/src/php-src/ext/pcre/pcre2lib/pcre2_substitute.c
Line | Count | Source |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | | New API code Copyright (c) 2016-2022 University of Cambridge |
11 | | |
12 | | ----------------------------------------------------------------------------- |
13 | | Redistribution and use in source and binary forms, with or without |
14 | | modification, are permitted provided that the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the University of Cambridge nor the names of its |
24 | | contributors may be used to endorse or promote products derived from |
25 | | this software without specific prior written permission. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | | POSSIBILITY OF SUCH DAMAGE. |
38 | | ----------------------------------------------------------------------------- |
39 | | */ |
40 | | |
41 | | |
42 | | #ifdef HAVE_CONFIG_H |
43 | | #include "config.h" |
44 | | #endif |
45 | | |
46 | | #include "pcre2_internal.h" |
47 | | |
48 | 0 | #define PTR_STACK_SIZE 20 |
49 | | |
50 | | #define SUBSTITUTE_OPTIONS \ |
51 | 0 | (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ |
52 | 0 | PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ |
53 | 0 | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ |
54 | 0 | PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) |
55 | | |
56 | | |
57 | | |
58 | | /************************************************* |
59 | | * Find end of substitute text * |
60 | | *************************************************/ |
61 | | |
62 | | /* In extended mode, we recognize ${name:+set text:unset text} and similar |
63 | | constructions. This requires the identification of unescaped : and } |
64 | | characters. This function scans for such. It must deal with nested ${ |
65 | | constructions. The pointer to the text is updated, either to the required end |
66 | | character, or to where an error was detected. |
67 | | |
68 | | Arguments: |
69 | | code points to the compiled expression (for options) |
70 | | ptrptr points to the pointer to the start of the text (updated) |
71 | | ptrend end of the whole string |
72 | | last TRUE if the last expected string (only } recognized) |
73 | | |
74 | | Returns: 0 on success |
75 | | negative error code on failure |
76 | | */ |
77 | | |
78 | | static int |
79 | | find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, |
80 | | BOOL last) |
81 | 0 | { |
82 | 0 | int rc = 0; |
83 | 0 | uint32_t nestlevel = 0; |
84 | 0 | BOOL literal = FALSE; |
85 | 0 | PCRE2_SPTR ptr = *ptrptr; |
86 | |
|
87 | 0 | for (; ptr < ptrend; ptr++) |
88 | 0 | { |
89 | 0 | if (literal) |
90 | 0 | { |
91 | 0 | if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) |
92 | 0 | { |
93 | 0 | literal = FALSE; |
94 | 0 | ptr += 1; |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | 0 | else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
99 | 0 | { |
100 | 0 | if (nestlevel == 0) goto EXIT; |
101 | 0 | nestlevel--; |
102 | 0 | } |
103 | | |
104 | 0 | else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; |
105 | | |
106 | 0 | else if (*ptr == CHAR_DOLLAR_SIGN) |
107 | 0 | { |
108 | 0 | if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
109 | 0 | { |
110 | 0 | nestlevel++; |
111 | 0 | ptr += 1; |
112 | 0 | } |
113 | 0 | } |
114 | | |
115 | 0 | else if (*ptr == CHAR_BACKSLASH) |
116 | 0 | { |
117 | 0 | int erc; |
118 | 0 | int errorcode; |
119 | 0 | uint32_t ch; |
120 | |
|
121 | 0 | if (ptr < ptrend - 1) switch (ptr[1]) |
122 | 0 | { |
123 | 0 | case CHAR_L: |
124 | 0 | case CHAR_l: |
125 | 0 | case CHAR_U: |
126 | 0 | case CHAR_u: |
127 | 0 | ptr += 1; |
128 | 0 | continue; |
129 | 0 | } |
130 | | |
131 | 0 | ptr += 1; /* Must point after \ */ |
132 | 0 | erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, |
133 | 0 | code->overall_options, code->extra_options, FALSE, NULL); |
134 | 0 | ptr -= 1; /* Back to last code unit of escape */ |
135 | 0 | if (errorcode != 0) |
136 | 0 | { |
137 | 0 | rc = errorcode; |
138 | 0 | goto EXIT; |
139 | 0 | } |
140 | | |
141 | 0 | switch(erc) |
142 | 0 | { |
143 | 0 | case 0: /* Data character */ |
144 | 0 | case ESC_E: /* Isolated \E is ignored */ |
145 | 0 | break; |
146 | | |
147 | 0 | case ESC_Q: |
148 | 0 | literal = TRUE; |
149 | 0 | break; |
150 | | |
151 | 0 | default: |
152 | 0 | rc = PCRE2_ERROR_BADREPESCAPE; |
153 | 0 | goto EXIT; |
154 | 0 | } |
155 | 0 | } |
156 | 0 | } |
157 | | |
158 | 0 | rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ |
159 | |
|
160 | 0 | EXIT: |
161 | 0 | *ptrptr = ptr; |
162 | 0 | return rc; |
163 | 0 | } |
164 | | |
165 | | |
166 | | |
167 | | /************************************************* |
168 | | * Match and substitute * |
169 | | *************************************************/ |
170 | | |
171 | | /* This function applies a compiled re to a subject string and creates a new |
172 | | string with substitutions. The first 7 arguments are the same as for |
173 | | pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. |
174 | | |
175 | | Arguments: |
176 | | code points to the compiled expression |
177 | | subject points to the subject string |
178 | | length length of subject string (may contain binary zeros) |
179 | | start_offset where to start in the subject string |
180 | | options option bits |
181 | | match_data points to a match_data block, or is NULL |
182 | | context points a PCRE2 context |
183 | | replacement points to the replacement string |
184 | | rlength length of replacement string |
185 | | buffer where to put the substituted string |
186 | | blength points to length of buffer; updated to length of string |
187 | | |
188 | | Returns: >= 0 number of substitutions made |
189 | | < 0 an error code |
190 | | PCRE2_ERROR_BADREPLACEMENT means invalid use of $ |
191 | | */ |
192 | | |
193 | | /* This macro checks for space in the buffer before copying into it. On |
194 | | overflow, either give an error immediately, or keep on, accumulating the |
195 | | length. */ |
196 | | |
197 | | #define CHECKMEMCPY(from,length) \ |
198 | 0 | { \ |
199 | 0 | if (!overflowed && lengthleft < length) \ |
200 | 0 | { \ |
201 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ |
202 | 0 | overflowed = TRUE; \ |
203 | 0 | extra_needed = length - lengthleft; \ |
204 | 0 | } \ |
205 | 0 | else if (overflowed) \ |
206 | 0 | { \ |
207 | 0 | extra_needed += length; \ |
208 | 0 | } \ |
209 | 0 | else \ |
210 | 0 | { \ |
211 | 0 | memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ |
212 | 0 | buff_offset += length; \ |
213 | 0 | lengthleft -= length; \ |
214 | 0 | } \ |
215 | 0 | } |
216 | | |
217 | | /* Here's the function */ |
218 | | |
219 | | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
220 | | pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
221 | | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
222 | | pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, |
223 | | PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) |
224 | 0 | { |
225 | 0 | int rc; |
226 | 0 | int subs; |
227 | 0 | int forcecase = 0; |
228 | 0 | int forcecasereset = 0; |
229 | 0 | uint32_t ovector_count; |
230 | 0 | uint32_t goptions = 0; |
231 | 0 | uint32_t suboptions; |
232 | 0 | pcre2_match_data *internal_match_data = NULL; |
233 | 0 | BOOL escaped_literal = FALSE; |
234 | 0 | BOOL overflowed = FALSE; |
235 | 0 | BOOL use_existing_match; |
236 | 0 | BOOL replacement_only; |
237 | 0 | #ifdef SUPPORT_UNICODE |
238 | 0 | BOOL utf = (code->overall_options & PCRE2_UTF) != 0; |
239 | 0 | BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; |
240 | 0 | #endif |
241 | 0 | PCRE2_UCHAR temp[6]; |
242 | 0 | PCRE2_SPTR ptr; |
243 | 0 | PCRE2_SPTR repend; |
244 | 0 | PCRE2_SIZE extra_needed = 0; |
245 | 0 | PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; |
246 | 0 | PCRE2_SIZE *ovector; |
247 | 0 | PCRE2_SIZE ovecsave[3]; |
248 | 0 | pcre2_substitute_callout_block scb; |
249 | | |
250 | | /* General initialization */ |
251 | |
|
252 | 0 | buff_offset = 0; |
253 | 0 | lengthleft = buff_length = *blength; |
254 | 0 | *blength = PCRE2_UNSET; |
255 | 0 | ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; |
256 | | |
257 | | /* Partial matching is not valid. This must come after setting *blength to |
258 | | PCRE2_UNSET, so as not to imply an offset in the replacement. */ |
259 | |
|
260 | 0 | if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) |
261 | 0 | return PCRE2_ERROR_BADOPTION; |
262 | | |
263 | | /* Validate length and find the end of the replacement. A NULL replacement of |
264 | | zero length is interpreted as an empty string. */ |
265 | | |
266 | 0 | if (replacement == NULL) |
267 | 0 | { |
268 | 0 | if (rlength != 0) return PCRE2_ERROR_NULL; |
269 | 0 | replacement = (PCRE2_SPTR)""; |
270 | 0 | } |
271 | | |
272 | 0 | if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); |
273 | 0 | repend = replacement + rlength; |
274 | | |
275 | | /* Check for using a match that has already happened. Note that the subject |
276 | | pointer in the match data may be NULL after a no-match. */ |
277 | |
|
278 | 0 | use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); |
279 | 0 | replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); |
280 | | |
281 | | /* If starting from an existing match, there must be an externally provided |
282 | | match data block. We create an internal match_data block in two cases: (a) an |
283 | | external one is not supplied (and we are not starting from an existing match); |
284 | | (b) an existing match is to be used for the first substitution. In the latter |
285 | | case, we copy the existing match into the internal block, except for any cached |
286 | | heap frame size and pointer. This ensures that no changes are made to the |
287 | | external match data block. */ |
288 | |
|
289 | 0 | if (match_data == NULL) |
290 | 0 | { |
291 | 0 | pcre2_general_context *gcontext; |
292 | 0 | if (use_existing_match) return PCRE2_ERROR_NULL; |
293 | 0 | gcontext = (mcontext == NULL)? |
294 | 0 | (pcre2_general_context *)code : |
295 | 0 | (pcre2_general_context *)mcontext; |
296 | 0 | match_data = internal_match_data = |
297 | 0 | pcre2_match_data_create_from_pattern(code, gcontext); |
298 | 0 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
299 | 0 | } |
300 | | |
301 | 0 | else if (use_existing_match) |
302 | 0 | { |
303 | 0 | pcre2_general_context *gcontext = (mcontext == NULL)? |
304 | 0 | (pcre2_general_context *)code : |
305 | 0 | (pcre2_general_context *)mcontext; |
306 | 0 | int pairs = (code->top_bracket + 1 < match_data->oveccount)? |
307 | 0 | code->top_bracket + 1 : match_data->oveccount; |
308 | 0 | internal_match_data = pcre2_match_data_create(match_data->oveccount, |
309 | 0 | gcontext); |
310 | 0 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
311 | 0 | memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) |
312 | 0 | + 2*pairs*sizeof(PCRE2_SIZE)); |
313 | 0 | internal_match_data->heapframes = NULL; |
314 | 0 | internal_match_data->heapframes_size = 0; |
315 | 0 | match_data = internal_match_data; |
316 | 0 | } |
317 | | |
318 | | /* Remember ovector details */ |
319 | | |
320 | 0 | ovector = pcre2_get_ovector_pointer(match_data); |
321 | 0 | ovector_count = pcre2_get_ovector_count(match_data); |
322 | | |
323 | | /* Fixed things in the callout block */ |
324 | |
|
325 | 0 | scb.version = 0; |
326 | 0 | scb.input = subject; |
327 | 0 | scb.output = (PCRE2_SPTR)buffer; |
328 | 0 | scb.ovector = ovector; |
329 | | |
330 | | /* A NULL subject of zero length is treated as an empty string. */ |
331 | |
|
332 | 0 | if (subject == NULL) |
333 | 0 | { |
334 | 0 | if (length != 0) return PCRE2_ERROR_NULL; |
335 | 0 | subject = (PCRE2_SPTR)""; |
336 | 0 | } |
337 | | |
338 | | /* Find length of zero-terminated subject */ |
339 | | |
340 | 0 | if (length == PCRE2_ZERO_TERMINATED) |
341 | 0 | length = subject? PRIV(strlen)(subject) : 0; |
342 | | |
343 | | /* Check UTF replacement string if necessary. */ |
344 | |
|
345 | 0 | #ifdef SUPPORT_UNICODE |
346 | 0 | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) |
347 | 0 | { |
348 | 0 | rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); |
349 | 0 | if (rc != 0) |
350 | 0 | { |
351 | 0 | match_data->leftchar = 0; |
352 | 0 | goto EXIT; |
353 | 0 | } |
354 | 0 | } |
355 | 0 | #endif /* SUPPORT_UNICODE */ |
356 | | |
357 | | /* Save the substitute options and remove them from the match options. */ |
358 | | |
359 | 0 | suboptions = options & SUBSTITUTE_OPTIONS; |
360 | 0 | options &= ~SUBSTITUTE_OPTIONS; |
361 | | |
362 | | /* Error if the start match offset is greater than the length of the subject. */ |
363 | |
|
364 | 0 | if (start_offset > length) |
365 | 0 | { |
366 | 0 | match_data->leftchar = 0; |
367 | 0 | rc = PCRE2_ERROR_BADOFFSET; |
368 | 0 | goto EXIT; |
369 | 0 | } |
370 | | |
371 | | /* Copy up to the start offset, unless only the replacement is required. */ |
372 | | |
373 | 0 | if (!replacement_only) CHECKMEMCPY(subject, start_offset); |
374 | | |
375 | | /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first |
376 | | match is taken from the match_data that was passed in. */ |
377 | |
|
378 | 0 | subs = 0; |
379 | 0 | do |
380 | 0 | { |
381 | 0 | PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; |
382 | 0 | uint32_t ptrstackptr = 0; |
383 | |
|
384 | 0 | if (use_existing_match) |
385 | 0 | { |
386 | 0 | rc = match_data->rc; |
387 | 0 | use_existing_match = FALSE; |
388 | 0 | } |
389 | 0 | else rc = pcre2_match(code, subject, length, start_offset, options|goptions, |
390 | 0 | match_data, mcontext); |
391 | |
|
392 | 0 | #ifdef SUPPORT_UNICODE |
393 | 0 | if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ |
394 | 0 | #endif |
395 | | |
396 | | /* Any error other than no match returns the error code. No match when not |
397 | | doing the special after-empty-match global rematch, or when at the end of the |
398 | | subject, breaks the global loop. Otherwise, advance the starting point by one |
399 | | character, copying it to the output, and try again. */ |
400 | |
|
401 | 0 | if (rc < 0) |
402 | 0 | { |
403 | 0 | PCRE2_SIZE save_start; |
404 | |
|
405 | 0 | if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; |
406 | 0 | if (goptions == 0 || start_offset >= length) break; |
407 | | |
408 | | /* Advance by one code point. Then, if CRLF is a valid newline sequence and |
409 | | we have advanced into the middle of it, advance one more code point. In |
410 | | other words, do not start in the middle of CRLF, even if CR and LF on their |
411 | | own are valid newlines. */ |
412 | | |
413 | 0 | save_start = start_offset++; |
414 | 0 | if (subject[start_offset-1] == CHAR_CR && |
415 | 0 | code->newline_convention != PCRE2_NEWLINE_CR && |
416 | 0 | code->newline_convention != PCRE2_NEWLINE_LF && |
417 | 0 | start_offset < length && |
418 | 0 | subject[start_offset] == CHAR_LF) |
419 | 0 | start_offset++; |
420 | | |
421 | | /* Otherwise, in UTF mode, advance past any secondary code points. */ |
422 | | |
423 | 0 | else if ((code->overall_options & PCRE2_UTF) != 0) |
424 | 0 | { |
425 | 0 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
426 | 0 | while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) |
427 | 0 | start_offset++; |
428 | | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
429 | | while (start_offset < length && |
430 | | (subject[start_offset] & 0xfc00) == 0xdc00) |
431 | | start_offset++; |
432 | | #endif |
433 | 0 | } |
434 | | |
435 | | /* Copy what we have advanced past (unless not required), reset the special |
436 | | global options, and continue to the next match. */ |
437 | |
|
438 | 0 | fraglength = start_offset - save_start; |
439 | 0 | if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); |
440 | 0 | goptions = 0; |
441 | 0 | continue; |
442 | 0 | } |
443 | | |
444 | | /* Handle a successful match. Matches that use \K to end before they start |
445 | | or start before the current point in the subject are not supported. */ |
446 | | |
447 | 0 | if (ovector[1] < ovector[0] || ovector[0] < start_offset) |
448 | 0 | { |
449 | 0 | rc = PCRE2_ERROR_BADSUBSPATTERN; |
450 | 0 | goto EXIT; |
451 | 0 | } |
452 | | |
453 | | /* Check for the same match as previous. This is legitimate after matching an |
454 | | empty string that starts after the initial match offset. We have tried again |
455 | | at the match point in case the pattern is one like /(?<=\G.)/ which can never |
456 | | match at its starting point, so running the match achieves the bumpalong. If |
457 | | we do get the same (null) match at the original match point, it isn't such a |
458 | | pattern, so we now do the empty string magic. In all other cases, a repeat |
459 | | match should never occur. */ |
460 | | |
461 | 0 | if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) |
462 | 0 | { |
463 | 0 | if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) |
464 | 0 | { |
465 | 0 | goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; |
466 | 0 | ovecsave[2] = start_offset; |
467 | 0 | continue; /* Back to the top of the loop */ |
468 | 0 | } |
469 | 0 | rc = PCRE2_ERROR_INTERNAL_DUPMATCH; |
470 | 0 | goto EXIT; |
471 | 0 | } |
472 | | |
473 | | /* Count substitutions with a paranoid check for integer overflow; surely no |
474 | | real call to this function would ever hit this! */ |
475 | | |
476 | 0 | if (subs == INT_MAX) |
477 | 0 | { |
478 | 0 | rc = PCRE2_ERROR_TOOMANYREPLACE; |
479 | 0 | goto EXIT; |
480 | 0 | } |
481 | 0 | subs++; |
482 | | |
483 | | /* Copy the text leading up to the match (unless not required), and remember |
484 | | where the insert begins and how many ovector pairs are set. */ |
485 | |
|
486 | 0 | if (rc == 0) rc = ovector_count; |
487 | 0 | fraglength = ovector[0] - start_offset; |
488 | 0 | if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); |
489 | 0 | scb.output_offsets[0] = buff_offset; |
490 | 0 | scb.oveccount = rc; |
491 | | |
492 | | /* Process the replacement string. If the entire replacement is literal, just |
493 | | copy it with length check. */ |
494 | |
|
495 | 0 | ptr = replacement; |
496 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) |
497 | 0 | { |
498 | 0 | CHECKMEMCPY(ptr, rlength); |
499 | 0 | } |
500 | | |
501 | | /* Within a non-literal replacement, which must be scanned character by |
502 | | character, local literal mode can be set by \Q, but only in extended mode |
503 | | when backslashes are being interpreted. In extended mode we must handle |
504 | | nested substrings that are to be reprocessed. */ |
505 | | |
506 | 0 | else for (;;) |
507 | 0 | { |
508 | 0 | uint32_t ch; |
509 | 0 | unsigned int chlen; |
510 | | |
511 | | /* If at the end of a nested substring, pop the stack. */ |
512 | |
|
513 | 0 | if (ptr >= repend) |
514 | 0 | { |
515 | 0 | if (ptrstackptr == 0) break; /* End of replacement string */ |
516 | 0 | repend = ptrstack[--ptrstackptr]; |
517 | 0 | ptr = ptrstack[--ptrstackptr]; |
518 | 0 | continue; |
519 | 0 | } |
520 | | |
521 | | /* Handle the next character */ |
522 | | |
523 | 0 | if (escaped_literal) |
524 | 0 | { |
525 | 0 | if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) |
526 | 0 | { |
527 | 0 | escaped_literal = FALSE; |
528 | 0 | ptr += 2; |
529 | 0 | continue; |
530 | 0 | } |
531 | 0 | goto LOADLITERAL; |
532 | 0 | } |
533 | | |
534 | | /* Not in literal mode. */ |
535 | | |
536 | 0 | if (*ptr == CHAR_DOLLAR_SIGN) |
537 | 0 | { |
538 | 0 | int group, n; |
539 | 0 | uint32_t special = 0; |
540 | 0 | BOOL inparens; |
541 | 0 | BOOL star; |
542 | 0 | PCRE2_SIZE sublength; |
543 | 0 | PCRE2_SPTR text1_start = NULL; |
544 | 0 | PCRE2_SPTR text1_end = NULL; |
545 | 0 | PCRE2_SPTR text2_start = NULL; |
546 | 0 | PCRE2_SPTR text2_end = NULL; |
547 | 0 | PCRE2_UCHAR next; |
548 | 0 | PCRE2_UCHAR name[33]; |
549 | |
|
550 | 0 | if (++ptr >= repend) goto BAD; |
551 | 0 | if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; |
552 | | |
553 | 0 | group = -1; |
554 | 0 | n = 0; |
555 | 0 | inparens = FALSE; |
556 | 0 | star = FALSE; |
557 | |
|
558 | 0 | if (next == CHAR_LEFT_CURLY_BRACKET) |
559 | 0 | { |
560 | 0 | if (++ptr >= repend) goto BAD; |
561 | 0 | next = *ptr; |
562 | 0 | inparens = TRUE; |
563 | 0 | } |
564 | | |
565 | 0 | if (next == CHAR_ASTERISK) |
566 | 0 | { |
567 | 0 | if (++ptr >= repend) goto BAD; |
568 | 0 | next = *ptr; |
569 | 0 | star = TRUE; |
570 | 0 | } |
571 | | |
572 | 0 | if (!star && next >= CHAR_0 && next <= CHAR_9) |
573 | 0 | { |
574 | 0 | group = next - CHAR_0; |
575 | 0 | while (++ptr < repend) |
576 | 0 | { |
577 | 0 | next = *ptr; |
578 | 0 | if (next < CHAR_0 || next > CHAR_9) break; |
579 | 0 | group = group * 10 + next - CHAR_0; |
580 | | |
581 | | /* A check for a number greater than the hightest captured group |
582 | | is sufficient here; no need for a separate overflow check. If unknown |
583 | | groups are to be treated as unset, just skip over any remaining |
584 | | digits and carry on. */ |
585 | |
|
586 | 0 | if (group > code->top_bracket) |
587 | 0 | { |
588 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
589 | 0 | { |
590 | 0 | while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); |
591 | 0 | break; |
592 | 0 | } |
593 | 0 | else |
594 | 0 | { |
595 | 0 | rc = PCRE2_ERROR_NOSUBSTRING; |
596 | 0 | goto PTREXIT; |
597 | 0 | } |
598 | 0 | } |
599 | 0 | } |
600 | 0 | } |
601 | 0 | else |
602 | 0 | { |
603 | 0 | const uint8_t *ctypes = code->tables + ctypes_offset; |
604 | 0 | while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) |
605 | 0 | { |
606 | 0 | name[n++] = next; |
607 | 0 | if (n > 32) goto BAD; |
608 | 0 | if (++ptr >= repend) break; |
609 | 0 | next = *ptr; |
610 | 0 | } |
611 | 0 | if (n == 0) goto BAD; |
612 | 0 | name[n] = 0; |
613 | 0 | } |
614 | | |
615 | | /* In extended mode we recognize ${name:+set text:unset text} and |
616 | | ${name:-default text}. */ |
617 | | |
618 | 0 | if (inparens) |
619 | 0 | { |
620 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
621 | 0 | !star && ptr < repend - 2 && next == CHAR_COLON) |
622 | 0 | { |
623 | 0 | special = *(++ptr); |
624 | 0 | if (special != CHAR_PLUS && special != CHAR_MINUS) |
625 | 0 | { |
626 | 0 | rc = PCRE2_ERROR_BADSUBSTITUTION; |
627 | 0 | goto PTREXIT; |
628 | 0 | } |
629 | | |
630 | 0 | text1_start = ++ptr; |
631 | 0 | rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); |
632 | 0 | if (rc != 0) goto PTREXIT; |
633 | 0 | text1_end = ptr; |
634 | |
|
635 | 0 | if (special == CHAR_PLUS && *ptr == CHAR_COLON) |
636 | 0 | { |
637 | 0 | text2_start = ++ptr; |
638 | 0 | rc = find_text_end(code, &ptr, repend, TRUE); |
639 | 0 | if (rc != 0) goto PTREXIT; |
640 | 0 | text2_end = ptr; |
641 | 0 | } |
642 | 0 | } |
643 | | |
644 | 0 | else |
645 | 0 | { |
646 | 0 | if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) |
647 | 0 | { |
648 | 0 | rc = PCRE2_ERROR_REPMISSINGBRACE; |
649 | 0 | goto PTREXIT; |
650 | 0 | } |
651 | 0 | } |
652 | | |
653 | 0 | ptr++; |
654 | 0 | } |
655 | | |
656 | | /* Have found a syntactically correct group number or name, or *name. |
657 | | Only *MARK is currently recognized. */ |
658 | | |
659 | 0 | if (star) |
660 | 0 | { |
661 | 0 | if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) |
662 | 0 | { |
663 | 0 | PCRE2_SPTR mark = pcre2_get_mark(match_data); |
664 | 0 | if (mark != NULL) |
665 | 0 | { |
666 | 0 | PCRE2_SPTR mark_start = mark; |
667 | 0 | while (*mark != 0) mark++; |
668 | 0 | fraglength = mark - mark_start; |
669 | 0 | CHECKMEMCPY(mark_start, fraglength); |
670 | 0 | } |
671 | 0 | } |
672 | 0 | else goto BAD; |
673 | 0 | } |
674 | | |
675 | | /* Substitute the contents of a group. We don't use substring_copy |
676 | | functions any more, in order to support case forcing. */ |
677 | | |
678 | 0 | else |
679 | 0 | { |
680 | 0 | PCRE2_SPTR subptr, subptrend; |
681 | | |
682 | | /* Find a number for a named group. In case there are duplicate names, |
683 | | search for the first one that is set. If the name is not found when |
684 | | PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a |
685 | | non-existent group. */ |
686 | |
|
687 | 0 | if (group < 0) |
688 | 0 | { |
689 | 0 | PCRE2_SPTR first, last, entry; |
690 | 0 | rc = pcre2_substring_nametable_scan(code, name, &first, &last); |
691 | 0 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
692 | 0 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
693 | 0 | { |
694 | 0 | group = code->top_bracket + 1; |
695 | 0 | } |
696 | 0 | else |
697 | 0 | { |
698 | 0 | if (rc < 0) goto PTREXIT; |
699 | 0 | for (entry = first; entry <= last; entry += rc) |
700 | 0 | { |
701 | 0 | uint32_t ng = GET2(entry, 0); |
702 | 0 | if (ng < ovector_count) |
703 | 0 | { |
704 | 0 | if (group < 0) group = ng; /* First in ovector */ |
705 | 0 | if (ovector[ng*2] != PCRE2_UNSET) |
706 | 0 | { |
707 | 0 | group = ng; /* First that is set */ |
708 | 0 | break; |
709 | 0 | } |
710 | 0 | } |
711 | 0 | } |
712 | | |
713 | | /* If group is still negative, it means we did not find a group |
714 | | that is in the ovector. Just set the first group. */ |
715 | |
|
716 | 0 | if (group < 0) group = GET2(first, 0); |
717 | 0 | } |
718 | 0 | } |
719 | | |
720 | | /* We now have a group that is identified by number. Find the length of |
721 | | the captured string. If a group in a non-special substitution is unset |
722 | | when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ |
723 | | |
724 | 0 | rc = pcre2_substring_length_bynumber(match_data, group, &sublength); |
725 | 0 | if (rc < 0) |
726 | 0 | { |
727 | 0 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
728 | 0 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
729 | 0 | { |
730 | 0 | rc = PCRE2_ERROR_UNSET; |
731 | 0 | } |
732 | 0 | if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ |
733 | 0 | if (special == 0) /* Plain substitution */ |
734 | 0 | { |
735 | 0 | if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; |
736 | 0 | goto PTREXIT; /* Else error */ |
737 | 0 | } |
738 | 0 | } |
739 | | |
740 | | /* If special is '+' we have a 'set' and possibly an 'unset' text, |
741 | | both of which are reprocessed when used. If special is '-' we have a |
742 | | default text for when the group is unset; it must be reprocessed. */ |
743 | | |
744 | 0 | if (special != 0) |
745 | 0 | { |
746 | 0 | if (special == CHAR_MINUS) |
747 | 0 | { |
748 | 0 | if (rc == 0) goto LITERAL_SUBSTITUTE; |
749 | 0 | text2_start = text1_start; |
750 | 0 | text2_end = text1_end; |
751 | 0 | } |
752 | | |
753 | 0 | if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; |
754 | 0 | ptrstack[ptrstackptr++] = ptr; |
755 | 0 | ptrstack[ptrstackptr++] = repend; |
756 | |
|
757 | 0 | if (rc == 0) |
758 | 0 | { |
759 | 0 | ptr = text1_start; |
760 | 0 | repend = text1_end; |
761 | 0 | } |
762 | 0 | else |
763 | 0 | { |
764 | 0 | ptr = text2_start; |
765 | 0 | repend = text2_end; |
766 | 0 | } |
767 | 0 | continue; |
768 | 0 | } |
769 | | |
770 | | /* Otherwise we have a literal substitution of a group's contents. */ |
771 | | |
772 | 0 | LITERAL_SUBSTITUTE: |
773 | 0 | subptr = subject + ovector[group*2]; |
774 | 0 | subptrend = subject + ovector[group*2 + 1]; |
775 | | |
776 | | /* Substitute a literal string, possibly forcing alphabetic case. */ |
777 | |
|
778 | 0 | while (subptr < subptrend) |
779 | 0 | { |
780 | 0 | GETCHARINCTEST(ch, subptr); |
781 | 0 | if (forcecase != 0) |
782 | 0 | { |
783 | 0 | #ifdef SUPPORT_UNICODE |
784 | 0 | if (utf || ucp) |
785 | 0 | { |
786 | 0 | uint32_t type = UCD_CHARTYPE(ch); |
787 | 0 | if (PRIV(ucp_gentype)[type] == ucp_L && |
788 | 0 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
789 | 0 | ch = UCD_OTHERCASE(ch); |
790 | 0 | } |
791 | 0 | else |
792 | 0 | #endif |
793 | 0 | { |
794 | 0 | if (((code->tables + cbits_offset + |
795 | 0 | ((forcecase > 0)? cbit_upper:cbit_lower) |
796 | 0 | )[ch/8] & (1u << (ch%8))) == 0) |
797 | 0 | ch = (code->tables + fcc_offset)[ch]; |
798 | 0 | } |
799 | 0 | forcecase = forcecasereset; |
800 | 0 | } |
801 | |
|
802 | 0 | #ifdef SUPPORT_UNICODE |
803 | 0 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
804 | 0 | #endif |
805 | 0 | { |
806 | 0 | temp[0] = ch; |
807 | 0 | chlen = 1; |
808 | 0 | } |
809 | 0 | CHECKMEMCPY(temp, chlen); |
810 | 0 | } |
811 | 0 | } |
812 | 0 | } |
813 | | |
814 | | /* Handle an escape sequence in extended mode. We can use check_escape() |
815 | | to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but |
816 | | the case-forcing escapes are not supported in pcre2_compile() so must be |
817 | | recognized here. */ |
818 | | |
819 | 0 | else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
820 | 0 | *ptr == CHAR_BACKSLASH) |
821 | 0 | { |
822 | 0 | int errorcode; |
823 | |
|
824 | 0 | if (ptr < repend - 1) switch (ptr[1]) |
825 | 0 | { |
826 | 0 | case CHAR_L: |
827 | 0 | forcecase = forcecasereset = -1; |
828 | 0 | ptr += 2; |
829 | 0 | continue; |
830 | | |
831 | 0 | case CHAR_l: |
832 | 0 | forcecase = -1; |
833 | 0 | forcecasereset = 0; |
834 | 0 | ptr += 2; |
835 | 0 | continue; |
836 | | |
837 | 0 | case CHAR_U: |
838 | 0 | forcecase = forcecasereset = 1; |
839 | 0 | ptr += 2; |
840 | 0 | continue; |
841 | | |
842 | 0 | case CHAR_u: |
843 | 0 | forcecase = 1; |
844 | 0 | forcecasereset = 0; |
845 | 0 | ptr += 2; |
846 | 0 | continue; |
847 | | |
848 | 0 | default: |
849 | 0 | break; |
850 | 0 | } |
851 | | |
852 | 0 | ptr++; /* Point after \ */ |
853 | 0 | rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, |
854 | 0 | code->overall_options, code->extra_options, FALSE, NULL); |
855 | 0 | if (errorcode != 0) goto BADESCAPE; |
856 | | |
857 | 0 | switch(rc) |
858 | 0 | { |
859 | 0 | case ESC_E: |
860 | 0 | forcecase = forcecasereset = 0; |
861 | 0 | continue; |
862 | | |
863 | 0 | case ESC_Q: |
864 | 0 | escaped_literal = TRUE; |
865 | 0 | continue; |
866 | | |
867 | 0 | case 0: /* Data character */ |
868 | 0 | goto LITERAL; |
869 | | |
870 | 0 | default: |
871 | 0 | goto BADESCAPE; |
872 | 0 | } |
873 | 0 | } |
874 | | |
875 | | /* Handle a literal code unit */ |
876 | | |
877 | 0 | else |
878 | 0 | { |
879 | 0 | LOADLITERAL: |
880 | 0 | GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ |
881 | |
|
882 | 0 | LITERAL: |
883 | 0 | if (forcecase != 0) |
884 | 0 | { |
885 | 0 | #ifdef SUPPORT_UNICODE |
886 | 0 | if (utf || ucp) |
887 | 0 | { |
888 | 0 | uint32_t type = UCD_CHARTYPE(ch); |
889 | 0 | if (PRIV(ucp_gentype)[type] == ucp_L && |
890 | 0 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
891 | 0 | ch = UCD_OTHERCASE(ch); |
892 | 0 | } |
893 | 0 | else |
894 | 0 | #endif |
895 | 0 | { |
896 | 0 | if (((code->tables + cbits_offset + |
897 | 0 | ((forcecase > 0)? cbit_upper:cbit_lower) |
898 | 0 | )[ch/8] & (1u << (ch%8))) == 0) |
899 | 0 | ch = (code->tables + fcc_offset)[ch]; |
900 | 0 | } |
901 | 0 | forcecase = forcecasereset; |
902 | 0 | } |
903 | |
|
904 | 0 | #ifdef SUPPORT_UNICODE |
905 | 0 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
906 | 0 | #endif |
907 | 0 | { |
908 | 0 | temp[0] = ch; |
909 | 0 | chlen = 1; |
910 | 0 | } |
911 | 0 | CHECKMEMCPY(temp, chlen); |
912 | 0 | } /* End handling a literal code unit */ |
913 | 0 | } /* End of loop for scanning the replacement. */ |
914 | | |
915 | | /* The replacement has been copied to the output, or its size has been |
916 | | remembered. Do the callout if there is one and we have done an actual |
917 | | replacement. */ |
918 | | |
919 | 0 | if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) |
920 | 0 | { |
921 | 0 | scb.subscount = subs; |
922 | 0 | scb.output_offsets[1] = buff_offset; |
923 | 0 | rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); |
924 | | |
925 | | /* A non-zero return means cancel this substitution. Instead, copy the |
926 | | matched string fragment. */ |
927 | |
|
928 | 0 | if (rc != 0) |
929 | 0 | { |
930 | 0 | PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; |
931 | 0 | PCRE2_SIZE oldlength = ovector[1] - ovector[0]; |
932 | |
|
933 | 0 | buff_offset -= newlength; |
934 | 0 | lengthleft += newlength; |
935 | 0 | if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); |
936 | | |
937 | | /* A negative return means do not do any more. */ |
938 | |
|
939 | 0 | if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); |
940 | 0 | } |
941 | 0 | } |
942 | | |
943 | | /* Save the details of this match. See above for how this data is used. If we |
944 | | matched an empty string, do the magic for global matches. Update the start |
945 | | offset to point to the rest of the subject string. If we re-used an existing |
946 | | match for the first match, switch to the internal match data block. */ |
947 | | |
948 | 0 | ovecsave[0] = ovector[0]; |
949 | 0 | ovecsave[1] = ovector[1]; |
950 | 0 | ovecsave[2] = start_offset; |
951 | |
|
952 | 0 | goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : |
953 | 0 | PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; |
954 | 0 | start_offset = ovector[1]; |
955 | 0 | } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ |
956 | | |
957 | | /* Copy the rest of the subject unless not required, and terminate the output |
958 | | with a binary zero. */ |
959 | | |
960 | 0 | if (!replacement_only) |
961 | 0 | { |
962 | 0 | fraglength = length - start_offset; |
963 | 0 | CHECKMEMCPY(subject + start_offset, fraglength); |
964 | 0 | } |
965 | | |
966 | 0 | temp[0] = 0; |
967 | 0 | CHECKMEMCPY(temp, 1); |
968 | | |
969 | | /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, |
970 | | and matching has carried on after a full buffer, in order to compute the length |
971 | | needed. Otherwise, an overflow generates an immediate error return. */ |
972 | |
|
973 | 0 | if (overflowed) |
974 | 0 | { |
975 | 0 | rc = PCRE2_ERROR_NOMEMORY; |
976 | 0 | *blength = buff_length + extra_needed; |
977 | 0 | } |
978 | | |
979 | | /* After a successful execution, return the number of substitutions and set the |
980 | | length of buffer used, excluding the trailing zero. */ |
981 | | |
982 | 0 | else |
983 | 0 | { |
984 | 0 | rc = subs; |
985 | 0 | *blength = buff_offset - 1; |
986 | 0 | } |
987 | |
|
988 | 0 | EXIT: |
989 | 0 | if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); |
990 | 0 | else match_data->rc = rc; |
991 | 0 | return rc; |
992 | | |
993 | 0 | NOROOM: |
994 | 0 | rc = PCRE2_ERROR_NOMEMORY; |
995 | 0 | goto EXIT; |
996 | | |
997 | 0 | BAD: |
998 | 0 | rc = PCRE2_ERROR_BADREPLACEMENT; |
999 | 0 | goto PTREXIT; |
1000 | | |
1001 | 0 | BADESCAPE: |
1002 | 0 | rc = PCRE2_ERROR_BADREPESCAPE; |
1003 | |
|
1004 | 0 | PTREXIT: |
1005 | 0 | *blength = (PCRE2_SIZE)(ptr - replacement); |
1006 | 0 | goto EXIT; |
1007 | 0 | } |
1008 | | |
1009 | | /* End of pcre2_substitute.c */ |