/src/php-src/ext/pcre/php_pcre.c
Line | Count | Source |
1 | | /* |
2 | | +----------------------------------------------------------------------+ |
3 | | | Copyright © The PHP Group and Contributors. | |
4 | | +----------------------------------------------------------------------+ |
5 | | | This source file is subject to the Modified BSD License that is | |
6 | | | bundled with this package in the file LICENSE, and is available | |
7 | | | through the World Wide Web at <https://www.php.net/license/>. | |
8 | | | | |
9 | | | SPDX-License-Identifier: BSD-3-Clause | |
10 | | +----------------------------------------------------------------------+ |
11 | | | Author: Andrei Zmievski <andrei@php.net> | |
12 | | +----------------------------------------------------------------------+ |
13 | | */ |
14 | | |
15 | | #include "php.h" |
16 | | #include "php_ini.h" |
17 | | #include "php_pcre.h" |
18 | | #include "ext/standard/info.h" |
19 | | #include "ext/standard/basic_functions.h" |
20 | | #include "zend_smart_str.h" |
21 | | #include "SAPI.h" |
22 | | |
23 | 0 | #define PREG_PATTERN_ORDER 1 |
24 | 0 | #define PREG_SET_ORDER 2 |
25 | 129 | #define PREG_OFFSET_CAPTURE (1<<8) |
26 | 129 | #define PREG_UNMATCHED_AS_NULL (1<<9) |
27 | | |
28 | 0 | #define PREG_SPLIT_NO_EMPTY (1<<0) |
29 | 0 | #define PREG_SPLIT_DELIM_CAPTURE (1<<1) |
30 | 0 | #define PREG_SPLIT_OFFSET_CAPTURE (1<<2) |
31 | | |
32 | 0 | #define PREG_GREP_INVERT (1<<0) |
33 | | |
34 | | #define PREG_JIT (1<<3) |
35 | | |
36 | 517 | #define PCRE_CACHE_SIZE 4096 |
37 | | |
38 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
39 | | #define PHP_PCRE_JIT_SUPPORT 1 |
40 | | #else |
41 | | #define PHP_PCRE_JIT_SUPPORT 0 |
42 | | #endif |
43 | | |
44 | | char *php_pcre_version; |
45 | | |
46 | | #include "php_pcre_arginfo.h" |
47 | | |
48 | | struct _pcre_cache_entry { |
49 | | pcre2_code *re; |
50 | | /* Pointer is not NULL (during request) when there are named captures. |
51 | | * Length is equal to capture_count + 1 to account for capture group 0. |
52 | | * This table cache is only valid during request. |
53 | | * Trying to store this over multiple requests causes issues when the keys are exposed in user arrays |
54 | | * (see GH-17122 and GH-17132). */ |
55 | | zend_string **subpats_table; |
56 | | uint32_t preg_options; |
57 | | uint32_t name_count; |
58 | | uint32_t capture_count; |
59 | | uint32_t compile_options; |
60 | | uint32_t refcount; |
61 | | }; |
62 | | |
63 | | PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre) |
64 | | |
65 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
66 | | #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024) |
67 | | #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024) |
68 | | ZEND_TLS pcre2_jit_stack *jit_stack = NULL; |
69 | | #endif |
70 | | /* General context using (infallible) system allocator. */ |
71 | | ZEND_TLS pcre2_general_context *gctx = NULL; |
72 | | /* These two are global per thread for now. Though it is possible to use these |
73 | | per pattern. Either one can copy it and use in pce, or one does no global |
74 | | contexts at all, but creates for every pce. */ |
75 | | ZEND_TLS pcre2_compile_context *cctx = NULL; |
76 | | ZEND_TLS pcre2_match_context *mctx = NULL; |
77 | | ZEND_TLS pcre2_match_data *mdata = NULL; |
78 | | ZEND_TLS bool mdata_used = 0; |
79 | | ZEND_TLS uint8_t pcre2_init_ok = 0; |
80 | | #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT) |
81 | | static MUTEX_T pcre_mt = NULL; |
82 | | #define php_pcre_mutex_alloc() \ |
83 | | if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc(); |
84 | | #define php_pcre_mutex_free() \ |
85 | | if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; } |
86 | | #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt); |
87 | | #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt); |
88 | | #else |
89 | | #define php_pcre_mutex_alloc() |
90 | | #define php_pcre_mutex_free() |
91 | | #define php_pcre_mutex_lock() |
92 | | #define php_pcre_mutex_unlock() |
93 | | #endif |
94 | | |
95 | | ZEND_TLS HashTable char_tables; |
96 | | |
97 | | static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats); |
98 | | |
99 | | static void php_pcre_free_char_table(zval *data) |
100 | 0 | {/*{{{*/ |
101 | 0 | void *ptr = Z_PTR_P(data); |
102 | 0 | pefree(ptr, 1); |
103 | 0 | }/*}}}*/ |
104 | | |
105 | | static void pcre_handle_exec_error(int pcre_code) /* {{{ */ |
106 | 796 | { |
107 | 796 | int preg_code = 0; |
108 | | |
109 | 796 | switch (pcre_code) { |
110 | 0 | case PCRE2_ERROR_MATCHLIMIT: |
111 | 0 | preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR; |
112 | 0 | break; |
113 | | |
114 | 0 | case PCRE2_ERROR_RECURSIONLIMIT: |
115 | 0 | preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR; |
116 | 0 | break; |
117 | | |
118 | 0 | case PCRE2_ERROR_BADUTFOFFSET: |
119 | 0 | preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR; |
120 | 0 | break; |
121 | | |
122 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
123 | | case PCRE2_ERROR_JIT_STACKLIMIT: |
124 | | preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR; |
125 | | break; |
126 | | #endif |
127 | | |
128 | 796 | default: |
129 | 796 | if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) { |
130 | 99 | preg_code = PHP_PCRE_BAD_UTF8_ERROR; |
131 | 697 | } else { |
132 | 697 | preg_code = PHP_PCRE_INTERNAL_ERROR; |
133 | 697 | } |
134 | 796 | break; |
135 | 796 | } |
136 | | |
137 | 796 | PCRE_G(error_code) = preg_code; |
138 | 796 | } |
139 | | /* }}} */ |
140 | | |
141 | | static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */ |
142 | 0 | { |
143 | 0 | switch (error_code) { |
144 | 0 | case PHP_PCRE_NO_ERROR: |
145 | 0 | return "No error"; |
146 | 0 | case PHP_PCRE_INTERNAL_ERROR: |
147 | 0 | return "Internal error"; |
148 | 0 | case PHP_PCRE_BAD_UTF8_ERROR: |
149 | 0 | return "Malformed UTF-8 characters, possibly incorrectly encoded"; |
150 | 0 | case PHP_PCRE_BAD_UTF8_OFFSET_ERROR: |
151 | 0 | return "The offset did not correspond to the beginning of a valid UTF-8 code point"; |
152 | 0 | case PHP_PCRE_BACKTRACK_LIMIT_ERROR: |
153 | 0 | return "Backtrack limit exhausted"; |
154 | 0 | case PHP_PCRE_RECURSION_LIMIT_ERROR: |
155 | 0 | return "Recursion limit exhausted"; |
156 | | |
157 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
158 | | case PHP_PCRE_JIT_STACKLIMIT_ERROR: |
159 | | return "JIT stack limit exhausted"; |
160 | | #endif |
161 | | |
162 | 0 | default: |
163 | 0 | return "Unknown error"; |
164 | 0 | } |
165 | 0 | } |
166 | | /* }}} */ |
167 | | |
168 | | static void php_free_pcre_cache(zval *data) /* {{{ */ |
169 | 0 | { |
170 | 0 | pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data); |
171 | 0 | if (!pce) return; |
172 | 0 | if (pce->subpats_table) { |
173 | 0 | free_subpats_table(pce->subpats_table, pce->capture_count + 1); |
174 | 0 | } |
175 | 0 | pcre2_code_free(pce->re); |
176 | 0 | free(pce); |
177 | 0 | } |
178 | | /* }}} */ |
179 | | |
180 | | static void *php_pcre_malloc(PCRE2_SIZE size, void *data) |
181 | 602 | { |
182 | 602 | return pemalloc(size, 1); |
183 | 602 | } |
184 | | |
185 | | static void php_pcre_free(void *block, void *data) |
186 | 77 | { |
187 | 77 | pefree(block, 1); |
188 | 77 | } |
189 | | |
190 | | static void *php_pcre_emalloc(PCRE2_SIZE size, void *data) |
191 | 44.6k | { |
192 | 44.6k | return emalloc(size); |
193 | 44.6k | } |
194 | | |
195 | | static void php_pcre_efree(void *block, void *data) |
196 | 44.8k | { |
197 | 44.8k | efree(block); |
198 | 44.8k | } |
199 | | |
200 | 1.97k | #define PHP_PCRE_PREALLOC_MDATA_SIZE 32 |
201 | | |
202 | | static void php_pcre_init_pcre2(uint8_t jit) |
203 | 2 | {/*{{{*/ |
204 | 2 | if (!gctx) { |
205 | 2 | gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL); |
206 | 2 | if (!gctx) { |
207 | 0 | pcre2_init_ok = 0; |
208 | 0 | return; |
209 | 0 | } |
210 | 2 | } |
211 | | |
212 | 2 | if (!cctx) { |
213 | 2 | cctx = pcre2_compile_context_create(gctx); |
214 | 2 | if (!cctx) { |
215 | 0 | pcre2_init_ok = 0; |
216 | 0 | return; |
217 | 0 | } |
218 | 2 | } |
219 | | |
220 | 2 | if (!mctx) { |
221 | 2 | mctx = pcre2_match_context_create(gctx); |
222 | 2 | if (!mctx) { |
223 | 0 | pcre2_init_ok = 0; |
224 | 0 | return; |
225 | 0 | } |
226 | 2 | } |
227 | | |
228 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
229 | | if (jit && !jit_stack) { |
230 | | jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx); |
231 | | if (!jit_stack) { |
232 | | pcre2_init_ok = 0; |
233 | | return; |
234 | | } |
235 | | } |
236 | | #endif |
237 | | |
238 | 2 | if (!mdata) { |
239 | 2 | mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx); |
240 | 2 | if (!mdata) { |
241 | 0 | pcre2_init_ok = 0; |
242 | 0 | return; |
243 | 0 | } |
244 | 2 | } |
245 | | |
246 | 2 | pcre2_init_ok = 1; |
247 | 2 | }/*}}}*/ |
248 | | |
249 | | static void php_pcre_shutdown_pcre2(void) |
250 | 0 | {/*{{{*/ |
251 | 0 | if (gctx) { |
252 | 0 | pcre2_general_context_free(gctx); |
253 | 0 | gctx = NULL; |
254 | 0 | } |
255 | |
|
256 | 0 | if (cctx) { |
257 | 0 | pcre2_compile_context_free(cctx); |
258 | 0 | cctx = NULL; |
259 | 0 | } |
260 | |
|
261 | 0 | if (mctx) { |
262 | 0 | pcre2_match_context_free(mctx); |
263 | 0 | mctx = NULL; |
264 | 0 | } |
265 | |
|
266 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
267 | | /* Stack may only be destroyed when no cached patterns |
268 | | possibly associated with it do exist. */ |
269 | | if (jit_stack) { |
270 | | pcre2_jit_stack_free(jit_stack); |
271 | | jit_stack = NULL; |
272 | | } |
273 | | #endif |
274 | |
|
275 | 0 | if (mdata) { |
276 | 0 | pcre2_match_data_free(mdata); |
277 | 0 | mdata = NULL; |
278 | 0 | } |
279 | |
|
280 | 0 | pcre2_init_ok = 0; |
281 | 0 | }/*}}}*/ |
282 | | |
283 | | static PHP_GINIT_FUNCTION(pcre) /* {{{ */ |
284 | 2 | { |
285 | 2 | php_pcre_mutex_alloc(); |
286 | | |
287 | 2 | zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1); |
288 | | |
289 | 2 | pcre_globals->backtrack_limit = 0; |
290 | 2 | pcre_globals->recursion_limit = 0; |
291 | 2 | pcre_globals->error_code = PHP_PCRE_NO_ERROR; |
292 | 2 | ZVAL_UNDEF(&pcre_globals->unmatched_null_pair); |
293 | 2 | ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair); |
294 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
295 | | pcre_globals->jit = 1; |
296 | | #endif |
297 | | |
298 | 2 | php_pcre_init_pcre2(1); |
299 | 2 | zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1); |
300 | 2 | } |
301 | | /* }}} */ |
302 | | |
303 | | static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */ |
304 | 0 | { |
305 | 0 | zend_hash_destroy(&pcre_globals->pcre_cache); |
306 | |
|
307 | 0 | php_pcre_shutdown_pcre2(); |
308 | 0 | zend_hash_destroy(&char_tables); |
309 | 0 | php_pcre_mutex_free(); |
310 | 0 | } |
311 | | /* }}} */ |
312 | | |
313 | | static PHP_INI_MH(OnUpdateBacktrackLimit) |
314 | 2 | {/*{{{*/ |
315 | 2 | OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage); |
316 | 2 | if (mctx) { |
317 | 2 | pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit)); |
318 | 2 | } |
319 | | |
320 | 2 | return SUCCESS; |
321 | 2 | }/*}}}*/ |
322 | | |
323 | | static PHP_INI_MH(OnUpdateRecursionLimit) |
324 | 2 | {/*{{{*/ |
325 | 2 | OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage); |
326 | 2 | if (mctx) { |
327 | 2 | pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit)); |
328 | 2 | } |
329 | | |
330 | 2 | return SUCCESS; |
331 | 2 | }/*}}}*/ |
332 | | |
333 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
334 | | static PHP_INI_MH(OnUpdateJit) |
335 | | {/*{{{*/ |
336 | | OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage); |
337 | | if (PCRE_G(jit) && jit_stack) { |
338 | | pcre2_jit_stack_assign(mctx, NULL, jit_stack); |
339 | | } else { |
340 | | pcre2_jit_stack_assign(mctx, NULL, NULL); |
341 | | } |
342 | | |
343 | | return SUCCESS; |
344 | | }/*}}}*/ |
345 | | #endif |
346 | | |
347 | | PHP_INI_BEGIN() |
348 | | STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals) |
349 | | STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals) |
350 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
351 | | STD_PHP_INI_BOOLEAN("pcre.jit", "1", PHP_INI_ALL, OnUpdateJit, jit, zend_pcre_globals, pcre_globals) |
352 | | #endif |
353 | | PHP_INI_END() |
354 | | |
355 | | static char *_pcre2_config_str(uint32_t what) |
356 | 8 | {/*{{{*/ |
357 | 8 | int len = pcre2_config(what, NULL); |
358 | 8 | char *ret = (char *) malloc(len + 1); |
359 | | |
360 | 8 | len = pcre2_config(what, ret); |
361 | 8 | if (!len) { |
362 | 0 | free(ret); |
363 | 0 | return NULL; |
364 | 0 | } |
365 | | |
366 | 8 | return ret; |
367 | 8 | }/*}}}*/ |
368 | | |
369 | | /* {{{ PHP_MINFO_FUNCTION(pcre) */ |
370 | | static PHP_MINFO_FUNCTION(pcre) |
371 | 3 | { |
372 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
373 | | uint32_t flag = 0; |
374 | | char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET); |
375 | | #endif |
376 | 3 | char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION); |
377 | 3 | char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION); |
378 | | |
379 | 3 | php_info_print_table_start(); |
380 | 3 | php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" ); |
381 | 3 | php_info_print_table_row(2, "PCRE Library Version", version); |
382 | 3 | free(version); |
383 | 3 | php_info_print_table_row(2, "PCRE Unicode Version", unicode); |
384 | 3 | free(unicode); |
385 | | |
386 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
387 | | if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) { |
388 | | php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled"); |
389 | | } else { |
390 | | php_info_print_table_row(2, "PCRE JIT Support", "unknown" ); |
391 | | } |
392 | | if (jit_target) { |
393 | | php_info_print_table_row(2, "PCRE JIT Target", jit_target); |
394 | | } |
395 | | free(jit_target); |
396 | | #else |
397 | 3 | php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" ); |
398 | 3 | #endif |
399 | | |
400 | | #ifdef HAVE_PCRE_VALGRIND_SUPPORT |
401 | | php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" ); |
402 | | #endif |
403 | | |
404 | 3 | php_info_print_table_end(); |
405 | | |
406 | 3 | DISPLAY_INI_ENTRIES(); |
407 | 3 | } |
408 | | /* }}} */ |
409 | | |
410 | | /* {{{ PHP_MINIT_FUNCTION(pcre) */ |
411 | | static PHP_MINIT_FUNCTION(pcre) |
412 | 2 | { |
413 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
414 | | if (UNEXPECTED(!pcre2_init_ok)) { |
415 | | /* Retry. */ |
416 | | php_pcre_init_pcre2(PCRE_G(jit)); |
417 | | if (!pcre2_init_ok) { |
418 | | return FAILURE; |
419 | | } |
420 | | } |
421 | | #endif |
422 | | |
423 | 2 | REGISTER_INI_ENTRIES(); |
424 | | |
425 | 2 | php_pcre_version = _pcre2_config_str(PCRE2_CONFIG_VERSION); |
426 | | |
427 | 2 | register_php_pcre_symbols(module_number); |
428 | | |
429 | 2 | return SUCCESS; |
430 | 2 | } |
431 | | /* }}} */ |
432 | | |
433 | | /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */ |
434 | | static PHP_MSHUTDOWN_FUNCTION(pcre) |
435 | 0 | { |
436 | 0 | UNREGISTER_INI_ENTRIES(); |
437 | |
|
438 | 0 | free(php_pcre_version); |
439 | |
|
440 | 0 | return SUCCESS; |
441 | 0 | } |
442 | | /* }}} */ |
443 | | |
444 | | /* {{{ PHP_RINIT_FUNCTION(pcre) */ |
445 | | static PHP_RINIT_FUNCTION(pcre) |
446 | 44.4k | { |
447 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
448 | | if (UNEXPECTED(!pcre2_init_ok)) { |
449 | | /* Retry. */ |
450 | | php_pcre_mutex_lock(); |
451 | | php_pcre_init_pcre2(PCRE_G(jit)); |
452 | | if (!pcre2_init_ok) { |
453 | | php_pcre_mutex_unlock(); |
454 | | return FAILURE; |
455 | | } |
456 | | php_pcre_mutex_unlock(); |
457 | | } |
458 | | |
459 | | mdata_used = 0; |
460 | | #endif |
461 | | |
462 | 44.4k | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
463 | 44.4k | PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL); |
464 | 44.4k | if (!PCRE_G(gctx_zmm)) { |
465 | 0 | return FAILURE; |
466 | 0 | } |
467 | | |
468 | 44.4k | return SUCCESS; |
469 | 44.4k | } |
470 | | /* }}} */ |
471 | | |
472 | | static PHP_RSHUTDOWN_FUNCTION(pcre) |
473 | 44.4k | { |
474 | 44.4k | pcre_cache_entry *pce; |
475 | 23.0M | ZEND_HASH_MAP_FOREACH_PTR(&PCRE_G(pcre_cache), pce) { |
476 | 23.0M | if (pce->subpats_table) { |
477 | 0 | free_subpats_table(pce->subpats_table, pce->capture_count + 1); |
478 | 0 | pce->subpats_table = NULL; |
479 | 0 | } |
480 | 23.0M | } ZEND_HASH_FOREACH_END(); |
481 | | |
482 | 44.4k | pcre2_general_context_free(PCRE_G(gctx_zmm)); |
483 | 44.4k | PCRE_G(gctx_zmm) = NULL; |
484 | | |
485 | 44.4k | zval_ptr_dtor(&PCRE_G(unmatched_null_pair)); |
486 | 44.4k | zval_ptr_dtor(&PCRE_G(unmatched_empty_pair)); |
487 | 44.4k | ZVAL_UNDEF(&PCRE_G(unmatched_null_pair)); |
488 | 44.4k | ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair)); |
489 | 44.4k | return SUCCESS; |
490 | 44.4k | } |
491 | | |
492 | | /* {{{ static pcre_clean_cache */ |
493 | | static int pcre_clean_cache(zval *data, void *arg) |
494 | 0 | { |
495 | 0 | pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data); |
496 | 0 | int *num_clean = (int *)arg; |
497 | |
|
498 | 0 | if (!pce->refcount) { |
499 | 0 | if (--(*num_clean) == 0) { |
500 | 0 | return ZEND_HASH_APPLY_REMOVE|ZEND_HASH_APPLY_STOP; |
501 | 0 | } |
502 | 0 | return ZEND_HASH_APPLY_REMOVE; |
503 | 0 | } else { |
504 | 0 | return ZEND_HASH_APPLY_KEEP; |
505 | 0 | } |
506 | 0 | } |
507 | | /* }}} */ |
508 | | |
509 | 0 | static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) { |
510 | 0 | uint32_t i; |
511 | 0 | for (i = 0; i < num_subpats; i++) { |
512 | 0 | if (subpat_names[i]) { |
513 | 0 | zend_string_release_ex(subpat_names[i], false); |
514 | 0 | } |
515 | 0 | } |
516 | 0 | efree(subpat_names); |
517 | 0 | } |
518 | | |
519 | | /* {{{ static make_subpats_table */ |
520 | | static zend_string **make_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce) |
521 | 0 | { |
522 | 0 | uint32_t num_subpats = pce->capture_count + 1; |
523 | 0 | uint32_t name_size, ni = 0; |
524 | 0 | char *name_table; |
525 | 0 | zend_string **subpat_names; |
526 | 0 | int rc1, rc2; |
527 | |
|
528 | 0 | rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table); |
529 | 0 | rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size); |
530 | 0 | if (rc1 < 0 || rc2 < 0) { |
531 | 0 | php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2); |
532 | 0 | return NULL; |
533 | 0 | } |
534 | | |
535 | 0 | subpat_names = ecalloc(num_subpats, sizeof(zend_string *)); |
536 | 0 | while (ni++ < name_cnt) { |
537 | 0 | unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1]; |
538 | 0 | const char *name = name_table + 2; |
539 | 0 | subpat_names[name_idx] = zend_string_init(name, strlen(name), false); |
540 | 0 | name_table += name_size; |
541 | 0 | } |
542 | 0 | return subpat_names; |
543 | 0 | } |
544 | | /* }}} */ |
545 | | |
546 | | static zend_string **ensure_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce) |
547 | 0 | { |
548 | 0 | if (!pce->subpats_table) { |
549 | 0 | pce->subpats_table = make_subpats_table(name_cnt, pce); |
550 | 0 | } |
551 | 0 | return pce->subpats_table; |
552 | 0 | } |
553 | | |
554 | | /* {{{ static calculate_unit_length */ |
555 | | /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */ |
556 | | static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start) |
557 | 57 | { |
558 | 57 | size_t unit_len; |
559 | | |
560 | 57 | if (pce->compile_options & PCRE2_UTF) { |
561 | 18 | const char *end = start; |
562 | | |
563 | | /* skip continuation bytes */ |
564 | 18 | while ((*++end & 0xC0) == 0x80); |
565 | 18 | unit_len = end - start; |
566 | 39 | } else { |
567 | 39 | unit_len = 1; |
568 | 39 | } |
569 | 57 | return unit_len; |
570 | 57 | } |
571 | | /* }}} */ |
572 | | |
573 | | /* {{{ pcre_get_compiled_regex_cache */ |
574 | | PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware) |
575 | 2.75k | { |
576 | 2.75k | pcre2_code *re = NULL; |
577 | | #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !defined(HAVE_BUNDLED_PCRE) |
578 | | uint32_t coptions = PCRE2_NO_START_OPTIMIZE; |
579 | | #else |
580 | 2.75k | uint32_t coptions = 0; |
581 | 2.75k | #endif |
582 | 2.75k | uint32_t eoptions = 0; |
583 | 2.75k | PCRE2_UCHAR error[128]; |
584 | 2.75k | PCRE2_SIZE erroffset; |
585 | 2.75k | int errnumber; |
586 | 2.75k | char delimiter; |
587 | 2.75k | char start_delimiter; |
588 | 2.75k | char end_delimiter; |
589 | 2.75k | char *p, *pp; |
590 | 2.75k | char *pattern; |
591 | 2.75k | size_t pattern_len; |
592 | 2.75k | uint32_t poptions = 0; |
593 | 2.75k | const uint8_t *tables = NULL; |
594 | 2.75k | zval *zv; |
595 | 2.75k | pcre_cache_entry new_entry; |
596 | 2.75k | int rc; |
597 | 2.75k | zend_string *key; |
598 | 2.75k | pcre_cache_entry *ret; |
599 | | |
600 | 2.75k | if (locale_aware && BG(ctype_string)) { |
601 | 0 | key = zend_string_concat2( |
602 | 0 | ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), |
603 | 0 | ZSTR_VAL(regex), ZSTR_LEN(regex)); |
604 | 2.75k | } else { |
605 | 2.75k | key = regex; |
606 | 2.75k | } |
607 | | |
608 | | /* Try to lookup the cached regex entry, and if successful, just pass |
609 | | back the compiled pattern, otherwise go on and compile it. */ |
610 | 2.75k | zv = zend_hash_find(&PCRE_G(pcre_cache), key); |
611 | 2.75k | if (zv) { |
612 | 1.55k | if (key != regex) { |
613 | 0 | zend_string_release_ex(key, 0); |
614 | 0 | } |
615 | 1.55k | return (pcre_cache_entry*)Z_PTR_P(zv); |
616 | 1.55k | } |
617 | | |
618 | 1.19k | p = ZSTR_VAL(regex); |
619 | 1.19k | const char* end_p = ZSTR_VAL(regex) + ZSTR_LEN(regex); |
620 | | |
621 | | /* Parse through the leading whitespace, and display a warning if we |
622 | | get to the end without encountering a delimiter. */ |
623 | 1.19k | while (isspace((unsigned char)*p)) p++; |
624 | 1.19k | if (p >= end_p) { |
625 | 3 | if (key != regex) { |
626 | 0 | zend_string_release_ex(key, 0); |
627 | 0 | } |
628 | 3 | php_error_docref(NULL, E_WARNING, "Empty regular expression"); |
629 | 3 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
630 | 3 | return NULL; |
631 | 3 | } |
632 | | |
633 | | /* Get the delimiter and display a warning if it is alphanumeric |
634 | | or a backslash. */ |
635 | 1.19k | delimiter = *p++; |
636 | 1.19k | if (isalnum((unsigned char)delimiter) || delimiter == '\\' || delimiter == '\0') { |
637 | 9 | if (key != regex) { |
638 | 0 | zend_string_release_ex(key, 0); |
639 | 0 | } |
640 | 9 | php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric, backslash, or NUL byte"); |
641 | 9 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
642 | 9 | return NULL; |
643 | 9 | } |
644 | | |
645 | 1.18k | start_delimiter = delimiter; |
646 | 1.18k | if ((pp = strchr("([{< )]}> )]}>", delimiter))) |
647 | 48 | delimiter = pp[5]; |
648 | 1.18k | end_delimiter = delimiter; |
649 | | |
650 | 1.18k | pp = p; |
651 | | |
652 | 1.18k | if (start_delimiter == end_delimiter) { |
653 | | /* We need to iterate through the pattern, searching for the ending delimiter, |
654 | | but skipping the backslashed delimiters. If the ending delimiter is not |
655 | | found, display a warning. */ |
656 | 454k | while (pp < end_p) { |
657 | 454k | if (*pp == '\\' && pp + 1 < end_p) pp++; |
658 | 439k | else if (*pp == delimiter) |
659 | 1.13k | break; |
660 | 453k | pp++; |
661 | 453k | } |
662 | 1.14k | } else { |
663 | | /* We iterate through the pattern, searching for the matching ending |
664 | | * delimiter. For each matching starting delimiter, we increment nesting |
665 | | * level, and decrement it for each matching ending delimiter. If we |
666 | | * reach the end of the pattern without matching, display a warning. |
667 | | */ |
668 | 39 | int brackets = 1; /* brackets nesting level */ |
669 | 25.4k | while (pp < end_p) { |
670 | 25.3k | if (*pp == '\\' && pp + 1 < end_p) pp++; |
671 | 25.0k | else if (*pp == end_delimiter && --brackets <= 0) |
672 | 0 | break; |
673 | 25.0k | else if (*pp == start_delimiter) |
674 | 1.36k | brackets++; |
675 | 25.3k | pp++; |
676 | 25.3k | } |
677 | 39 | } |
678 | | |
679 | 1.18k | if (pp >= end_p) { |
680 | 51 | if (key != regex) { |
681 | 0 | zend_string_release_ex(key, 0); |
682 | 0 | } |
683 | 51 | if (start_delimiter == end_delimiter) { |
684 | 12 | php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter); |
685 | 39 | } else { |
686 | 39 | php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter); |
687 | 39 | } |
688 | 51 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
689 | 51 | return NULL; |
690 | 51 | } |
691 | | |
692 | | /* Make a copy of the actual pattern. */ |
693 | 1.13k | pattern_len = pp - p; |
694 | 1.13k | pattern = estrndup(p, pattern_len); |
695 | | |
696 | | /* Move on to the options */ |
697 | 1.13k | pp++; |
698 | | |
699 | | /* Parse through the options, setting appropriate flags. Display |
700 | | a warning if we encounter an unknown modifier. */ |
701 | 2.17k | while (pp < end_p) { |
702 | 1.07k | switch (*pp++) { |
703 | | /* Perl compatible options */ |
704 | 561 | case 'i': coptions |= PCRE2_CASELESS; break; |
705 | 34 | case 'm': coptions |= PCRE2_MULTILINE; break; |
706 | 8 | case 'n': coptions |= PCRE2_NO_AUTO_CAPTURE; break; |
707 | 1 | case 's': coptions |= PCRE2_DOTALL; break; |
708 | 12 | case 'x': coptions |= PCRE2_EXTENDED; break; |
709 | | |
710 | | /* PCRE specific options */ |
711 | 11 | case 'A': coptions |= PCRE2_ANCHORED; break; |
712 | 0 | case 'D': coptions |= PCRE2_DOLLAR_ENDONLY;break; |
713 | 0 | #ifdef PCRE2_EXTRA_CASELESS_RESTRICT |
714 | 0 | case 'r': eoptions |= PCRE2_EXTRA_CASELESS_RESTRICT; break; |
715 | 0 | #endif |
716 | 0 | case 'S': /* Pass. */ break; |
717 | 0 | case 'X': /* Pass. */ break; |
718 | 156 | case 'U': coptions |= PCRE2_UNGREEDY; break; |
719 | 239 | case 'u': coptions |= PCRE2_UTF; |
720 | | /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII |
721 | | characters, even in UTF-8 mode. However, this can be changed by setting |
722 | | the PCRE2_UCP option. */ |
723 | 239 | #ifdef PCRE2_UCP |
724 | 239 | coptions |= PCRE2_UCP; |
725 | 239 | #endif |
726 | 239 | break; |
727 | 0 | case 'J': coptions |= PCRE2_DUPNAMES; break; |
728 | | |
729 | 3 | case ' ': |
730 | 3 | case '\n': |
731 | 18 | case '\r': |
732 | 18 | break; |
733 | | |
734 | 0 | case 'e': /* legacy eval */ |
735 | 39 | default: |
736 | 39 | if (pp[-1]) { |
737 | 18 | php_error_docref(NULL, E_WARNING, "Unknown modifier '%c'", pp[-1]); |
738 | 21 | } else { |
739 | 21 | php_error_docref(NULL, E_WARNING, "NUL byte is not a valid modifier"); |
740 | 21 | } |
741 | 39 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
742 | 39 | efree(pattern); |
743 | 39 | if (key != regex) { |
744 | 0 | zend_string_release_ex(key, 0); |
745 | 0 | } |
746 | 39 | return NULL; |
747 | 1.07k | } |
748 | 1.07k | } |
749 | | |
750 | 1.09k | if (key != regex) { |
751 | 0 | zv = zend_hash_str_lookup(&char_tables, ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string))); |
752 | 0 | if (Z_ISNULL_P(zv)) { |
753 | 0 | tables = pcre2_maketables(gctx); |
754 | 0 | if (UNEXPECTED(!tables)) { |
755 | | /* Remove the placeholder entry created by zend_hash_str_lookup(), |
756 | | * set ptr to NULL first so the destructor (pefree) is safe. */ |
757 | 0 | ZVAL_PTR(zv, NULL); |
758 | 0 | zend_hash_str_del(&char_tables, ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string))); |
759 | 0 | php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables"); |
760 | 0 | pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY); |
761 | 0 | zend_string_release_ex(key, 0); |
762 | 0 | efree(pattern); |
763 | 0 | return NULL; |
764 | 0 | } |
765 | 0 | ZVAL_PTR(zv, (void *)tables); |
766 | 0 | } else { |
767 | 0 | tables = Z_PTR_P(zv); |
768 | 0 | } |
769 | 0 | } |
770 | 1.09k | pcre2_set_character_tables(cctx, tables); |
771 | | |
772 | 1.09k | pcre2_set_compile_extra_options(cctx, eoptions); |
773 | | |
774 | | /* Compile pattern and display a warning if compilation failed. */ |
775 | 1.09k | re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx); |
776 | | |
777 | 1.09k | if (re == NULL) { |
778 | 580 | if (key != regex) { |
779 | 0 | zend_string_release_ex(key, 0); |
780 | 0 | } |
781 | 580 | pcre2_get_error_message(errnumber, error, sizeof(error)); |
782 | 580 | php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset); |
783 | 580 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
784 | 580 | efree(pattern); |
785 | 580 | return NULL; |
786 | 580 | } |
787 | | |
788 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
789 | | if (PCRE_G(jit)) { |
790 | | /* Enable PCRE JIT compiler */ |
791 | | rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); |
792 | | if (EXPECTED(rc >= 0)) { |
793 | | size_t jit_size = 0; |
794 | | if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) { |
795 | | poptions |= PREG_JIT; |
796 | | } |
797 | | } else if (rc == PCRE2_ERROR_NOMEMORY) { |
798 | | php_error_docref(NULL, E_WARNING, |
799 | | "Allocation of JIT memory failed, PCRE JIT will be disabled. " |
800 | | "This is likely caused by security restrictions. " |
801 | | "Either grant PHP permission to allocate executable memory, or set pcre.jit=0"); |
802 | | PCRE_G(jit) = 0; |
803 | | } else { |
804 | | pcre2_get_error_message(rc, error, sizeof(error)); |
805 | | php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error); |
806 | | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
807 | | } |
808 | | } |
809 | | #endif |
810 | 517 | efree(pattern); |
811 | | |
812 | | /* |
813 | | * If we reached cache limit, clean out the items from the head of the list; |
814 | | * these are supposedly the oldest ones (but not necessarily the least used |
815 | | * ones). |
816 | | */ |
817 | 517 | if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) { |
818 | 0 | int num_clean = PCRE_CACHE_SIZE / 8; |
819 | 0 | zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean); |
820 | 0 | } |
821 | | |
822 | | /* Store the compiled pattern and extra info in the cache. */ |
823 | 517 | new_entry.re = re; |
824 | 517 | new_entry.preg_options = poptions; |
825 | 517 | new_entry.compile_options = coptions; |
826 | 517 | new_entry.refcount = 0; |
827 | 517 | new_entry.subpats_table = NULL; |
828 | | |
829 | 517 | if ((rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count)) < 0 || |
830 | 517 | (rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count)) < 0) { |
831 | 0 | if (key != regex) { |
832 | 0 | zend_string_release_ex(key, 0); |
833 | 0 | } |
834 | 0 | pcre2_code_free(new_entry.re); |
835 | 0 | php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc); |
836 | 0 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
837 | 0 | return NULL; |
838 | 0 | } |
839 | | |
840 | | /* |
841 | | * Interned strings are not duplicated when stored in HashTable, |
842 | | * but all the interned strings created during HTTP request are removed |
843 | | * at end of request. However PCRE_G(pcre_cache) must be consistent |
844 | | * on the next request as well. So we disable usage of interned strings |
845 | | * as hash keys especually for this table. |
846 | | * See bug #63180 |
847 | | */ |
848 | 517 | if (!(GC_FLAGS(key) & IS_STR_PERMANENT)) { |
849 | 117 | zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1); |
850 | 117 | GC_MAKE_PERSISTENT_LOCAL(str); |
851 | | |
852 | 117 | ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry)); |
853 | 117 | zend_string_release(str); |
854 | 400 | } else { |
855 | 400 | ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry)); |
856 | 400 | } |
857 | | |
858 | 517 | if (key != regex) { |
859 | 0 | zend_string_release_ex(key, 0); |
860 | 0 | } |
861 | | |
862 | 517 | return ret; |
863 | 517 | } |
864 | | /* }}} */ |
865 | | |
866 | | /* {{{ pcre_get_compiled_regex_cache */ |
867 | | PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex) |
868 | 2.75k | { |
869 | 2.75k | return pcre_get_compiled_regex_cache_ex(regex, true); |
870 | 2.75k | } |
871 | | /* }}} */ |
872 | | |
873 | | /* {{{ pcre_get_compiled_regex */ |
874 | | PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count) |
875 | 0 | { |
876 | 0 | pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex); |
877 | |
|
878 | 0 | if (capture_count) { |
879 | 0 | *capture_count = pce ? pce->capture_count : 0; |
880 | 0 | } |
881 | |
|
882 | 0 | return pce ? pce->re : NULL; |
883 | 0 | } |
884 | | /* }}} */ |
885 | | |
886 | | /* XXX For the cases where it's only about match yes/no and no capture |
887 | | required, perhaps just a minimum sized data would suffice. */ |
888 | | PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re) |
889 | 0 | {/*{{{*/ |
890 | |
|
891 | 0 | assert(NULL != re); |
892 | |
|
893 | 0 | if (EXPECTED(!mdata_used)) { |
894 | 0 | int rc = 0; |
895 | |
|
896 | 0 | if (!capture_count) { |
897 | | /* As we deal with a non cached pattern, no other way to gather this info. */ |
898 | 0 | rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count); |
899 | 0 | } |
900 | |
|
901 | 0 | if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
902 | 0 | mdata_used = 1; |
903 | 0 | return mdata; |
904 | 0 | } |
905 | 0 | } |
906 | | |
907 | 0 | return pcre2_match_data_create_from_pattern(re, gctx); |
908 | 0 | }/*}}}*/ |
909 | | |
910 | | PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data) |
911 | 0 | {/*{{{*/ |
912 | 0 | if (UNEXPECTED(match_data != mdata)) { |
913 | 0 | pcre2_match_data_free(match_data); |
914 | 0 | } else { |
915 | 0 | mdata_used = 0; |
916 | 0 | } |
917 | 0 | }/*}}}*/ |
918 | | |
919 | 0 | static void init_unmatched_null_pair(zval *pair) { |
920 | 0 | zval val1, val2; |
921 | 0 | ZVAL_NULL(&val1); |
922 | 0 | ZVAL_LONG(&val2, -1); |
923 | 0 | ZVAL_ARR(pair, zend_new_pair(&val1, &val2)); |
924 | 0 | } |
925 | | |
926 | 0 | static void init_unmatched_empty_pair(zval *pair) { |
927 | 0 | zval val1, val2; |
928 | 0 | ZVAL_EMPTY_STRING(&val1); |
929 | 0 | ZVAL_LONG(&val2, -1); |
930 | 0 | ZVAL_ARR(pair, zend_new_pair(&val1, &val2)); |
931 | 0 | } |
932 | | |
933 | | static zend_always_inline void populate_match_value_str( |
934 | 252 | zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) { |
935 | 252 | ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset); |
936 | 252 | } |
937 | | |
938 | | static zend_always_inline void populate_match_value( |
939 | | zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset, |
940 | 252 | bool unmatched_as_null) { |
941 | 252 | if (PCRE2_UNSET == start_offset) { |
942 | 0 | if (unmatched_as_null) { |
943 | 0 | ZVAL_NULL(val); |
944 | 0 | } else { |
945 | 0 | ZVAL_EMPTY_STRING(val); |
946 | 0 | } |
947 | 252 | } else { |
948 | 252 | populate_match_value_str(val, subject, start_offset, end_offset); |
949 | 252 | } |
950 | 252 | } |
951 | | |
952 | | static inline void add_named( |
953 | 0 | HashTable *const subpats, zend_string *name, zval *val, bool unmatched) { |
954 | 0 | ZEND_ASSERT(!(GC_FLAGS(name) & IS_STR_PERSISTENT)); |
955 | | |
956 | | /* If the DUPNAMES option is used, multiple subpatterns might have the same name. |
957 | | * In this case we want to preserve the one that actually has a value. */ |
958 | 0 | if (!unmatched) { |
959 | 0 | zend_hash_update(subpats, name, val); |
960 | 0 | } else { |
961 | 0 | if (!zend_hash_add(subpats, name, val)) { |
962 | 0 | return; |
963 | 0 | } |
964 | 0 | } |
965 | 0 | Z_TRY_ADDREF_P(val); |
966 | 0 | } |
967 | | |
968 | | /* {{{ add_offset_pair */ |
969 | | static inline void add_offset_pair( |
970 | | HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset, |
971 | | zend_string *name, zend_long unmatched_as_null) |
972 | 0 | { |
973 | 0 | zval match_pair; |
974 | | |
975 | | /* Add (match, offset) to the return value */ |
976 | 0 | if (PCRE2_UNSET == start_offset) { |
977 | 0 | if (unmatched_as_null) { |
978 | 0 | do { |
979 | 0 | if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) { |
980 | 0 | if (UNEXPECTED(EG(flags) & EG_FLAGS_IN_SHUTDOWN)) { |
981 | 0 | init_unmatched_null_pair(&match_pair); |
982 | 0 | break; |
983 | 0 | } else { |
984 | 0 | init_unmatched_null_pair(&PCRE_G(unmatched_null_pair)); |
985 | 0 | } |
986 | 0 | } |
987 | 0 | ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair)); |
988 | 0 | } while (0); |
989 | 0 | } else { |
990 | 0 | do { |
991 | 0 | if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) { |
992 | 0 | if (UNEXPECTED(EG(flags) & EG_FLAGS_IN_SHUTDOWN)) { |
993 | 0 | init_unmatched_empty_pair(&match_pair); |
994 | 0 | break; |
995 | 0 | } else { |
996 | 0 | init_unmatched_empty_pair(&PCRE_G(unmatched_empty_pair)); |
997 | 0 | } |
998 | 0 | } |
999 | 0 | ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair)); |
1000 | 0 | } while (0); |
1001 | 0 | } |
1002 | 0 | } else { |
1003 | 0 | zval val1, val2; |
1004 | 0 | populate_match_value_str(&val1, subject, start_offset, end_offset); |
1005 | 0 | ZVAL_LONG(&val2, start_offset); |
1006 | 0 | ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2)); |
1007 | 0 | } |
1008 | |
|
1009 | 0 | if (name) { |
1010 | 0 | add_named(result, name, &match_pair, start_offset == PCRE2_UNSET); |
1011 | 0 | } |
1012 | 0 | zend_hash_next_index_insert_new(result, &match_pair); |
1013 | 0 | } |
1014 | | /* }}} */ |
1015 | | |
1016 | | static void populate_subpat_array( |
1017 | | HashTable *subpats_ht, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, |
1018 | 129 | uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) { |
1019 | 129 | zend_long offset_capture = flags & PREG_OFFSET_CAPTURE; |
1020 | 129 | zend_long unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL; |
1021 | 129 | zval val; |
1022 | 129 | int i; |
1023 | 129 | if (subpat_names) { |
1024 | 0 | if (offset_capture) { |
1025 | 0 | for (i = 0; i < count; i++) { |
1026 | 0 | add_offset_pair( |
1027 | 0 | subpats_ht, subject, offsets[2*i], offsets[2*i+1], |
1028 | 0 | subpat_names[i], unmatched_as_null); |
1029 | 0 | } |
1030 | 0 | if (unmatched_as_null) { |
1031 | 0 | for (i = count; i < num_subpats; i++) { |
1032 | 0 | add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1); |
1033 | 0 | } |
1034 | 0 | } |
1035 | 0 | } else { |
1036 | 0 | for (i = 0; i < count; i++) { |
1037 | 0 | populate_match_value( |
1038 | 0 | &val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null); |
1039 | 0 | if (subpat_names[i]) { |
1040 | 0 | add_named(subpats_ht, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET); |
1041 | 0 | } |
1042 | 0 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1043 | 0 | } |
1044 | 0 | if (unmatched_as_null) { |
1045 | 0 | for (i = count; i < num_subpats; i++) { |
1046 | 0 | ZVAL_NULL(&val); |
1047 | 0 | if (subpat_names[i]) { |
1048 | 0 | zend_hash_add(subpats_ht, subpat_names[i], &val); |
1049 | 0 | } |
1050 | 0 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1051 | 0 | } |
1052 | 0 | } |
1053 | 0 | } |
1054 | 129 | } else { |
1055 | 129 | if (offset_capture) { |
1056 | 0 | for (i = 0; i < count; i++) { |
1057 | 0 | add_offset_pair( |
1058 | 0 | subpats_ht, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null); |
1059 | 0 | } |
1060 | 0 | if (unmatched_as_null) { |
1061 | 0 | for (i = count; i < num_subpats; i++) { |
1062 | 0 | add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1); |
1063 | 0 | } |
1064 | 0 | } |
1065 | 129 | } else { |
1066 | 381 | for (i = 0; i < count; i++) { |
1067 | 252 | populate_match_value( |
1068 | 252 | &val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null); |
1069 | 252 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1070 | 252 | } |
1071 | 129 | if (unmatched_as_null) { |
1072 | 0 | ZVAL_NULL(&val); |
1073 | 0 | for (i = count; i < num_subpats; i++) { |
1074 | 0 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1075 | 0 | } |
1076 | 0 | } |
1077 | 129 | } |
1078 | 129 | } |
1079 | | /* Add MARK, if available */ |
1080 | 129 | if (mark) { |
1081 | 0 | ZVAL_STRING(&val, (char *)mark); |
1082 | 0 | zend_hash_str_update(subpats_ht, ZEND_STRL("MARK"), &val); |
1083 | 0 | } |
1084 | 129 | } |
1085 | | |
1086 | | static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, bool global) /* {{{ */ |
1087 | 2.54k | { |
1088 | | /* parameters */ |
1089 | 2.54k | zend_string *regex; /* Regular expression */ |
1090 | 2.54k | zend_string *subject; /* String to match against */ |
1091 | 2.54k | pcre_cache_entry *pce; /* Compiled regular expression */ |
1092 | 2.54k | zval *subpats = NULL; /* Array for subpatterns */ |
1093 | 2.54k | zend_long flags = 0; /* Match control flags */ |
1094 | 2.54k | zend_long start_offset = 0; /* Where the new search starts */ |
1095 | | |
1096 | 7.63k | ZEND_PARSE_PARAMETERS_START(2, 5) |
1097 | 10.1k | Z_PARAM_STR(regex) |
1098 | 12.7k | Z_PARAM_STR(subject) |
1099 | 2.54k | Z_PARAM_OPTIONAL |
1100 | 5.08k | Z_PARAM_ZVAL(subpats) |
1101 | 5.08k | Z_PARAM_LONG(flags) |
1102 | 0 | Z_PARAM_LONG(start_offset) |
1103 | 2.54k | ZEND_PARSE_PARAMETERS_END(); |
1104 | | |
1105 | | /* Compile regex or get it from cache. */ |
1106 | 2.54k | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
1107 | 667 | RETURN_FALSE; |
1108 | 667 | } |
1109 | | |
1110 | 1.87k | if (start_offset == ZEND_LONG_MIN) { |
1111 | 0 | zend_argument_value_error(5, "must be greater than " ZEND_LONG_FMT, ZEND_LONG_MIN); |
1112 | 0 | RETURN_THROWS(); |
1113 | 0 | } |
1114 | | |
1115 | 1.87k | pce->refcount++; |
1116 | 1.87k | php_pcre_match_impl(pce, subject, return_value, subpats, |
1117 | 1.87k | global, flags, start_offset); |
1118 | 1.87k | pce->refcount--; |
1119 | 1.87k | } |
1120 | | /* }}} */ |
1121 | | |
1122 | | static zend_always_inline bool is_known_valid_utf8( |
1123 | 561 | zend_string *subject_str, PCRE2_SIZE start_offset) { |
1124 | 561 | if (!ZSTR_IS_VALID_UTF8(subject_str)) { |
1125 | | /* We don't know whether the string is valid UTF-8 or not. */ |
1126 | 561 | return false; |
1127 | 561 | } |
1128 | | |
1129 | 0 | if (start_offset == ZSTR_LEN(subject_str)) { |
1130 | | /* Degenerate case: Offset points to end of string. */ |
1131 | 0 | return true; |
1132 | 0 | } |
1133 | | |
1134 | | /* Check that the offset does not point to an UTF-8 continuation byte. */ |
1135 | 0 | return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80; |
1136 | 0 | } |
1137 | | |
1138 | | /* {{{ php_pcre_match_impl() */ |
1139 | | PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, |
1140 | | zval *subpats, bool global, zend_long flags, zend_off_t start_offset) |
1141 | 1.87k | { |
1142 | 1.87k | zval result_set; /* Holds a set of subpatterns after |
1143 | | a global match */ |
1144 | 1.87k | HashTable **match_sets = NULL; /* An array of sets of matches for each |
1145 | | subpattern after a global match */ |
1146 | 1.87k | uint32_t options; /* Execution options */ |
1147 | 1.87k | int count; /* Count of matched subpatterns */ |
1148 | 1.87k | uint32_t num_subpats; /* Number of captured subpatterns */ |
1149 | 1.87k | int matched; /* Has anything matched */ |
1150 | 1.87k | zend_string **subpat_names; /* Array for named subpatterns */ |
1151 | 1.87k | size_t i; |
1152 | 1.87k | uint32_t subpats_order; /* Order of subpattern matches */ |
1153 | 1.87k | uint32_t offset_capture; /* Capture match offsets: yes/no */ |
1154 | 1.87k | zend_long unmatched_as_null; /* Null non-matches: yes/no */ |
1155 | 1.87k | PCRE2_SPTR mark = NULL; /* Target for MARK name */ |
1156 | 1.87k | HashTable *marks = NULL; /* Array of marks for PREG_PATTERN_ORDER */ |
1157 | 1.87k | pcre2_match_data *match_data; |
1158 | 1.87k | PCRE2_SIZE start_offset2, orig_start_offset; |
1159 | 1.87k | bool old_mdata_used; |
1160 | | |
1161 | 1.87k | char *subject = ZSTR_VAL(subject_str); |
1162 | 1.87k | size_t subject_len = ZSTR_LEN(subject_str); |
1163 | | |
1164 | | /* Overwrite the passed-in value for subpatterns with an empty array. */ |
1165 | 1.87k | if (subpats != NULL) { |
1166 | 0 | subpats = zend_try_array_init(subpats); |
1167 | 0 | if (!subpats) { |
1168 | 0 | RETURN_THROWS(); |
1169 | 0 | } |
1170 | 0 | } |
1171 | | |
1172 | 1.87k | subpats_order = global ? PREG_PATTERN_ORDER : 0; |
1173 | | |
1174 | 1.87k | if (flags) { |
1175 | 0 | offset_capture = flags & PREG_OFFSET_CAPTURE; |
1176 | 0 | unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL; |
1177 | | |
1178 | | /* |
1179 | | * subpats_order is pre-set to pattern mode so we change it only if |
1180 | | * necessary. |
1181 | | */ |
1182 | 0 | if (flags & 0xff) { |
1183 | 0 | subpats_order = flags & 0xff; |
1184 | 0 | if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) || |
1185 | 0 | (!global && subpats_order != 0)) { |
1186 | 0 | zend_argument_value_error(4, "must be a PREG_* constant"); |
1187 | 0 | RETURN_THROWS(); |
1188 | 0 | } |
1189 | 0 | } |
1190 | 1.87k | } else { |
1191 | 1.87k | offset_capture = 0; |
1192 | 1.87k | unmatched_as_null = 0; |
1193 | 1.87k | } |
1194 | | |
1195 | | /* Negative offset counts from the end of the string. */ |
1196 | 1.87k | if (start_offset < 0) { |
1197 | 0 | if ((PCRE2_SIZE)-start_offset <= subject_len) { |
1198 | 0 | start_offset2 = subject_len + start_offset; |
1199 | 0 | } else { |
1200 | 0 | start_offset2 = 0; |
1201 | 0 | } |
1202 | 1.87k | } else { |
1203 | 1.87k | start_offset2 = (PCRE2_SIZE)start_offset; |
1204 | 1.87k | } |
1205 | | |
1206 | 1.87k | if (start_offset2 > subject_len) { |
1207 | 0 | pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET); |
1208 | 0 | RETURN_FALSE; |
1209 | 0 | } |
1210 | | |
1211 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
1212 | 1.87k | num_subpats = pce->capture_count + 1; |
1213 | | |
1214 | | /* |
1215 | | * Build a mapping from subpattern numbers to their names. We will |
1216 | | * allocate the table only if there are any named subpatterns. |
1217 | | */ |
1218 | 1.87k | subpat_names = NULL; |
1219 | 1.87k | if (subpats && pce->name_count > 0) { |
1220 | 0 | subpat_names = ensure_subpats_table(pce->name_count, pce); |
1221 | 0 | if (UNEXPECTED(!subpat_names)) { |
1222 | 0 | RETURN_FALSE; |
1223 | 0 | } |
1224 | 0 | } |
1225 | | |
1226 | 1.87k | matched = 0; |
1227 | 1.87k | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
1228 | | |
1229 | 1.87k | old_mdata_used = mdata_used; |
1230 | 1.87k | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
1231 | 1.84k | mdata_used = true; |
1232 | 1.84k | match_data = mdata; |
1233 | 1.84k | } else { |
1234 | 30 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
1235 | 30 | if (!match_data) { |
1236 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1237 | 0 | RETURN_FALSE; |
1238 | 0 | } |
1239 | 30 | } |
1240 | | |
1241 | | /* Allocate match sets array and initialize the values. */ |
1242 | 1.87k | if (global && subpats && subpats_order == PREG_PATTERN_ORDER) { |
1243 | 0 | match_sets = safe_emalloc(num_subpats, sizeof(HashTable *), 0); |
1244 | 0 | for (i=0; i<num_subpats; i++) { |
1245 | 0 | match_sets[i] = zend_new_array(0); |
1246 | 0 | } |
1247 | 0 | } |
1248 | | |
1249 | | /* Array of subpattern offsets */ |
1250 | 1.87k | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
1251 | | |
1252 | 1.87k | orig_start_offset = start_offset2; |
1253 | 1.87k | options = |
1254 | 1.87k | (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset) |
1255 | 1.87k | ? 0 : PCRE2_NO_UTF_CHECK; |
1256 | | |
1257 | | /* Execute the regular expression. */ |
1258 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1259 | | if ((pce->preg_options & PREG_JIT) && options) { |
1260 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1261 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1262 | | } else |
1263 | | #endif |
1264 | 1.87k | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1265 | 1.87k | options, match_data, mctx); |
1266 | | |
1267 | 1.87k | while (1) { |
1268 | | /* If something has matched */ |
1269 | 1.87k | if (count >= 0) { |
1270 | | /* Check for too many substrings condition. */ |
1271 | 126 | if (UNEXPECTED(count == 0)) { |
1272 | 0 | php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings"); |
1273 | 0 | count = num_subpats; |
1274 | 0 | } |
1275 | | |
1276 | 126 | matched: |
1277 | 126 | matched++; |
1278 | | |
1279 | | /* If subpatterns array has been passed, fill it in with values. */ |
1280 | 126 | if (subpats != NULL) { |
1281 | | /* Try to get the list of substrings and display a warning if failed. */ |
1282 | 0 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
1283 | 0 | if (match_sets) { |
1284 | 0 | for (i = 0; i < num_subpats; i++) { |
1285 | 0 | zend_array_destroy(match_sets[i]); |
1286 | 0 | } |
1287 | 0 | efree(match_sets); |
1288 | 0 | } |
1289 | 0 | if (marks) { |
1290 | 0 | zend_array_destroy(marks); |
1291 | 0 | } |
1292 | 0 | if (match_data != mdata) { |
1293 | 0 | pcre2_match_data_free(match_data); |
1294 | 0 | } |
1295 | 0 | php_error_docref(NULL, E_WARNING, "Get subpatterns list failed"); |
1296 | 0 | RETURN_FALSE; |
1297 | 0 | } |
1298 | | |
1299 | 0 | if (global) { /* global pattern matching */ |
1300 | 0 | if (subpats_order == PREG_PATTERN_ORDER) { |
1301 | | /* For each subpattern, insert it into the appropriate array. */ |
1302 | 0 | if (offset_capture) { |
1303 | 0 | for (i = 0; i < count; i++) { |
1304 | 0 | add_offset_pair( |
1305 | 0 | match_sets[i], subject, offsets[2*i], offsets[2*i+1], |
1306 | 0 | NULL, unmatched_as_null); |
1307 | 0 | } |
1308 | 0 | } else { |
1309 | 0 | for (i = 0; i < count; i++) { |
1310 | 0 | zval val; |
1311 | 0 | populate_match_value( |
1312 | 0 | &val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null); |
1313 | 0 | zend_hash_next_index_insert_new(match_sets[i], &val); |
1314 | 0 | } |
1315 | 0 | } |
1316 | 0 | mark = pcre2_get_mark(match_data); |
1317 | | /* Add MARK, if available */ |
1318 | 0 | if (mark) { |
1319 | 0 | if (!marks) { |
1320 | 0 | marks = zend_new_array(0); |
1321 | 0 | } |
1322 | 0 | zval tmp; |
1323 | 0 | ZVAL_STRING(&tmp, (char *) mark); |
1324 | 0 | zend_hash_index_add_new(marks, matched - 1, &tmp); |
1325 | 0 | } |
1326 | | /* |
1327 | | * If the number of captured subpatterns on this run is |
1328 | | * less than the total possible number, pad the result |
1329 | | * arrays with NULLs or empty strings. |
1330 | | */ |
1331 | 0 | if (count < num_subpats) { |
1332 | 0 | for (int i = count; i < num_subpats; i++) { |
1333 | 0 | if (offset_capture) { |
1334 | 0 | add_offset_pair( |
1335 | 0 | match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET, |
1336 | 0 | NULL, unmatched_as_null); |
1337 | 0 | } else if (unmatched_as_null) { |
1338 | 0 | zval tmp; |
1339 | 0 | ZVAL_NULL(&tmp); |
1340 | 0 | zend_hash_next_index_insert_new(match_sets[i], &tmp); |
1341 | 0 | } else { |
1342 | 0 | zval tmp; |
1343 | 0 | ZVAL_EMPTY_STRING(&tmp); |
1344 | 0 | zend_hash_next_index_insert_new(match_sets[i], &tmp); |
1345 | 0 | } |
1346 | 0 | } |
1347 | 0 | } |
1348 | 0 | } else { |
1349 | | /* Allocate and populate the result set array */ |
1350 | 0 | mark = pcre2_get_mark(match_data); |
1351 | 0 | array_init_size(&result_set, count + (mark ? 1 : 0)); |
1352 | 0 | populate_subpat_array( |
1353 | 0 | Z_ARRVAL(result_set), subject, offsets, subpat_names, |
1354 | 0 | num_subpats, count, mark, flags); |
1355 | | /* And add it to the output array */ |
1356 | 0 | zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &result_set); |
1357 | 0 | } |
1358 | 0 | } else { /* single pattern matching */ |
1359 | | /* For each subpattern, insert it into the subpatterns array. */ |
1360 | 0 | mark = pcre2_get_mark(match_data); |
1361 | 0 | populate_subpat_array( |
1362 | 0 | Z_ARRVAL_P(subpats), subject, offsets, subpat_names, num_subpats, count, mark, flags); |
1363 | 0 | break; |
1364 | 0 | } |
1365 | 0 | } |
1366 | | |
1367 | | /* Advance to the next piece. */ |
1368 | 126 | start_offset2 = offsets[1]; |
1369 | | |
1370 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
1371 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
1372 | | the match again at the same point. If this fails (picked up above) we |
1373 | | advance to the next character. */ |
1374 | 126 | if (start_offset2 == offsets[0]) { |
1375 | 66 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1376 | 66 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
1377 | 66 | if (count >= 0) { |
1378 | 0 | if (global) { |
1379 | 0 | goto matched; |
1380 | 0 | } else { |
1381 | 0 | break; |
1382 | 0 | } |
1383 | 66 | } else if (count == PCRE2_ERROR_NOMATCH) { |
1384 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
1385 | | this is not necessarily the end. We need to advance |
1386 | | the start offset, and continue. Fudge the offset values |
1387 | | to achieve this, unless we're already at the end of the string. */ |
1388 | 60 | if (start_offset2 < subject_len) { |
1389 | 57 | size_t unit_len = calculate_unit_length(pce, subject + start_offset2); |
1390 | | |
1391 | 57 | start_offset2 += unit_len; |
1392 | 57 | } else { |
1393 | 3 | break; |
1394 | 3 | } |
1395 | 60 | } else { |
1396 | 6 | goto error; |
1397 | 6 | } |
1398 | 66 | } |
1399 | 1.74k | } else if (count == PCRE2_ERROR_NOMATCH) { |
1400 | 1.65k | break; |
1401 | 1.65k | } else { |
1402 | 105 | error: |
1403 | 105 | pcre_handle_exec_error(count); |
1404 | 105 | break; |
1405 | 99 | } |
1406 | | |
1407 | 117 | if (!global) { |
1408 | 117 | break; |
1409 | 117 | } |
1410 | | |
1411 | | /* Execute the regular expression. */ |
1412 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1413 | | if ((pce->preg_options & PREG_JIT)) { |
1414 | | if (start_offset2 > subject_len) { |
1415 | | pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET); |
1416 | | break; |
1417 | | } |
1418 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1419 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1420 | | } else |
1421 | | #endif |
1422 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1423 | 0 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1424 | 0 | } |
1425 | 1.87k | if (match_data != mdata) { |
1426 | 30 | pcre2_match_data_free(match_data); |
1427 | 30 | } |
1428 | 1.87k | mdata_used = old_mdata_used; |
1429 | | |
1430 | | /* Add the match sets to the output array and clean up */ |
1431 | 1.87k | if (match_sets) { |
1432 | 0 | if (subpat_names) { |
1433 | 0 | for (i = 0; i < num_subpats; i++) { |
1434 | 0 | zval wrapper; |
1435 | 0 | ZVAL_ARR(&wrapper, match_sets[i]); |
1436 | 0 | if (subpat_names[i]) { |
1437 | 0 | zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper); |
1438 | 0 | GC_ADDREF(match_sets[i]); |
1439 | 0 | } |
1440 | 0 | zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper); |
1441 | 0 | } |
1442 | 0 | } else { |
1443 | 0 | for (i = 0; i < num_subpats; i++) { |
1444 | 0 | zval wrapper; |
1445 | 0 | ZVAL_ARR(&wrapper, match_sets[i]); |
1446 | 0 | zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper); |
1447 | 0 | } |
1448 | 0 | } |
1449 | 0 | efree(match_sets); |
1450 | |
|
1451 | 0 | if (marks) { |
1452 | 0 | zval tmp; |
1453 | 0 | ZVAL_ARR(&tmp, marks); |
1454 | 0 | zend_hash_str_update(Z_ARRVAL_P(subpats), "MARK", sizeof("MARK") - 1, &tmp); |
1455 | 0 | } |
1456 | 0 | } |
1457 | | |
1458 | 1.87k | if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) { |
1459 | | /* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */ |
1460 | 1.77k | if ((pce->compile_options & PCRE2_UTF) |
1461 | 462 | && !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) { |
1462 | 72 | GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8); |
1463 | 72 | } |
1464 | | |
1465 | 1.77k | RETVAL_LONG(matched); |
1466 | 1.77k | } else { |
1467 | 105 | RETVAL_FALSE; |
1468 | 105 | } |
1469 | 1.87k | } |
1470 | | /* }}} */ |
1471 | | |
1472 | | /* {{{ Perform a Perl-style regular expression match */ |
1473 | | PHP_FUNCTION(preg_match) |
1474 | 2.54k | { |
1475 | 2.54k | php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, false); |
1476 | 2.54k | } |
1477 | | /* }}} */ |
1478 | | |
1479 | | ZEND_FRAMELESS_FUNCTION(preg_match, 2) |
1480 | 0 | { |
1481 | 0 | zval regex_tmp, subject_tmp; |
1482 | 0 | zend_string *regex, *subject; |
1483 | |
|
1484 | 0 | Z_FLF_PARAM_STR(1, regex, regex_tmp); |
1485 | 0 | Z_FLF_PARAM_STR(2, subject, subject_tmp); |
1486 | | |
1487 | | /* Compile regex or get it from cache. */ |
1488 | 0 | pcre_cache_entry *pce; |
1489 | 0 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
1490 | 0 | RETVAL_FALSE; |
1491 | 0 | goto flf_clean; |
1492 | 0 | } |
1493 | | |
1494 | 0 | pce->refcount++; |
1495 | 0 | php_pcre_match_impl(pce, subject, return_value, /* subpats */ NULL, |
1496 | 0 | /* global */ false, /* flags */ 0, /* start_offset */ 0); |
1497 | 0 | pce->refcount--; |
1498 | |
|
1499 | 0 | flf_clean: |
1500 | 0 | Z_FLF_PARAM_FREE_STR(1, regex_tmp); |
1501 | 0 | Z_FLF_PARAM_FREE_STR(2, subject_tmp); |
1502 | 0 | } |
1503 | | |
1504 | | /* {{{ Perform a Perl-style global regular expression match */ |
1505 | | PHP_FUNCTION(preg_match_all) |
1506 | 0 | { |
1507 | 0 | php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, true); |
1508 | 0 | } |
1509 | | /* }}} */ |
1510 | | |
1511 | | /* {{{ preg_get_backref */ |
1512 | | static int preg_get_backref(char **str, int *backref) |
1513 | 0 | { |
1514 | 0 | char in_brace = 0; |
1515 | 0 | char *walk = *str; |
1516 | |
|
1517 | 0 | if (walk[1] == 0) |
1518 | 0 | return 0; |
1519 | | |
1520 | 0 | if (*walk == '$' && walk[1] == '{') { |
1521 | 0 | in_brace = 1; |
1522 | 0 | walk++; |
1523 | 0 | } |
1524 | 0 | walk++; |
1525 | |
|
1526 | 0 | if (*walk >= '0' && *walk <= '9') { |
1527 | 0 | *backref = *walk - '0'; |
1528 | 0 | walk++; |
1529 | 0 | } else |
1530 | 0 | return 0; |
1531 | | |
1532 | 0 | if (*walk && *walk >= '0' && *walk <= '9') { |
1533 | 0 | *backref = *backref * 10 + *walk - '0'; |
1534 | 0 | walk++; |
1535 | 0 | } |
1536 | |
|
1537 | 0 | if (in_brace) { |
1538 | 0 | if (*walk != '}') |
1539 | 0 | return 0; |
1540 | 0 | else |
1541 | 0 | walk++; |
1542 | 0 | } |
1543 | | |
1544 | 0 | *str = walk; |
1545 | 0 | return 1; |
1546 | 0 | } |
1547 | | /* }}} */ |
1548 | | |
1549 | | /* Return NULL if an exception has occurred */ |
1550 | | static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) |
1551 | 129 | { |
1552 | 129 | zend_string *result_str = NULL; |
1553 | 129 | zval retval; /* Function return value */ |
1554 | 129 | zval arg; /* Argument to pass to function */ |
1555 | | |
1556 | 129 | array_init_size(&arg, count + (mark ? 1 : 0)); |
1557 | 129 | populate_subpat_array(Z_ARRVAL(arg), subject, offsets, subpat_names, num_subpats, count, mark, flags); |
1558 | | |
1559 | 129 | fci->retval = &retval; |
1560 | 129 | fci->param_count = 1; |
1561 | 129 | fci->params = &arg; |
1562 | 129 | fci->consumed_args = zend_fci_consumed_arg(0); |
1563 | 129 | zend_call_function(fci, fcc); |
1564 | 129 | zval_ptr_dtor(&arg); |
1565 | 129 | if (EXPECTED(Z_TYPE(retval) == IS_STRING)) { |
1566 | 12 | return Z_STR(retval); |
1567 | 12 | } |
1568 | | /* No Exception has occurred */ |
1569 | 117 | else if (EXPECTED(Z_TYPE(retval) != IS_UNDEF)) { |
1570 | 108 | result_str = zval_try_get_string_func(&retval); |
1571 | 108 | } |
1572 | 117 | zval_ptr_dtor(&retval); |
1573 | | |
1574 | 117 | return result_str; |
1575 | 129 | } |
1576 | | |
1577 | | /* {{{ php_pcre_replace */ |
1578 | | PHPAPI zend_string *php_pcre_replace(zend_string *regex, |
1579 | | zend_string *subject_str, |
1580 | | const char *subject, size_t subject_len, |
1581 | | zend_string *replace_str, |
1582 | | size_t limit, size_t *replace_count) |
1583 | 117 | { |
1584 | 117 | pcre_cache_entry *pce; /* Compiled regular expression */ |
1585 | 117 | zend_string *result; /* Function result */ |
1586 | | |
1587 | | /* Abort on pending exception, e.g. thrown from __toString(). */ |
1588 | 117 | if (UNEXPECTED(EG(exception))) { |
1589 | 0 | return NULL; |
1590 | 0 | } |
1591 | | |
1592 | | /* Compile regex or get it from cache. */ |
1593 | 117 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
1594 | 0 | return NULL; |
1595 | 0 | } |
1596 | 117 | pce->refcount++; |
1597 | 117 | result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str, |
1598 | 117 | limit, replace_count); |
1599 | 117 | pce->refcount--; |
1600 | | |
1601 | 117 | return result; |
1602 | 117 | } |
1603 | | /* }}} */ |
1604 | | |
1605 | | /* {{{ php_pcre_replace_impl() */ |
1606 | | PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count) |
1607 | 117 | { |
1608 | 117 | uint32_t options; /* Execution options */ |
1609 | 117 | int count; /* Count of matched subpatterns */ |
1610 | 117 | uint32_t num_subpats; /* Number of captured subpatterns */ |
1611 | 117 | size_t new_len; /* Length of needed storage */ |
1612 | 117 | size_t alloc_len; /* Actual allocated length */ |
1613 | 117 | size_t match_len; /* Length of the current match */ |
1614 | 117 | int backref; /* Backreference number */ |
1615 | 117 | PCRE2_SIZE start_offset; /* Where the new search starts */ |
1616 | 117 | size_t last_end_offset; /* Where the last search ended */ |
1617 | 117 | char *walkbuf, /* Location of current replacement in the result */ |
1618 | 117 | *walk, /* Used to walk the replacement string */ |
1619 | 117 | walk_last; /* Last walked character */ |
1620 | 117 | const char *match, /* The current match */ |
1621 | 117 | *piece, /* The current piece of subject */ |
1622 | 117 | *replace_end; /* End of replacement string */ |
1623 | 117 | size_t result_len; /* Length of result */ |
1624 | 117 | zend_string *result; /* Result of replacement */ |
1625 | 117 | pcre2_match_data *match_data; |
1626 | 117 | bool old_mdata_used; |
1627 | | |
1628 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
1629 | 117 | num_subpats = pce->capture_count + 1; |
1630 | 117 | alloc_len = 0; |
1631 | 117 | result = NULL; |
1632 | | |
1633 | | /* Initialize */ |
1634 | 117 | match = NULL; |
1635 | 117 | start_offset = 0; |
1636 | 117 | last_end_offset = 0; |
1637 | 117 | result_len = 0; |
1638 | 117 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
1639 | | |
1640 | 117 | old_mdata_used = mdata_used; |
1641 | 117 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
1642 | 9 | mdata_used = true; |
1643 | 9 | match_data = mdata; |
1644 | 108 | } else { |
1645 | 108 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
1646 | 108 | if (!match_data) { |
1647 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1648 | 0 | return NULL; |
1649 | 0 | } |
1650 | 108 | } |
1651 | | |
1652 | 117 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
1653 | | |
1654 | | /* Array of subpattern offsets */ |
1655 | 117 | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
1656 | | |
1657 | | /* Execute the regular expression. */ |
1658 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1659 | | if ((pce->preg_options & PREG_JIT) && options) { |
1660 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1661 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1662 | | } else |
1663 | | #endif |
1664 | 117 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1665 | 117 | options, match_data, mctx); |
1666 | | |
1667 | 171 | while (1) { |
1668 | 171 | piece = subject + last_end_offset; |
1669 | | |
1670 | 171 | if (count >= 0 && limit > 0) { |
1671 | 54 | bool simple_string; |
1672 | | |
1673 | | /* Check for too many substrings condition. */ |
1674 | 54 | if (UNEXPECTED(count == 0)) { |
1675 | 0 | php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings"); |
1676 | 0 | count = num_subpats; |
1677 | 0 | } |
1678 | | |
1679 | 54 | matched: |
1680 | 54 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
1681 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1682 | 0 | if (result) { |
1683 | 0 | zend_string_release_ex(result, 0); |
1684 | 0 | result = NULL; |
1685 | 0 | } |
1686 | 0 | break; |
1687 | 0 | } |
1688 | | |
1689 | 54 | if (replace_count) { |
1690 | 54 | ++*replace_count; |
1691 | 54 | } |
1692 | | |
1693 | | /* Set the match location in subject */ |
1694 | 54 | match = subject + offsets[0]; |
1695 | | |
1696 | 54 | new_len = result_len + offsets[0] - last_end_offset; /* part before the match */ |
1697 | | |
1698 | 54 | walk = ZSTR_VAL(replace_str); |
1699 | 54 | replace_end = walk + ZSTR_LEN(replace_str); |
1700 | 54 | walk_last = 0; |
1701 | 54 | simple_string = true; |
1702 | 117 | while (walk < replace_end) { |
1703 | 63 | if ('\\' == *walk || '$' == *walk) { |
1704 | 0 | simple_string = false; |
1705 | 0 | if (walk_last == '\\') { |
1706 | 0 | walk++; |
1707 | 0 | walk_last = 0; |
1708 | 0 | continue; |
1709 | 0 | } |
1710 | 0 | if (preg_get_backref(&walk, &backref)) { |
1711 | 0 | if (backref < count) |
1712 | 0 | new_len += offsets[(backref<<1)+1] - offsets[backref<<1]; |
1713 | 0 | continue; |
1714 | 0 | } |
1715 | 0 | } |
1716 | 63 | new_len++; |
1717 | 63 | walk++; |
1718 | 63 | walk_last = walk[-1]; |
1719 | 63 | } |
1720 | | |
1721 | 54 | if (new_len >= alloc_len) { |
1722 | 54 | alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD; |
1723 | 54 | if (result == NULL) { |
1724 | 54 | result = zend_string_alloc(alloc_len, 0); |
1725 | 54 | } else { |
1726 | 0 | result = zend_string_extend(result, alloc_len, 0); |
1727 | 0 | } |
1728 | 54 | } |
1729 | | |
1730 | 54 | if (match-piece > 0) { |
1731 | | /* copy the part of the string before the match */ |
1732 | 54 | memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece); |
1733 | 54 | result_len += (match-piece); |
1734 | 54 | } |
1735 | | |
1736 | 54 | if (simple_string) { |
1737 | | /* copy replacement */ |
1738 | 54 | memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1); |
1739 | 54 | result_len += ZSTR_LEN(replace_str); |
1740 | 54 | } else { |
1741 | | /* copy replacement and backrefs */ |
1742 | 0 | walkbuf = ZSTR_VAL(result) + result_len; |
1743 | |
|
1744 | 0 | walk = ZSTR_VAL(replace_str); |
1745 | 0 | walk_last = 0; |
1746 | 0 | while (walk < replace_end) { |
1747 | 0 | if ('\\' == *walk || '$' == *walk) { |
1748 | 0 | if (walk_last == '\\') { |
1749 | 0 | *(walkbuf-1) = *walk++; |
1750 | 0 | walk_last = 0; |
1751 | 0 | continue; |
1752 | 0 | } |
1753 | 0 | if (preg_get_backref(&walk, &backref)) { |
1754 | 0 | if (backref < count) { |
1755 | 0 | if (offsets[backref<<1] < SIZE_MAX) { |
1756 | 0 | match_len = offsets[(backref<<1)+1] - offsets[backref<<1]; |
1757 | 0 | walkbuf = zend_mempcpy(walkbuf, subject + offsets[backref << 1], match_len); |
1758 | 0 | } |
1759 | 0 | } |
1760 | 0 | continue; |
1761 | 0 | } |
1762 | 0 | } |
1763 | 0 | *walkbuf++ = *walk++; |
1764 | 0 | walk_last = walk[-1]; |
1765 | 0 | } |
1766 | 0 | *walkbuf = '\0'; |
1767 | | /* increment the result length by how much we've added to the string */ |
1768 | 0 | result_len += (walkbuf - (ZSTR_VAL(result) + result_len)); |
1769 | 0 | } |
1770 | | |
1771 | 54 | limit--; |
1772 | | |
1773 | | /* Advance to the next piece. */ |
1774 | 54 | start_offset = last_end_offset = offsets[1]; |
1775 | | |
1776 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
1777 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
1778 | | the match again at the same point. If this fails (picked up above) we |
1779 | | advance to the next character. */ |
1780 | 54 | if (start_offset == offsets[0]) { |
1781 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1782 | 0 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
1783 | |
|
1784 | 0 | piece = subject + start_offset; |
1785 | 0 | if (count >= 0 && limit > 0) { |
1786 | 0 | goto matched; |
1787 | 0 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
1788 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
1789 | | this is not necessarily the end. We need to advance |
1790 | | the start offset, and continue. Fudge the offset values |
1791 | | to achieve this, unless we're already at the end of the string. */ |
1792 | 0 | if (start_offset < subject_len) { |
1793 | 0 | size_t unit_len = calculate_unit_length(pce, piece); |
1794 | 0 | start_offset += unit_len; |
1795 | 0 | } else { |
1796 | 0 | goto not_matched; |
1797 | 0 | } |
1798 | 0 | } else { |
1799 | 0 | goto error; |
1800 | 0 | } |
1801 | 0 | } |
1802 | | |
1803 | 117 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
1804 | 117 | not_matched: |
1805 | 117 | if (!result && subject_str) { |
1806 | 63 | result = zend_string_copy(subject_str); |
1807 | 63 | break; |
1808 | 63 | } |
1809 | | /* now we know exactly how long it is */ |
1810 | 54 | alloc_len = result_len + subject_len - last_end_offset; |
1811 | 54 | if (NULL != result) { |
1812 | 54 | result = zend_string_realloc(result, alloc_len, 0); |
1813 | 54 | } else { |
1814 | 0 | result = zend_string_alloc(alloc_len, 0); |
1815 | 0 | } |
1816 | | /* stick that last bit of string on our output */ |
1817 | 54 | memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset); |
1818 | 54 | result_len += subject_len - last_end_offset; |
1819 | 54 | ZSTR_VAL(result)[result_len] = '\0'; |
1820 | 54 | ZSTR_LEN(result) = result_len; |
1821 | 54 | break; |
1822 | 117 | } else { |
1823 | 0 | error: |
1824 | 0 | pcre_handle_exec_error(count); |
1825 | 0 | if (result) { |
1826 | 0 | zend_string_release_ex(result, 0); |
1827 | 0 | result = NULL; |
1828 | 0 | } |
1829 | 0 | break; |
1830 | 0 | } |
1831 | | |
1832 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1833 | | if (pce->preg_options & PREG_JIT) { |
1834 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1835 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1836 | | } else |
1837 | | #endif |
1838 | 54 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1839 | 54 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1840 | 54 | } |
1841 | 117 | if (match_data != mdata) { |
1842 | 108 | pcre2_match_data_free(match_data); |
1843 | 108 | } |
1844 | 117 | mdata_used = old_mdata_used; |
1845 | | |
1846 | 117 | return result; |
1847 | 117 | } |
1848 | | /* }}} */ |
1849 | | |
1850 | | static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, |
1851 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
1852 | | size_t limit, size_t *replace_count, zend_long flags |
1853 | 84 | ) { |
1854 | 84 | uint32_t options; /* Execution options */ |
1855 | 84 | int count; /* Count of matched subpatterns */ |
1856 | 84 | zend_string **subpat_names; /* Array for named subpatterns */ |
1857 | 84 | uint32_t num_subpats; /* Number of captured subpatterns */ |
1858 | 84 | size_t alloc_len; /* Actual allocated length */ |
1859 | 84 | PCRE2_SIZE start_offset; /* Where the new search starts */ |
1860 | 84 | size_t last_end_offset; /* Where the last search ended */ |
1861 | 84 | const char *match, /* The current match */ |
1862 | 84 | *piece; /* The current piece of subject */ |
1863 | 84 | size_t result_len; /* Length of result */ |
1864 | 84 | zend_string *result; /* Result of replacement */ |
1865 | 84 | pcre2_match_data *match_data; |
1866 | 84 | bool old_mdata_used; |
1867 | | |
1868 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
1869 | 84 | num_subpats = pce->capture_count + 1; |
1870 | 84 | if (pce->name_count > 0) { |
1871 | 0 | subpat_names = ensure_subpats_table(pce->name_count, pce); |
1872 | 0 | if (UNEXPECTED(!subpat_names)) { |
1873 | 0 | return NULL; |
1874 | 0 | } |
1875 | 84 | } else { |
1876 | 84 | subpat_names = NULL; |
1877 | 84 | } |
1878 | | |
1879 | 84 | alloc_len = 0; |
1880 | 84 | result = NULL; |
1881 | | |
1882 | | /* Initialize */ |
1883 | 84 | match = NULL; |
1884 | 84 | start_offset = 0; |
1885 | 84 | last_end_offset = 0; |
1886 | 84 | result_len = 0; |
1887 | 84 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
1888 | | |
1889 | 84 | old_mdata_used = mdata_used; |
1890 | 84 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
1891 | 84 | mdata_used = 1; |
1892 | 84 | match_data = mdata; |
1893 | 84 | } else { |
1894 | 0 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
1895 | 0 | if (!match_data) { |
1896 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1897 | 0 | mdata_used = old_mdata_used; |
1898 | 0 | return NULL; |
1899 | 0 | } |
1900 | 0 | } |
1901 | | |
1902 | 84 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
1903 | | |
1904 | | /* Array of subpattern offsets */ |
1905 | 84 | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
1906 | | |
1907 | | /* Execute the regular expression. */ |
1908 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1909 | | if ((pce->preg_options & PREG_JIT) && options) { |
1910 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
1911 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1912 | | } else |
1913 | | #endif |
1914 | 84 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
1915 | 84 | options, match_data, mctx); |
1916 | | |
1917 | 204 | while (1) { |
1918 | 204 | piece = ZSTR_VAL(subject_str) + last_end_offset; |
1919 | | |
1920 | 204 | if (count >= 0 && limit) { |
1921 | | /* Check for too many substrings condition. */ |
1922 | 129 | if (UNEXPECTED(count == 0)) { |
1923 | 0 | php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings"); |
1924 | 0 | count = num_subpats; |
1925 | 0 | } |
1926 | | |
1927 | 129 | matched: |
1928 | 129 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
1929 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1930 | 0 | if (result) { |
1931 | 0 | zend_string_release_ex(result, 0); |
1932 | 0 | result = NULL; |
1933 | 0 | } |
1934 | 0 | break; |
1935 | 0 | } |
1936 | | |
1937 | 129 | if (replace_count) { |
1938 | 129 | ++*replace_count; |
1939 | 129 | } |
1940 | | |
1941 | | /* Set the match location in subject */ |
1942 | 129 | match = ZSTR_VAL(subject_str) + offsets[0]; |
1943 | | |
1944 | | /* Length of needed storage */ |
1945 | 129 | size_t new_len = result_len + offsets[0] - last_end_offset; /* part before the match */ |
1946 | | |
1947 | | /* Use custom function to get replacement string and its length. */ |
1948 | 129 | zend_string *eval_result = preg_do_repl_func( |
1949 | 129 | fci, fcc, ZSTR_VAL(subject_str), offsets, subpat_names, num_subpats, count, |
1950 | 129 | pcre2_get_mark(match_data), flags); |
1951 | | |
1952 | 129 | if (UNEXPECTED(eval_result == NULL)) { |
1953 | 9 | goto error; |
1954 | 9 | } |
1955 | 120 | new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD; |
1956 | 120 | if (new_len >= alloc_len) { |
1957 | 99 | alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD; |
1958 | 99 | if (result == NULL) { |
1959 | 60 | result = zend_string_alloc(alloc_len, 0); |
1960 | 60 | } else { |
1961 | 39 | result = zend_string_extend(result, alloc_len, 0); |
1962 | 39 | } |
1963 | 99 | } |
1964 | | |
1965 | 120 | if (match-piece > 0) { |
1966 | | /* copy the part of the string before the match */ |
1967 | 120 | memcpy(ZSTR_VAL(result) + result_len, piece, match-piece); |
1968 | 120 | result_len += (match-piece); |
1969 | 120 | } |
1970 | | |
1971 | | /* If using custom function, copy result to the buffer and clean up. */ |
1972 | 120 | memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result)); |
1973 | 120 | result_len += ZSTR_LEN(eval_result); |
1974 | 120 | zend_string_release_ex(eval_result, 0); |
1975 | | |
1976 | 120 | limit--; |
1977 | | |
1978 | | /* Advance to the next piece. */ |
1979 | 120 | start_offset = last_end_offset = offsets[1]; |
1980 | | |
1981 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
1982 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
1983 | | the match again at the same point. If this fails (picked up above) we |
1984 | | advance to the next character. */ |
1985 | 120 | if (start_offset == offsets[0]) { |
1986 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
1987 | 0 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
1988 | |
|
1989 | 0 | piece = ZSTR_VAL(subject_str) + start_offset; |
1990 | 0 | if (count >= 0 && limit) { |
1991 | 0 | goto matched; |
1992 | 0 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
1993 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
1994 | | this is not necessarily the end. We need to advance |
1995 | | the start offset, and continue. Fudge the offset values |
1996 | | to achieve this, unless we're already at the end of the string. */ |
1997 | 0 | if (start_offset < ZSTR_LEN(subject_str)) { |
1998 | 0 | size_t unit_len = calculate_unit_length(pce, piece); |
1999 | 0 | start_offset += unit_len; |
2000 | 0 | } else { |
2001 | 0 | goto not_matched; |
2002 | 0 | } |
2003 | 0 | } else { |
2004 | 0 | goto error; |
2005 | 0 | } |
2006 | 0 | } |
2007 | | |
2008 | 120 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
2009 | 75 | not_matched: |
2010 | 75 | if (result == NULL) { |
2011 | 15 | result = zend_string_copy(subject_str); |
2012 | 15 | break; |
2013 | 15 | } |
2014 | | /* now we know exactly how long it is */ |
2015 | 60 | size_t segment_len = ZSTR_LEN(subject_str) - last_end_offset; |
2016 | 60 | alloc_len = result_len + segment_len; |
2017 | 60 | result = zend_string_realloc(result, alloc_len, 0); |
2018 | | /* stick that last bit of string on our output */ |
2019 | 60 | memcpy(ZSTR_VAL(result) + result_len, piece, segment_len); |
2020 | 60 | result_len += segment_len; |
2021 | 60 | ZSTR_VAL(result)[result_len] = '\0'; |
2022 | 60 | ZSTR_LEN(result) = result_len; |
2023 | 60 | break; |
2024 | 75 | } else { |
2025 | 9 | error: |
2026 | 9 | pcre_handle_exec_error(count); |
2027 | 9 | if (result) { |
2028 | 0 | zend_string_release_ex(result, 0); |
2029 | 0 | result = NULL; |
2030 | 0 | } |
2031 | 9 | break; |
2032 | 0 | } |
2033 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2034 | | if ((pce->preg_options & PREG_JIT)) { |
2035 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
2036 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2037 | | } else |
2038 | | #endif |
2039 | 120 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
2040 | 120 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2041 | 120 | } |
2042 | 84 | if (match_data != mdata) { |
2043 | 0 | pcre2_match_data_free(match_data); |
2044 | 0 | } |
2045 | 84 | mdata_used = old_mdata_used; |
2046 | | |
2047 | 84 | return result; |
2048 | 84 | } |
2049 | | |
2050 | | static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex, |
2051 | | zend_string *subject_str, |
2052 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
2053 | | size_t limit, size_t *replace_count, zend_long flags) |
2054 | 99 | { |
2055 | 99 | pcre_cache_entry *pce; /* Compiled regular expression */ |
2056 | 99 | zend_string *result; /* Function result */ |
2057 | | |
2058 | | /* Compile regex or get it from cache. */ |
2059 | 99 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
2060 | 15 | return NULL; |
2061 | 15 | } |
2062 | 84 | pce->refcount++; |
2063 | 84 | result = php_pcre_replace_func_impl(pce, subject_str, fci, fcc, limit, replace_count, flags); |
2064 | 84 | pce->refcount--; |
2065 | | |
2066 | 84 | return result; |
2067 | 99 | } |
2068 | | |
2069 | | /* {{{ php_pcre_replace_array */ |
2070 | | static zend_string *php_pcre_replace_array(HashTable *regex, |
2071 | | zend_string *replace_str, HashTable *replace_ht, |
2072 | | zend_string *subject_str, size_t limit, size_t *replace_count) |
2073 | 0 | { |
2074 | 0 | zval *regex_entry; |
2075 | 0 | zend_string *result; |
2076 | |
|
2077 | 0 | zend_string_addref(subject_str); |
2078 | |
|
2079 | 0 | if (replace_ht) { |
2080 | 0 | uint32_t replace_idx = 0; |
2081 | | |
2082 | | /* For each entry in the regex array, get the entry */ |
2083 | 0 | ZEND_HASH_FOREACH_VAL(regex, regex_entry) { |
2084 | | /* Make sure we're dealing with strings. */ |
2085 | 0 | zend_string *tmp_regex_str; |
2086 | 0 | zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str); |
2087 | 0 | zend_string *replace_entry_str, *tmp_replace_entry_str; |
2088 | 0 | zval *zv; |
2089 | | |
2090 | | /* Get current entry */ |
2091 | 0 | while (1) { |
2092 | 0 | if (replace_idx == replace_ht->nNumUsed) { |
2093 | 0 | replace_entry_str = ZSTR_EMPTY_ALLOC(); |
2094 | 0 | tmp_replace_entry_str = NULL; |
2095 | 0 | break; |
2096 | 0 | } |
2097 | 0 | zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx); |
2098 | 0 | replace_idx++; |
2099 | 0 | if (Z_TYPE_P(zv) != IS_UNDEF) { |
2100 | 0 | replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str); |
2101 | 0 | break; |
2102 | 0 | } |
2103 | 0 | } |
2104 | | |
2105 | | /* Do the actual replacement and put the result back into subject_str |
2106 | | for further replacements. */ |
2107 | 0 | result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str), |
2108 | 0 | ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count); |
2109 | 0 | zend_tmp_string_release(tmp_replace_entry_str); |
2110 | 0 | zend_tmp_string_release(tmp_regex_str); |
2111 | 0 | zend_string_release_ex(subject_str, 0); |
2112 | 0 | subject_str = result; |
2113 | 0 | if (UNEXPECTED(result == NULL)) { |
2114 | 0 | break; |
2115 | 0 | } |
2116 | 0 | } ZEND_HASH_FOREACH_END(); |
2117 | |
|
2118 | 0 | } else { |
2119 | 0 | ZEND_ASSERT(replace_str != NULL); |
2120 | | |
2121 | | /* For each entry in the regex array, get the entry */ |
2122 | 0 | ZEND_HASH_FOREACH_VAL(regex, regex_entry) { |
2123 | | /* Make sure we're dealing with strings. */ |
2124 | 0 | zend_string *tmp_regex_str; |
2125 | 0 | zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str); |
2126 | | |
2127 | | /* Do the actual replacement and put the result back into subject_str |
2128 | | for further replacements. */ |
2129 | 0 | result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str), |
2130 | 0 | ZSTR_LEN(subject_str), replace_str, limit, replace_count); |
2131 | 0 | zend_tmp_string_release(tmp_regex_str); |
2132 | 0 | zend_string_release_ex(subject_str, 0); |
2133 | 0 | subject_str = result; |
2134 | |
|
2135 | 0 | if (UNEXPECTED(result == NULL)) { |
2136 | 0 | break; |
2137 | 0 | } |
2138 | 0 | } ZEND_HASH_FOREACH_END(); |
2139 | 0 | } |
2140 | |
|
2141 | 0 | return subject_str; |
2142 | 0 | } |
2143 | | /* }}} */ |
2144 | | |
2145 | | /* {{{ php_replace_in_subject */ |
2146 | | static zend_always_inline zend_string *php_replace_in_subject( |
2147 | | zend_string *regex_str, HashTable *regex_ht, |
2148 | | zend_string *replace_str, HashTable *replace_ht, |
2149 | | zend_string *subject, size_t limit, size_t *replace_count) |
2150 | 117 | { |
2151 | 117 | zend_string *result; |
2152 | | |
2153 | 117 | if (regex_str) { |
2154 | 117 | ZEND_ASSERT(replace_str != NULL); |
2155 | 117 | result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject), |
2156 | 117 | replace_str, limit, replace_count); |
2157 | 117 | } else { |
2158 | 0 | ZEND_ASSERT(regex_ht != NULL); |
2159 | 0 | result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject, |
2160 | 0 | limit, replace_count); |
2161 | 0 | } |
2162 | 117 | return result; |
2163 | 117 | } |
2164 | | /* }}} */ |
2165 | | |
2166 | | static zend_string *php_replace_in_subject_func(zend_string *regex_str, const HashTable *regex_ht, |
2167 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
2168 | | zend_string *subject, size_t limit, size_t *replace_count, zend_long flags) |
2169 | 99 | { |
2170 | 99 | zend_string *result; |
2171 | | |
2172 | 99 | if (regex_str) { |
2173 | 99 | result = php_pcre_replace_func(regex_str, subject, fci, fcc, limit, replace_count, flags); |
2174 | 99 | return result; |
2175 | 99 | } else { |
2176 | | /* If regex is an array */ |
2177 | 0 | zval *regex_entry; |
2178 | |
|
2179 | 0 | ZEND_ASSERT(regex_ht != NULL); |
2180 | |
|
2181 | 0 | zend_string_addref(subject); |
2182 | | |
2183 | | /* For each entry in the regex array, get the entry */ |
2184 | 0 | ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) { |
2185 | | /* Make sure we're dealing with strings. */ |
2186 | 0 | zend_string *tmp_regex_entry_str; |
2187 | 0 | zend_string *regex_entry_str = zval_try_get_tmp_string(regex_entry, &tmp_regex_entry_str); |
2188 | 0 | if (UNEXPECTED(regex_entry_str == NULL)) { |
2189 | 0 | break; |
2190 | 0 | } |
2191 | | |
2192 | | /* Do the actual replacement and put the result back into subject |
2193 | | for further replacements. */ |
2194 | 0 | result = php_pcre_replace_func( |
2195 | 0 | regex_entry_str, subject, fci, fcc, limit, replace_count, flags); |
2196 | 0 | zend_tmp_string_release(tmp_regex_entry_str); |
2197 | 0 | zend_string_release(subject); |
2198 | 0 | subject = result; |
2199 | 0 | if (UNEXPECTED(result == NULL)) { |
2200 | 0 | break; |
2201 | 0 | } |
2202 | 0 | } ZEND_HASH_FOREACH_END(); |
2203 | |
|
2204 | 0 | return subject; |
2205 | 0 | } |
2206 | 99 | } |
2207 | | |
2208 | | static size_t php_preg_replace_func_impl(zval *return_value, |
2209 | | zend_string *regex_str, const HashTable *regex_ht, |
2210 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
2211 | | zend_string *subject_str, const HashTable *subject_ht, zend_long limit_val, zend_long flags) |
2212 | 99 | { |
2213 | 99 | zend_string *result; |
2214 | 99 | size_t replace_count = 0; |
2215 | | |
2216 | 99 | if (subject_str) { |
2217 | 99 | result = php_replace_in_subject_func( |
2218 | 99 | regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags); |
2219 | 99 | if (result != NULL) { |
2220 | 75 | RETVAL_STR(result); |
2221 | 75 | } else { |
2222 | 24 | RETVAL_NULL(); |
2223 | 24 | } |
2224 | 99 | } else { |
2225 | | /* if subject is an array */ |
2226 | 0 | zval *subject_entry, zv; |
2227 | 0 | zend_string *string_key; |
2228 | 0 | zend_ulong num_key; |
2229 | |
|
2230 | 0 | ZEND_ASSERT(subject_ht != NULL); |
2231 | |
|
2232 | 0 | array_init_size(return_value, zend_hash_num_elements(subject_ht)); |
2233 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2234 | | |
2235 | | /* For each subject entry, convert it to string, then perform replacement |
2236 | | and add the result to the return_value array. */ |
2237 | 0 | ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) { |
2238 | 0 | zend_string *tmp_subject_entry_str; |
2239 | 0 | zend_string *subject_entry_str = zval_try_get_tmp_string(subject_entry, &tmp_subject_entry_str); |
2240 | 0 | if (UNEXPECTED(subject_entry_str == NULL)) { |
2241 | 0 | break; |
2242 | 0 | } |
2243 | | |
2244 | 0 | result = php_replace_in_subject_func( |
2245 | 0 | regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags); |
2246 | 0 | if (result != NULL) { |
2247 | | /* Add to return array */ |
2248 | 0 | ZVAL_STR(&zv, result); |
2249 | 0 | if (string_key) { |
2250 | 0 | zend_hash_add_new(return_value_ht, string_key, &zv); |
2251 | 0 | } else { |
2252 | 0 | zend_hash_index_add_new(return_value_ht, num_key, &zv); |
2253 | 0 | } |
2254 | 0 | } |
2255 | 0 | zend_tmp_string_release(tmp_subject_entry_str); |
2256 | 0 | } ZEND_HASH_FOREACH_END(); |
2257 | 0 | } |
2258 | | |
2259 | 99 | return replace_count; |
2260 | 99 | } |
2261 | | |
2262 | | static void _preg_replace_common( |
2263 | | zval *return_value, |
2264 | | HashTable *regex_ht, zend_string *regex_str, |
2265 | | HashTable *replace_ht, zend_string *replace_str, |
2266 | | HashTable *subject_ht, zend_string *subject_str, |
2267 | | zend_long limit, |
2268 | | zval *zcount, |
2269 | | bool is_filter |
2270 | 117 | ) { |
2271 | 117 | size_t replace_count = 0; |
2272 | 117 | zend_string *result; |
2273 | 117 | size_t old_replace_count; |
2274 | | |
2275 | | /* If replace is an array then the regex argument needs to also be an array */ |
2276 | 117 | if (replace_ht && !regex_ht) { |
2277 | 0 | zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given"); |
2278 | 0 | RETURN_THROWS(); |
2279 | 0 | } |
2280 | | |
2281 | 117 | if (subject_str) { |
2282 | 117 | old_replace_count = replace_count; |
2283 | 117 | result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht, |
2284 | 117 | subject_str, limit, &replace_count); |
2285 | 117 | if (result != NULL) { |
2286 | 117 | if (!is_filter || replace_count > old_replace_count) { |
2287 | 117 | RETVAL_STR(result); |
2288 | 117 | } else { |
2289 | 0 | zend_string_release_ex(result, 0); |
2290 | 0 | RETVAL_NULL(); |
2291 | 0 | } |
2292 | 117 | } else { |
2293 | 0 | RETVAL_NULL(); |
2294 | 0 | } |
2295 | 117 | } else { |
2296 | | /* if subject is an array */ |
2297 | 0 | zval *subject_entry, zv; |
2298 | 0 | zend_string *string_key; |
2299 | 0 | zend_ulong num_key; |
2300 | |
|
2301 | 0 | ZEND_ASSERT(subject_ht != NULL); |
2302 | |
|
2303 | 0 | array_init_size(return_value, zend_hash_num_elements(subject_ht)); |
2304 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2305 | | |
2306 | | /* For each subject entry, convert it to string, then perform replacement |
2307 | | and add the result to the return_value array. */ |
2308 | 0 | ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) { |
2309 | 0 | old_replace_count = replace_count; |
2310 | 0 | zend_string *tmp_subject_entry_str; |
2311 | 0 | zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str); |
2312 | 0 | result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht, |
2313 | 0 | subject_entry_str, limit, &replace_count); |
2314 | |
|
2315 | 0 | if (result != NULL) { |
2316 | 0 | if (!is_filter || replace_count > old_replace_count) { |
2317 | | /* Add to return array */ |
2318 | 0 | ZVAL_STR(&zv, result); |
2319 | 0 | if (string_key) { |
2320 | 0 | zend_hash_add_new(return_value_ht, string_key, &zv); |
2321 | 0 | } else { |
2322 | 0 | zend_hash_index_add_new(return_value_ht, num_key, &zv); |
2323 | 0 | } |
2324 | 0 | } else { |
2325 | 0 | zend_string_release_ex(result, 0); |
2326 | 0 | } |
2327 | 0 | } |
2328 | 0 | zend_tmp_string_release(tmp_subject_entry_str); |
2329 | 0 | } ZEND_HASH_FOREACH_END(); |
2330 | 0 | } |
2331 | | |
2332 | 117 | if (zcount) { |
2333 | 0 | ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count); |
2334 | 0 | } |
2335 | 117 | } |
2336 | | |
2337 | | /* {{{ preg_replace_common */ |
2338 | | static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter) |
2339 | 120 | { |
2340 | 120 | zend_string *regex_str, *replace_str, *subject_str; |
2341 | 120 | HashTable *regex_ht, *replace_ht, *subject_ht; |
2342 | 120 | zend_long limit = -1; |
2343 | 120 | zval *zcount = NULL; |
2344 | | |
2345 | | /* Get function parameters and do error-checking. */ |
2346 | 360 | ZEND_PARSE_PARAMETERS_START(3, 5) |
2347 | 600 | Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str) |
2348 | 600 | Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str) |
2349 | 600 | Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str) |
2350 | 600 | Z_PARAM_OPTIONAL |
2351 | 600 | Z_PARAM_LONG(limit) |
2352 | 342 | Z_PARAM_ZVAL(zcount) |
2353 | 342 | ZEND_PARSE_PARAMETERS_END(); |
2354 | | |
2355 | 117 | _preg_replace_common( |
2356 | 117 | return_value, |
2357 | 117 | regex_ht, regex_str, |
2358 | 117 | replace_ht, replace_str, |
2359 | 117 | subject_ht, subject_str, |
2360 | 117 | limit, zcount, is_filter); |
2361 | 117 | } |
2362 | | /* }}} */ |
2363 | | |
2364 | | /* {{{ Perform Perl-style regular expression replacement. */ |
2365 | | PHP_FUNCTION(preg_replace) |
2366 | 120 | { |
2367 | 120 | preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false); |
2368 | 120 | } |
2369 | | /* }}} */ |
2370 | | |
2371 | | ZEND_FRAMELESS_FUNCTION(preg_replace, 3) |
2372 | 0 | { |
2373 | 0 | zend_string *regex_str, *replace_str, *subject_str; |
2374 | 0 | HashTable *regex_ht, *replace_ht, *subject_ht; |
2375 | 0 | zval regex_tmp, replace_tmp, subject_tmp; |
2376 | |
|
2377 | 0 | Z_FLF_PARAM_ARRAY_HT_OR_STR(1, regex_ht, regex_str, regex_tmp); |
2378 | 0 | Z_FLF_PARAM_ARRAY_HT_OR_STR(2, replace_ht, replace_str, replace_tmp); |
2379 | 0 | Z_FLF_PARAM_ARRAY_HT_OR_STR(3, subject_ht, subject_str, subject_tmp); |
2380 | |
|
2381 | 0 | _preg_replace_common( |
2382 | 0 | return_value, |
2383 | 0 | regex_ht, regex_str, |
2384 | 0 | replace_ht, replace_str, |
2385 | 0 | subject_ht, subject_str, |
2386 | 0 | /* limit */ -1, /* zcount */ NULL, /* is_filter */ false); |
2387 | |
|
2388 | 0 | flf_clean:; |
2389 | 0 | Z_FLF_PARAM_FREE_STR(1, regex_tmp); |
2390 | 0 | Z_FLF_PARAM_FREE_STR(2, replace_tmp); |
2391 | 0 | Z_FLF_PARAM_FREE_STR(3, subject_tmp); |
2392 | 0 | } |
2393 | | |
2394 | | /* {{{ Perform Perl-style regular expression replacement using replacement callback. */ |
2395 | | PHP_FUNCTION(preg_replace_callback) |
2396 | 99 | { |
2397 | 99 | zval *zcount = NULL; |
2398 | 99 | zend_string *regex_str; |
2399 | 99 | HashTable *regex_ht; |
2400 | 99 | zend_string *subject_str; |
2401 | 99 | HashTable *subject_ht; |
2402 | 99 | zend_long limit = -1, flags = 0; |
2403 | 99 | size_t replace_count; |
2404 | 99 | zend_fcall_info fci = empty_fcall_info; |
2405 | 99 | zend_fcall_info_cache fcc = empty_fcall_info_cache; |
2406 | | |
2407 | | /* Get function parameters and do error-checking. */ |
2408 | 297 | ZEND_PARSE_PARAMETERS_START(3, 6) |
2409 | 495 | Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str) |
2410 | 495 | Z_PARAM_FUNC(fci, fcc) |
2411 | 594 | Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str) |
2412 | 594 | Z_PARAM_OPTIONAL |
2413 | 594 | Z_PARAM_LONG(limit) |
2414 | 0 | Z_PARAM_ZVAL(zcount) |
2415 | 0 | Z_PARAM_LONG(flags) |
2416 | 99 | ZEND_PARSE_PARAMETERS_END(); |
2417 | | |
2418 | 99 | replace_count = php_preg_replace_func_impl(return_value, regex_str, regex_ht, |
2419 | 99 | &fci, &fcc, |
2420 | 99 | subject_str, subject_ht, limit, flags); |
2421 | 99 | if (zcount) { |
2422 | 0 | ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count); |
2423 | 0 | } |
2424 | 99 | } |
2425 | | /* }}} */ |
2426 | | |
2427 | | /* {{{ Perform Perl-style regular expression replacement using replacement callback. */ |
2428 | | PHP_FUNCTION(preg_replace_callback_array) |
2429 | 0 | { |
2430 | 0 | zval *replace, *zcount = NULL; |
2431 | 0 | HashTable *pattern, *subject_ht; |
2432 | 0 | zend_string *subject_str, *str_idx_regex; |
2433 | 0 | zend_long limit = -1, flags = 0; |
2434 | 0 | size_t replace_count = 0; |
2435 | | |
2436 | | /* Get function parameters and do error-checking. */ |
2437 | 0 | ZEND_PARSE_PARAMETERS_START(2, 5) |
2438 | 0 | Z_PARAM_ARRAY_HT(pattern) |
2439 | 0 | Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str) |
2440 | 0 | Z_PARAM_OPTIONAL |
2441 | 0 | Z_PARAM_LONG(limit) |
2442 | 0 | Z_PARAM_ZVAL(zcount) |
2443 | 0 | Z_PARAM_LONG(flags) |
2444 | 0 | ZEND_PARSE_PARAMETERS_END(); |
2445 | | |
2446 | 0 | if (subject_ht) { |
2447 | 0 | GC_TRY_ADDREF(subject_ht); |
2448 | 0 | } else { |
2449 | 0 | GC_TRY_ADDREF(subject_str); |
2450 | 0 | } |
2451 | |
|
2452 | 0 | ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) { |
2453 | 0 | if (!str_idx_regex) { |
2454 | 0 | zend_argument_type_error(1, "must contain only string patterns as keys"); |
2455 | 0 | goto error; |
2456 | 0 | } |
2457 | | |
2458 | 0 | zend_fcall_info_cache fcc = empty_fcall_info_cache; |
2459 | 0 | zend_fcall_info fci = empty_fcall_info; |
2460 | 0 | fci.size = sizeof(zend_fcall_info); |
2461 | | /* Copy potential trampoline */ |
2462 | 0 | ZVAL_COPY_VALUE(&fci.function_name, replace); |
2463 | |
|
2464 | 0 | if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) { |
2465 | 0 | zend_argument_type_error(1, "must contain only valid callbacks"); |
2466 | 0 | goto error; |
2467 | 0 | } |
2468 | | |
2469 | 0 | zval retval; |
2470 | 0 | replace_count += php_preg_replace_func_impl(&retval, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc, |
2471 | 0 | subject_str, subject_ht, limit, flags); |
2472 | 0 | zend_release_fcall_info_cache(&fcc); |
2473 | |
|
2474 | 0 | switch (Z_TYPE(retval)) { |
2475 | 0 | case IS_ARRAY: |
2476 | 0 | ZEND_ASSERT(subject_ht); |
2477 | 0 | zend_array_release(subject_ht); |
2478 | 0 | subject_ht = Z_ARR(retval); |
2479 | 0 | break; |
2480 | 0 | case IS_STRING: |
2481 | 0 | ZEND_ASSERT(subject_str); |
2482 | 0 | zend_string_release(subject_str); |
2483 | 0 | subject_str = Z_STR(retval); |
2484 | 0 | break; |
2485 | 0 | case IS_NULL: |
2486 | 0 | RETVAL_NULL(); |
2487 | 0 | goto error; |
2488 | 0 | default: ZEND_UNREACHABLE(); |
2489 | 0 | } |
2490 | | |
2491 | 0 | if (EG(exception)) { |
2492 | 0 | goto error; |
2493 | 0 | } |
2494 | 0 | } ZEND_HASH_FOREACH_END(); |
2495 | | |
2496 | 0 | if (zcount) { |
2497 | 0 | ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count); |
2498 | 0 | } |
2499 | |
|
2500 | 0 | if (subject_ht) { |
2501 | 0 | RETVAL_ARR(subject_ht); |
2502 | | // Unset the type_flags of immutable arrays to prevent the VM from performing refcounting |
2503 | 0 | if (GC_FLAGS(subject_ht) & IS_ARRAY_IMMUTABLE) { |
2504 | 0 | Z_TYPE_FLAGS_P(return_value) = 0; |
2505 | 0 | } |
2506 | 0 | return; |
2507 | 0 | } else { |
2508 | 0 | RETURN_STR(subject_str); |
2509 | 0 | } |
2510 | | |
2511 | 0 | error: |
2512 | 0 | if (subject_ht) { |
2513 | 0 | zend_array_release(subject_ht); |
2514 | 0 | } else { |
2515 | 0 | zend_string_release(subject_str); |
2516 | 0 | } |
2517 | 0 | } |
2518 | | /* }}} */ |
2519 | | |
2520 | | /* {{{ Perform Perl-style regular expression replacement and only return matches. */ |
2521 | | PHP_FUNCTION(preg_filter) |
2522 | 0 | { |
2523 | 0 | preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true); |
2524 | 0 | } |
2525 | | /* }}} */ |
2526 | | |
2527 | | /* {{{ Split string into an array using a perl-style regular expression as a delimiter */ |
2528 | | PHP_FUNCTION(preg_split) |
2529 | 0 | { |
2530 | 0 | zend_string *regex; /* Regular expression */ |
2531 | 0 | zend_string *subject; /* String to match against */ |
2532 | 0 | zend_long limit_val = -1;/* Integer value of limit */ |
2533 | 0 | zend_long flags = 0; /* Match control flags */ |
2534 | 0 | pcre_cache_entry *pce; /* Compiled regular expression */ |
2535 | | |
2536 | | /* Get function parameters and do error checking */ |
2537 | 0 | ZEND_PARSE_PARAMETERS_START(2, 4) |
2538 | 0 | Z_PARAM_STR(regex) |
2539 | 0 | Z_PARAM_STR(subject) |
2540 | 0 | Z_PARAM_OPTIONAL |
2541 | 0 | Z_PARAM_LONG(limit_val) |
2542 | 0 | Z_PARAM_LONG(flags) |
2543 | 0 | ZEND_PARSE_PARAMETERS_END(); |
2544 | | |
2545 | | /* Compile regex or get it from cache. */ |
2546 | 0 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
2547 | 0 | RETURN_FALSE; |
2548 | 0 | } |
2549 | | |
2550 | 0 | pce->refcount++; |
2551 | 0 | php_pcre_split_impl(pce, subject, return_value, limit_val, flags); |
2552 | 0 | pce->refcount--; |
2553 | 0 | } |
2554 | | /* }}} */ |
2555 | | |
2556 | | /* {{{ php_pcre_split */ |
2557 | | PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, |
2558 | | zend_long limit_val, zend_long flags) |
2559 | 0 | { |
2560 | 0 | uint32_t options; /* Execution options */ |
2561 | 0 | int count; /* Count of matched subpatterns */ |
2562 | 0 | PCRE2_SIZE start_offset; /* Where the new search starts */ |
2563 | 0 | PCRE2_SIZE last_match_offset; /* Location of last match */ |
2564 | 0 | uint32_t no_empty; /* If NO_EMPTY flag is set */ |
2565 | 0 | uint32_t delim_capture; /* If delimiters should be captured */ |
2566 | 0 | uint32_t offset_capture; /* If offsets should be captured */ |
2567 | 0 | uint32_t num_subpats; /* Number of captured subpatterns */ |
2568 | 0 | zval tmp; |
2569 | 0 | pcre2_match_data *match_data; |
2570 | 0 | bool old_mdata_used; |
2571 | 0 | char *subject = ZSTR_VAL(subject_str); |
2572 | |
|
2573 | 0 | no_empty = flags & PREG_SPLIT_NO_EMPTY; |
2574 | 0 | delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE; |
2575 | 0 | offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE; |
2576 | | |
2577 | | /* Initialize return value */ |
2578 | 0 | array_init(return_value); |
2579 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2580 | | |
2581 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
2582 | 0 | num_subpats = pce->capture_count + 1; |
2583 | | |
2584 | | /* Start at the beginning of the string */ |
2585 | 0 | start_offset = 0; |
2586 | 0 | last_match_offset = 0; |
2587 | 0 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
2588 | |
|
2589 | 0 | if (limit_val == -1) { |
2590 | | /* pass */ |
2591 | 0 | } else if (limit_val == 0) { |
2592 | 0 | limit_val = -1; |
2593 | 0 | } else if (limit_val <= 1) { |
2594 | 0 | goto last; |
2595 | 0 | } |
2596 | | |
2597 | 0 | old_mdata_used = mdata_used; |
2598 | 0 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
2599 | 0 | mdata_used = true; |
2600 | 0 | match_data = mdata; |
2601 | 0 | } else { |
2602 | 0 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
2603 | 0 | if (!match_data) { |
2604 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
2605 | 0 | zval_ptr_dtor(return_value); |
2606 | 0 | RETURN_FALSE; |
2607 | 0 | } |
2608 | 0 | } |
2609 | | |
2610 | 0 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
2611 | | |
2612 | | /* Array of subpattern offsets */ |
2613 | 0 | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
2614 | |
|
2615 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2616 | | if ((pce->preg_options & PREG_JIT) && options) { |
2617 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2618 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2619 | | } else |
2620 | | #endif |
2621 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2622 | 0 | options, match_data, mctx); |
2623 | |
|
2624 | 0 | while (1) { |
2625 | | /* If something matched */ |
2626 | 0 | if (count >= 0) { |
2627 | | /* Check for too many substrings condition. */ |
2628 | 0 | if (UNEXPECTED(count == 0)) { |
2629 | 0 | php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings"); |
2630 | 0 | count = num_subpats; |
2631 | 0 | } |
2632 | |
|
2633 | 0 | matched: |
2634 | 0 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
2635 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
2636 | 0 | break; |
2637 | 0 | } |
2638 | | |
2639 | 0 | if (!no_empty || offsets[0] != last_match_offset) { |
2640 | 0 | if (offset_capture) { |
2641 | | /* Add (match, offset) pair to the return value */ |
2642 | 0 | add_offset_pair( |
2643 | 0 | return_value_ht, subject, last_match_offset, offsets[0], |
2644 | 0 | NULL, 0); |
2645 | 0 | } else { |
2646 | | /* Add the piece to the return value */ |
2647 | 0 | populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]); |
2648 | 0 | zend_hash_next_index_insert_new(return_value_ht, &tmp); |
2649 | 0 | } |
2650 | | |
2651 | | /* One less left to do */ |
2652 | 0 | if (limit_val != -1) |
2653 | 0 | limit_val--; |
2654 | 0 | } |
2655 | |
|
2656 | 0 | if (delim_capture) { |
2657 | 0 | size_t i; |
2658 | 0 | for (i = 1; i < count; i++) { |
2659 | | /* If we have matched a delimiter */ |
2660 | 0 | if (!no_empty || offsets[2*i] != offsets[2*i+1]) { |
2661 | 0 | if (offset_capture) { |
2662 | 0 | add_offset_pair( |
2663 | 0 | return_value_ht, subject, offsets[2*i], offsets[2*i+1], NULL, 0); |
2664 | 0 | } else { |
2665 | 0 | populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]); |
2666 | 0 | zend_hash_next_index_insert_new(return_value_ht, &tmp); |
2667 | 0 | } |
2668 | 0 | } |
2669 | 0 | } |
2670 | 0 | } |
2671 | | |
2672 | | /* Advance to the position right after the last full match */ |
2673 | 0 | start_offset = last_match_offset = offsets[1]; |
2674 | | |
2675 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
2676 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
2677 | | the match again at the same point. If this fails (picked up above) we |
2678 | | advance to the next character. */ |
2679 | 0 | if (start_offset == offsets[0]) { |
2680 | | /* Get next piece if no limit or limit not yet reached and something matched*/ |
2681 | 0 | if (limit_val != -1 && limit_val <= 1) { |
2682 | 0 | break; |
2683 | 0 | } |
2684 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2685 | 0 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
2686 | 0 | if (count >= 0) { |
2687 | 0 | goto matched; |
2688 | 0 | } else if (count == PCRE2_ERROR_NOMATCH) { |
2689 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
2690 | | this is not necessarily the end. We need to advance |
2691 | | the start offset, and continue. Fudge the offset values |
2692 | | to achieve this, unless we're already at the end of the string. */ |
2693 | 0 | if (start_offset < ZSTR_LEN(subject_str)) { |
2694 | 0 | start_offset += calculate_unit_length(pce, subject + start_offset); |
2695 | 0 | } else { |
2696 | 0 | break; |
2697 | 0 | } |
2698 | 0 | } else { |
2699 | 0 | goto error; |
2700 | 0 | } |
2701 | 0 | } |
2702 | |
|
2703 | 0 | } else if (count == PCRE2_ERROR_NOMATCH) { |
2704 | 0 | break; |
2705 | 0 | } else { |
2706 | 0 | error: |
2707 | 0 | pcre_handle_exec_error(count); |
2708 | 0 | break; |
2709 | 0 | } |
2710 | | |
2711 | | /* Get next piece if no limit or limit not yet reached and something matched*/ |
2712 | 0 | if (limit_val != -1 && limit_val <= 1) { |
2713 | 0 | break; |
2714 | 0 | } |
2715 | | |
2716 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2717 | | if (pce->preg_options & PREG_JIT) { |
2718 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2719 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2720 | | } else |
2721 | | #endif |
2722 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2723 | 0 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2724 | 0 | } |
2725 | 0 | if (match_data != mdata) { |
2726 | 0 | pcre2_match_data_free(match_data); |
2727 | 0 | } |
2728 | 0 | mdata_used = old_mdata_used; |
2729 | |
|
2730 | 0 | if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) { |
2731 | 0 | zval_ptr_dtor(return_value); |
2732 | 0 | RETURN_FALSE; |
2733 | 0 | } |
2734 | | |
2735 | 0 | last: |
2736 | 0 | start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */ |
2737 | |
|
2738 | 0 | if (!no_empty || start_offset < ZSTR_LEN(subject_str)) { |
2739 | 0 | if (offset_capture) { |
2740 | | /* Add the last (match, offset) pair to the return value */ |
2741 | 0 | add_offset_pair(return_value_ht, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0); |
2742 | 0 | } else { |
2743 | | /* Add the last piece to the return value */ |
2744 | 0 | if (start_offset == 0) { |
2745 | 0 | ZVAL_STR_COPY(&tmp, subject_str); |
2746 | 0 | } else { |
2747 | 0 | populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str)); |
2748 | 0 | } |
2749 | 0 | zend_hash_next_index_insert_new(return_value_ht, &tmp); |
2750 | 0 | } |
2751 | 0 | } |
2752 | 0 | } |
2753 | | /* }}} */ |
2754 | | |
2755 | | /* {{{ Quote regular expression characters plus an optional character */ |
2756 | | PHP_FUNCTION(preg_quote) |
2757 | 30 | { |
2758 | 30 | zend_string *str; /* Input string argument */ |
2759 | 30 | zend_string *delim = NULL; /* Additional delimiter argument */ |
2760 | 30 | char *in_str; /* Input string */ |
2761 | 30 | char *in_str_end; /* End of the input string */ |
2762 | 30 | zend_string *out_str; /* Output string with quoted characters */ |
2763 | 30 | size_t extra_len; /* Number of additional characters */ |
2764 | 30 | char *p, /* Iterator for input string */ |
2765 | 30 | *q, /* Iterator for output string */ |
2766 | 30 | delim_char = '\0', /* Delimiter character to be quoted */ |
2767 | 30 | c; /* Current character */ |
2768 | | |
2769 | | /* Get the arguments and check for errors */ |
2770 | 90 | ZEND_PARSE_PARAMETERS_START(1, 2) |
2771 | 120 | Z_PARAM_STR(str) |
2772 | 30 | Z_PARAM_OPTIONAL |
2773 | 66 | Z_PARAM_STR_OR_NULL(delim) |
2774 | 30 | ZEND_PARSE_PARAMETERS_END(); |
2775 | | |
2776 | | /* Nothing to do if we got an empty string */ |
2777 | 30 | if (ZSTR_LEN(str) == 0) { |
2778 | 0 | RETURN_EMPTY_STRING(); |
2779 | 0 | } |
2780 | | |
2781 | 30 | in_str = ZSTR_VAL(str); |
2782 | 30 | in_str_end = in_str + ZSTR_LEN(str); |
2783 | | |
2784 | 30 | if (delim) { |
2785 | 3 | delim_char = ZSTR_VAL(delim)[0]; |
2786 | 3 | } |
2787 | | |
2788 | | /* Go through the string and quote necessary characters */ |
2789 | 30 | extra_len = 0; |
2790 | 30 | p = in_str; |
2791 | 50.4k | do { |
2792 | 50.4k | c = *p; |
2793 | 50.4k | switch(c) { |
2794 | 735 | case '.': |
2795 | 921 | case '\\': |
2796 | 1.28k | case '+': |
2797 | 1.30k | case '*': |
2798 | 1.43k | case '?': |
2799 | 1.57k | case '[': |
2800 | 1.63k | case '^': |
2801 | 1.75k | case ']': |
2802 | 1.75k | case '$': |
2803 | 1.98k | case '(': |
2804 | 2.61k | case ')': |
2805 | 2.68k | case '{': |
2806 | 3.03k | case '}': |
2807 | 3.48k | case '=': |
2808 | 3.48k | case '!': |
2809 | 3.70k | case '>': |
2810 | 3.74k | case '<': |
2811 | 3.81k | case '|': |
2812 | 4.23k | case ':': |
2813 | 4.47k | case '-': |
2814 | 4.80k | case '#': |
2815 | 4.80k | extra_len++; |
2816 | 4.80k | break; |
2817 | | |
2818 | 1.91k | case '\0': |
2819 | 1.91k | extra_len+=3; |
2820 | 1.91k | break; |
2821 | | |
2822 | 43.7k | default: |
2823 | 43.7k | if (c == delim_char) { |
2824 | 0 | extra_len++; |
2825 | 0 | } |
2826 | 43.7k | break; |
2827 | 50.4k | } |
2828 | 50.4k | p++; |
2829 | 50.4k | } while (p != in_str_end); |
2830 | | |
2831 | 30 | if (extra_len == 0) { |
2832 | 0 | RETURN_STR_COPY(str); |
2833 | 0 | } |
2834 | | |
2835 | | /* Allocate enough memory so that even if each character |
2836 | | is quoted, we won't run out of room */ |
2837 | 30 | out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0); |
2838 | 30 | q = ZSTR_VAL(out_str); |
2839 | 30 | p = in_str; |
2840 | | |
2841 | 50.4k | do { |
2842 | 50.4k | c = *p; |
2843 | 50.4k | switch(c) { |
2844 | 735 | case '.': |
2845 | 921 | case '\\': |
2846 | 1.28k | case '+': |
2847 | 1.30k | case '*': |
2848 | 1.43k | case '?': |
2849 | 1.57k | case '[': |
2850 | 1.63k | case '^': |
2851 | 1.75k | case ']': |
2852 | 1.75k | case '$': |
2853 | 1.98k | case '(': |
2854 | 2.61k | case ')': |
2855 | 2.68k | case '{': |
2856 | 3.03k | case '}': |
2857 | 3.48k | case '=': |
2858 | 3.48k | case '!': |
2859 | 3.70k | case '>': |
2860 | 3.74k | case '<': |
2861 | 3.81k | case '|': |
2862 | 4.23k | case ':': |
2863 | 4.47k | case '-': |
2864 | 4.80k | case '#': |
2865 | 4.80k | *q++ = '\\'; |
2866 | 4.80k | *q++ = c; |
2867 | 4.80k | break; |
2868 | | |
2869 | 1.91k | case '\0': |
2870 | 1.91k | *q++ = '\\'; |
2871 | 1.91k | *q++ = '0'; |
2872 | 1.91k | *q++ = '0'; |
2873 | 1.91k | *q++ = '0'; |
2874 | 1.91k | break; |
2875 | | |
2876 | 43.7k | default: |
2877 | 43.7k | if (c == delim_char) { |
2878 | 0 | *q++ = '\\'; |
2879 | 0 | } |
2880 | 43.7k | *q++ = c; |
2881 | 43.7k | break; |
2882 | 50.4k | } |
2883 | 50.4k | p++; |
2884 | 50.4k | } while (p != in_str_end); |
2885 | 30 | *q = '\0'; |
2886 | | |
2887 | 30 | RETURN_NEW_STR(out_str); |
2888 | 30 | } |
2889 | | /* }}} */ |
2890 | | |
2891 | | /* {{{ Searches array and returns entries which match regex */ |
2892 | | PHP_FUNCTION(preg_grep) |
2893 | 0 | { |
2894 | 0 | zend_string *regex; /* Regular expression */ |
2895 | 0 | zval *input; /* Input array */ |
2896 | 0 | zend_long flags = 0; /* Match control flags */ |
2897 | 0 | pcre_cache_entry *pce; /* Compiled regular expression */ |
2898 | | |
2899 | | /* Get arguments and do error checking */ |
2900 | 0 | ZEND_PARSE_PARAMETERS_START(2, 3) |
2901 | 0 | Z_PARAM_STR(regex) |
2902 | 0 | Z_PARAM_ARRAY(input) |
2903 | 0 | Z_PARAM_OPTIONAL |
2904 | 0 | Z_PARAM_LONG(flags) |
2905 | 0 | ZEND_PARSE_PARAMETERS_END(); |
2906 | | |
2907 | | /* Compile regex or get it from cache. */ |
2908 | 0 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
2909 | 0 | RETURN_FALSE; |
2910 | 0 | } |
2911 | | |
2912 | 0 | pce->refcount++; |
2913 | 0 | php_pcre_grep_impl(pce, input, return_value, flags); |
2914 | 0 | pce->refcount--; |
2915 | 0 | } |
2916 | | /* }}} */ |
2917 | | |
2918 | | PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */ |
2919 | 0 | { |
2920 | 0 | zval *entry; /* An entry in the input array */ |
2921 | 0 | uint32_t num_subpats; /* Number of captured subpatterns */ |
2922 | 0 | int count; /* Count of matched subpatterns */ |
2923 | 0 | uint32_t options; /* Execution options */ |
2924 | 0 | zend_string *string_key; |
2925 | 0 | zend_ulong num_key; |
2926 | 0 | bool invert; /* Whether to return non-matching |
2927 | | entries */ |
2928 | 0 | bool old_mdata_used; |
2929 | 0 | pcre2_match_data *match_data; |
2930 | 0 | invert = flags & PREG_GREP_INVERT ? 1 : 0; |
2931 | | |
2932 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
2933 | 0 | num_subpats = pce->capture_count + 1; |
2934 | | |
2935 | | /* Initialize return array */ |
2936 | 0 | array_init(return_value); |
2937 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2938 | |
|
2939 | 0 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
2940 | |
|
2941 | 0 | old_mdata_used = mdata_used; |
2942 | 0 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
2943 | 0 | mdata_used = true; |
2944 | 0 | match_data = mdata; |
2945 | 0 | } else { |
2946 | 0 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
2947 | 0 | if (!match_data) { |
2948 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
2949 | 0 | return; |
2950 | 0 | } |
2951 | 0 | } |
2952 | | |
2953 | 0 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
2954 | | |
2955 | | /* Go through the input array */ |
2956 | 0 | ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) { |
2957 | 0 | zend_string *tmp_subject_str; |
2958 | 0 | zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str); |
2959 | | |
2960 | | /* Perform the match */ |
2961 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2962 | | if ((pce->preg_options & PREG_JIT) && options) { |
2963 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0, |
2964 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2965 | | } else |
2966 | | #endif |
2967 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0, |
2968 | 0 | options, match_data, mctx); |
2969 | | |
2970 | | /* If the entry fits our requirements */ |
2971 | 0 | if (count >= 0) { |
2972 | | /* Check for too many substrings condition. */ |
2973 | 0 | if (UNEXPECTED(count == 0)) { |
2974 | 0 | php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings"); |
2975 | 0 | } |
2976 | 0 | if (!invert) { |
2977 | 0 | Z_TRY_ADDREF_P(entry); |
2978 | | |
2979 | | /* Add to return array */ |
2980 | 0 | if (string_key) { |
2981 | 0 | zend_hash_update(return_value_ht, string_key, entry); |
2982 | 0 | } else { |
2983 | 0 | zend_hash_index_update(return_value_ht, num_key, entry); |
2984 | 0 | } |
2985 | 0 | } |
2986 | 0 | } else if (count == PCRE2_ERROR_NOMATCH) { |
2987 | 0 | if (invert) { |
2988 | 0 | Z_TRY_ADDREF_P(entry); |
2989 | | |
2990 | | /* Add to return array */ |
2991 | 0 | if (string_key) { |
2992 | 0 | zend_hash_update(return_value_ht, string_key, entry); |
2993 | 0 | } else { |
2994 | 0 | zend_hash_index_update(return_value_ht, num_key, entry); |
2995 | 0 | } |
2996 | 0 | } |
2997 | 0 | } else { |
2998 | 0 | pcre_handle_exec_error(count); |
2999 | 0 | zend_tmp_string_release(tmp_subject_str); |
3000 | 0 | break; |
3001 | 0 | } |
3002 | | |
3003 | 0 | zend_tmp_string_release(tmp_subject_str); |
3004 | 0 | } ZEND_HASH_FOREACH_END(); |
3005 | 0 | if (match_data != mdata) { |
3006 | 0 | pcre2_match_data_free(match_data); |
3007 | 0 | } |
3008 | |
|
3009 | 0 | mdata_used = old_mdata_used; |
3010 | |
|
3011 | 0 | if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) { |
3012 | 0 | zend_array_destroy(Z_ARR_P(return_value)); |
3013 | 0 | RETURN_FALSE; |
3014 | 0 | } |
3015 | 0 | } |
3016 | | /* }}} */ |
3017 | | |
3018 | | /* {{{ Returns the error code of the last regexp execution. */ |
3019 | | PHP_FUNCTION(preg_last_error) |
3020 | 0 | { |
3021 | 0 | ZEND_PARSE_PARAMETERS_NONE(); |
3022 | | |
3023 | 0 | RETURN_LONG(PCRE_G(error_code)); |
3024 | 0 | } |
3025 | | /* }}} */ |
3026 | | |
3027 | | /* {{{ Returns the error message of the last regexp execution. */ |
3028 | | PHP_FUNCTION(preg_last_error_msg) |
3029 | 0 | { |
3030 | 0 | ZEND_PARSE_PARAMETERS_NONE(); |
3031 | | |
3032 | 0 | RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code))); |
3033 | 0 | } |
3034 | | /* }}} */ |
3035 | | |
3036 | | /* {{{ module definition structures */ |
3037 | | |
3038 | | zend_module_entry pcre_module_entry = { |
3039 | | STANDARD_MODULE_HEADER, |
3040 | | "pcre", |
3041 | | ext_functions, |
3042 | | PHP_MINIT(pcre), |
3043 | | PHP_MSHUTDOWN(pcre), |
3044 | | PHP_RINIT(pcre), |
3045 | | PHP_RSHUTDOWN(pcre), |
3046 | | PHP_MINFO(pcre), |
3047 | | PHP_PCRE_VERSION, |
3048 | | PHP_MODULE_GLOBALS(pcre), |
3049 | | PHP_GINIT(pcre), |
3050 | | PHP_GSHUTDOWN(pcre), |
3051 | | NULL, |
3052 | | STANDARD_MODULE_PROPERTIES_EX |
3053 | | }; |
3054 | | |
3055 | | #ifdef COMPILE_DL_PCRE |
3056 | | ZEND_GET_MODULE(pcre) |
3057 | | #endif |
3058 | | |
3059 | | /* }}} */ |
3060 | | |
3061 | | PHPAPI pcre2_match_context *php_pcre_mctx(void) |
3062 | 0 | {/*{{{*/ |
3063 | 0 | return mctx; |
3064 | 0 | }/*}}}*/ |
3065 | | |
3066 | | PHPAPI pcre2_general_context *php_pcre_gctx(void) |
3067 | 0 | {/*{{{*/ |
3068 | 0 | return gctx; |
3069 | 0 | }/*}}}*/ |
3070 | | |
3071 | | PHPAPI pcre2_compile_context *php_pcre_cctx(void) |
3072 | 0 | {/*{{{*/ |
3073 | 0 | return cctx; |
3074 | 0 | }/*}}}*/ |
3075 | | |
3076 | | PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce) |
3077 | 0 | {/*{{{*/ |
3078 | 0 | assert(NULL != pce); |
3079 | 0 | pce->refcount++; |
3080 | 0 | }/*}}}*/ |
3081 | | |
3082 | | PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce) |
3083 | 0 | {/*{{{*/ |
3084 | 0 | assert(NULL != pce); |
3085 | 0 | assert(0 != pce->refcount); |
3086 | 0 | pce->refcount--; |
3087 | 0 | }/*}}}*/ |
3088 | | |
3089 | | PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce) |
3090 | 0 | {/*{{{*/ |
3091 | 0 | assert(NULL != pce); |
3092 | 0 | return pce->re; |
3093 | 0 | }/*}}}*/ |