/src/php-src/ext/pcre/php_pcre.c
Line | Count | Source |
1 | | /* |
2 | | +----------------------------------------------------------------------+ |
3 | | | Copyright (c) The PHP Group | |
4 | | +----------------------------------------------------------------------+ |
5 | | | This source file is subject to version 3.01 of the PHP license, | |
6 | | | that is bundled with this package in the file LICENSE, and is | |
7 | | | available through the world-wide-web at the following url: | |
8 | | | https://www.php.net/license/3_01.txt | |
9 | | | If you did not receive a copy of the PHP license and are unable to | |
10 | | | obtain it through the world-wide-web, please send a note to | |
11 | | | license@php.net so we can mail you a copy immediately. | |
12 | | +----------------------------------------------------------------------+ |
13 | | | Author: Andrei Zmievski <andrei@php.net> | |
14 | | +----------------------------------------------------------------------+ |
15 | | */ |
16 | | |
17 | | #include "php.h" |
18 | | #include "php_ini.h" |
19 | | #include "php_pcre.h" |
20 | | #include "ext/standard/info.h" |
21 | | #include "ext/standard/basic_functions.h" |
22 | | #include "zend_smart_str.h" |
23 | | #include "SAPI.h" |
24 | | |
25 | 0 | #define PREG_PATTERN_ORDER 1 |
26 | 0 | #define PREG_SET_ORDER 2 |
27 | 377 | #define PREG_OFFSET_CAPTURE (1<<8) |
28 | 377 | #define PREG_UNMATCHED_AS_NULL (1<<9) |
29 | | |
30 | 0 | #define PREG_SPLIT_NO_EMPTY (1<<0) |
31 | 0 | #define PREG_SPLIT_DELIM_CAPTURE (1<<1) |
32 | 0 | #define PREG_SPLIT_OFFSET_CAPTURE (1<<2) |
33 | | |
34 | 0 | #define PREG_GREP_INVERT (1<<0) |
35 | | |
36 | | #define PREG_JIT (1<<3) |
37 | | |
38 | 1.61k | #define PCRE_CACHE_SIZE 4096 |
39 | | |
40 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
41 | | #define PHP_PCRE_JIT_SUPPORT 1 |
42 | | #else |
43 | | #define PHP_PCRE_JIT_SUPPORT 0 |
44 | | #endif |
45 | | |
46 | | char *php_pcre_version; |
47 | | |
48 | | #include "php_pcre_arginfo.h" |
49 | | |
50 | | struct _pcre_cache_entry { |
51 | | pcre2_code *re; |
52 | | /* Pointer is not NULL (during request) when there are named captures. |
53 | | * Length is equal to capture_count + 1 to account for capture group 0. |
54 | | * This table cache is only valid during request. |
55 | | * Trying to store this over multiple requests causes issues when the keys are exposed in user arrays |
56 | | * (see GH-17122 and GH-17132). */ |
57 | | zend_string **subpats_table; |
58 | | uint32_t preg_options; |
59 | | uint32_t name_count; |
60 | | uint32_t capture_count; |
61 | | uint32_t compile_options; |
62 | | uint32_t refcount; |
63 | | }; |
64 | | |
65 | | PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre) |
66 | | |
67 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
68 | | #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024) |
69 | | #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024) |
70 | | ZEND_TLS pcre2_jit_stack *jit_stack = NULL; |
71 | | #endif |
72 | | /* General context using (infallible) system allocator. */ |
73 | | ZEND_TLS pcre2_general_context *gctx = NULL; |
74 | | /* These two are global per thread for now. Though it is possible to use these |
75 | | per pattern. Either one can copy it and use in pce, or one does no global |
76 | | contexts at all, but creates for every pce. */ |
77 | | ZEND_TLS pcre2_compile_context *cctx = NULL; |
78 | | ZEND_TLS pcre2_match_context *mctx = NULL; |
79 | | ZEND_TLS pcre2_match_data *mdata = NULL; |
80 | | ZEND_TLS bool mdata_used = 0; |
81 | | ZEND_TLS uint8_t pcre2_init_ok = 0; |
82 | | #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT) |
83 | | static MUTEX_T pcre_mt = NULL; |
84 | | #define php_pcre_mutex_alloc() \ |
85 | | if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc(); |
86 | | #define php_pcre_mutex_free() \ |
87 | | if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; } |
88 | | #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt); |
89 | | #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt); |
90 | | #else |
91 | | #define php_pcre_mutex_alloc() |
92 | | #define php_pcre_mutex_free() |
93 | | #define php_pcre_mutex_lock() |
94 | | #define php_pcre_mutex_unlock() |
95 | | #endif |
96 | | |
97 | | ZEND_TLS HashTable char_tables; |
98 | | |
99 | | static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats); |
100 | | |
101 | | static void php_pcre_free_char_table(zval *data) |
102 | 0 | {/*{{{*/ |
103 | 0 | void *ptr = Z_PTR_P(data); |
104 | 0 | pefree(ptr, 1); |
105 | 0 | }/*}}}*/ |
106 | | |
107 | | static void pcre_handle_exec_error(int pcre_code) /* {{{ */ |
108 | 1.45k | { |
109 | 1.45k | int preg_code = 0; |
110 | | |
111 | 1.45k | switch (pcre_code) { |
112 | 28 | case PCRE2_ERROR_MATCHLIMIT: |
113 | 28 | preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR; |
114 | 28 | break; |
115 | | |
116 | 0 | case PCRE2_ERROR_RECURSIONLIMIT: |
117 | 0 | preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR; |
118 | 0 | break; |
119 | | |
120 | 0 | case PCRE2_ERROR_BADUTFOFFSET: |
121 | 0 | preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR; |
122 | 0 | break; |
123 | | |
124 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
125 | | case PCRE2_ERROR_JIT_STACKLIMIT: |
126 | | preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR; |
127 | | break; |
128 | | #endif |
129 | | |
130 | 1.43k | default: |
131 | 1.43k | if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) { |
132 | 164 | preg_code = PHP_PCRE_BAD_UTF8_ERROR; |
133 | 1.26k | } else { |
134 | 1.26k | preg_code = PHP_PCRE_INTERNAL_ERROR; |
135 | 1.26k | } |
136 | 1.43k | break; |
137 | 1.45k | } |
138 | | |
139 | 1.45k | PCRE_G(error_code) = preg_code; |
140 | 1.45k | } |
141 | | /* }}} */ |
142 | | |
143 | | static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */ |
144 | 0 | { |
145 | 0 | switch (error_code) { |
146 | 0 | case PHP_PCRE_NO_ERROR: |
147 | 0 | return "No error"; |
148 | 0 | case PHP_PCRE_INTERNAL_ERROR: |
149 | 0 | return "Internal error"; |
150 | 0 | case PHP_PCRE_BAD_UTF8_ERROR: |
151 | 0 | return "Malformed UTF-8 characters, possibly incorrectly encoded"; |
152 | 0 | case PHP_PCRE_BAD_UTF8_OFFSET_ERROR: |
153 | 0 | return "The offset did not correspond to the beginning of a valid UTF-8 code point"; |
154 | 0 | case PHP_PCRE_BACKTRACK_LIMIT_ERROR: |
155 | 0 | return "Backtrack limit exhausted"; |
156 | 0 | case PHP_PCRE_RECURSION_LIMIT_ERROR: |
157 | 0 | return "Recursion limit exhausted"; |
158 | | |
159 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
160 | | case PHP_PCRE_JIT_STACKLIMIT_ERROR: |
161 | | return "JIT stack limit exhausted"; |
162 | | #endif |
163 | | |
164 | 0 | default: |
165 | 0 | return "Unknown error"; |
166 | 0 | } |
167 | 0 | } |
168 | | /* }}} */ |
169 | | |
170 | | static void php_free_pcre_cache(zval *data) /* {{{ */ |
171 | 0 | { |
172 | 0 | pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data); |
173 | 0 | if (!pce) return; |
174 | 0 | if (pce->subpats_table) { |
175 | 0 | free_subpats_table(pce->subpats_table, pce->capture_count + 1); |
176 | 0 | } |
177 | 0 | pcre2_code_free(pce->re); |
178 | 0 | free(pce); |
179 | 0 | } |
180 | | /* }}} */ |
181 | | |
182 | | static void *php_pcre_malloc(PCRE2_SIZE size, void *data) |
183 | 1.89k | { |
184 | 1.89k | return pemalloc(size, 1); |
185 | 1.89k | } |
186 | | |
187 | | static void php_pcre_free(void *block, void *data) |
188 | 212 | { |
189 | 212 | pefree(block, 1); |
190 | 212 | } |
191 | | |
192 | | static void *php_pcre_emalloc(PCRE2_SIZE size, void *data) |
193 | 226k | { |
194 | 226k | return emalloc(size); |
195 | 226k | } |
196 | | |
197 | | static void php_pcre_efree(void *block, void *data) |
198 | 226k | { |
199 | 226k | efree(block); |
200 | 226k | } |
201 | | |
202 | 3.42k | #define PHP_PCRE_PREALLOC_MDATA_SIZE 32 |
203 | | |
204 | | static void php_pcre_init_pcre2(uint8_t jit) |
205 | 16 | {/*{{{*/ |
206 | 16 | if (!gctx) { |
207 | 16 | gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL); |
208 | 16 | if (!gctx) { |
209 | 0 | pcre2_init_ok = 0; |
210 | 0 | return; |
211 | 0 | } |
212 | 16 | } |
213 | | |
214 | 16 | if (!cctx) { |
215 | 16 | cctx = pcre2_compile_context_create(gctx); |
216 | 16 | if (!cctx) { |
217 | 0 | pcre2_init_ok = 0; |
218 | 0 | return; |
219 | 0 | } |
220 | 16 | } |
221 | | |
222 | 16 | if (!mctx) { |
223 | 16 | mctx = pcre2_match_context_create(gctx); |
224 | 16 | if (!mctx) { |
225 | 0 | pcre2_init_ok = 0; |
226 | 0 | return; |
227 | 0 | } |
228 | 16 | } |
229 | | |
230 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
231 | | if (jit && !jit_stack) { |
232 | | jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx); |
233 | | if (!jit_stack) { |
234 | | pcre2_init_ok = 0; |
235 | | return; |
236 | | } |
237 | | } |
238 | | #endif |
239 | | |
240 | 16 | if (!mdata) { |
241 | 16 | mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx); |
242 | 16 | if (!mdata) { |
243 | 0 | pcre2_init_ok = 0; |
244 | 0 | return; |
245 | 0 | } |
246 | 16 | } |
247 | | |
248 | 16 | pcre2_init_ok = 1; |
249 | 16 | }/*}}}*/ |
250 | | |
251 | | static void php_pcre_shutdown_pcre2(void) |
252 | 0 | {/*{{{*/ |
253 | 0 | if (gctx) { |
254 | 0 | pcre2_general_context_free(gctx); |
255 | 0 | gctx = NULL; |
256 | 0 | } |
257 | |
|
258 | 0 | if (cctx) { |
259 | 0 | pcre2_compile_context_free(cctx); |
260 | 0 | cctx = NULL; |
261 | 0 | } |
262 | |
|
263 | 0 | if (mctx) { |
264 | 0 | pcre2_match_context_free(mctx); |
265 | 0 | mctx = NULL; |
266 | 0 | } |
267 | |
|
268 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
269 | | /* Stack may only be destroyed when no cached patterns |
270 | | possibly associated with it do exist. */ |
271 | | if (jit_stack) { |
272 | | pcre2_jit_stack_free(jit_stack); |
273 | | jit_stack = NULL; |
274 | | } |
275 | | #endif |
276 | |
|
277 | 0 | if (mdata) { |
278 | 0 | pcre2_match_data_free(mdata); |
279 | 0 | mdata = NULL; |
280 | 0 | } |
281 | |
|
282 | 0 | pcre2_init_ok = 0; |
283 | 0 | }/*}}}*/ |
284 | | |
285 | | static PHP_GINIT_FUNCTION(pcre) /* {{{ */ |
286 | 16 | { |
287 | 16 | php_pcre_mutex_alloc(); |
288 | | |
289 | 16 | zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1); |
290 | | |
291 | 16 | pcre_globals->backtrack_limit = 0; |
292 | 16 | pcre_globals->recursion_limit = 0; |
293 | 16 | pcre_globals->error_code = PHP_PCRE_NO_ERROR; |
294 | 16 | ZVAL_UNDEF(&pcre_globals->unmatched_null_pair); |
295 | 16 | ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair); |
296 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
297 | | pcre_globals->jit = 1; |
298 | | #endif |
299 | | |
300 | 16 | php_pcre_init_pcre2(1); |
301 | 16 | zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1); |
302 | 16 | } |
303 | | /* }}} */ |
304 | | |
305 | | static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */ |
306 | 0 | { |
307 | 0 | zend_hash_destroy(&pcre_globals->pcre_cache); |
308 | |
|
309 | 0 | php_pcre_shutdown_pcre2(); |
310 | 0 | zend_hash_destroy(&char_tables); |
311 | 0 | php_pcre_mutex_free(); |
312 | 0 | } |
313 | | /* }}} */ |
314 | | |
315 | | static PHP_INI_MH(OnUpdateBacktrackLimit) |
316 | 16 | {/*{{{*/ |
317 | 16 | OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage); |
318 | 16 | if (mctx) { |
319 | 16 | pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit)); |
320 | 16 | } |
321 | | |
322 | 16 | return SUCCESS; |
323 | 16 | }/*}}}*/ |
324 | | |
325 | | static PHP_INI_MH(OnUpdateRecursionLimit) |
326 | 16 | {/*{{{*/ |
327 | 16 | OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage); |
328 | 16 | if (mctx) { |
329 | 16 | pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit)); |
330 | 16 | } |
331 | | |
332 | 16 | return SUCCESS; |
333 | 16 | }/*}}}*/ |
334 | | |
335 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
336 | | static PHP_INI_MH(OnUpdateJit) |
337 | | {/*{{{*/ |
338 | | OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage); |
339 | | if (PCRE_G(jit) && jit_stack) { |
340 | | pcre2_jit_stack_assign(mctx, NULL, jit_stack); |
341 | | } else { |
342 | | pcre2_jit_stack_assign(mctx, NULL, NULL); |
343 | | } |
344 | | |
345 | | return SUCCESS; |
346 | | }/*}}}*/ |
347 | | #endif |
348 | | |
349 | | PHP_INI_BEGIN() |
350 | | STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals) |
351 | | STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals) |
352 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
353 | | STD_PHP_INI_BOOLEAN("pcre.jit", "1", PHP_INI_ALL, OnUpdateJit, jit, zend_pcre_globals, pcre_globals) |
354 | | #endif |
355 | | PHP_INI_END() |
356 | | |
357 | | static char *_pcre2_config_str(uint32_t what) |
358 | 24 | {/*{{{*/ |
359 | 24 | int len = pcre2_config(what, NULL); |
360 | 24 | char *ret = (char *) malloc(len + 1); |
361 | | |
362 | 24 | len = pcre2_config(what, ret); |
363 | 24 | if (!len) { |
364 | 0 | free(ret); |
365 | 0 | return NULL; |
366 | 0 | } |
367 | | |
368 | 24 | return ret; |
369 | 24 | }/*}}}*/ |
370 | | |
371 | | /* {{{ PHP_MINFO_FUNCTION(pcre) */ |
372 | | static PHP_MINFO_FUNCTION(pcre) |
373 | 4 | { |
374 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
375 | | uint32_t flag = 0; |
376 | | char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET); |
377 | | #endif |
378 | 4 | char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION); |
379 | 4 | char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION); |
380 | | |
381 | 4 | php_info_print_table_start(); |
382 | 4 | php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" ); |
383 | 4 | php_info_print_table_row(2, "PCRE Library Version", version); |
384 | 4 | free(version); |
385 | 4 | php_info_print_table_row(2, "PCRE Unicode Version", unicode); |
386 | 4 | free(unicode); |
387 | | |
388 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
389 | | if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) { |
390 | | php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled"); |
391 | | } else { |
392 | | php_info_print_table_row(2, "PCRE JIT Support", "unknown" ); |
393 | | } |
394 | | if (jit_target) { |
395 | | php_info_print_table_row(2, "PCRE JIT Target", jit_target); |
396 | | } |
397 | | free(jit_target); |
398 | | #else |
399 | 4 | php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" ); |
400 | 4 | #endif |
401 | | |
402 | | #ifdef HAVE_PCRE_VALGRIND_SUPPORT |
403 | | php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" ); |
404 | | #endif |
405 | | |
406 | 4 | php_info_print_table_end(); |
407 | | |
408 | 4 | DISPLAY_INI_ENTRIES(); |
409 | 4 | } |
410 | | /* }}} */ |
411 | | |
412 | | /* {{{ PHP_MINIT_FUNCTION(pcre) */ |
413 | | static PHP_MINIT_FUNCTION(pcre) |
414 | 16 | { |
415 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
416 | | if (UNEXPECTED(!pcre2_init_ok)) { |
417 | | /* Retry. */ |
418 | | php_pcre_init_pcre2(PCRE_G(jit)); |
419 | | if (!pcre2_init_ok) { |
420 | | return FAILURE; |
421 | | } |
422 | | } |
423 | | #endif |
424 | | |
425 | 16 | REGISTER_INI_ENTRIES(); |
426 | | |
427 | 16 | php_pcre_version = _pcre2_config_str(PCRE2_CONFIG_VERSION); |
428 | | |
429 | 16 | register_php_pcre_symbols(module_number); |
430 | | |
431 | 16 | return SUCCESS; |
432 | 16 | } |
433 | | /* }}} */ |
434 | | |
435 | | /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */ |
436 | | static PHP_MSHUTDOWN_FUNCTION(pcre) |
437 | 0 | { |
438 | 0 | UNREGISTER_INI_ENTRIES(); |
439 | |
|
440 | 0 | free(php_pcre_version); |
441 | |
|
442 | 0 | return SUCCESS; |
443 | 0 | } |
444 | | /* }}} */ |
445 | | |
446 | | /* {{{ PHP_RINIT_FUNCTION(pcre) */ |
447 | | static PHP_RINIT_FUNCTION(pcre) |
448 | 224k | { |
449 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
450 | | if (UNEXPECTED(!pcre2_init_ok)) { |
451 | | /* Retry. */ |
452 | | php_pcre_mutex_lock(); |
453 | | php_pcre_init_pcre2(PCRE_G(jit)); |
454 | | if (!pcre2_init_ok) { |
455 | | php_pcre_mutex_unlock(); |
456 | | return FAILURE; |
457 | | } |
458 | | php_pcre_mutex_unlock(); |
459 | | } |
460 | | |
461 | | mdata_used = 0; |
462 | | #endif |
463 | | |
464 | 224k | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
465 | 224k | PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL); |
466 | 224k | if (!PCRE_G(gctx_zmm)) { |
467 | 0 | return FAILURE; |
468 | 0 | } |
469 | | |
470 | 224k | return SUCCESS; |
471 | 224k | } |
472 | | /* }}} */ |
473 | | |
474 | | static PHP_RSHUTDOWN_FUNCTION(pcre) |
475 | 224k | { |
476 | 224k | pcre_cache_entry *pce; |
477 | 68.8M | ZEND_HASH_MAP_FOREACH_PTR(&PCRE_G(pcre_cache), pce) { |
478 | 68.8M | if (pce->subpats_table) { |
479 | 0 | free_subpats_table(pce->subpats_table, pce->capture_count + 1); |
480 | 0 | pce->subpats_table = NULL; |
481 | 0 | } |
482 | 68.8M | } ZEND_HASH_FOREACH_END(); |
483 | | |
484 | 224k | pcre2_general_context_free(PCRE_G(gctx_zmm)); |
485 | 224k | PCRE_G(gctx_zmm) = NULL; |
486 | | |
487 | 224k | zval_ptr_dtor(&PCRE_G(unmatched_null_pair)); |
488 | 224k | zval_ptr_dtor(&PCRE_G(unmatched_empty_pair)); |
489 | 224k | ZVAL_UNDEF(&PCRE_G(unmatched_null_pair)); |
490 | 224k | ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair)); |
491 | 224k | return SUCCESS; |
492 | 224k | } |
493 | | |
494 | | /* {{{ static pcre_clean_cache */ |
495 | | static int pcre_clean_cache(zval *data, void *arg) |
496 | 0 | { |
497 | 0 | pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data); |
498 | 0 | int *num_clean = (int *)arg; |
499 | |
|
500 | 0 | if (!pce->refcount) { |
501 | 0 | if (--(*num_clean) == 0) { |
502 | 0 | return ZEND_HASH_APPLY_REMOVE|ZEND_HASH_APPLY_STOP; |
503 | 0 | } |
504 | 0 | return ZEND_HASH_APPLY_REMOVE; |
505 | 0 | } else { |
506 | 0 | return ZEND_HASH_APPLY_KEEP; |
507 | 0 | } |
508 | 0 | } |
509 | | /* }}} */ |
510 | | |
511 | 0 | static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) { |
512 | 0 | uint32_t i; |
513 | 0 | for (i = 0; i < num_subpats; i++) { |
514 | 0 | if (subpat_names[i]) { |
515 | 0 | zend_string_release_ex(subpat_names[i], false); |
516 | 0 | } |
517 | 0 | } |
518 | 0 | efree(subpat_names); |
519 | 0 | } |
520 | | |
521 | | /* {{{ static make_subpats_table */ |
522 | | static zend_string **make_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce) |
523 | 0 | { |
524 | 0 | uint32_t num_subpats = pce->capture_count + 1; |
525 | 0 | uint32_t name_size, ni = 0; |
526 | 0 | char *name_table; |
527 | 0 | zend_string **subpat_names; |
528 | 0 | int rc1, rc2; |
529 | |
|
530 | 0 | rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table); |
531 | 0 | rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size); |
532 | 0 | if (rc1 < 0 || rc2 < 0) { |
533 | 0 | php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2); |
534 | 0 | return NULL; |
535 | 0 | } |
536 | | |
537 | 0 | subpat_names = ecalloc(num_subpats, sizeof(zend_string *)); |
538 | 0 | while (ni++ < name_cnt) { |
539 | 0 | unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1]; |
540 | 0 | const char *name = name_table + 2; |
541 | 0 | subpat_names[name_idx] = zend_string_init(name, strlen(name), false); |
542 | 0 | name_table += name_size; |
543 | 0 | } |
544 | 0 | return subpat_names; |
545 | 0 | } |
546 | | /* }}} */ |
547 | | |
548 | | static zend_string **ensure_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce) |
549 | 0 | { |
550 | 0 | if (!pce->subpats_table) { |
551 | 0 | pce->subpats_table = make_subpats_table(name_cnt, pce); |
552 | 0 | } |
553 | 0 | return pce->subpats_table; |
554 | 0 | } |
555 | | |
556 | | /* {{{ static calculate_unit_length */ |
557 | | /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */ |
558 | | static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start) |
559 | 769 | { |
560 | 769 | size_t unit_len; |
561 | | |
562 | 769 | if (pce->compile_options & PCRE2_UTF) { |
563 | 28 | const char *end = start; |
564 | | |
565 | | /* skip continuation bytes */ |
566 | 28 | while ((*++end & 0xC0) == 0x80); |
567 | 28 | unit_len = end - start; |
568 | 741 | } else { |
569 | 741 | unit_len = 1; |
570 | 741 | } |
571 | 769 | return unit_len; |
572 | 769 | } |
573 | | /* }}} */ |
574 | | |
575 | | /* {{{ pcre_get_compiled_regex_cache */ |
576 | | PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware) |
577 | 5.35k | { |
578 | 5.35k | pcre2_code *re = NULL; |
579 | | #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !defined(HAVE_BUNDLED_PCRE) |
580 | | uint32_t coptions = PCRE2_NO_START_OPTIMIZE; |
581 | | #else |
582 | 5.35k | uint32_t coptions = 0; |
583 | 5.35k | #endif |
584 | 5.35k | uint32_t eoptions = 0; |
585 | 5.35k | PCRE2_UCHAR error[128]; |
586 | 5.35k | PCRE2_SIZE erroffset; |
587 | 5.35k | int errnumber; |
588 | 5.35k | char delimiter; |
589 | 5.35k | char start_delimiter; |
590 | 5.35k | char end_delimiter; |
591 | 5.35k | char *p, *pp; |
592 | 5.35k | char *pattern; |
593 | 5.35k | size_t pattern_len; |
594 | 5.35k | uint32_t poptions = 0; |
595 | 5.35k | const uint8_t *tables = NULL; |
596 | 5.35k | zval *zv; |
597 | 5.35k | pcre_cache_entry new_entry; |
598 | 5.35k | int rc; |
599 | 5.35k | zend_string *key; |
600 | 5.35k | pcre_cache_entry *ret; |
601 | | |
602 | 5.35k | if (locale_aware && BG(ctype_string)) { |
603 | 0 | key = zend_string_concat2( |
604 | 0 | ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), |
605 | 0 | ZSTR_VAL(regex), ZSTR_LEN(regex)); |
606 | 5.35k | } else { |
607 | 5.35k | key = regex; |
608 | 5.35k | } |
609 | | |
610 | | /* Try to lookup the cached regex entry, and if successful, just pass |
611 | | back the compiled pattern, otherwise go on and compile it. */ |
612 | 5.35k | zv = zend_hash_find(&PCRE_G(pcre_cache), key); |
613 | 5.35k | if (zv) { |
614 | 2.49k | if (key != regex) { |
615 | 0 | zend_string_release_ex(key, 0); |
616 | 0 | } |
617 | 2.49k | return (pcre_cache_entry*)Z_PTR_P(zv); |
618 | 2.49k | } |
619 | | |
620 | 2.86k | p = ZSTR_VAL(regex); |
621 | 2.86k | const char* end_p = ZSTR_VAL(regex) + ZSTR_LEN(regex); |
622 | | |
623 | | /* Parse through the leading whitespace, and display a warning if we |
624 | | get to the end without encountering a delimiter. */ |
625 | 2.86k | while (isspace((int)*(unsigned char *)p)) p++; |
626 | 2.86k | if (p >= end_p) { |
627 | 12 | if (key != regex) { |
628 | 0 | zend_string_release_ex(key, 0); |
629 | 0 | } |
630 | 12 | php_error_docref(NULL, E_WARNING, "Empty regular expression"); |
631 | 12 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
632 | 12 | return NULL; |
633 | 12 | } |
634 | | |
635 | | /* Get the delimiter and display a warning if it is alphanumeric |
636 | | or a backslash. */ |
637 | 2.85k | delimiter = *p++; |
638 | 2.85k | if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\' || delimiter == '\0') { |
639 | 21 | if (key != regex) { |
640 | 0 | zend_string_release_ex(key, 0); |
641 | 0 | } |
642 | 21 | php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric, backslash, or NUL byte"); |
643 | 21 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
644 | 21 | return NULL; |
645 | 21 | } |
646 | | |
647 | 2.83k | start_delimiter = delimiter; |
648 | 2.83k | if ((pp = strchr("([{< )]}> )]}>", delimiter))) |
649 | 76 | delimiter = pp[5]; |
650 | 2.83k | end_delimiter = delimiter; |
651 | | |
652 | 2.83k | pp = p; |
653 | | |
654 | 2.83k | if (start_delimiter == end_delimiter) { |
655 | | /* We need to iterate through the pattern, searching for the ending delimiter, |
656 | | but skipping the backslashed delimiters. If the ending delimiter is not |
657 | | found, display a warning. */ |
658 | 924k | while (pp < end_p) { |
659 | 924k | if (*pp == '\\' && pp + 1 < end_p) pp++; |
660 | 878k | else if (*pp == delimiter) |
661 | 2.74k | break; |
662 | 921k | pp++; |
663 | 921k | } |
664 | 2.76k | } else { |
665 | | /* We iterate through the pattern, searching for the matching ending |
666 | | * delimiter. For each matching starting delimiter, we increment nesting |
667 | | * level, and decrement it for each matching ending delimiter. If we |
668 | | * reach the end of the pattern without matching, display a warning. |
669 | | */ |
670 | 65 | int brackets = 1; /* brackets nesting level */ |
671 | 34.4k | while (pp < end_p) { |
672 | 34.4k | if (*pp == '\\' && pp + 1 < end_p) pp++; |
673 | 33.7k | else if (*pp == end_delimiter && --brackets <= 0) |
674 | 9 | break; |
675 | 33.6k | else if (*pp == start_delimiter) |
676 | 1.71k | brackets++; |
677 | 34.3k | pp++; |
678 | 34.3k | } |
679 | 65 | } |
680 | | |
681 | 2.83k | if (pp >= end_p) { |
682 | 81 | if (key != regex) { |
683 | 0 | zend_string_release_ex(key, 0); |
684 | 0 | } |
685 | 81 | if (start_delimiter == end_delimiter) { |
686 | 25 | php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter); |
687 | 56 | } else { |
688 | 56 | php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter); |
689 | 56 | } |
690 | 81 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
691 | 81 | return NULL; |
692 | 81 | } |
693 | | |
694 | | /* Make a copy of the actual pattern. */ |
695 | 2.75k | pattern_len = pp - p; |
696 | 2.75k | pattern = estrndup(p, pattern_len); |
697 | | |
698 | | /* Move on to the options */ |
699 | 2.75k | pp++; |
700 | | |
701 | | /* Parse through the options, setting appropriate flags. Display |
702 | | a warning if we encounter an unknown modifier. */ |
703 | 6.37k | while (pp < end_p) { |
704 | 3.86k | switch (*pp++) { |
705 | | /* Perl compatible options */ |
706 | 1.12k | case 'i': coptions |= PCRE2_CASELESS; break; |
707 | 318 | case 'm': coptions |= PCRE2_MULTILINE; break; |
708 | 36 | case 'n': coptions |= PCRE2_NO_AUTO_CAPTURE; break; |
709 | 275 | case 's': coptions |= PCRE2_DOTALL; break; |
710 | 18 | case 'x': coptions |= PCRE2_EXTENDED; break; |
711 | | |
712 | | /* PCRE specific options */ |
713 | 363 | case 'A': coptions |= PCRE2_ANCHORED; break; |
714 | 7 | case 'D': coptions |= PCRE2_DOLLAR_ENDONLY;break; |
715 | 0 | #ifdef PCRE2_EXTRA_CASELESS_RESTRICT |
716 | 27 | case 'r': eoptions |= PCRE2_EXTRA_CASELESS_RESTRICT; break; |
717 | 0 | #endif |
718 | 3 | case 'S': /* Pass. */ break; |
719 | 97 | case 'X': /* Pass. */ break; |
720 | 266 | case 'U': coptions |= PCRE2_UNGREEDY; break; |
721 | 731 | case 'u': coptions |= PCRE2_UTF; |
722 | | /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII |
723 | | characters, even in UTF-8 mode. However, this can be changed by setting |
724 | | the PCRE2_UCP option. */ |
725 | 731 | #ifdef PCRE2_UCP |
726 | 731 | coptions |= PCRE2_UCP; |
727 | 731 | #endif |
728 | 731 | break; |
729 | 36 | case 'J': coptions |= PCRE2_DUPNAMES; break; |
730 | | |
731 | 61 | case ' ': |
732 | 302 | case '\n': |
733 | 319 | case '\r': |
734 | 319 | break; |
735 | | |
736 | 2 | case 'e': /* legacy eval */ |
737 | 242 | default: |
738 | 242 | if (pp[-1]) { |
739 | 219 | php_error_docref(NULL, E_WARNING, "Unknown modifier '%c'", pp[-1]); |
740 | 219 | } else { |
741 | 23 | php_error_docref(NULL, E_WARNING, "NUL byte is not a valid modifier"); |
742 | 23 | } |
743 | 242 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
744 | 242 | efree(pattern); |
745 | 242 | if (key != regex) { |
746 | 0 | zend_string_release_ex(key, 0); |
747 | 0 | } |
748 | 242 | return NULL; |
749 | 3.86k | } |
750 | 3.86k | } |
751 | | |
752 | 2.50k | if (key != regex) { |
753 | 0 | tables = (uint8_t *)zend_hash_find_ptr(&char_tables, BG(ctype_string)); |
754 | 0 | if (!tables) { |
755 | 0 | zend_string *_k; |
756 | 0 | tables = pcre2_maketables(gctx); |
757 | 0 | if (UNEXPECTED(!tables)) { |
758 | 0 | php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables"); |
759 | 0 | pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY); |
760 | 0 | zend_string_release_ex(key, 0); |
761 | 0 | efree(pattern); |
762 | 0 | return NULL; |
763 | 0 | } |
764 | 0 | _k = zend_string_init(ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), 1); |
765 | 0 | GC_MAKE_PERSISTENT_LOCAL(_k); |
766 | 0 | zend_hash_add_ptr(&char_tables, _k, (void *)tables); |
767 | 0 | zend_string_release(_k); |
768 | 0 | } |
769 | 0 | } |
770 | 2.50k | pcre2_set_character_tables(cctx, tables); |
771 | | |
772 | 2.50k | pcre2_set_compile_extra_options(cctx, eoptions); |
773 | | |
774 | | /* Compile pattern and display a warning if compilation failed. */ |
775 | 2.50k | re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx); |
776 | | |
777 | 2.50k | if (re == NULL) { |
778 | 890 | if (key != regex) { |
779 | 0 | zend_string_release_ex(key, 0); |
780 | 0 | } |
781 | 890 | pcre2_get_error_message(errnumber, error, sizeof(error)); |
782 | 890 | php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset); |
783 | 890 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
784 | 890 | efree(pattern); |
785 | 890 | return NULL; |
786 | 890 | } |
787 | | |
788 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
789 | | if (PCRE_G(jit)) { |
790 | | /* Enable PCRE JIT compiler */ |
791 | | rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); |
792 | | if (EXPECTED(rc >= 0)) { |
793 | | size_t jit_size = 0; |
794 | | if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) { |
795 | | poptions |= PREG_JIT; |
796 | | } |
797 | | } else if (rc == PCRE2_ERROR_NOMEMORY) { |
798 | | php_error_docref(NULL, E_WARNING, |
799 | | "Allocation of JIT memory failed, PCRE JIT will be disabled. " |
800 | | "This is likely caused by security restrictions. " |
801 | | "Either grant PHP permission to allocate executable memory, or set pcre.jit=0"); |
802 | | PCRE_G(jit) = 0; |
803 | | } else { |
804 | | pcre2_get_error_message(rc, error, sizeof(error)); |
805 | | php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error); |
806 | | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
807 | | } |
808 | | } |
809 | | #endif |
810 | 1.61k | efree(pattern); |
811 | | |
812 | | /* |
813 | | * If we reached cache limit, clean out the items from the head of the list; |
814 | | * these are supposedly the oldest ones (but not necessarily the least used |
815 | | * ones). |
816 | | */ |
817 | 1.61k | if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) { |
818 | 0 | int num_clean = PCRE_CACHE_SIZE / 8; |
819 | 0 | zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean); |
820 | 0 | } |
821 | | |
822 | | /* Store the compiled pattern and extra info in the cache. */ |
823 | 1.61k | new_entry.re = re; |
824 | 1.61k | new_entry.preg_options = poptions; |
825 | 1.61k | new_entry.compile_options = coptions; |
826 | 1.61k | new_entry.refcount = 0; |
827 | 1.61k | new_entry.subpats_table = NULL; |
828 | | |
829 | 1.61k | rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count); |
830 | 1.61k | if (rc < 0) { |
831 | 0 | if (key != regex) { |
832 | 0 | zend_string_release_ex(key, 0); |
833 | 0 | } |
834 | 0 | pcre2_code_free(new_entry.re); |
835 | 0 | php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc); |
836 | 0 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
837 | 0 | return NULL; |
838 | 0 | } |
839 | | |
840 | 1.61k | rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count); |
841 | 1.61k | if (rc < 0) { |
842 | 0 | if (key != regex) { |
843 | 0 | zend_string_release_ex(key, 0); |
844 | 0 | } |
845 | 0 | pcre2_code_free(new_entry.re); |
846 | 0 | php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc); |
847 | 0 | pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); |
848 | 0 | return NULL; |
849 | 0 | } |
850 | | |
851 | | /* |
852 | | * Interned strings are not duplicated when stored in HashTable, |
853 | | * but all the interned strings created during HTTP request are removed |
854 | | * at end of request. However PCRE_G(pcre_cache) must be consistent |
855 | | * on the next request as well. So we disable usage of interned strings |
856 | | * as hash keys especually for this table. |
857 | | * See bug #63180 |
858 | | */ |
859 | 1.61k | if (!(GC_FLAGS(key) & IS_STR_PERMANENT)) { |
860 | 850 | zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1); |
861 | 850 | GC_MAKE_PERSISTENT_LOCAL(str); |
862 | | |
863 | 850 | ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry)); |
864 | 850 | zend_string_release(str); |
865 | 850 | } else { |
866 | 769 | ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry)); |
867 | 769 | } |
868 | | |
869 | 1.61k | if (key != regex) { |
870 | 0 | zend_string_release_ex(key, 0); |
871 | 0 | } |
872 | | |
873 | 1.61k | return ret; |
874 | 1.61k | } |
875 | | /* }}} */ |
876 | | |
877 | | /* {{{ pcre_get_compiled_regex_cache */ |
878 | | PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex) |
879 | 5.35k | { |
880 | 5.35k | return pcre_get_compiled_regex_cache_ex(regex, true); |
881 | 5.35k | } |
882 | | /* }}} */ |
883 | | |
884 | | /* {{{ pcre_get_compiled_regex */ |
885 | | PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count) |
886 | 0 | { |
887 | 0 | pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex); |
888 | |
|
889 | 0 | if (capture_count) { |
890 | 0 | *capture_count = pce ? pce->capture_count : 0; |
891 | 0 | } |
892 | |
|
893 | 0 | return pce ? pce->re : NULL; |
894 | 0 | } |
895 | | /* }}} */ |
896 | | |
897 | | /* XXX For the cases where it's only about match yes/no and no capture |
898 | | required, perhaps just a minimum sized data would suffice. */ |
899 | | PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re) |
900 | 0 | {/*{{{*/ |
901 | |
|
902 | 0 | assert(NULL != re); |
903 | |
|
904 | 0 | if (EXPECTED(!mdata_used)) { |
905 | 0 | int rc = 0; |
906 | |
|
907 | 0 | if (!capture_count) { |
908 | | /* As we deal with a non cached pattern, no other way to gather this info. */ |
909 | 0 | rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count); |
910 | 0 | } |
911 | |
|
912 | 0 | if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
913 | 0 | mdata_used = 1; |
914 | 0 | return mdata; |
915 | 0 | } |
916 | 0 | } |
917 | | |
918 | 0 | return pcre2_match_data_create_from_pattern(re, gctx); |
919 | 0 | }/*}}}*/ |
920 | | |
921 | | PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data) |
922 | 0 | {/*{{{*/ |
923 | 0 | if (UNEXPECTED(match_data != mdata)) { |
924 | 0 | pcre2_match_data_free(match_data); |
925 | 0 | } else { |
926 | 0 | mdata_used = 0; |
927 | 0 | } |
928 | 0 | }/*}}}*/ |
929 | | |
930 | 0 | static void init_unmatched_null_pair(zval *pair) { |
931 | 0 | zval val1, val2; |
932 | 0 | ZVAL_NULL(&val1); |
933 | 0 | ZVAL_LONG(&val2, -1); |
934 | 0 | ZVAL_ARR(pair, zend_new_pair(&val1, &val2)); |
935 | 0 | } |
936 | | |
937 | 0 | static void init_unmatched_empty_pair(zval *pair) { |
938 | 0 | zval val1, val2; |
939 | 0 | ZVAL_EMPTY_STRING(&val1); |
940 | 0 | ZVAL_LONG(&val2, -1); |
941 | 0 | ZVAL_ARR(pair, zend_new_pair(&val1, &val2)); |
942 | 0 | } |
943 | | |
944 | | static zend_always_inline void populate_match_value_str( |
945 | 744 | zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) { |
946 | 744 | ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset); |
947 | 744 | } |
948 | | |
949 | | static zend_always_inline void populate_match_value( |
950 | | zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset, |
951 | 744 | bool unmatched_as_null) { |
952 | 744 | if (PCRE2_UNSET == start_offset) { |
953 | 0 | if (unmatched_as_null) { |
954 | 0 | ZVAL_NULL(val); |
955 | 0 | } else { |
956 | 0 | ZVAL_EMPTY_STRING(val); |
957 | 0 | } |
958 | 744 | } else { |
959 | 744 | populate_match_value_str(val, subject, start_offset, end_offset); |
960 | 744 | } |
961 | 744 | } |
962 | | |
963 | | static inline void add_named( |
964 | 0 | HashTable *const subpats, zend_string *name, zval *val, bool unmatched) { |
965 | 0 | ZEND_ASSERT(!(GC_FLAGS(name) & IS_STR_PERSISTENT)); |
966 | | |
967 | | /* If the DUPNAMES option is used, multiple subpatterns might have the same name. |
968 | | * In this case we want to preserve the one that actually has a value. */ |
969 | 0 | if (!unmatched) { |
970 | 0 | zend_hash_update(subpats, name, val); |
971 | 0 | } else { |
972 | 0 | if (!zend_hash_add(subpats, name, val)) { |
973 | 0 | return; |
974 | 0 | } |
975 | 0 | } |
976 | 0 | Z_TRY_ADDREF_P(val); |
977 | 0 | } |
978 | | |
979 | | /* {{{ add_offset_pair */ |
980 | | static inline void add_offset_pair( |
981 | | HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset, |
982 | | zend_string *name, zend_long unmatched_as_null) |
983 | 0 | { |
984 | 0 | zval match_pair; |
985 | | |
986 | | /* Add (match, offset) to the return value */ |
987 | 0 | if (PCRE2_UNSET == start_offset) { |
988 | 0 | if (unmatched_as_null) { |
989 | 0 | do { |
990 | 0 | if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) { |
991 | 0 | if (UNEXPECTED(EG(flags) & EG_FLAGS_IN_SHUTDOWN)) { |
992 | 0 | init_unmatched_null_pair(&match_pair); |
993 | 0 | break; |
994 | 0 | } else { |
995 | 0 | init_unmatched_null_pair(&PCRE_G(unmatched_null_pair)); |
996 | 0 | } |
997 | 0 | } |
998 | 0 | ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair)); |
999 | 0 | } while (0); |
1000 | 0 | } else { |
1001 | 0 | do { |
1002 | 0 | if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) { |
1003 | 0 | if (UNEXPECTED(EG(flags) & EG_FLAGS_IN_SHUTDOWN)) { |
1004 | 0 | init_unmatched_empty_pair(&match_pair); |
1005 | 0 | break; |
1006 | 0 | } else { |
1007 | 0 | init_unmatched_empty_pair(&PCRE_G(unmatched_empty_pair)); |
1008 | 0 | } |
1009 | 0 | } |
1010 | 0 | ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair)); |
1011 | 0 | } while (0); |
1012 | 0 | } |
1013 | 0 | } else { |
1014 | 0 | zval val1, val2; |
1015 | 0 | populate_match_value_str(&val1, subject, start_offset, end_offset); |
1016 | 0 | ZVAL_LONG(&val2, start_offset); |
1017 | 0 | ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2)); |
1018 | 0 | } |
1019 | |
|
1020 | 0 | if (name) { |
1021 | 0 | add_named(result, name, &match_pair, start_offset == PCRE2_UNSET); |
1022 | 0 | } |
1023 | 0 | zend_hash_next_index_insert_new(result, &match_pair); |
1024 | 0 | } |
1025 | | /* }}} */ |
1026 | | |
1027 | | static void populate_subpat_array( |
1028 | | HashTable *subpats_ht, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, |
1029 | 377 | uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) { |
1030 | 377 | zend_long offset_capture = flags & PREG_OFFSET_CAPTURE; |
1031 | 377 | zend_long unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL; |
1032 | 377 | zval val; |
1033 | 377 | int i; |
1034 | 377 | if (subpat_names) { |
1035 | 0 | if (offset_capture) { |
1036 | 0 | for (i = 0; i < count; i++) { |
1037 | 0 | add_offset_pair( |
1038 | 0 | subpats_ht, subject, offsets[2*i], offsets[2*i+1], |
1039 | 0 | subpat_names[i], unmatched_as_null); |
1040 | 0 | } |
1041 | 0 | if (unmatched_as_null) { |
1042 | 0 | for (i = count; i < num_subpats; i++) { |
1043 | 0 | add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1); |
1044 | 0 | } |
1045 | 0 | } |
1046 | 0 | } else { |
1047 | 0 | for (i = 0; i < count; i++) { |
1048 | 0 | populate_match_value( |
1049 | 0 | &val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null); |
1050 | 0 | if (subpat_names[i]) { |
1051 | 0 | add_named(subpats_ht, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET); |
1052 | 0 | } |
1053 | 0 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1054 | 0 | } |
1055 | 0 | if (unmatched_as_null) { |
1056 | 0 | for (i = count; i < num_subpats; i++) { |
1057 | 0 | ZVAL_NULL(&val); |
1058 | 0 | if (subpat_names[i]) { |
1059 | 0 | zend_hash_add(subpats_ht, subpat_names[i], &val); |
1060 | 0 | } |
1061 | 0 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1062 | 0 | } |
1063 | 0 | } |
1064 | 0 | } |
1065 | 377 | } else { |
1066 | 377 | if (offset_capture) { |
1067 | 0 | for (i = 0; i < count; i++) { |
1068 | 0 | add_offset_pair( |
1069 | 0 | subpats_ht, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null); |
1070 | 0 | } |
1071 | 0 | if (unmatched_as_null) { |
1072 | 0 | for (i = count; i < num_subpats; i++) { |
1073 | 0 | add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1); |
1074 | 0 | } |
1075 | 0 | } |
1076 | 377 | } else { |
1077 | 1.12k | for (i = 0; i < count; i++) { |
1078 | 744 | populate_match_value( |
1079 | 744 | &val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null); |
1080 | 744 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1081 | 744 | } |
1082 | 377 | if (unmatched_as_null) { |
1083 | 0 | ZVAL_NULL(&val); |
1084 | 0 | for (i = count; i < num_subpats; i++) { |
1085 | 0 | zend_hash_next_index_insert_new(subpats_ht, &val); |
1086 | 0 | } |
1087 | 0 | } |
1088 | 377 | } |
1089 | 377 | } |
1090 | | /* Add MARK, if available */ |
1091 | 377 | if (mark) { |
1092 | 0 | ZVAL_STRING(&val, (char *)mark); |
1093 | 0 | zend_hash_str_add_new(subpats_ht, ZEND_STRL("MARK"), &val); |
1094 | 0 | } |
1095 | 377 | } |
1096 | | |
1097 | | static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, bool global) /* {{{ */ |
1098 | 4.78k | { |
1099 | | /* parameters */ |
1100 | 4.78k | zend_string *regex; /* Regular expression */ |
1101 | 4.78k | zend_string *subject; /* String to match against */ |
1102 | 4.78k | pcre_cache_entry *pce; /* Compiled regular expression */ |
1103 | 4.78k | zval *subpats = NULL; /* Array for subpatterns */ |
1104 | 4.78k | zend_long flags = 0; /* Match control flags */ |
1105 | 4.78k | zend_long start_offset = 0; /* Where the new search starts */ |
1106 | | |
1107 | 14.3k | ZEND_PARSE_PARAMETERS_START(2, 5) |
1108 | 19.1k | Z_PARAM_STR(regex) |
1109 | 23.8k | Z_PARAM_STR(subject) |
1110 | 4.77k | Z_PARAM_OPTIONAL |
1111 | 9.56k | Z_PARAM_ZVAL(subpats) |
1112 | 9.56k | Z_PARAM_LONG(flags) |
1113 | 0 | Z_PARAM_LONG(start_offset) |
1114 | 4.78k | ZEND_PARSE_PARAMETERS_END(); |
1115 | | |
1116 | | /* Compile regex or get it from cache. */ |
1117 | 4.77k | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
1118 | 1.02k | RETURN_FALSE; |
1119 | 1.02k | } |
1120 | | |
1121 | 3.75k | if (start_offset == ZEND_LONG_MIN) { |
1122 | 0 | zend_argument_value_error(5, "must be greater than " ZEND_LONG_FMT, ZEND_LONG_MIN); |
1123 | 0 | RETURN_THROWS(); |
1124 | 0 | } |
1125 | | |
1126 | 3.75k | pce->refcount++; |
1127 | 3.75k | php_pcre_match_impl(pce, subject, return_value, subpats, |
1128 | 3.75k | global, flags, start_offset); |
1129 | 3.75k | pce->refcount--; |
1130 | 3.75k | } |
1131 | | /* }}} */ |
1132 | | |
1133 | | static zend_always_inline bool is_known_valid_utf8( |
1134 | 1.22k | zend_string *subject_str, PCRE2_SIZE start_offset) { |
1135 | 1.22k | if (!ZSTR_IS_VALID_UTF8(subject_str)) { |
1136 | | /* We don't know whether the string is valid UTF-8 or not. */ |
1137 | 1.21k | return false; |
1138 | 1.21k | } |
1139 | | |
1140 | 1 | if (start_offset == ZSTR_LEN(subject_str)) { |
1141 | | /* Degenerate case: Offset points to end of string. */ |
1142 | 1 | return true; |
1143 | 1 | } |
1144 | | |
1145 | | /* Check that the offset does not point to an UTF-8 continuation byte. */ |
1146 | 0 | return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80; |
1147 | 1 | } |
1148 | | |
1149 | | /* {{{ php_pcre_match_impl() */ |
1150 | | PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, |
1151 | | zval *subpats, bool global, zend_long flags, zend_off_t start_offset) |
1152 | 3.75k | { |
1153 | 3.75k | zval result_set; /* Holds a set of subpatterns after |
1154 | | a global match */ |
1155 | 3.75k | HashTable **match_sets = NULL; /* An array of sets of matches for each |
1156 | | subpattern after a global match */ |
1157 | 3.75k | uint32_t options; /* Execution options */ |
1158 | 3.75k | int count; /* Count of matched subpatterns */ |
1159 | 3.75k | uint32_t num_subpats; /* Number of captured subpatterns */ |
1160 | 3.75k | int matched; /* Has anything matched */ |
1161 | 3.75k | zend_string **subpat_names; /* Array for named subpatterns */ |
1162 | 3.75k | size_t i; |
1163 | 3.75k | uint32_t subpats_order; /* Order of subpattern matches */ |
1164 | 3.75k | uint32_t offset_capture; /* Capture match offsets: yes/no */ |
1165 | 3.75k | zend_long unmatched_as_null; /* Null non-matches: yes/no */ |
1166 | 3.75k | PCRE2_SPTR mark = NULL; /* Target for MARK name */ |
1167 | 3.75k | HashTable *marks = NULL; /* Array of marks for PREG_PATTERN_ORDER */ |
1168 | 3.75k | pcre2_match_data *match_data; |
1169 | 3.75k | PCRE2_SIZE start_offset2, orig_start_offset; |
1170 | 3.75k | bool old_mdata_used; |
1171 | | |
1172 | 3.75k | char *subject = ZSTR_VAL(subject_str); |
1173 | 3.75k | size_t subject_len = ZSTR_LEN(subject_str); |
1174 | | |
1175 | | /* Overwrite the passed-in value for subpatterns with an empty array. */ |
1176 | 3.75k | if (subpats != NULL) { |
1177 | 1 | subpats = zend_try_array_init(subpats); |
1178 | 1 | if (!subpats) { |
1179 | 0 | RETURN_THROWS(); |
1180 | 0 | } |
1181 | 1 | } |
1182 | | |
1183 | 3.75k | subpats_order = global ? PREG_PATTERN_ORDER : 0; |
1184 | | |
1185 | 3.75k | if (flags) { |
1186 | 0 | offset_capture = flags & PREG_OFFSET_CAPTURE; |
1187 | 0 | unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL; |
1188 | | |
1189 | | /* |
1190 | | * subpats_order is pre-set to pattern mode so we change it only if |
1191 | | * necessary. |
1192 | | */ |
1193 | 0 | if (flags & 0xff) { |
1194 | 0 | subpats_order = flags & 0xff; |
1195 | 0 | if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) || |
1196 | 0 | (!global && subpats_order != 0)) { |
1197 | 0 | zend_argument_value_error(4, "must be a PREG_* constant"); |
1198 | 0 | RETURN_THROWS(); |
1199 | 0 | } |
1200 | 0 | } |
1201 | 3.75k | } else { |
1202 | 3.75k | offset_capture = 0; |
1203 | 3.75k | unmatched_as_null = 0; |
1204 | 3.75k | } |
1205 | | |
1206 | | /* Negative offset counts from the end of the string. */ |
1207 | 3.75k | if (start_offset < 0) { |
1208 | 0 | if ((PCRE2_SIZE)-start_offset <= subject_len) { |
1209 | 0 | start_offset2 = subject_len + start_offset; |
1210 | 0 | } else { |
1211 | 0 | start_offset2 = 0; |
1212 | 0 | } |
1213 | 3.75k | } else { |
1214 | 3.75k | start_offset2 = (PCRE2_SIZE)start_offset; |
1215 | 3.75k | } |
1216 | | |
1217 | 3.75k | if (start_offset2 > subject_len) { |
1218 | 0 | pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET); |
1219 | 0 | RETURN_FALSE; |
1220 | 0 | } |
1221 | | |
1222 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
1223 | 3.75k | num_subpats = pce->capture_count + 1; |
1224 | | |
1225 | | /* |
1226 | | * Build a mapping from subpattern numbers to their names. We will |
1227 | | * allocate the table only if there are any named subpatterns. |
1228 | | */ |
1229 | 3.75k | subpat_names = NULL; |
1230 | 3.75k | if (subpats && pce->name_count > 0) { |
1231 | 0 | subpat_names = ensure_subpats_table(pce->name_count, pce); |
1232 | 0 | if (UNEXPECTED(!subpat_names)) { |
1233 | 0 | RETURN_FALSE; |
1234 | 0 | } |
1235 | 0 | } |
1236 | | |
1237 | 3.75k | matched = 0; |
1238 | 3.75k | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
1239 | | |
1240 | 3.75k | old_mdata_used = mdata_used; |
1241 | 3.75k | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
1242 | 3.19k | mdata_used = true; |
1243 | 3.19k | match_data = mdata; |
1244 | 3.19k | } else { |
1245 | 554 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
1246 | 554 | if (!match_data) { |
1247 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1248 | 0 | RETURN_FALSE; |
1249 | 0 | } |
1250 | 554 | } |
1251 | | |
1252 | | /* Allocate match sets array and initialize the values. */ |
1253 | 3.75k | if (global && subpats && subpats_order == PREG_PATTERN_ORDER) { |
1254 | 0 | match_sets = safe_emalloc(num_subpats, sizeof(HashTable *), 0); |
1255 | 0 | for (i=0; i<num_subpats; i++) { |
1256 | 0 | match_sets[i] = zend_new_array(0); |
1257 | 0 | } |
1258 | 0 | } |
1259 | | |
1260 | | /* Array of subpattern offsets */ |
1261 | 3.75k | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
1262 | | |
1263 | 3.75k | orig_start_offset = start_offset2; |
1264 | 3.75k | options = |
1265 | 3.75k | (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset) |
1266 | 3.75k | ? 0 : PCRE2_NO_UTF_CHECK; |
1267 | | |
1268 | | /* Execute the regular expression. */ |
1269 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1270 | | if ((pce->preg_options & PREG_JIT) && options) { |
1271 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1272 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1273 | | } else |
1274 | | #endif |
1275 | 3.75k | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1276 | 3.75k | options, match_data, mctx); |
1277 | | |
1278 | 3.75k | while (1) { |
1279 | | /* If something has matched */ |
1280 | 3.75k | if (count >= 0) { |
1281 | | /* Check for too many substrings condition. */ |
1282 | 244 | if (UNEXPECTED(count == 0)) { |
1283 | 0 | php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings"); |
1284 | 0 | count = num_subpats; |
1285 | 0 | } |
1286 | | |
1287 | 244 | matched: |
1288 | 244 | matched++; |
1289 | | |
1290 | | /* If subpatterns array has been passed, fill it in with values. */ |
1291 | 244 | if (subpats != NULL) { |
1292 | | /* Try to get the list of substrings and display a warning if failed. */ |
1293 | 0 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
1294 | 0 | if (match_sets) { |
1295 | 0 | for (i = 0; i < num_subpats; i++) { |
1296 | 0 | zend_array_destroy(match_sets[i]); |
1297 | 0 | } |
1298 | 0 | efree(match_sets); |
1299 | 0 | } |
1300 | 0 | if (marks) { |
1301 | 0 | zend_array_destroy(marks); |
1302 | 0 | } |
1303 | 0 | if (match_data != mdata) { |
1304 | 0 | pcre2_match_data_free(match_data); |
1305 | 0 | } |
1306 | 0 | php_error_docref(NULL, E_WARNING, "Get subpatterns list failed"); |
1307 | 0 | RETURN_FALSE; |
1308 | 0 | } |
1309 | | |
1310 | 0 | if (global) { /* global pattern matching */ |
1311 | 0 | if (subpats_order == PREG_PATTERN_ORDER) { |
1312 | | /* For each subpattern, insert it into the appropriate array. */ |
1313 | 0 | if (offset_capture) { |
1314 | 0 | for (i = 0; i < count; i++) { |
1315 | 0 | add_offset_pair( |
1316 | 0 | match_sets[i], subject, offsets[2*i], offsets[2*i+1], |
1317 | 0 | NULL, unmatched_as_null); |
1318 | 0 | } |
1319 | 0 | } else { |
1320 | 0 | for (i = 0; i < count; i++) { |
1321 | 0 | zval val; |
1322 | 0 | populate_match_value( |
1323 | 0 | &val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null); |
1324 | 0 | zend_hash_next_index_insert_new(match_sets[i], &val); |
1325 | 0 | } |
1326 | 0 | } |
1327 | 0 | mark = pcre2_get_mark(match_data); |
1328 | | /* Add MARK, if available */ |
1329 | 0 | if (mark) { |
1330 | 0 | if (!marks) { |
1331 | 0 | marks = zend_new_array(0); |
1332 | 0 | } |
1333 | 0 | zval tmp; |
1334 | 0 | ZVAL_STRING(&tmp, (char *) mark); |
1335 | 0 | zend_hash_index_add_new(marks, matched - 1, &tmp); |
1336 | 0 | } |
1337 | | /* |
1338 | | * If the number of captured subpatterns on this run is |
1339 | | * less than the total possible number, pad the result |
1340 | | * arrays with NULLs or empty strings. |
1341 | | */ |
1342 | 0 | if (count < num_subpats) { |
1343 | 0 | for (int i = count; i < num_subpats; i++) { |
1344 | 0 | if (offset_capture) { |
1345 | 0 | add_offset_pair( |
1346 | 0 | match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET, |
1347 | 0 | NULL, unmatched_as_null); |
1348 | 0 | } else if (unmatched_as_null) { |
1349 | 0 | zval tmp; |
1350 | 0 | ZVAL_NULL(&tmp); |
1351 | 0 | zend_hash_next_index_insert_new(match_sets[i], &tmp); |
1352 | 0 | } else { |
1353 | 0 | zval tmp; |
1354 | 0 | ZVAL_EMPTY_STRING(&tmp); |
1355 | 0 | zend_hash_next_index_insert_new(match_sets[i], &tmp); |
1356 | 0 | } |
1357 | 0 | } |
1358 | 0 | } |
1359 | 0 | } else { |
1360 | | /* Allocate and populate the result set array */ |
1361 | 0 | mark = pcre2_get_mark(match_data); |
1362 | 0 | array_init_size(&result_set, count + (mark ? 1 : 0)); |
1363 | 0 | populate_subpat_array( |
1364 | 0 | Z_ARRVAL(result_set), subject, offsets, subpat_names, |
1365 | 0 | num_subpats, count, mark, flags); |
1366 | | /* And add it to the output array */ |
1367 | 0 | zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &result_set); |
1368 | 0 | } |
1369 | 0 | } else { /* single pattern matching */ |
1370 | | /* For each subpattern, insert it into the subpatterns array. */ |
1371 | 0 | mark = pcre2_get_mark(match_data); |
1372 | 0 | populate_subpat_array( |
1373 | 0 | Z_ARRVAL_P(subpats), subject, offsets, subpat_names, num_subpats, count, mark, flags); |
1374 | 0 | break; |
1375 | 0 | } |
1376 | 0 | } |
1377 | | |
1378 | | /* Advance to the next piece. */ |
1379 | 244 | start_offset2 = offsets[1]; |
1380 | | |
1381 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
1382 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
1383 | | the match again at the same point. If this fails (picked up above) we |
1384 | | advance to the next character. */ |
1385 | 244 | if (start_offset2 == offsets[0]) { |
1386 | 91 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1387 | 91 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
1388 | 91 | if (count >= 0) { |
1389 | 5 | if (global) { |
1390 | 0 | goto matched; |
1391 | 5 | } else { |
1392 | 5 | break; |
1393 | 5 | } |
1394 | 86 | } else if (count == PCRE2_ERROR_NOMATCH) { |
1395 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
1396 | | this is not necessarily the end. We need to advance |
1397 | | the start offset, and continue. Fudge the offset values |
1398 | | to achieve this, unless we're already at the end of the string. */ |
1399 | 83 | if (start_offset2 < subject_len) { |
1400 | 72 | size_t unit_len = calculate_unit_length(pce, subject + start_offset2); |
1401 | | |
1402 | 72 | start_offset2 += unit_len; |
1403 | 72 | } else { |
1404 | 11 | break; |
1405 | 11 | } |
1406 | 83 | } else { |
1407 | 3 | goto error; |
1408 | 3 | } |
1409 | 91 | } |
1410 | 3.50k | } else if (count == PCRE2_ERROR_NOMATCH) { |
1411 | 3.31k | break; |
1412 | 3.31k | } else { |
1413 | 196 | error: |
1414 | 196 | pcre_handle_exec_error(count); |
1415 | 196 | break; |
1416 | 193 | } |
1417 | | |
1418 | 225 | if (!global) { |
1419 | 225 | break; |
1420 | 225 | } |
1421 | | |
1422 | | /* Execute the regular expression. */ |
1423 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1424 | | if ((pce->preg_options & PREG_JIT)) { |
1425 | | if (start_offset2 > subject_len) { |
1426 | | pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET); |
1427 | | break; |
1428 | | } |
1429 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1430 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1431 | | } else |
1432 | | #endif |
1433 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2, |
1434 | 0 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1435 | 0 | } |
1436 | 3.75k | if (match_data != mdata) { |
1437 | 554 | pcre2_match_data_free(match_data); |
1438 | 554 | } |
1439 | 3.75k | mdata_used = old_mdata_used; |
1440 | | |
1441 | | /* Add the match sets to the output array and clean up */ |
1442 | 3.75k | if (match_sets) { |
1443 | 0 | if (subpat_names) { |
1444 | 0 | for (i = 0; i < num_subpats; i++) { |
1445 | 0 | zval wrapper; |
1446 | 0 | ZVAL_ARR(&wrapper, match_sets[i]); |
1447 | 0 | if (subpat_names[i]) { |
1448 | 0 | zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper); |
1449 | 0 | GC_ADDREF(match_sets[i]); |
1450 | 0 | } |
1451 | 0 | zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper); |
1452 | 0 | } |
1453 | 0 | } else { |
1454 | 0 | for (i = 0; i < num_subpats; i++) { |
1455 | 0 | zval wrapper; |
1456 | 0 | ZVAL_ARR(&wrapper, match_sets[i]); |
1457 | 0 | zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper); |
1458 | 0 | } |
1459 | 0 | } |
1460 | 0 | efree(match_sets); |
1461 | |
|
1462 | 0 | if (marks) { |
1463 | 0 | zval tmp; |
1464 | 0 | ZVAL_ARR(&tmp, marks); |
1465 | 0 | zend_hash_str_update(Z_ARRVAL_P(subpats), "MARK", sizeof("MARK") - 1, &tmp); |
1466 | 0 | } |
1467 | 0 | } |
1468 | | |
1469 | 3.75k | if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) { |
1470 | | /* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */ |
1471 | 3.55k | if ((pce->compile_options & PCRE2_UTF) |
1472 | 1.03k | && !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) { |
1473 | 441 | GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8); |
1474 | 441 | } |
1475 | | |
1476 | 3.55k | RETVAL_LONG(matched); |
1477 | 3.55k | } else { |
1478 | 196 | RETVAL_FALSE; |
1479 | 196 | } |
1480 | 3.75k | } |
1481 | | /* }}} */ |
1482 | | |
1483 | | /* {{{ Perform a Perl-style regular expression match */ |
1484 | | PHP_FUNCTION(preg_match) |
1485 | 4.78k | { |
1486 | 4.78k | php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, false); |
1487 | 4.78k | } |
1488 | | /* }}} */ |
1489 | | |
1490 | | ZEND_FRAMELESS_FUNCTION(preg_match, 2) |
1491 | 0 | { |
1492 | 0 | zval regex_tmp, subject_tmp; |
1493 | 0 | zend_string *regex, *subject; |
1494 | |
|
1495 | 0 | Z_FLF_PARAM_STR(1, regex, regex_tmp); |
1496 | 0 | Z_FLF_PARAM_STR(2, subject, subject_tmp); |
1497 | | |
1498 | | /* Compile regex or get it from cache. */ |
1499 | 0 | pcre_cache_entry *pce; |
1500 | 0 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
1501 | 0 | RETVAL_FALSE; |
1502 | 0 | goto flf_clean; |
1503 | 0 | } |
1504 | | |
1505 | 0 | pce->refcount++; |
1506 | 0 | php_pcre_match_impl(pce, subject, return_value, /* subpats */ NULL, |
1507 | 0 | /* global */ false, /* flags */ 0, /* start_offset */ 0); |
1508 | 0 | pce->refcount--; |
1509 | |
|
1510 | 0 | flf_clean: |
1511 | 0 | Z_FLF_PARAM_FREE_STR(1, regex_tmp); |
1512 | 0 | Z_FLF_PARAM_FREE_STR(2, subject_tmp); |
1513 | 0 | } |
1514 | | |
1515 | | /* {{{ Perform a Perl-style global regular expression match */ |
1516 | | PHP_FUNCTION(preg_match_all) |
1517 | 0 | { |
1518 | 0 | php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, true); |
1519 | 0 | } |
1520 | | /* }}} */ |
1521 | | |
1522 | | /* {{{ preg_get_backref */ |
1523 | | static int preg_get_backref(char **str, int *backref) |
1524 | 10 | { |
1525 | 10 | char in_brace = 0; |
1526 | 10 | char *walk = *str; |
1527 | | |
1528 | 10 | if (walk[1] == 0) |
1529 | 0 | return 0; |
1530 | | |
1531 | 10 | if (*walk == '$' && walk[1] == '{') { |
1532 | 0 | in_brace = 1; |
1533 | 0 | walk++; |
1534 | 0 | } |
1535 | 10 | walk++; |
1536 | | |
1537 | 10 | if (*walk >= '0' && *walk <= '9') { |
1538 | 0 | *backref = *walk - '0'; |
1539 | 0 | walk++; |
1540 | 0 | } else |
1541 | 10 | return 0; |
1542 | | |
1543 | 0 | if (*walk && *walk >= '0' && *walk <= '9') { |
1544 | 0 | *backref = *backref * 10 + *walk - '0'; |
1545 | 0 | walk++; |
1546 | 0 | } |
1547 | |
|
1548 | 0 | if (in_brace) { |
1549 | 0 | if (*walk != '}') |
1550 | 0 | return 0; |
1551 | 0 | else |
1552 | 0 | walk++; |
1553 | 0 | } |
1554 | | |
1555 | 0 | *str = walk; |
1556 | 0 | return 1; |
1557 | 0 | } |
1558 | | /* }}} */ |
1559 | | |
1560 | | /* Return NULL if an exception has occurred */ |
1561 | | static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) |
1562 | 377 | { |
1563 | 377 | zend_string *result_str = NULL; |
1564 | 377 | zval retval; /* Function return value */ |
1565 | 377 | zval arg; /* Argument to pass to function */ |
1566 | | |
1567 | 377 | array_init_size(&arg, count + (mark ? 1 : 0)); |
1568 | 377 | populate_subpat_array(Z_ARRVAL(arg), subject, offsets, subpat_names, num_subpats, count, mark, flags); |
1569 | | |
1570 | 377 | fci->retval = &retval; |
1571 | 377 | fci->param_count = 1; |
1572 | 377 | fci->params = &arg; |
1573 | 377 | zend_call_function(fci, fcc); |
1574 | 377 | zval_ptr_dtor(&arg); |
1575 | 377 | if (EXPECTED(Z_TYPE(retval) == IS_STRING)) { |
1576 | 42 | return Z_STR(retval); |
1577 | 42 | } |
1578 | | /* No Exception has occurred */ |
1579 | 335 | else if (EXPECTED(Z_TYPE(retval) != IS_UNDEF)) { |
1580 | 316 | result_str = zval_try_get_string_func(&retval); |
1581 | 316 | } |
1582 | 335 | zval_ptr_dtor(&retval); |
1583 | | |
1584 | 335 | return result_str; |
1585 | 377 | } |
1586 | | |
1587 | | /* {{{ php_pcre_replace */ |
1588 | | PHPAPI zend_string *php_pcre_replace(zend_string *regex, |
1589 | | zend_string *subject_str, |
1590 | | const char *subject, size_t subject_len, |
1591 | | zend_string *replace_str, |
1592 | | size_t limit, size_t *replace_count) |
1593 | 403 | { |
1594 | 403 | pcre_cache_entry *pce; /* Compiled regular expression */ |
1595 | 403 | zend_string *result; /* Function result */ |
1596 | | |
1597 | | /* Abort on pending exception, e.g. thrown from __toString(). */ |
1598 | 403 | if (UNEXPECTED(EG(exception))) { |
1599 | 0 | return NULL; |
1600 | 0 | } |
1601 | | |
1602 | | /* Compile regex or get it from cache. */ |
1603 | 403 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
1604 | 208 | return NULL; |
1605 | 208 | } |
1606 | 195 | pce->refcount++; |
1607 | 195 | result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str, |
1608 | 195 | limit, replace_count); |
1609 | 195 | pce->refcount--; |
1610 | | |
1611 | 195 | return result; |
1612 | 403 | } |
1613 | | /* }}} */ |
1614 | | |
1615 | | /* {{{ php_pcre_replace_impl() */ |
1616 | | PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count) |
1617 | 195 | { |
1618 | 195 | uint32_t options; /* Execution options */ |
1619 | 195 | int count; /* Count of matched subpatterns */ |
1620 | 195 | uint32_t num_subpats; /* Number of captured subpatterns */ |
1621 | 195 | size_t new_len; /* Length of needed storage */ |
1622 | 195 | size_t alloc_len; /* Actual allocated length */ |
1623 | 195 | size_t match_len; /* Length of the current match */ |
1624 | 195 | int backref; /* Backreference number */ |
1625 | 195 | PCRE2_SIZE start_offset; /* Where the new search starts */ |
1626 | 195 | size_t last_end_offset; /* Where the last search ended */ |
1627 | 195 | char *walkbuf, /* Location of current replacement in the result */ |
1628 | 195 | *walk, /* Used to walk the replacement string */ |
1629 | 195 | walk_last; /* Last walked character */ |
1630 | 195 | const char *match, /* The current match */ |
1631 | 195 | *piece, /* The current piece of subject */ |
1632 | 195 | *replace_end; /* End of replacement string */ |
1633 | 195 | size_t result_len; /* Length of result */ |
1634 | 195 | zend_string *result; /* Result of replacement */ |
1635 | 195 | pcre2_match_data *match_data; |
1636 | 195 | bool old_mdata_used; |
1637 | | |
1638 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
1639 | 195 | num_subpats = pce->capture_count + 1; |
1640 | 195 | alloc_len = 0; |
1641 | 195 | result = NULL; |
1642 | | |
1643 | | /* Initialize */ |
1644 | 195 | match = NULL; |
1645 | 195 | start_offset = 0; |
1646 | 195 | last_end_offset = 0; |
1647 | 195 | result_len = 0; |
1648 | 195 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
1649 | | |
1650 | 195 | old_mdata_used = mdata_used; |
1651 | 195 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
1652 | 59 | mdata_used = true; |
1653 | 59 | match_data = mdata; |
1654 | 136 | } else { |
1655 | 136 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
1656 | 136 | if (!match_data) { |
1657 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1658 | 0 | return NULL; |
1659 | 0 | } |
1660 | 136 | } |
1661 | | |
1662 | 195 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
1663 | | |
1664 | | /* Array of subpattern offsets */ |
1665 | 195 | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
1666 | | |
1667 | | /* Execute the regular expression. */ |
1668 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1669 | | if ((pce->preg_options & PREG_JIT) && options) { |
1670 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1671 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1672 | | } else |
1673 | | #endif |
1674 | 195 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1675 | 195 | options, match_data, mctx); |
1676 | | |
1677 | 858 | while (1) { |
1678 | 858 | piece = subject + last_end_offset; |
1679 | | |
1680 | 858 | if (count >= 0 && limit > 0) { |
1681 | 685 | bool simple_string; |
1682 | | |
1683 | | /* Check for too many substrings condition. */ |
1684 | 685 | if (UNEXPECTED(count == 0)) { |
1685 | 0 | php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings"); |
1686 | 0 | count = num_subpats; |
1687 | 0 | } |
1688 | | |
1689 | 717 | matched: |
1690 | 717 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
1691 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1692 | 0 | if (result) { |
1693 | 0 | zend_string_release_ex(result, 0); |
1694 | 0 | result = NULL; |
1695 | 0 | } |
1696 | 0 | break; |
1697 | 0 | } |
1698 | | |
1699 | 717 | if (replace_count) { |
1700 | 717 | ++*replace_count; |
1701 | 717 | } |
1702 | | |
1703 | | /* Set the match location in subject */ |
1704 | 717 | match = subject + offsets[0]; |
1705 | | |
1706 | 717 | new_len = result_len + offsets[0] - last_end_offset; /* part before the match */ |
1707 | | |
1708 | 717 | walk = ZSTR_VAL(replace_str); |
1709 | 717 | replace_end = walk + ZSTR_LEN(replace_str); |
1710 | 717 | walk_last = 0; |
1711 | 717 | simple_string = true; |
1712 | 1.22k | while (walk < replace_end) { |
1713 | 509 | if ('\\' == *walk || '$' == *walk) { |
1714 | 5 | simple_string = false; |
1715 | 5 | if (walk_last == '\\') { |
1716 | 0 | walk++; |
1717 | 0 | walk_last = 0; |
1718 | 0 | continue; |
1719 | 0 | } |
1720 | 5 | if (preg_get_backref(&walk, &backref)) { |
1721 | 0 | if (backref < count) |
1722 | 0 | new_len += offsets[(backref<<1)+1] - offsets[backref<<1]; |
1723 | 0 | continue; |
1724 | 0 | } |
1725 | 5 | } |
1726 | 509 | new_len++; |
1727 | 509 | walk++; |
1728 | 509 | walk_last = walk[-1]; |
1729 | 509 | } |
1730 | | |
1731 | 717 | if (new_len >= alloc_len) { |
1732 | 203 | alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD; |
1733 | 203 | if (result == NULL) { |
1734 | 96 | result = zend_string_alloc(alloc_len, 0); |
1735 | 107 | } else { |
1736 | 107 | result = zend_string_extend(result, alloc_len, 0); |
1737 | 107 | } |
1738 | 203 | } |
1739 | | |
1740 | 717 | if (match-piece > 0) { |
1741 | | /* copy the part of the string before the match */ |
1742 | 623 | memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece); |
1743 | 623 | result_len += (match-piece); |
1744 | 623 | } |
1745 | | |
1746 | 717 | if (simple_string) { |
1747 | | /* copy replacement */ |
1748 | 712 | memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1); |
1749 | 712 | result_len += ZSTR_LEN(replace_str); |
1750 | 712 | } else { |
1751 | | /* copy replacement and backrefs */ |
1752 | 5 | walkbuf = ZSTR_VAL(result) + result_len; |
1753 | | |
1754 | 5 | walk = ZSTR_VAL(replace_str); |
1755 | 5 | walk_last = 0; |
1756 | 102 | while (walk < replace_end) { |
1757 | 97 | if ('\\' == *walk || '$' == *walk) { |
1758 | 5 | if (walk_last == '\\') { |
1759 | 0 | *(walkbuf-1) = *walk++; |
1760 | 0 | walk_last = 0; |
1761 | 0 | continue; |
1762 | 0 | } |
1763 | 5 | if (preg_get_backref(&walk, &backref)) { |
1764 | 0 | if (backref < count) { |
1765 | 0 | if (offsets[backref<<1] < SIZE_MAX) { |
1766 | 0 | match_len = offsets[(backref<<1)+1] - offsets[backref<<1]; |
1767 | 0 | walkbuf = zend_mempcpy(walkbuf, subject + offsets[backref << 1], match_len); |
1768 | 0 | } |
1769 | 0 | } |
1770 | 0 | continue; |
1771 | 0 | } |
1772 | 5 | } |
1773 | 97 | *walkbuf++ = *walk++; |
1774 | 97 | walk_last = walk[-1]; |
1775 | 97 | } |
1776 | 5 | *walkbuf = '\0'; |
1777 | | /* increment the result length by how much we've added to the string */ |
1778 | 5 | result_len += (walkbuf - (ZSTR_VAL(result) + result_len)); |
1779 | 5 | } |
1780 | | |
1781 | 717 | limit--; |
1782 | | |
1783 | | /* Advance to the next piece. */ |
1784 | 717 | start_offset = last_end_offset = offsets[1]; |
1785 | | |
1786 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
1787 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
1788 | | the match again at the same point. If this fails (picked up above) we |
1789 | | advance to the next character. */ |
1790 | 717 | if (start_offset == offsets[0]) { |
1791 | 603 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1792 | 603 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
1793 | | |
1794 | 603 | piece = subject + start_offset; |
1795 | 603 | if (count >= 0 && limit > 0) { |
1796 | 32 | goto matched; |
1797 | 571 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
1798 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
1799 | | this is not necessarily the end. We need to advance |
1800 | | the start offset, and continue. Fudge the offset values |
1801 | | to achieve this, unless we're already at the end of the string. */ |
1802 | 571 | if (start_offset < subject_len) { |
1803 | 549 | size_t unit_len = calculate_unit_length(pce, piece); |
1804 | 549 | start_offset += unit_len; |
1805 | 549 | } else { |
1806 | 22 | goto not_matched; |
1807 | 22 | } |
1808 | 571 | } else { |
1809 | 0 | goto error; |
1810 | 0 | } |
1811 | 603 | } |
1812 | | |
1813 | 717 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
1814 | 195 | not_matched: |
1815 | 195 | if (!result && subject_str) { |
1816 | 99 | result = zend_string_copy(subject_str); |
1817 | 99 | break; |
1818 | 99 | } |
1819 | | /* now we know exactly how long it is */ |
1820 | 96 | alloc_len = result_len + subject_len - last_end_offset; |
1821 | 96 | if (NULL != result) { |
1822 | 96 | result = zend_string_realloc(result, alloc_len, 0); |
1823 | 96 | } else { |
1824 | 0 | result = zend_string_alloc(alloc_len, 0); |
1825 | 0 | } |
1826 | | /* stick that last bit of string on our output */ |
1827 | 96 | memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset); |
1828 | 96 | result_len += subject_len - last_end_offset; |
1829 | 96 | ZSTR_VAL(result)[result_len] = '\0'; |
1830 | 96 | ZSTR_LEN(result) = result_len; |
1831 | 96 | break; |
1832 | 195 | } else { |
1833 | 0 | error: |
1834 | 0 | pcre_handle_exec_error(count); |
1835 | 0 | if (result) { |
1836 | 0 | zend_string_release_ex(result, 0); |
1837 | 0 | result = NULL; |
1838 | 0 | } |
1839 | 0 | break; |
1840 | 0 | } |
1841 | | |
1842 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1843 | | if (pce->preg_options & PREG_JIT) { |
1844 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1845 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1846 | | } else |
1847 | | #endif |
1848 | 663 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset, |
1849 | 663 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1850 | 663 | } |
1851 | 195 | if (match_data != mdata) { |
1852 | 136 | pcre2_match_data_free(match_data); |
1853 | 136 | } |
1854 | 195 | mdata_used = old_mdata_used; |
1855 | | |
1856 | 195 | return result; |
1857 | 195 | } |
1858 | | /* }}} */ |
1859 | | |
1860 | | static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, |
1861 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
1862 | | size_t limit, size_t *replace_count, zend_long flags |
1863 | 164 | ) { |
1864 | 164 | uint32_t options; /* Execution options */ |
1865 | 164 | int count; /* Count of matched subpatterns */ |
1866 | 164 | zend_string **subpat_names; /* Array for named subpatterns */ |
1867 | 164 | uint32_t num_subpats; /* Number of captured subpatterns */ |
1868 | 164 | size_t alloc_len; /* Actual allocated length */ |
1869 | 164 | PCRE2_SIZE start_offset; /* Where the new search starts */ |
1870 | 164 | size_t last_end_offset; /* Where the last search ended */ |
1871 | 164 | const char *match, /* The current match */ |
1872 | 164 | *piece; /* The current piece of subject */ |
1873 | 164 | size_t result_len; /* Length of result */ |
1874 | 164 | zend_string *result; /* Result of replacement */ |
1875 | 164 | pcre2_match_data *match_data; |
1876 | 164 | bool old_mdata_used; |
1877 | | |
1878 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
1879 | 164 | num_subpats = pce->capture_count + 1; |
1880 | 164 | if (pce->name_count > 0) { |
1881 | 0 | subpat_names = ensure_subpats_table(pce->name_count, pce); |
1882 | 0 | if (UNEXPECTED(!subpat_names)) { |
1883 | 0 | return NULL; |
1884 | 0 | } |
1885 | 164 | } else { |
1886 | 164 | subpat_names = NULL; |
1887 | 164 | } |
1888 | | |
1889 | 164 | alloc_len = 0; |
1890 | 164 | result = NULL; |
1891 | | |
1892 | | /* Initialize */ |
1893 | 164 | match = NULL; |
1894 | 164 | start_offset = 0; |
1895 | 164 | last_end_offset = 0; |
1896 | 164 | result_len = 0; |
1897 | 164 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
1898 | | |
1899 | 164 | old_mdata_used = mdata_used; |
1900 | 164 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
1901 | 128 | mdata_used = 1; |
1902 | 128 | match_data = mdata; |
1903 | 128 | } else { |
1904 | 36 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
1905 | 36 | if (!match_data) { |
1906 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1907 | 0 | mdata_used = old_mdata_used; |
1908 | 0 | return NULL; |
1909 | 0 | } |
1910 | 36 | } |
1911 | | |
1912 | 164 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
1913 | | |
1914 | | /* Array of subpattern offsets */ |
1915 | 164 | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
1916 | | |
1917 | | /* Execute the regular expression. */ |
1918 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
1919 | | if ((pce->preg_options & PREG_JIT) && options) { |
1920 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
1921 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
1922 | | } else |
1923 | | #endif |
1924 | 164 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
1925 | 164 | options, match_data, mctx); |
1926 | | |
1927 | 516 | while (1) { |
1928 | 514 | piece = ZSTR_VAL(subject_str) + last_end_offset; |
1929 | | |
1930 | 514 | if (count >= 0 && limit) { |
1931 | | /* Check for too many substrings condition. */ |
1932 | 377 | if (UNEXPECTED(count == 0)) { |
1933 | 0 | php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings"); |
1934 | 0 | count = num_subpats; |
1935 | 0 | } |
1936 | | |
1937 | 377 | matched: |
1938 | 377 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
1939 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
1940 | 0 | if (result) { |
1941 | 0 | zend_string_release_ex(result, 0); |
1942 | 0 | result = NULL; |
1943 | 0 | } |
1944 | 0 | break; |
1945 | 0 | } |
1946 | | |
1947 | 377 | if (replace_count) { |
1948 | 377 | ++*replace_count; |
1949 | 377 | } |
1950 | | |
1951 | | /* Set the match location in subject */ |
1952 | 377 | match = ZSTR_VAL(subject_str) + offsets[0]; |
1953 | | |
1954 | | /* Length of needed storage */ |
1955 | 377 | size_t new_len = result_len + offsets[0] - last_end_offset; /* part before the match */ |
1956 | | |
1957 | | /* Use custom function to get replacement string and its length. */ |
1958 | 377 | zend_string *eval_result = preg_do_repl_func( |
1959 | 377 | fci, fcc, ZSTR_VAL(subject_str), offsets, subpat_names, num_subpats, count, |
1960 | 377 | pcre2_get_mark(match_data), flags); |
1961 | | |
1962 | 377 | if (UNEXPECTED(eval_result == NULL)) { |
1963 | 17 | goto error; |
1964 | 17 | } |
1965 | 360 | new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD; |
1966 | 360 | if (new_len >= alloc_len) { |
1967 | 210 | alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD; |
1968 | 210 | if (result == NULL) { |
1969 | 110 | result = zend_string_alloc(alloc_len, 0); |
1970 | 110 | } else { |
1971 | 100 | result = zend_string_extend(result, alloc_len, 0); |
1972 | 100 | } |
1973 | 210 | } |
1974 | | |
1975 | 360 | if (match-piece > 0) { |
1976 | | /* copy the part of the string before the match */ |
1977 | 348 | memcpy(ZSTR_VAL(result) + result_len, piece, match-piece); |
1978 | 348 | result_len += (match-piece); |
1979 | 348 | } |
1980 | | |
1981 | | /* If using custom function, copy result to the buffer and clean up. */ |
1982 | 360 | memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result)); |
1983 | 360 | result_len += ZSTR_LEN(eval_result); |
1984 | 360 | zend_string_release_ex(eval_result, 0); |
1985 | | |
1986 | 360 | limit--; |
1987 | | |
1988 | | /* Advance to the next piece. */ |
1989 | 360 | start_offset = last_end_offset = offsets[1]; |
1990 | | |
1991 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
1992 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
1993 | | the match again at the same point. If this fails (picked up above) we |
1994 | | advance to the next character. */ |
1995 | 360 | if (start_offset == offsets[0]) { |
1996 | 156 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
1997 | 156 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
1998 | | |
1999 | 156 | piece = ZSTR_VAL(subject_str) + start_offset; |
2000 | 156 | if (count >= 0 && limit) { |
2001 | 0 | goto matched; |
2002 | 156 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
2003 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
2004 | | this is not necessarily the end. We need to advance |
2005 | | the start offset, and continue. Fudge the offset values |
2006 | | to achieve this, unless we're already at the end of the string. */ |
2007 | 156 | if (start_offset < ZSTR_LEN(subject_str)) { |
2008 | 148 | size_t unit_len = calculate_unit_length(pce, piece); |
2009 | 148 | start_offset += unit_len; |
2010 | 148 | } else { |
2011 | 8 | goto not_matched; |
2012 | 8 | } |
2013 | 156 | } else { |
2014 | 0 | goto error; |
2015 | 0 | } |
2016 | 156 | } |
2017 | | |
2018 | 360 | } else if (count == PCRE2_ERROR_NOMATCH || limit == 0) { |
2019 | 145 | not_matched: |
2020 | 145 | if (result == NULL) { |
2021 | 37 | result = zend_string_copy(subject_str); |
2022 | 37 | break; |
2023 | 37 | } |
2024 | | /* now we know exactly how long it is */ |
2025 | 108 | size_t segment_len = ZSTR_LEN(subject_str) - last_end_offset; |
2026 | 108 | alloc_len = result_len + segment_len; |
2027 | 108 | result = zend_string_realloc(result, alloc_len, 0); |
2028 | | /* stick that last bit of string on our output */ |
2029 | 108 | memcpy(ZSTR_VAL(result) + result_len, piece, segment_len); |
2030 | 108 | result_len += segment_len; |
2031 | 108 | ZSTR_VAL(result)[result_len] = '\0'; |
2032 | 108 | ZSTR_LEN(result) = result_len; |
2033 | 108 | break; |
2034 | 145 | } else { |
2035 | 17 | error: |
2036 | 17 | pcre_handle_exec_error(count); |
2037 | 17 | if (result) { |
2038 | 0 | zend_string_release_ex(result, 0); |
2039 | 0 | result = NULL; |
2040 | 0 | } |
2041 | 17 | break; |
2042 | 0 | } |
2043 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2044 | | if ((pce->preg_options & PREG_JIT)) { |
2045 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
2046 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2047 | | } else |
2048 | | #endif |
2049 | 352 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), start_offset, |
2050 | 352 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2051 | 352 | } |
2052 | 164 | if (match_data != mdata) { |
2053 | 35 | pcre2_match_data_free(match_data); |
2054 | 35 | } |
2055 | 164 | mdata_used = old_mdata_used; |
2056 | | |
2057 | 164 | return result; |
2058 | 164 | } |
2059 | | |
2060 | | static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex, |
2061 | | zend_string *subject_str, |
2062 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
2063 | | size_t limit, size_t *replace_count, zend_long flags) |
2064 | 175 | { |
2065 | 175 | pcre_cache_entry *pce; /* Compiled regular expression */ |
2066 | 175 | zend_string *result; /* Function result */ |
2067 | | |
2068 | | /* Compile regex or get it from cache. */ |
2069 | 175 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
2070 | 11 | return NULL; |
2071 | 11 | } |
2072 | 164 | pce->refcount++; |
2073 | 164 | result = php_pcre_replace_func_impl(pce, subject_str, fci, fcc, limit, replace_count, flags); |
2074 | 164 | pce->refcount--; |
2075 | | |
2076 | 164 | return result; |
2077 | 175 | } |
2078 | | |
2079 | | /* {{{ php_pcre_replace_array */ |
2080 | | static zend_string *php_pcre_replace_array(HashTable *regex, |
2081 | | zend_string *replace_str, HashTable *replace_ht, |
2082 | | zend_string *subject_str, size_t limit, size_t *replace_count) |
2083 | 0 | { |
2084 | 0 | zval *regex_entry; |
2085 | 0 | zend_string *result; |
2086 | |
|
2087 | 0 | zend_string_addref(subject_str); |
2088 | |
|
2089 | 0 | if (replace_ht) { |
2090 | 0 | uint32_t replace_idx = 0; |
2091 | | |
2092 | | /* For each entry in the regex array, get the entry */ |
2093 | 0 | ZEND_HASH_FOREACH_VAL(regex, regex_entry) { |
2094 | | /* Make sure we're dealing with strings. */ |
2095 | 0 | zend_string *tmp_regex_str; |
2096 | 0 | zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str); |
2097 | 0 | zend_string *replace_entry_str, *tmp_replace_entry_str; |
2098 | 0 | zval *zv; |
2099 | | |
2100 | | /* Get current entry */ |
2101 | 0 | while (1) { |
2102 | 0 | if (replace_idx == replace_ht->nNumUsed) { |
2103 | 0 | replace_entry_str = ZSTR_EMPTY_ALLOC(); |
2104 | 0 | tmp_replace_entry_str = NULL; |
2105 | 0 | break; |
2106 | 0 | } |
2107 | 0 | zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx); |
2108 | 0 | replace_idx++; |
2109 | 0 | if (Z_TYPE_P(zv) != IS_UNDEF) { |
2110 | 0 | replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str); |
2111 | 0 | break; |
2112 | 0 | } |
2113 | 0 | } |
2114 | | |
2115 | | /* Do the actual replacement and put the result back into subject_str |
2116 | | for further replacements. */ |
2117 | 0 | result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str), |
2118 | 0 | ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count); |
2119 | 0 | zend_tmp_string_release(tmp_replace_entry_str); |
2120 | 0 | zend_tmp_string_release(tmp_regex_str); |
2121 | 0 | zend_string_release_ex(subject_str, 0); |
2122 | 0 | subject_str = result; |
2123 | 0 | if (UNEXPECTED(result == NULL)) { |
2124 | 0 | break; |
2125 | 0 | } |
2126 | 0 | } ZEND_HASH_FOREACH_END(); |
2127 | |
|
2128 | 0 | } else { |
2129 | 0 | ZEND_ASSERT(replace_str != NULL); |
2130 | | |
2131 | | /* For each entry in the regex array, get the entry */ |
2132 | 0 | ZEND_HASH_FOREACH_VAL(regex, regex_entry) { |
2133 | | /* Make sure we're dealing with strings. */ |
2134 | 0 | zend_string *tmp_regex_str; |
2135 | 0 | zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str); |
2136 | | |
2137 | | /* Do the actual replacement and put the result back into subject_str |
2138 | | for further replacements. */ |
2139 | 0 | result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str), |
2140 | 0 | ZSTR_LEN(subject_str), replace_str, limit, replace_count); |
2141 | 0 | zend_tmp_string_release(tmp_regex_str); |
2142 | 0 | zend_string_release_ex(subject_str, 0); |
2143 | 0 | subject_str = result; |
2144 | |
|
2145 | 0 | if (UNEXPECTED(result == NULL)) { |
2146 | 0 | break; |
2147 | 0 | } |
2148 | 0 | } ZEND_HASH_FOREACH_END(); |
2149 | 0 | } |
2150 | |
|
2151 | 0 | return subject_str; |
2152 | 0 | } |
2153 | | /* }}} */ |
2154 | | |
2155 | | /* {{{ php_replace_in_subject */ |
2156 | | static zend_always_inline zend_string *php_replace_in_subject( |
2157 | | zend_string *regex_str, HashTable *regex_ht, |
2158 | | zend_string *replace_str, HashTable *replace_ht, |
2159 | | zend_string *subject, size_t limit, size_t *replace_count) |
2160 | 403 | { |
2161 | 403 | zend_string *result; |
2162 | | |
2163 | 403 | if (regex_str) { |
2164 | 403 | ZEND_ASSERT(replace_str != NULL); |
2165 | 403 | result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject), |
2166 | 403 | replace_str, limit, replace_count); |
2167 | 403 | } else { |
2168 | 0 | ZEND_ASSERT(regex_ht != NULL); |
2169 | 0 | result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject, |
2170 | 0 | limit, replace_count); |
2171 | 0 | } |
2172 | 403 | return result; |
2173 | 403 | } |
2174 | | /* }}} */ |
2175 | | |
2176 | | static zend_string *php_replace_in_subject_func(zend_string *regex_str, const HashTable *regex_ht, |
2177 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
2178 | | zend_string *subject, size_t limit, size_t *replace_count, zend_long flags) |
2179 | 175 | { |
2180 | 175 | zend_string *result; |
2181 | | |
2182 | 175 | if (regex_str) { |
2183 | 175 | result = php_pcre_replace_func(regex_str, subject, fci, fcc, limit, replace_count, flags); |
2184 | 175 | return result; |
2185 | 175 | } else { |
2186 | | /* If regex is an array */ |
2187 | 0 | zval *regex_entry; |
2188 | |
|
2189 | 0 | ZEND_ASSERT(regex_ht != NULL); |
2190 | |
|
2191 | 0 | zend_string_addref(subject); |
2192 | | |
2193 | | /* For each entry in the regex array, get the entry */ |
2194 | 0 | ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) { |
2195 | | /* Make sure we're dealing with strings. */ |
2196 | 0 | zend_string *tmp_regex_entry_str; |
2197 | 0 | zend_string *regex_entry_str = zval_try_get_tmp_string(regex_entry, &tmp_regex_entry_str); |
2198 | 0 | if (UNEXPECTED(regex_entry_str == NULL)) { |
2199 | 0 | break; |
2200 | 0 | } |
2201 | | |
2202 | | /* Do the actual replacement and put the result back into subject |
2203 | | for further replacements. */ |
2204 | 0 | result = php_pcre_replace_func( |
2205 | 0 | regex_entry_str, subject, fci, fcc, limit, replace_count, flags); |
2206 | 0 | zend_tmp_string_release(tmp_regex_entry_str); |
2207 | 0 | zend_string_release(subject); |
2208 | 0 | subject = result; |
2209 | 0 | if (UNEXPECTED(result == NULL)) { |
2210 | 0 | break; |
2211 | 0 | } |
2212 | 0 | } ZEND_HASH_FOREACH_END(); |
2213 | |
|
2214 | 0 | return subject; |
2215 | 0 | } |
2216 | 175 | } |
2217 | | |
2218 | | static size_t php_preg_replace_func_impl(zval *return_value, |
2219 | | zend_string *regex_str, const HashTable *regex_ht, |
2220 | | zend_fcall_info *fci, zend_fcall_info_cache *fcc, |
2221 | | zend_string *subject_str, const HashTable *subject_ht, zend_long limit_val, zend_long flags) |
2222 | 175 | { |
2223 | 175 | zend_string *result; |
2224 | 175 | size_t replace_count = 0; |
2225 | | |
2226 | 175 | if (subject_str) { |
2227 | 175 | result = php_replace_in_subject_func( |
2228 | 175 | regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags); |
2229 | 175 | if (result != NULL) { |
2230 | 145 | RETVAL_STR(result); |
2231 | 145 | } else { |
2232 | 30 | RETVAL_NULL(); |
2233 | 30 | } |
2234 | 175 | } else { |
2235 | | /* if subject is an array */ |
2236 | 0 | zval *subject_entry, zv; |
2237 | 0 | zend_string *string_key; |
2238 | 0 | zend_ulong num_key; |
2239 | |
|
2240 | 0 | ZEND_ASSERT(subject_ht != NULL); |
2241 | |
|
2242 | 0 | array_init_size(return_value, zend_hash_num_elements(subject_ht)); |
2243 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2244 | | |
2245 | | /* For each subject entry, convert it to string, then perform replacement |
2246 | | and add the result to the return_value array. */ |
2247 | 0 | ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) { |
2248 | 0 | zend_string *tmp_subject_entry_str; |
2249 | 0 | zend_string *subject_entry_str = zval_try_get_tmp_string(subject_entry, &tmp_subject_entry_str); |
2250 | 0 | if (UNEXPECTED(subject_entry_str == NULL)) { |
2251 | 0 | break; |
2252 | 0 | } |
2253 | | |
2254 | 0 | result = php_replace_in_subject_func( |
2255 | 0 | regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags); |
2256 | 0 | if (result != NULL) { |
2257 | | /* Add to return array */ |
2258 | 0 | ZVAL_STR(&zv, result); |
2259 | 0 | if (string_key) { |
2260 | 0 | zend_hash_add_new(return_value_ht, string_key, &zv); |
2261 | 0 | } else { |
2262 | 0 | zend_hash_index_add_new(return_value_ht, num_key, &zv); |
2263 | 0 | } |
2264 | 0 | } |
2265 | 0 | zend_tmp_string_release(tmp_subject_entry_str); |
2266 | 0 | } ZEND_HASH_FOREACH_END(); |
2267 | 0 | } |
2268 | | |
2269 | 175 | return replace_count; |
2270 | 175 | } |
2271 | | |
2272 | | static void _preg_replace_common( |
2273 | | zval *return_value, |
2274 | | HashTable *regex_ht, zend_string *regex_str, |
2275 | | HashTable *replace_ht, zend_string *replace_str, |
2276 | | HashTable *subject_ht, zend_string *subject_str, |
2277 | | zend_long limit, |
2278 | | zval *zcount, |
2279 | | bool is_filter |
2280 | 403 | ) { |
2281 | 403 | size_t replace_count = 0; |
2282 | 403 | zend_string *result; |
2283 | 403 | size_t old_replace_count; |
2284 | | |
2285 | | /* If replace is an array then the regex argument needs to also be an array */ |
2286 | 403 | if (replace_ht && !regex_ht) { |
2287 | 0 | zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given"); |
2288 | 0 | RETURN_THROWS(); |
2289 | 0 | } |
2290 | | |
2291 | 403 | if (subject_str) { |
2292 | 403 | old_replace_count = replace_count; |
2293 | 403 | result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht, |
2294 | 403 | subject_str, limit, &replace_count); |
2295 | 403 | if (result != NULL) { |
2296 | 195 | if (!is_filter || replace_count > old_replace_count) { |
2297 | 195 | RETVAL_STR(result); |
2298 | 195 | } else { |
2299 | 0 | zend_string_release_ex(result, 0); |
2300 | 0 | RETVAL_NULL(); |
2301 | 0 | } |
2302 | 208 | } else { |
2303 | 208 | RETVAL_NULL(); |
2304 | 208 | } |
2305 | 403 | } else { |
2306 | | /* if subject is an array */ |
2307 | 0 | zval *subject_entry, zv; |
2308 | 0 | zend_string *string_key; |
2309 | 0 | zend_ulong num_key; |
2310 | |
|
2311 | 0 | ZEND_ASSERT(subject_ht != NULL); |
2312 | |
|
2313 | 0 | array_init_size(return_value, zend_hash_num_elements(subject_ht)); |
2314 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2315 | | |
2316 | | /* For each subject entry, convert it to string, then perform replacement |
2317 | | and add the result to the return_value array. */ |
2318 | 0 | ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) { |
2319 | 0 | old_replace_count = replace_count; |
2320 | 0 | zend_string *tmp_subject_entry_str; |
2321 | 0 | zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str); |
2322 | 0 | result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht, |
2323 | 0 | subject_entry_str, limit, &replace_count); |
2324 | |
|
2325 | 0 | if (result != NULL) { |
2326 | 0 | if (!is_filter || replace_count > old_replace_count) { |
2327 | | /* Add to return array */ |
2328 | 0 | ZVAL_STR(&zv, result); |
2329 | 0 | if (string_key) { |
2330 | 0 | zend_hash_add_new(return_value_ht, string_key, &zv); |
2331 | 0 | } else { |
2332 | 0 | zend_hash_index_add_new(return_value_ht, num_key, &zv); |
2333 | 0 | } |
2334 | 0 | } else { |
2335 | 0 | zend_string_release_ex(result, 0); |
2336 | 0 | } |
2337 | 0 | } |
2338 | 0 | zend_tmp_string_release(tmp_subject_entry_str); |
2339 | 0 | } ZEND_HASH_FOREACH_END(); |
2340 | 0 | } |
2341 | | |
2342 | 403 | if (zcount) { |
2343 | 0 | ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count); |
2344 | 0 | } |
2345 | 403 | } |
2346 | | |
2347 | | /* {{{ preg_replace_common */ |
2348 | | static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter) |
2349 | 406 | { |
2350 | 406 | zend_string *regex_str, *replace_str, *subject_str; |
2351 | 406 | HashTable *regex_ht, *replace_ht, *subject_ht; |
2352 | 406 | zend_long limit = -1; |
2353 | 406 | zval *zcount = NULL; |
2354 | | |
2355 | | /* Get function parameters and do error-checking. */ |
2356 | 1.21k | ZEND_PARSE_PARAMETERS_START(3, 5) |
2357 | 2.02k | Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str) |
2358 | 2.02k | Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str) |
2359 | 2.02k | Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str) |
2360 | 2.02k | Z_PARAM_OPTIONAL |
2361 | 2.02k | Z_PARAM_LONG(limit) |
2362 | 978 | Z_PARAM_ZVAL(zcount) |
2363 | 978 | ZEND_PARSE_PARAMETERS_END(); |
2364 | | |
2365 | 403 | _preg_replace_common( |
2366 | 403 | return_value, |
2367 | 403 | regex_ht, regex_str, |
2368 | 403 | replace_ht, replace_str, |
2369 | 403 | subject_ht, subject_str, |
2370 | 403 | limit, zcount, is_filter); |
2371 | 403 | } |
2372 | | /* }}} */ |
2373 | | |
2374 | | /* {{{ Perform Perl-style regular expression replacement. */ |
2375 | | PHP_FUNCTION(preg_replace) |
2376 | 406 | { |
2377 | 406 | preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false); |
2378 | 406 | } |
2379 | | /* }}} */ |
2380 | | |
2381 | | ZEND_FRAMELESS_FUNCTION(preg_replace, 3) |
2382 | 0 | { |
2383 | 0 | zend_string *regex_str, *replace_str, *subject_str; |
2384 | 0 | HashTable *regex_ht, *replace_ht, *subject_ht; |
2385 | 0 | zval regex_tmp, replace_tmp, subject_tmp; |
2386 | |
|
2387 | 0 | Z_FLF_PARAM_ARRAY_HT_OR_STR(1, regex_ht, regex_str, regex_tmp); |
2388 | 0 | Z_FLF_PARAM_ARRAY_HT_OR_STR(2, replace_ht, replace_str, replace_tmp); |
2389 | 0 | Z_FLF_PARAM_ARRAY_HT_OR_STR(3, subject_ht, subject_str, subject_tmp); |
2390 | |
|
2391 | 0 | _preg_replace_common( |
2392 | 0 | return_value, |
2393 | 0 | regex_ht, regex_str, |
2394 | 0 | replace_ht, replace_str, |
2395 | 0 | subject_ht, subject_str, |
2396 | 0 | /* limit */ -1, /* zcount */ NULL, /* is_filter */ false); |
2397 | |
|
2398 | 0 | flf_clean:; |
2399 | 0 | Z_FLF_PARAM_FREE_STR(1, regex_tmp); |
2400 | 0 | Z_FLF_PARAM_FREE_STR(2, replace_tmp); |
2401 | 0 | Z_FLF_PARAM_FREE_STR(3, subject_tmp); |
2402 | 0 | } |
2403 | | |
2404 | | /* {{{ Perform Perl-style regular expression replacement using replacement callback. */ |
2405 | | PHP_FUNCTION(preg_replace_callback) |
2406 | 178 | { |
2407 | 178 | zval *zcount = NULL; |
2408 | 178 | zend_string *regex_str; |
2409 | 178 | HashTable *regex_ht; |
2410 | 178 | zend_string *subject_str; |
2411 | 178 | HashTable *subject_ht; |
2412 | 178 | zend_long limit = -1, flags = 0; |
2413 | 178 | size_t replace_count; |
2414 | 178 | zend_fcall_info fci = empty_fcall_info; |
2415 | 178 | zend_fcall_info_cache fcc = empty_fcall_info_cache; |
2416 | | |
2417 | | /* Get function parameters and do error-checking. */ |
2418 | 533 | ZEND_PARSE_PARAMETERS_START(3, 6) |
2419 | 885 | Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str) |
2420 | 885 | Z_PARAM_FUNC(fci, fcc) |
2421 | 1.05k | Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str) |
2422 | 1.05k | Z_PARAM_OPTIONAL |
2423 | 1.05k | Z_PARAM_LONG(limit) |
2424 | 0 | Z_PARAM_ZVAL(zcount) |
2425 | 0 | Z_PARAM_LONG(flags) |
2426 | 178 | ZEND_PARSE_PARAMETERS_END(); |
2427 | | |
2428 | 175 | replace_count = php_preg_replace_func_impl(return_value, regex_str, regex_ht, |
2429 | 175 | &fci, &fcc, |
2430 | 175 | subject_str, subject_ht, limit, flags); |
2431 | 175 | if (zcount) { |
2432 | 0 | ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count); |
2433 | 0 | } |
2434 | 175 | } |
2435 | | /* }}} */ |
2436 | | |
2437 | | /* {{{ Perform Perl-style regular expression replacement using replacement callback. */ |
2438 | | PHP_FUNCTION(preg_replace_callback_array) |
2439 | 0 | { |
2440 | 0 | zval *replace, *zcount = NULL; |
2441 | 0 | HashTable *pattern, *subject_ht; |
2442 | 0 | zend_string *subject_str, *str_idx_regex; |
2443 | 0 | zend_long limit = -1, flags = 0; |
2444 | 0 | size_t replace_count = 0; |
2445 | | |
2446 | | /* Get function parameters and do error-checking. */ |
2447 | 0 | ZEND_PARSE_PARAMETERS_START(2, 5) |
2448 | 0 | Z_PARAM_ARRAY_HT(pattern) |
2449 | 0 | Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str) |
2450 | 0 | Z_PARAM_OPTIONAL |
2451 | 0 | Z_PARAM_LONG(limit) |
2452 | 0 | Z_PARAM_ZVAL(zcount) |
2453 | 0 | Z_PARAM_LONG(flags) |
2454 | 0 | ZEND_PARSE_PARAMETERS_END(); |
2455 | | |
2456 | 0 | if (subject_ht) { |
2457 | 0 | GC_TRY_ADDREF(subject_ht); |
2458 | 0 | } else { |
2459 | 0 | GC_TRY_ADDREF(subject_str); |
2460 | 0 | } |
2461 | |
|
2462 | 0 | ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) { |
2463 | 0 | if (!str_idx_regex) { |
2464 | 0 | zend_argument_type_error(1, "must contain only string patterns as keys"); |
2465 | 0 | goto error; |
2466 | 0 | } |
2467 | | |
2468 | 0 | zend_fcall_info_cache fcc = empty_fcall_info_cache; |
2469 | 0 | zend_fcall_info fci = empty_fcall_info; |
2470 | 0 | fci.size = sizeof(zend_fcall_info); |
2471 | | /* Copy potential trampoline */ |
2472 | 0 | ZVAL_COPY_VALUE(&fci.function_name, replace); |
2473 | |
|
2474 | 0 | if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) { |
2475 | 0 | zend_argument_type_error(1, "must contain only valid callbacks"); |
2476 | 0 | goto error; |
2477 | 0 | } |
2478 | | |
2479 | 0 | zval retval; |
2480 | 0 | replace_count += php_preg_replace_func_impl(&retval, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc, |
2481 | 0 | subject_str, subject_ht, limit, flags); |
2482 | 0 | zend_release_fcall_info_cache(&fcc); |
2483 | |
|
2484 | 0 | switch (Z_TYPE(retval)) { |
2485 | 0 | case IS_ARRAY: |
2486 | 0 | ZEND_ASSERT(subject_ht); |
2487 | 0 | zend_array_release(subject_ht); |
2488 | 0 | subject_ht = Z_ARR(retval); |
2489 | 0 | break; |
2490 | 0 | case IS_STRING: |
2491 | 0 | ZEND_ASSERT(subject_str); |
2492 | 0 | zend_string_release(subject_str); |
2493 | 0 | subject_str = Z_STR(retval); |
2494 | 0 | break; |
2495 | 0 | case IS_NULL: |
2496 | 0 | RETVAL_NULL(); |
2497 | 0 | goto error; |
2498 | 0 | EMPTY_SWITCH_DEFAULT_CASE() |
2499 | 0 | } |
2500 | | |
2501 | 0 | if (EG(exception)) { |
2502 | 0 | goto error; |
2503 | 0 | } |
2504 | 0 | } ZEND_HASH_FOREACH_END(); |
2505 | | |
2506 | 0 | if (zcount) { |
2507 | 0 | ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count); |
2508 | 0 | } |
2509 | |
|
2510 | 0 | if (subject_ht) { |
2511 | 0 | RETVAL_ARR(subject_ht); |
2512 | | // Unset the type_flags of immutable arrays to prevent the VM from performing refcounting |
2513 | 0 | if (GC_FLAGS(subject_ht) & IS_ARRAY_IMMUTABLE) { |
2514 | 0 | Z_TYPE_FLAGS_P(return_value) = 0; |
2515 | 0 | } |
2516 | 0 | return; |
2517 | 0 | } else { |
2518 | 0 | RETURN_STR(subject_str); |
2519 | 0 | } |
2520 | | |
2521 | 0 | error: |
2522 | 0 | if (subject_ht) { |
2523 | 0 | zend_array_release(subject_ht); |
2524 | 0 | } else { |
2525 | 0 | zend_string_release(subject_str); |
2526 | 0 | } |
2527 | 0 | } |
2528 | | /* }}} */ |
2529 | | |
2530 | | /* {{{ Perform Perl-style regular expression replacement and only return matches. */ |
2531 | | PHP_FUNCTION(preg_filter) |
2532 | 0 | { |
2533 | 0 | preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true); |
2534 | 0 | } |
2535 | | /* }}} */ |
2536 | | |
2537 | | /* {{{ Split string into an array using a perl-style regular expression as a delimiter */ |
2538 | | PHP_FUNCTION(preg_split) |
2539 | 0 | { |
2540 | 0 | zend_string *regex; /* Regular expression */ |
2541 | 0 | zend_string *subject; /* String to match against */ |
2542 | 0 | zend_long limit_val = -1;/* Integer value of limit */ |
2543 | 0 | zend_long flags = 0; /* Match control flags */ |
2544 | 0 | pcre_cache_entry *pce; /* Compiled regular expression */ |
2545 | | |
2546 | | /* Get function parameters and do error checking */ |
2547 | 0 | ZEND_PARSE_PARAMETERS_START(2, 4) |
2548 | 0 | Z_PARAM_STR(regex) |
2549 | 0 | Z_PARAM_STR(subject) |
2550 | 0 | Z_PARAM_OPTIONAL |
2551 | 0 | Z_PARAM_LONG(limit_val) |
2552 | 0 | Z_PARAM_LONG(flags) |
2553 | 0 | ZEND_PARSE_PARAMETERS_END(); |
2554 | | |
2555 | | /* Compile regex or get it from cache. */ |
2556 | 0 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
2557 | 0 | RETURN_FALSE; |
2558 | 0 | } |
2559 | | |
2560 | 0 | pce->refcount++; |
2561 | 0 | php_pcre_split_impl(pce, subject, return_value, limit_val, flags); |
2562 | 0 | pce->refcount--; |
2563 | 0 | } |
2564 | | /* }}} */ |
2565 | | |
2566 | | /* {{{ php_pcre_split */ |
2567 | | PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value, |
2568 | | zend_long limit_val, zend_long flags) |
2569 | 0 | { |
2570 | 0 | uint32_t options; /* Execution options */ |
2571 | 0 | int count; /* Count of matched subpatterns */ |
2572 | 0 | PCRE2_SIZE start_offset; /* Where the new search starts */ |
2573 | 0 | PCRE2_SIZE last_match_offset; /* Location of last match */ |
2574 | 0 | uint32_t no_empty; /* If NO_EMPTY flag is set */ |
2575 | 0 | uint32_t delim_capture; /* If delimiters should be captured */ |
2576 | 0 | uint32_t offset_capture; /* If offsets should be captured */ |
2577 | 0 | uint32_t num_subpats; /* Number of captured subpatterns */ |
2578 | 0 | zval tmp; |
2579 | 0 | pcre2_match_data *match_data; |
2580 | 0 | bool old_mdata_used; |
2581 | 0 | char *subject = ZSTR_VAL(subject_str); |
2582 | |
|
2583 | 0 | no_empty = flags & PREG_SPLIT_NO_EMPTY; |
2584 | 0 | delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE; |
2585 | 0 | offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE; |
2586 | | |
2587 | | /* Initialize return value */ |
2588 | 0 | array_init(return_value); |
2589 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2590 | | |
2591 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
2592 | 0 | num_subpats = pce->capture_count + 1; |
2593 | | |
2594 | | /* Start at the beginning of the string */ |
2595 | 0 | start_offset = 0; |
2596 | 0 | last_match_offset = 0; |
2597 | 0 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
2598 | |
|
2599 | 0 | if (limit_val == -1) { |
2600 | | /* pass */ |
2601 | 0 | } else if (limit_val == 0) { |
2602 | 0 | limit_val = -1; |
2603 | 0 | } else if (limit_val <= 1) { |
2604 | 0 | goto last; |
2605 | 0 | } |
2606 | | |
2607 | 0 | old_mdata_used = mdata_used; |
2608 | 0 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
2609 | 0 | mdata_used = true; |
2610 | 0 | match_data = mdata; |
2611 | 0 | } else { |
2612 | 0 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
2613 | 0 | if (!match_data) { |
2614 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
2615 | 0 | zval_ptr_dtor(return_value); |
2616 | 0 | RETURN_FALSE; |
2617 | 0 | } |
2618 | 0 | } |
2619 | | |
2620 | 0 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
2621 | | |
2622 | | /* Array of subpattern offsets */ |
2623 | 0 | PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data); |
2624 | |
|
2625 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2626 | | if ((pce->preg_options & PREG_JIT) && options) { |
2627 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2628 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2629 | | } else |
2630 | | #endif |
2631 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2632 | 0 | options, match_data, mctx); |
2633 | |
|
2634 | 0 | while (1) { |
2635 | | /* If something matched */ |
2636 | 0 | if (count >= 0) { |
2637 | | /* Check for too many substrings condition. */ |
2638 | 0 | if (UNEXPECTED(count == 0)) { |
2639 | 0 | php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings"); |
2640 | 0 | count = num_subpats; |
2641 | 0 | } |
2642 | |
|
2643 | 0 | matched: |
2644 | 0 | if (UNEXPECTED(offsets[1] < offsets[0])) { |
2645 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
2646 | 0 | break; |
2647 | 0 | } |
2648 | | |
2649 | 0 | if (!no_empty || offsets[0] != last_match_offset) { |
2650 | 0 | if (offset_capture) { |
2651 | | /* Add (match, offset) pair to the return value */ |
2652 | 0 | add_offset_pair( |
2653 | 0 | return_value_ht, subject, last_match_offset, offsets[0], |
2654 | 0 | NULL, 0); |
2655 | 0 | } else { |
2656 | | /* Add the piece to the return value */ |
2657 | 0 | populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]); |
2658 | 0 | zend_hash_next_index_insert_new(return_value_ht, &tmp); |
2659 | 0 | } |
2660 | | |
2661 | | /* One less left to do */ |
2662 | 0 | if (limit_val != -1) |
2663 | 0 | limit_val--; |
2664 | 0 | } |
2665 | |
|
2666 | 0 | if (delim_capture) { |
2667 | 0 | size_t i; |
2668 | 0 | for (i = 1; i < count; i++) { |
2669 | | /* If we have matched a delimiter */ |
2670 | 0 | if (!no_empty || offsets[2*i] != offsets[2*i+1]) { |
2671 | 0 | if (offset_capture) { |
2672 | 0 | add_offset_pair( |
2673 | 0 | return_value_ht, subject, offsets[2*i], offsets[2*i+1], NULL, 0); |
2674 | 0 | } else { |
2675 | 0 | populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]); |
2676 | 0 | zend_hash_next_index_insert_new(return_value_ht, &tmp); |
2677 | 0 | } |
2678 | 0 | } |
2679 | 0 | } |
2680 | 0 | } |
2681 | | |
2682 | | /* Advance to the position right after the last full match */ |
2683 | 0 | start_offset = last_match_offset = offsets[1]; |
2684 | | |
2685 | | /* If we have matched an empty string, mimic what Perl's /g options does. |
2686 | | This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try |
2687 | | the match again at the same point. If this fails (picked up above) we |
2688 | | advance to the next character. */ |
2689 | 0 | if (start_offset == offsets[0]) { |
2690 | | /* Get next piece if no limit or limit not yet reached and something matched*/ |
2691 | 0 | if (limit_val != -1 && limit_val <= 1) { |
2692 | 0 | break; |
2693 | 0 | } |
2694 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2695 | 0 | PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx); |
2696 | 0 | if (count >= 0) { |
2697 | 0 | goto matched; |
2698 | 0 | } else if (count == PCRE2_ERROR_NOMATCH) { |
2699 | | /* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match, |
2700 | | this is not necessarily the end. We need to advance |
2701 | | the start offset, and continue. Fudge the offset values |
2702 | | to achieve this, unless we're already at the end of the string. */ |
2703 | 0 | if (start_offset < ZSTR_LEN(subject_str)) { |
2704 | 0 | start_offset += calculate_unit_length(pce, subject + start_offset); |
2705 | 0 | } else { |
2706 | 0 | break; |
2707 | 0 | } |
2708 | 0 | } else { |
2709 | 0 | goto error; |
2710 | 0 | } |
2711 | 0 | } |
2712 | |
|
2713 | 0 | } else if (count == PCRE2_ERROR_NOMATCH) { |
2714 | 0 | break; |
2715 | 0 | } else { |
2716 | 0 | error: |
2717 | 0 | pcre_handle_exec_error(count); |
2718 | 0 | break; |
2719 | 0 | } |
2720 | | |
2721 | | /* Get next piece if no limit or limit not yet reached and something matched*/ |
2722 | 0 | if (limit_val != -1 && limit_val <= 1) { |
2723 | 0 | break; |
2724 | 0 | } |
2725 | | |
2726 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2727 | | if (pce->preg_options & PREG_JIT) { |
2728 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2729 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2730 | | } else |
2731 | | #endif |
2732 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset, |
2733 | 0 | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2734 | 0 | } |
2735 | 0 | if (match_data != mdata) { |
2736 | 0 | pcre2_match_data_free(match_data); |
2737 | 0 | } |
2738 | 0 | mdata_used = old_mdata_used; |
2739 | |
|
2740 | 0 | if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) { |
2741 | 0 | zval_ptr_dtor(return_value); |
2742 | 0 | RETURN_FALSE; |
2743 | 0 | } |
2744 | | |
2745 | 0 | last: |
2746 | 0 | start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */ |
2747 | |
|
2748 | 0 | if (!no_empty || start_offset < ZSTR_LEN(subject_str)) { |
2749 | 0 | if (offset_capture) { |
2750 | | /* Add the last (match, offset) pair to the return value */ |
2751 | 0 | add_offset_pair(return_value_ht, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0); |
2752 | 0 | } else { |
2753 | | /* Add the last piece to the return value */ |
2754 | 0 | if (start_offset == 0) { |
2755 | 0 | ZVAL_STR_COPY(&tmp, subject_str); |
2756 | 0 | } else { |
2757 | 0 | populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str)); |
2758 | 0 | } |
2759 | 0 | zend_hash_next_index_insert_new(return_value_ht, &tmp); |
2760 | 0 | } |
2761 | 0 | } |
2762 | 0 | } |
2763 | | /* }}} */ |
2764 | | |
2765 | | /* {{{ Quote regular expression characters plus an optional character */ |
2766 | | PHP_FUNCTION(preg_quote) |
2767 | 42 | { |
2768 | 42 | zend_string *str; /* Input string argument */ |
2769 | 42 | zend_string *delim = NULL; /* Additional delimiter argument */ |
2770 | 42 | char *in_str; /* Input string */ |
2771 | 42 | char *in_str_end; /* End of the input string */ |
2772 | 42 | zend_string *out_str; /* Output string with quoted characters */ |
2773 | 42 | size_t extra_len; /* Number of additional characters */ |
2774 | 42 | char *p, /* Iterator for input string */ |
2775 | 42 | *q, /* Iterator for output string */ |
2776 | 42 | delim_char = '\0', /* Delimiter character to be quoted */ |
2777 | 42 | c; /* Current character */ |
2778 | | |
2779 | | /* Get the arguments and check for errors */ |
2780 | 126 | ZEND_PARSE_PARAMETERS_START(1, 2) |
2781 | 168 | Z_PARAM_STR(str) |
2782 | 42 | Z_PARAM_OPTIONAL |
2783 | 84 | Z_PARAM_STR_OR_NULL(delim) |
2784 | 42 | ZEND_PARSE_PARAMETERS_END(); |
2785 | | |
2786 | | /* Nothing to do if we got an empty string */ |
2787 | 42 | if (ZSTR_LEN(str) == 0) { |
2788 | 0 | RETURN_EMPTY_STRING(); |
2789 | 0 | } |
2790 | | |
2791 | 42 | in_str = ZSTR_VAL(str); |
2792 | 42 | in_str_end = in_str + ZSTR_LEN(str); |
2793 | | |
2794 | 42 | if (delim) { |
2795 | 0 | delim_char = ZSTR_VAL(delim)[0]; |
2796 | 0 | } |
2797 | | |
2798 | | /* Go through the string and quote necessary characters */ |
2799 | 42 | extra_len = 0; |
2800 | 42 | p = in_str; |
2801 | 57.1k | do { |
2802 | 57.1k | c = *p; |
2803 | 57.1k | switch(c) { |
2804 | 733 | case '.': |
2805 | 913 | case '\\': |
2806 | 1.35k | case '+': |
2807 | 1.36k | case '*': |
2808 | 1.46k | case '?': |
2809 | 1.59k | case '[': |
2810 | 1.66k | case '^': |
2811 | 1.77k | case ']': |
2812 | 1.77k | case '$': |
2813 | 1.90k | case '(': |
2814 | 2.38k | case ')': |
2815 | 2.46k | case '{': |
2816 | 2.87k | case '}': |
2817 | 3.37k | case '=': |
2818 | 3.37k | case '!': |
2819 | 3.64k | case '>': |
2820 | 3.69k | case '<': |
2821 | 3.74k | case '|': |
2822 | 4.21k | case ':': |
2823 | 4.47k | case '-': |
2824 | 4.89k | case '#': |
2825 | 4.89k | extra_len++; |
2826 | 4.89k | break; |
2827 | | |
2828 | 2.60k | case '\0': |
2829 | 2.60k | extra_len+=3; |
2830 | 2.60k | break; |
2831 | | |
2832 | 49.6k | default: |
2833 | 49.6k | if (c == delim_char) { |
2834 | 0 | extra_len++; |
2835 | 0 | } |
2836 | 49.6k | break; |
2837 | 57.1k | } |
2838 | 57.1k | p++; |
2839 | 57.1k | } while (p != in_str_end); |
2840 | | |
2841 | 42 | if (extra_len == 0) { |
2842 | 1 | RETURN_STR_COPY(str); |
2843 | 1 | } |
2844 | | |
2845 | | /* Allocate enough memory so that even if each character |
2846 | | is quoted, we won't run out of room */ |
2847 | 41 | out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0); |
2848 | 41 | q = ZSTR_VAL(out_str); |
2849 | 41 | p = in_str; |
2850 | | |
2851 | 57.1k | do { |
2852 | 57.1k | c = *p; |
2853 | 57.1k | switch(c) { |
2854 | 733 | case '.': |
2855 | 913 | case '\\': |
2856 | 1.35k | case '+': |
2857 | 1.36k | case '*': |
2858 | 1.46k | case '?': |
2859 | 1.59k | case '[': |
2860 | 1.66k | case '^': |
2861 | 1.77k | case ']': |
2862 | 1.77k | case '$': |
2863 | 1.90k | case '(': |
2864 | 2.38k | case ')': |
2865 | 2.46k | case '{': |
2866 | 2.87k | case '}': |
2867 | 3.37k | case '=': |
2868 | 3.37k | case '!': |
2869 | 3.64k | case '>': |
2870 | 3.69k | case '<': |
2871 | 3.74k | case '|': |
2872 | 4.21k | case ':': |
2873 | 4.47k | case '-': |
2874 | 4.89k | case '#': |
2875 | 4.89k | *q++ = '\\'; |
2876 | 4.89k | *q++ = c; |
2877 | 4.89k | break; |
2878 | | |
2879 | 2.60k | case '\0': |
2880 | 2.60k | *q++ = '\\'; |
2881 | 2.60k | *q++ = '0'; |
2882 | 2.60k | *q++ = '0'; |
2883 | 2.60k | *q++ = '0'; |
2884 | 2.60k | break; |
2885 | | |
2886 | 49.6k | default: |
2887 | 49.6k | if (c == delim_char) { |
2888 | 0 | *q++ = '\\'; |
2889 | 0 | } |
2890 | 49.6k | *q++ = c; |
2891 | 49.6k | break; |
2892 | 57.1k | } |
2893 | 57.1k | p++; |
2894 | 57.1k | } while (p != in_str_end); |
2895 | 41 | *q = '\0'; |
2896 | | |
2897 | 41 | RETURN_NEW_STR(out_str); |
2898 | 41 | } |
2899 | | /* }}} */ |
2900 | | |
2901 | | /* {{{ Searches array and returns entries which match regex */ |
2902 | | PHP_FUNCTION(preg_grep) |
2903 | 0 | { |
2904 | 0 | zend_string *regex; /* Regular expression */ |
2905 | 0 | zval *input; /* Input array */ |
2906 | 0 | zend_long flags = 0; /* Match control flags */ |
2907 | 0 | pcre_cache_entry *pce; /* Compiled regular expression */ |
2908 | | |
2909 | | /* Get arguments and do error checking */ |
2910 | 0 | ZEND_PARSE_PARAMETERS_START(2, 3) |
2911 | 0 | Z_PARAM_STR(regex) |
2912 | 0 | Z_PARAM_ARRAY(input) |
2913 | 0 | Z_PARAM_OPTIONAL |
2914 | 0 | Z_PARAM_LONG(flags) |
2915 | 0 | ZEND_PARSE_PARAMETERS_END(); |
2916 | | |
2917 | | /* Compile regex or get it from cache. */ |
2918 | 0 | if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) { |
2919 | 0 | RETURN_FALSE; |
2920 | 0 | } |
2921 | | |
2922 | 0 | pce->refcount++; |
2923 | 0 | php_pcre_grep_impl(pce, input, return_value, flags); |
2924 | 0 | pce->refcount--; |
2925 | 0 | } |
2926 | | /* }}} */ |
2927 | | |
2928 | | PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */ |
2929 | 0 | { |
2930 | 0 | zval *entry; /* An entry in the input array */ |
2931 | 0 | uint32_t num_subpats; /* Number of captured subpatterns */ |
2932 | 0 | int count; /* Count of matched subpatterns */ |
2933 | 0 | uint32_t options; /* Execution options */ |
2934 | 0 | zend_string *string_key; |
2935 | 0 | zend_ulong num_key; |
2936 | 0 | bool invert; /* Whether to return non-matching |
2937 | | entries */ |
2938 | 0 | bool old_mdata_used; |
2939 | 0 | pcre2_match_data *match_data; |
2940 | 0 | invert = flags & PREG_GREP_INVERT ? 1 : 0; |
2941 | | |
2942 | | /* Calculate the size of the offsets array, and allocate memory for it. */ |
2943 | 0 | num_subpats = pce->capture_count + 1; |
2944 | | |
2945 | | /* Initialize return array */ |
2946 | 0 | array_init(return_value); |
2947 | 0 | HashTable *return_value_ht = Z_ARRVAL_P(return_value); |
2948 | |
|
2949 | 0 | PCRE_G(error_code) = PHP_PCRE_NO_ERROR; |
2950 | |
|
2951 | 0 | old_mdata_used = mdata_used; |
2952 | 0 | if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) { |
2953 | 0 | mdata_used = true; |
2954 | 0 | match_data = mdata; |
2955 | 0 | } else { |
2956 | 0 | match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm)); |
2957 | 0 | if (!match_data) { |
2958 | 0 | PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR; |
2959 | 0 | return; |
2960 | 0 | } |
2961 | 0 | } |
2962 | | |
2963 | 0 | options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK; |
2964 | | |
2965 | | /* Go through the input array */ |
2966 | 0 | ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) { |
2967 | 0 | zend_string *tmp_subject_str; |
2968 | 0 | zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str); |
2969 | | |
2970 | | /* Perform the match */ |
2971 | | #ifdef HAVE_PCRE_JIT_SUPPORT |
2972 | | if ((pce->preg_options & PREG_JIT) && options) { |
2973 | | count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0, |
2974 | | PCRE2_NO_UTF_CHECK, match_data, mctx); |
2975 | | } else |
2976 | | #endif |
2977 | 0 | count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0, |
2978 | 0 | options, match_data, mctx); |
2979 | | |
2980 | | /* If the entry fits our requirements */ |
2981 | 0 | if (count >= 0) { |
2982 | | /* Check for too many substrings condition. */ |
2983 | 0 | if (UNEXPECTED(count == 0)) { |
2984 | 0 | php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings"); |
2985 | 0 | } |
2986 | 0 | if (!invert) { |
2987 | 0 | Z_TRY_ADDREF_P(entry); |
2988 | | |
2989 | | /* Add to return array */ |
2990 | 0 | if (string_key) { |
2991 | 0 | zend_hash_update(return_value_ht, string_key, entry); |
2992 | 0 | } else { |
2993 | 0 | zend_hash_index_update(return_value_ht, num_key, entry); |
2994 | 0 | } |
2995 | 0 | } |
2996 | 0 | } else if (count == PCRE2_ERROR_NOMATCH) { |
2997 | 0 | if (invert) { |
2998 | 0 | Z_TRY_ADDREF_P(entry); |
2999 | | |
3000 | | /* Add to return array */ |
3001 | 0 | if (string_key) { |
3002 | 0 | zend_hash_update(return_value_ht, string_key, entry); |
3003 | 0 | } else { |
3004 | 0 | zend_hash_index_update(return_value_ht, num_key, entry); |
3005 | 0 | } |
3006 | 0 | } |
3007 | 0 | } else { |
3008 | 0 | pcre_handle_exec_error(count); |
3009 | 0 | zend_tmp_string_release(tmp_subject_str); |
3010 | 0 | break; |
3011 | 0 | } |
3012 | | |
3013 | 0 | zend_tmp_string_release(tmp_subject_str); |
3014 | 0 | } ZEND_HASH_FOREACH_END(); |
3015 | 0 | if (match_data != mdata) { |
3016 | 0 | pcre2_match_data_free(match_data); |
3017 | 0 | } |
3018 | |
|
3019 | 0 | mdata_used = old_mdata_used; |
3020 | |
|
3021 | 0 | if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) { |
3022 | 0 | zend_array_destroy(Z_ARR_P(return_value)); |
3023 | 0 | RETURN_FALSE; |
3024 | 0 | } |
3025 | 0 | } |
3026 | | /* }}} */ |
3027 | | |
3028 | | /* {{{ Returns the error code of the last regexp execution. */ |
3029 | | PHP_FUNCTION(preg_last_error) |
3030 | 0 | { |
3031 | 0 | ZEND_PARSE_PARAMETERS_NONE(); |
3032 | | |
3033 | 0 | RETURN_LONG(PCRE_G(error_code)); |
3034 | 0 | } |
3035 | | /* }}} */ |
3036 | | |
3037 | | /* {{{ Returns the error message of the last regexp execution. */ |
3038 | | PHP_FUNCTION(preg_last_error_msg) |
3039 | 0 | { |
3040 | 0 | ZEND_PARSE_PARAMETERS_NONE(); |
3041 | | |
3042 | 0 | RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code))); |
3043 | 0 | } |
3044 | | /* }}} */ |
3045 | | |
3046 | | /* {{{ module definition structures */ |
3047 | | |
3048 | | zend_module_entry pcre_module_entry = { |
3049 | | STANDARD_MODULE_HEADER, |
3050 | | "pcre", |
3051 | | ext_functions, |
3052 | | PHP_MINIT(pcre), |
3053 | | PHP_MSHUTDOWN(pcre), |
3054 | | PHP_RINIT(pcre), |
3055 | | PHP_RSHUTDOWN(pcre), |
3056 | | PHP_MINFO(pcre), |
3057 | | PHP_PCRE_VERSION, |
3058 | | PHP_MODULE_GLOBALS(pcre), |
3059 | | PHP_GINIT(pcre), |
3060 | | PHP_GSHUTDOWN(pcre), |
3061 | | NULL, |
3062 | | STANDARD_MODULE_PROPERTIES_EX |
3063 | | }; |
3064 | | |
3065 | | #ifdef COMPILE_DL_PCRE |
3066 | | ZEND_GET_MODULE(pcre) |
3067 | | #endif |
3068 | | |
3069 | | /* }}} */ |
3070 | | |
3071 | | PHPAPI pcre2_match_context *php_pcre_mctx(void) |
3072 | 9 | {/*{{{*/ |
3073 | 9 | return mctx; |
3074 | 9 | }/*}}}*/ |
3075 | | |
3076 | | PHPAPI pcre2_general_context *php_pcre_gctx(void) |
3077 | 0 | {/*{{{*/ |
3078 | 0 | return gctx; |
3079 | 0 | }/*}}}*/ |
3080 | | |
3081 | | PHPAPI pcre2_compile_context *php_pcre_cctx(void) |
3082 | 0 | {/*{{{*/ |
3083 | 0 | return cctx; |
3084 | 0 | }/*}}}*/ |
3085 | | |
3086 | | PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce) |
3087 | 0 | {/*{{{*/ |
3088 | 0 | assert(NULL != pce); |
3089 | 0 | pce->refcount++; |
3090 | 0 | }/*}}}*/ |
3091 | | |
3092 | | PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce) |
3093 | 0 | {/*{{{*/ |
3094 | 0 | assert(NULL != pce); |
3095 | 0 | assert(0 != pce->refcount); |
3096 | 0 | pce->refcount--; |
3097 | 0 | }/*}}}*/ |
3098 | | |
3099 | | PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce) |
3100 | 0 | {/*{{{*/ |
3101 | 0 | assert(NULL != pce); |
3102 | 0 | return pce->re; |
3103 | 0 | }/*}}}*/ |