/src/fluent-bit/lib/onigmo/regparse.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | regparse.c - Onigmo (Oniguruma-mod) (regular expression library) |
3 | | **********************************************************************/ |
4 | | /*- |
5 | | * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> |
6 | | * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp> |
7 | | * All rights reserved. |
8 | | * |
9 | | * Redistribution and use in source and binary forms, with or without |
10 | | * modification, are permitted provided that the following conditions |
11 | | * are met: |
12 | | * 1. Redistributions of source code must retain the above copyright |
13 | | * notice, this list of conditions and the following disclaimer. |
14 | | * 2. Redistributions in binary form must reproduce the above copyright |
15 | | * notice, this list of conditions and the following disclaimer in the |
16 | | * documentation and/or other materials provided with the distribution. |
17 | | * |
18 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
19 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
20 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
21 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
22 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
23 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
24 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
25 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
26 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
27 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
28 | | * SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | #include "regparse.h" |
32 | | #include <stdarg.h> |
33 | | |
34 | 5.70k | #define WARN_BUFSIZE 256 |
35 | | |
36 | | #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
37 | | |
38 | | |
39 | | const OnigSyntaxType OnigSyntaxRuby = { |
40 | | (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | |
41 | | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | |
42 | | ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | |
43 | | ONIG_SYN_OP_ESC_C_CONTROL ) |
44 | | & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) |
45 | | , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | |
46 | | ONIG_SYN_OP2_OPTION_RUBY | |
47 | | ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | |
48 | | ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | |
49 | | ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | |
50 | | ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | |
51 | | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | |
52 | | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | |
53 | | ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | |
54 | | ONIG_SYN_OP2_ESC_H_XDIGIT | |
55 | | #ifndef RUBY |
56 | | ONIG_SYN_OP2_ESC_U_HEX4 | |
57 | | #endif |
58 | | ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER | |
59 | | ONIG_SYN_OP2_QMARK_LPAREN_CONDITION | |
60 | | ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK | |
61 | | ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | |
62 | | ONIG_SYN_OP2_QMARK_TILDE_ABSENT ) |
63 | | , ( SYN_GNU_REGEX_BV | |
64 | | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | |
65 | | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | |
66 | | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | |
67 | | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | |
68 | | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | |
69 | | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | |
70 | | ONIG_SYN_WARN_CC_DUP | |
71 | | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) |
72 | | , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE | |
73 | | ONIG_OPTION_WORD_BOUND_ALL_RANGE ) |
74 | | , |
75 | | { |
76 | | (OnigCodePoint )'\\' /* esc */ |
77 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ |
78 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ |
79 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ |
80 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ |
81 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ |
82 | | } |
83 | | }; |
84 | | |
85 | | const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; |
86 | | |
87 | 5.70k | extern void onig_null_warn(const char* s ARG_UNUSED) { } |
88 | | |
89 | | #ifdef DEFAULT_WARN_FUNCTION |
90 | | static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; |
91 | | #else |
92 | | static OnigWarnFunc onig_warn = onig_null_warn; |
93 | | #endif |
94 | | |
95 | | #ifdef DEFAULT_VERB_WARN_FUNCTION |
96 | | static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; |
97 | | #else |
98 | | static OnigWarnFunc onig_verb_warn = onig_null_warn; |
99 | | #endif |
100 | | |
101 | | extern void onig_set_warn_func(OnigWarnFunc f) |
102 | 0 | { |
103 | 0 | onig_warn = f; |
104 | 0 | } |
105 | | |
106 | | extern void onig_set_verb_warn_func(OnigWarnFunc f) |
107 | 0 | { |
108 | 0 | onig_verb_warn = f; |
109 | 0 | } |
110 | | |
111 | | static void CC_DUP_WARN(ScanEnv *env, OnigCodePoint from, OnigCodePoint to); |
112 | | |
113 | | |
114 | | static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; |
115 | | |
116 | | extern unsigned int |
117 | | onig_get_parse_depth_limit(void) |
118 | 0 | { |
119 | 0 | return ParseDepthLimit; |
120 | 0 | } |
121 | | |
122 | | extern int |
123 | | onig_set_parse_depth_limit(unsigned int depth) |
124 | 0 | { |
125 | 0 | if (depth == 0) |
126 | 0 | ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; |
127 | 0 | else |
128 | 0 | ParseDepthLimit = depth; |
129 | 0 | return 0; |
130 | 0 | } |
131 | | |
132 | | |
133 | | static void |
134 | | bbuf_free(BBuf* bbuf) |
135 | 3.30M | { |
136 | 3.30M | if (IS_NOT_NULL(bbuf)) { |
137 | 2.83M | if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); |
138 | 2.83M | xfree(bbuf); |
139 | 2.83M | } |
140 | 3.30M | } |
141 | | |
142 | | static int |
143 | | bbuf_clone(BBuf** rto, BBuf* from) |
144 | 2.37k | { |
145 | 2.37k | int r; |
146 | 2.37k | BBuf *to; |
147 | | |
148 | 2.37k | *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); |
149 | 2.37k | CHECK_NULL_RETURN_MEMERR(to); |
150 | 2.37k | r = BBUF_INIT(to, from->alloc); |
151 | 2.37k | if (r != 0) return r; |
152 | 2.37k | to->used = from->used; |
153 | 2.37k | xmemcpy(to->p, from->p, from->used); |
154 | 2.37k | return 0; |
155 | 2.37k | } |
156 | | |
157 | | #define BACKREF_REL_TO_ABS(rel_no, env) \ |
158 | 4.07k | ((env)->num_mem + 1 + (rel_no)) |
159 | | |
160 | 334k | #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) |
161 | | |
162 | | #define MBCODE_START_POS(enc) \ |
163 | 2.71k | (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) |
164 | | |
165 | | #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ |
166 | 686 | add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT) |
167 | | |
168 | 0 | #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ |
169 | 0 | if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ |
170 | 0 | r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ |
171 | 0 | if (r) return r;\ |
172 | 0 | }\ |
173 | 0 | } while (0) |
174 | | |
175 | | |
176 | 82.6M | #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \ |
177 | 82.6M | if (BITSET_AT(bs, pos)) CC_DUP_WARN(env, pos, pos); \ |
178 | 82.6M | BS_ROOM(bs, pos) |= BS_BIT(pos); \ |
179 | 82.6M | } while (0) |
180 | | |
181 | 0 | #define BITSET_IS_EMPTY(bs,empty) do {\ |
182 | 0 | int i;\ |
183 | 0 | empty = 1;\ |
184 | 0 | for (i = 0; i < BITSET_SIZE; i++) {\ |
185 | 0 | if ((bs)[i] != 0) {\ |
186 | 0 | empty = 0; break;\ |
187 | 0 | }\ |
188 | 0 | }\ |
189 | 0 | } while (0) |
190 | | |
191 | | static void |
192 | | bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to) |
193 | 461k | { |
194 | 461k | int i; |
195 | 59.3M | for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { |
196 | 58.9M | BITSET_SET_BIT_CHKDUP(bs, i); |
197 | 58.9M | } |
198 | 461k | } |
199 | | |
200 | | #if 0 |
201 | | static void |
202 | | bitset_set_all(BitSetRef bs) |
203 | | { |
204 | | int i; |
205 | | for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } |
206 | | } |
207 | | #endif |
208 | | |
209 | | static void |
210 | | bitset_invert(BitSetRef bs) |
211 | 0 | { |
212 | 0 | int i; |
213 | 0 | for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); } |
214 | 0 | } |
215 | | |
216 | | static void |
217 | | bitset_invert_to(BitSetRef from, BitSetRef to) |
218 | 2.71k | { |
219 | 2.71k | int i; |
220 | 24.4k | for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); } |
221 | 2.71k | } |
222 | | |
223 | | static void |
224 | | bitset_and(BitSetRef dest, BitSetRef bs) |
225 | 460k | { |
226 | 460k | int i; |
227 | 4.14M | for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; } |
228 | 460k | } |
229 | | |
230 | | static void |
231 | | bitset_or(BitSetRef dest, BitSetRef bs) |
232 | 465k | { |
233 | 465k | int i; |
234 | 4.18M | for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; } |
235 | 465k | } |
236 | | |
237 | | static void |
238 | | bitset_copy(BitSetRef dest, BitSetRef bs) |
239 | 0 | { |
240 | 0 | int i; |
241 | 0 | for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; } |
242 | 0 | } |
243 | | |
244 | | #if defined(USE_NAMED_GROUP) && !defined(USE_ST_LIBRARY) |
245 | | extern int |
246 | | onig_strncmp(const UChar* s1, const UChar* s2, int n) |
247 | | { |
248 | | int x; |
249 | | |
250 | | while (n-- > 0) { |
251 | | x = *s2++ - *s1++; |
252 | | if (x) return x; |
253 | | } |
254 | | return 0; |
255 | | } |
256 | | #endif |
257 | | |
258 | | extern void |
259 | | onig_strcpy(UChar* dest, const UChar* src, const UChar* end) |
260 | 13.7M | { |
261 | 13.7M | ptrdiff_t len = end - src; |
262 | 13.7M | if (len > 0) { |
263 | 13.7M | xmemcpy(dest, src, len); |
264 | 13.7M | dest[len] = (UChar )0; |
265 | 13.7M | } |
266 | 13.7M | } |
267 | | |
268 | | #ifdef USE_NAMED_GROUP |
269 | | static UChar* |
270 | | strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) |
271 | 239k | { |
272 | 239k | ptrdiff_t slen; |
273 | 239k | int term_len, i; |
274 | 239k | UChar *r; |
275 | | |
276 | 239k | slen = end - s; |
277 | 239k | term_len = ONIGENC_MBC_MINLEN(enc); |
278 | | |
279 | 239k | r = (UChar* )xmalloc(slen + term_len); |
280 | 239k | CHECK_NULL_RETURN(r); |
281 | 239k | xmemcpy(r, s, slen); |
282 | | |
283 | 479k | for (i = 0; i < term_len; i++) |
284 | 239k | r[slen + i] = (UChar )0; |
285 | | |
286 | 239k | return r; |
287 | 239k | } |
288 | | #endif |
289 | | |
290 | | /* scan pattern methods */ |
291 | 178 | #define PEND_VALUE 0 |
292 | | |
293 | | #ifdef __GNUC__ |
294 | | /* get rid of Wunused-but-set-variable and Wuninitialized */ |
295 | 21.4M | # define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev |
296 | | #else |
297 | | # define PFETCH_READY UChar* pfetch_prev |
298 | | #endif |
299 | 30.7M | #define PEND (p < end ? 0 : 1) |
300 | 1.41M | #define PUNFETCH p = pfetch_prev |
301 | 966k | #define PINC do { \ |
302 | 966k | pfetch_prev = p; \ |
303 | 966k | p += enclen(enc, p, end); \ |
304 | 966k | } while (0) |
305 | 22.3M | #define PFETCH(c) do { \ |
306 | 22.3M | c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \ |
307 | 22.3M | pfetch_prev = p; \ |
308 | 22.3M | p += enclen(enc, p, end); \ |
309 | 22.3M | } while (0) |
310 | | |
311 | 29.8k | #define PINC_S do { \ |
312 | 29.8k | p += enclen(enc, p, end); \ |
313 | 29.8k | } while (0) |
314 | 1.97M | #define PFETCH_S(c) do { \ |
315 | 1.97M | c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \ |
316 | 1.97M | p += enclen(enc, p, end); \ |
317 | 1.97M | } while (0) |
318 | | |
319 | 4.91M | #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) |
320 | 8.52M | #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) |
321 | | |
322 | | static UChar* |
323 | | strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, |
324 | | size_t capa) |
325 | 2.40M | { |
326 | 2.40M | UChar* r; |
327 | | |
328 | 2.40M | if (dest) |
329 | 2.40M | r = (UChar* )xrealloc(dest, capa + 1); |
330 | 0 | else |
331 | 0 | r = (UChar* )xmalloc(capa + 1); |
332 | | |
333 | 2.40M | CHECK_NULL_RETURN(r); |
334 | 2.40M | onig_strcpy(r + (dest_end - dest), src, src_end); |
335 | 2.40M | return r; |
336 | 2.40M | } |
337 | | |
338 | | /* dest on static area */ |
339 | | static UChar* |
340 | | strcat_capa_from_static(UChar* dest, UChar* dest_end, |
341 | | const UChar* src, const UChar* src_end, size_t capa) |
342 | 133k | { |
343 | 133k | UChar* r; |
344 | | |
345 | 133k | r = (UChar* )xmalloc(capa + 1); |
346 | 133k | CHECK_NULL_RETURN(r); |
347 | 133k | onig_strcpy(r, dest, dest_end); |
348 | 133k | onig_strcpy(r + (dest_end - dest), src, src_end); |
349 | 133k | return r; |
350 | 133k | } |
351 | | |
352 | | |
353 | | #ifdef USE_ST_LIBRARY |
354 | | |
355 | | # ifdef RUBY |
356 | | # include "ruby/st.h" |
357 | | # else |
358 | | # include "st.h" |
359 | | # endif |
360 | | |
361 | | typedef struct { |
362 | | const UChar* s; |
363 | | const UChar* end; |
364 | | } st_str_end_key; |
365 | | |
366 | | static int |
367 | | str_end_cmp(st_data_t xp, st_data_t yp) |
368 | 97.3k | { |
369 | 97.3k | const st_str_end_key *x, *y; |
370 | 97.3k | const UChar *p, *q; |
371 | 97.3k | int c; |
372 | | |
373 | 97.3k | x = (const st_str_end_key *)xp; |
374 | 97.3k | y = (const st_str_end_key *)yp; |
375 | 97.3k | if ((x->end - x->s) != (y->end - y->s)) |
376 | 0 | return 1; |
377 | | |
378 | 97.3k | p = x->s; |
379 | 97.3k | q = y->s; |
380 | 490k | while (p < x->end) { |
381 | 393k | c = (int )*p - (int )*q; |
382 | 393k | if (c != 0) return c; |
383 | | |
384 | 393k | p++; q++; |
385 | 393k | } |
386 | | |
387 | 97.3k | return 0; |
388 | 97.3k | } |
389 | | |
390 | | static st_index_t |
391 | | str_end_hash(st_data_t xp) |
392 | 498k | { |
393 | 498k | const st_str_end_key *x = (const st_str_end_key *)xp; |
394 | 498k | const UChar *p; |
395 | 498k | st_index_t val = 0; |
396 | | |
397 | 498k | p = x->s; |
398 | 2.64M | while (p < x->end) { |
399 | 2.14M | val = val * 997 + (int )*p++; |
400 | 2.14M | } |
401 | | |
402 | 498k | return val + (val >> 5); |
403 | 498k | } |
404 | | |
405 | | extern hash_table_type* |
406 | | onig_st_init_strend_table_with_size(st_index_t size) |
407 | 78.0k | { |
408 | 78.0k | static const struct st_hash_type hashType = { |
409 | 78.0k | str_end_cmp, |
410 | 78.0k | str_end_hash, |
411 | 78.0k | }; |
412 | | |
413 | 78.0k | return (hash_table_type* ) |
414 | 78.0k | onig_st_init_table_with_size(&hashType, size); |
415 | 78.0k | } |
416 | | |
417 | | extern int |
418 | | onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, |
419 | | const UChar* end_key, hash_data_type *value) |
420 | 258k | { |
421 | 258k | st_str_end_key key; |
422 | | |
423 | 258k | key.s = (UChar* )str_key; |
424 | 258k | key.end = (UChar* )end_key; |
425 | | |
426 | 258k | return onig_st_lookup(table, (st_data_t )(&key), value); |
427 | 258k | } |
428 | | |
429 | | extern int |
430 | | onig_st_insert_strend(hash_table_type* table, const UChar* str_key, |
431 | | const UChar* end_key, hash_data_type value) |
432 | 239k | { |
433 | 239k | st_str_end_key* key; |
434 | 239k | int result; |
435 | | |
436 | 239k | key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); |
437 | 239k | key->s = (UChar* )str_key; |
438 | 239k | key->end = (UChar* )end_key; |
439 | 239k | result = onig_st_insert(table, (st_data_t )key, value); |
440 | 239k | if (result) { |
441 | 0 | xfree(key); |
442 | 0 | } |
443 | 239k | return result; |
444 | 239k | } |
445 | | |
446 | | #endif /* USE_ST_LIBRARY */ |
447 | | |
448 | | |
449 | | #ifdef USE_NAMED_GROUP |
450 | | |
451 | 142 | # define INIT_NAME_BACKREFS_ALLOC_NUM 8 |
452 | | |
453 | | typedef struct { |
454 | | UChar* name; |
455 | | size_t name_len; /* byte length */ |
456 | | int back_num; /* number of backrefs */ |
457 | | int back_alloc; |
458 | | int back_ref1; |
459 | | int* back_refs; |
460 | | } NameEntry; |
461 | | |
462 | | # ifdef USE_ST_LIBRARY |
463 | | |
464 | | typedef st_table NameTable; |
465 | | typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ |
466 | | |
467 | | # ifdef ONIG_DEBUG |
468 | | static int |
469 | | i_print_name_entry(UChar* key, NameEntry* e, void* arg) |
470 | | { |
471 | | int i; |
472 | | FILE* fp = (FILE* )arg; |
473 | | |
474 | | fprintf(fp, "%s: ", e->name); |
475 | | if (e->back_num == 0) |
476 | | fputs("-", fp); |
477 | | else if (e->back_num == 1) |
478 | | fprintf(fp, "%d", e->back_ref1); |
479 | | else { |
480 | | for (i = 0; i < e->back_num; i++) { |
481 | | if (i > 0) fprintf(fp, ", "); |
482 | | fprintf(fp, "%d", e->back_refs[i]); |
483 | | } |
484 | | } |
485 | | fputs("\n", fp); |
486 | | return ST_CONTINUE; |
487 | | } |
488 | | |
489 | | extern int |
490 | | onig_print_names(FILE* fp, regex_t* reg) |
491 | | { |
492 | | NameTable* t = (NameTable* )reg->name_table; |
493 | | |
494 | | if (IS_NOT_NULL(t)) { |
495 | | fprintf(fp, "name table\n"); |
496 | | onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); |
497 | | fputs("\n", fp); |
498 | | } |
499 | | return 0; |
500 | | } |
501 | | # endif /* ONIG_DEBUG */ |
502 | | |
503 | | static int |
504 | | i_free_name_entry(st_data_t vkey, st_data_t ve, st_data_t arg_data ARG_UNUSED, int existing ARG_UNUSED) |
505 | 239k | { |
506 | 239k | NameEntry* e = (NameEntry*)ve; |
507 | 239k | UChar* key = (UChar*)vkey; |
508 | 239k | xfree(e->name); |
509 | 239k | if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); |
510 | 239k | xfree(key); |
511 | 239k | xfree(e); |
512 | 239k | return ST_DELETE; |
513 | 239k | } |
514 | | |
515 | | static int |
516 | | names_clear(regex_t* reg) |
517 | 1.67M | { |
518 | 1.67M | NameTable* t = (NameTable* )reg->name_table; |
519 | | |
520 | 1.67M | if (IS_NOT_NULL(t)) { |
521 | 78.0k | onig_st_foreach(t, i_free_name_entry, 0); |
522 | 78.0k | } |
523 | 1.67M | return 0; |
524 | 1.67M | } |
525 | | |
526 | | extern int |
527 | | onig_names_free(regex_t* reg) |
528 | 836k | { |
529 | 836k | int r; |
530 | 836k | NameTable* t; |
531 | | |
532 | 836k | r = names_clear(reg); |
533 | 836k | if (r) return r; |
534 | | |
535 | 836k | t = (NameTable* )reg->name_table; |
536 | 836k | if (IS_NOT_NULL(t)) onig_st_free_table(t); |
537 | 836k | reg->name_table = (void* )NULL; |
538 | 836k | return 0; |
539 | 836k | } |
540 | | |
541 | | static NameEntry* |
542 | | name_find(regex_t* reg, const UChar* name, const UChar* name_end) |
543 | 336k | { |
544 | 336k | NameEntry* e; |
545 | 336k | NameTable* t = (NameTable* )reg->name_table; |
546 | | |
547 | 336k | e = (NameEntry* )NULL; |
548 | 336k | if (IS_NOT_NULL(t)) { |
549 | 258k | onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); |
550 | 258k | } |
551 | 336k | return e; |
552 | 336k | } |
553 | | |
554 | | typedef struct { |
555 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); |
556 | | regex_t* reg; |
557 | | void* arg; |
558 | | int ret; |
559 | | OnigEncoding enc; |
560 | | } INamesArg; |
561 | | |
562 | | static int |
563 | | i_names(st_data_t key_data ARG_UNUSED, st_data_t value_data, st_data_t arg_data, int existing ARG_UNUSED) |
564 | 89.3k | { |
565 | | /* Cast back to original types */ |
566 | 89.3k | NameEntry* e = (NameEntry*)value_data; |
567 | 89.3k | INamesArg* arg = (INamesArg*)arg_data; |
568 | | |
569 | 89.3k | int r = (*(arg->func))(e->name, |
570 | 89.3k | e->name + e->name_len, |
571 | 89.3k | e->back_num, |
572 | 89.3k | (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), |
573 | 89.3k | arg->reg, arg->arg); |
574 | 89.3k | if (r != 0) { |
575 | 0 | arg->ret = r; |
576 | 0 | return ST_STOP; |
577 | 0 | } |
578 | 89.3k | return ST_CONTINUE; |
579 | 89.3k | } |
580 | | |
581 | | extern int |
582 | | onig_foreach_name(regex_t* reg, |
583 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) |
584 | 28.1k | { |
585 | 28.1k | INamesArg narg; |
586 | 28.1k | NameTable* t = (NameTable* )reg->name_table; |
587 | | |
588 | 28.1k | narg.ret = 0; |
589 | 28.1k | if (IS_NOT_NULL(t)) { |
590 | 28.1k | narg.func = func; |
591 | 28.1k | narg.reg = reg; |
592 | 28.1k | narg.arg = arg; |
593 | 28.1k | narg.enc = reg->enc; /* should be pattern encoding. */ |
594 | 28.1k | onig_st_foreach(t, i_names, (st_data_t)&narg); |
595 | 28.1k | } |
596 | 28.1k | return narg.ret; |
597 | 28.1k | } |
598 | | |
599 | | static int |
600 | | i_renumber_name(st_data_t key_data ARG_UNUSED, st_data_t ve, st_data_t vmap, int existing ARG_UNUSED) |
601 | 2.34k | { |
602 | 2.34k | int i; |
603 | | |
604 | 2.34k | NameEntry* e = (NameEntry*)ve; |
605 | 2.34k | GroupNumRemap* map = (GroupNumRemap*)vmap; |
606 | 2.34k | if (e->back_num > 1) { |
607 | 1.83k | for (i = 0; i < e->back_num; i++) { |
608 | 1.72k | e->back_refs[i] = map[e->back_refs[i]].new_val; |
609 | 1.72k | } |
610 | 104 | } |
611 | 2.24k | else if (e->back_num == 1) { |
612 | 2.24k | e->back_ref1 = map[e->back_ref1].new_val; |
613 | 2.24k | } |
614 | | |
615 | 2.34k | return ST_CONTINUE; |
616 | 2.34k | } |
617 | | |
618 | | extern int |
619 | | onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) |
620 | 2.32k | { |
621 | 2.32k | NameTable* t = (NameTable* )reg->name_table; |
622 | | |
623 | 2.32k | if (IS_NOT_NULL(t)) { |
624 | 2.32k | onig_st_foreach(t, i_renumber_name, (st_data_t)map); |
625 | 2.32k | } |
626 | 2.32k | return 0; |
627 | 2.32k | } |
628 | | |
629 | | |
630 | | extern int |
631 | | onig_number_of_names(const regex_t* reg) |
632 | 0 | { |
633 | 0 | NameTable* t = (NameTable* )reg->name_table; |
634 | |
|
635 | 0 | if (IS_NOT_NULL(t)) |
636 | 0 | return (int )t->num_entries; |
637 | 0 | else |
638 | 0 | return 0; |
639 | 0 | } |
640 | | |
641 | | # else /* USE_ST_LIBRARY */ |
642 | | |
643 | | # define INIT_NAMES_ALLOC_NUM 8 |
644 | | |
645 | | typedef struct { |
646 | | NameEntry* e; |
647 | | int num; |
648 | | int alloc; |
649 | | } NameTable; |
650 | | |
651 | | # ifdef ONIG_DEBUG |
652 | | extern int |
653 | | onig_print_names(FILE* fp, regex_t* reg) |
654 | | { |
655 | | int i, j; |
656 | | NameEntry* e; |
657 | | NameTable* t = (NameTable* )reg->name_table; |
658 | | |
659 | | if (IS_NOT_NULL(t) && t->num > 0) { |
660 | | fprintf(fp, "name table\n"); |
661 | | for (i = 0; i < t->num; i++) { |
662 | | e = &(t->e[i]); |
663 | | fprintf(fp, "%s: ", e->name); |
664 | | if (e->back_num == 0) { |
665 | | fputs("-", fp); |
666 | | } |
667 | | else if (e->back_num == 1) { |
668 | | fprintf(fp, "%d", e->back_ref1); |
669 | | } |
670 | | else { |
671 | | for (j = 0; j < e->back_num; j++) { |
672 | | if (j > 0) fprintf(fp, ", "); |
673 | | fprintf(fp, "%d", e->back_refs[j]); |
674 | | } |
675 | | } |
676 | | fputs("\n", fp); |
677 | | } |
678 | | fputs("\n", fp); |
679 | | } |
680 | | return 0; |
681 | | } |
682 | | # endif |
683 | | |
684 | | static int |
685 | | names_clear(regex_t* reg) |
686 | | { |
687 | | int i; |
688 | | NameEntry* e; |
689 | | NameTable* t = (NameTable* )reg->name_table; |
690 | | |
691 | | if (IS_NOT_NULL(t)) { |
692 | | for (i = 0; i < t->num; i++) { |
693 | | e = &(t->e[i]); |
694 | | if (IS_NOT_NULL(e->name)) { |
695 | | xfree(e->name); |
696 | | e->name = NULL; |
697 | | e->name_len = 0; |
698 | | e->back_num = 0; |
699 | | e->back_alloc = 0; |
700 | | if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); |
701 | | e->back_refs = (int* )NULL; |
702 | | } |
703 | | } |
704 | | if (IS_NOT_NULL(t->e)) { |
705 | | xfree(t->e); |
706 | | t->e = NULL; |
707 | | } |
708 | | t->num = 0; |
709 | | } |
710 | | return 0; |
711 | | } |
712 | | |
713 | | extern int |
714 | | onig_names_free(regex_t* reg) |
715 | | { |
716 | | int r; |
717 | | NameTable* t; |
718 | | |
719 | | r = names_clear(reg); |
720 | | if (r) return r; |
721 | | |
722 | | t = (NameTable* )reg->name_table; |
723 | | if (IS_NOT_NULL(t)) xfree(t); |
724 | | reg->name_table = NULL; |
725 | | return 0; |
726 | | } |
727 | | |
728 | | static NameEntry* |
729 | | name_find(regex_t* reg, const UChar* name, const UChar* name_end) |
730 | | { |
731 | | int i, len; |
732 | | NameEntry* e; |
733 | | NameTable* t = (NameTable* )reg->name_table; |
734 | | |
735 | | if (IS_NOT_NULL(t)) { |
736 | | len = name_end - name; |
737 | | for (i = 0; i < t->num; i++) { |
738 | | e = &(t->e[i]); |
739 | | if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) |
740 | | return e; |
741 | | } |
742 | | } |
743 | | return (NameEntry* )NULL; |
744 | | } |
745 | | |
746 | | extern int |
747 | | onig_foreach_name(regex_t* reg, |
748 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) |
749 | | { |
750 | | int i, r; |
751 | | NameEntry* e; |
752 | | NameTable* t = (NameTable* )reg->name_table; |
753 | | |
754 | | if (IS_NOT_NULL(t)) { |
755 | | for (i = 0; i < t->num; i++) { |
756 | | e = &(t->e[i]); |
757 | | r = (*func)(e->name, e->name + e->name_len, e->back_num, |
758 | | (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), |
759 | | reg, arg); |
760 | | if (r != 0) return r; |
761 | | } |
762 | | } |
763 | | return 0; |
764 | | } |
765 | | |
766 | | extern int |
767 | | onig_number_of_names(const regex_t* reg) |
768 | | { |
769 | | NameTable* t = (NameTable* )reg->name_table; |
770 | | |
771 | | if (IS_NOT_NULL(t)) |
772 | | return t->num; |
773 | | else |
774 | | return 0; |
775 | | } |
776 | | |
777 | | # endif /* else USE_ST_LIBRARY */ |
778 | | |
779 | | static int |
780 | | name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) |
781 | 246k | { |
782 | 246k | int alloc; |
783 | 246k | NameEntry* e; |
784 | 246k | NameTable* t = (NameTable* )reg->name_table; |
785 | | |
786 | 246k | if (name_end - name <= 0) |
787 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
788 | | |
789 | 246k | e = name_find(reg, name, name_end); |
790 | 246k | if (IS_NULL(e)) { |
791 | 239k | # ifdef USE_ST_LIBRARY |
792 | 239k | if (IS_NULL(t)) { |
793 | 78.0k | t = onig_st_init_strend_table_with_size(5); |
794 | 78.0k | reg->name_table = (void* )t; |
795 | 78.0k | } |
796 | 239k | e = (NameEntry* )xmalloc(sizeof(NameEntry)); |
797 | 239k | CHECK_NULL_RETURN_MEMERR(e); |
798 | | |
799 | 239k | e->name = strdup_with_null(reg->enc, name, name_end); |
800 | 239k | if (IS_NULL(e->name)) { |
801 | 0 | xfree(e); |
802 | 0 | return ONIGERR_MEMORY; |
803 | 0 | } |
804 | 239k | onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), |
805 | 239k | (HashDataType )e); |
806 | | |
807 | 239k | e->name_len = name_end - name; |
808 | 239k | e->back_num = 0; |
809 | 239k | e->back_alloc = 0; |
810 | 239k | e->back_refs = (int* )NULL; |
811 | | |
812 | | # else |
813 | | |
814 | | if (IS_NULL(t)) { |
815 | | alloc = INIT_NAMES_ALLOC_NUM; |
816 | | t = (NameTable* )xmalloc(sizeof(NameTable)); |
817 | | CHECK_NULL_RETURN_MEMERR(t); |
818 | | t->e = NULL; |
819 | | t->alloc = 0; |
820 | | t->num = 0; |
821 | | |
822 | | t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); |
823 | | if (IS_NULL(t->e)) { |
824 | | xfree(t); |
825 | | return ONIGERR_MEMORY; |
826 | | } |
827 | | t->alloc = alloc; |
828 | | reg->name_table = t; |
829 | | goto clear; |
830 | | } |
831 | | else if (t->num == t->alloc) { |
832 | | int i; |
833 | | NameEntry* p; |
834 | | |
835 | | alloc = t->alloc * 2; |
836 | | p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); |
837 | | CHECK_NULL_RETURN_MEMERR(p); |
838 | | t->e = p; |
839 | | t->alloc = alloc; |
840 | | |
841 | | clear: |
842 | | for (i = t->num; i < t->alloc; i++) { |
843 | | t->e[i].name = NULL; |
844 | | t->e[i].name_len = 0; |
845 | | t->e[i].back_num = 0; |
846 | | t->e[i].back_alloc = 0; |
847 | | t->e[i].back_refs = (int* )NULL; |
848 | | } |
849 | | } |
850 | | e = &(t->e[t->num]); |
851 | | t->num++; |
852 | | e->name = strdup_with_null(reg->enc, name, name_end); |
853 | | if (IS_NULL(e->name)) return ONIGERR_MEMORY; |
854 | | e->name_len = name_end - name; |
855 | | # endif |
856 | 239k | } |
857 | | |
858 | 246k | if (e->back_num >= 1 && |
859 | 246k | ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { |
860 | 0 | onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, |
861 | 0 | name, name_end); |
862 | 0 | return ONIGERR_MULTIPLEX_DEFINED_NAME; |
863 | 0 | } |
864 | | |
865 | 246k | e->back_num++; |
866 | 246k | if (e->back_num == 1) { |
867 | 239k | e->back_ref1 = backref; |
868 | 239k | } |
869 | 6.39k | else { |
870 | 6.39k | if (e->back_num == 2) { |
871 | 142 | alloc = INIT_NAME_BACKREFS_ALLOC_NUM; |
872 | 142 | e->back_refs = (int* )xmalloc(sizeof(int) * alloc); |
873 | 142 | CHECK_NULL_RETURN_MEMERR(e->back_refs); |
874 | 142 | e->back_alloc = alloc; |
875 | 142 | e->back_refs[0] = e->back_ref1; |
876 | 142 | e->back_refs[1] = backref; |
877 | 142 | } |
878 | 6.25k | else { |
879 | 6.25k | if (e->back_num > e->back_alloc) { |
880 | 98 | int* p; |
881 | 98 | alloc = e->back_alloc * 2; |
882 | 98 | p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); |
883 | 98 | CHECK_NULL_RETURN_MEMERR(p); |
884 | 98 | e->back_refs = p; |
885 | 98 | e->back_alloc = alloc; |
886 | 98 | } |
887 | 6.25k | e->back_refs[e->back_num - 1] = backref; |
888 | 6.25k | } |
889 | 6.39k | } |
890 | | |
891 | 246k | return 0; |
892 | 246k | } |
893 | | |
894 | | extern int |
895 | | onig_name_to_group_numbers(regex_t* reg, const UChar* name, |
896 | | const UChar* name_end, int** nums) |
897 | 90.9k | { |
898 | 90.9k | NameEntry* e = name_find(reg, name, name_end); |
899 | | |
900 | 90.9k | if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; |
901 | | |
902 | 90.9k | switch (e->back_num) { |
903 | 0 | case 0: |
904 | 0 | *nums = 0; |
905 | 0 | break; |
906 | 89.4k | case 1: |
907 | 89.4k | *nums = &(e->back_ref1); |
908 | 89.4k | break; |
909 | 1.49k | default: |
910 | 1.49k | *nums = e->back_refs; |
911 | 1.49k | break; |
912 | 90.9k | } |
913 | 90.9k | return e->back_num; |
914 | 90.9k | } |
915 | | |
916 | | extern int |
917 | | onig_name_to_backref_number(regex_t* reg, const UChar* name, |
918 | | const UChar* name_end, const OnigRegion *region) |
919 | 89.3k | { |
920 | 89.3k | int i, n, *nums; |
921 | | |
922 | 89.3k | n = onig_name_to_group_numbers(reg, name, name_end, &nums); |
923 | 89.3k | if (n < 0) |
924 | 0 | return n; |
925 | 89.3k | else if (n == 0) |
926 | 0 | return ONIGERR_PARSER_BUG; |
927 | 89.3k | else if (n == 1) |
928 | 89.3k | return nums[0]; |
929 | 0 | else { |
930 | 0 | if (IS_NOT_NULL(region)) { |
931 | 0 | for (i = n - 1; i >= 0; i--) { |
932 | 0 | if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) |
933 | 0 | return nums[i]; |
934 | 0 | } |
935 | 0 | } |
936 | 0 | return nums[n - 1]; |
937 | 0 | } |
938 | 89.3k | } |
939 | | |
940 | | #else /* USE_NAMED_GROUP */ |
941 | | |
942 | | extern int |
943 | | onig_name_to_group_numbers(regex_t* reg, const UChar* name, |
944 | | const UChar* name_end, int** nums) |
945 | | { |
946 | | return ONIG_NO_SUPPORT_CONFIG; |
947 | | } |
948 | | |
949 | | extern int |
950 | | onig_name_to_backref_number(regex_t* reg, const UChar* name, |
951 | | const UChar* name_end, const OnigRegion* region) |
952 | | { |
953 | | return ONIG_NO_SUPPORT_CONFIG; |
954 | | } |
955 | | |
956 | | extern int |
957 | | onig_foreach_name(regex_t* reg, |
958 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) |
959 | | { |
960 | | return ONIG_NO_SUPPORT_CONFIG; |
961 | | } |
962 | | |
963 | | extern int |
964 | | onig_number_of_names(const regex_t* reg) |
965 | | { |
966 | | return 0; |
967 | | } |
968 | | #endif /* else USE_NAMED_GROUP */ |
969 | | |
970 | | extern int |
971 | | onig_noname_group_capture_is_active(const regex_t* reg) |
972 | 0 | { |
973 | 0 | if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) |
974 | 0 | return 0; |
975 | | |
976 | 0 | #ifdef USE_NAMED_GROUP |
977 | 0 | if (onig_number_of_names(reg) > 0 && |
978 | 0 | IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && |
979 | 0 | !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { |
980 | 0 | return 0; |
981 | 0 | } |
982 | 0 | #endif |
983 | | |
984 | 0 | return 1; |
985 | 0 | } |
986 | | |
987 | | |
988 | 622 | #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 |
989 | | |
990 | | static void |
991 | | scan_env_clear(ScanEnv* env) |
992 | 836k | { |
993 | 836k | int i; |
994 | | |
995 | 836k | BIT_STATUS_CLEAR(env->capture_history); |
996 | 836k | BIT_STATUS_CLEAR(env->bt_mem_start); |
997 | 836k | BIT_STATUS_CLEAR(env->bt_mem_end); |
998 | 836k | BIT_STATUS_CLEAR(env->backrefed_mem); |
999 | 836k | env->error = (UChar* )NULL; |
1000 | 836k | env->error_end = (UChar* )NULL; |
1001 | 836k | env->num_call = 0; |
1002 | 836k | env->num_mem = 0; |
1003 | 836k | #ifdef USE_NAMED_GROUP |
1004 | 836k | env->num_named = 0; |
1005 | 836k | #endif |
1006 | 836k | env->mem_alloc = 0; |
1007 | 836k | env->mem_nodes_dynamic = (Node** )NULL; |
1008 | | |
1009 | 7.52M | for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) |
1010 | 6.68M | env->mem_nodes_static[i] = NULL_NODE; |
1011 | | |
1012 | | #ifdef USE_COMBINATION_EXPLOSION_CHECK |
1013 | | env->num_comb_exp_check = 0; |
1014 | | env->comb_exp_max_regnum = 0; |
1015 | | env->curr_max_regnum = 0; |
1016 | | env->has_recursion = 0; |
1017 | | #endif |
1018 | 836k | env->parse_depth = 0; |
1019 | 836k | env->warnings_flag = 0; |
1020 | 836k | } |
1021 | | |
1022 | | static int |
1023 | | scan_env_add_mem_entry(ScanEnv* env) |
1024 | 355k | { |
1025 | 355k | int i, need, alloc; |
1026 | 355k | Node** p; |
1027 | | |
1028 | 355k | need = env->num_mem + 1; |
1029 | 355k | if (need > ONIG_MAX_CAPTURE_GROUP_NUM) |
1030 | 0 | return ONIGERR_TOO_MANY_CAPTURE_GROUPS; |
1031 | 355k | if (need >= SCANENV_MEMNODES_SIZE) { |
1032 | 51.5k | if (env->mem_alloc <= need) { |
1033 | 1.32k | if (IS_NULL(env->mem_nodes_dynamic)) { |
1034 | 622 | alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; |
1035 | 622 | p = (Node** )xmalloc(sizeof(Node*) * alloc); |
1036 | 622 | CHECK_NULL_RETURN_MEMERR(p); |
1037 | 622 | xmemcpy(p, env->mem_nodes_static, |
1038 | 622 | sizeof(Node*) * SCANENV_MEMNODES_SIZE); |
1039 | 622 | } |
1040 | 702 | else { |
1041 | 702 | alloc = env->mem_alloc * 2; |
1042 | 702 | p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); |
1043 | 702 | CHECK_NULL_RETURN_MEMERR(p); |
1044 | 702 | } |
1045 | | |
1046 | 85.1k | for (i = env->num_mem + 1; i < alloc; i++) |
1047 | 83.8k | p[i] = NULL_NODE; |
1048 | | |
1049 | 1.32k | env->mem_nodes_dynamic = p; |
1050 | 1.32k | env->mem_alloc = alloc; |
1051 | 1.32k | } |
1052 | 51.5k | } |
1053 | | |
1054 | 355k | env->num_mem++; |
1055 | 355k | return env->num_mem; |
1056 | 355k | } |
1057 | | |
1058 | | static int |
1059 | | scan_env_set_mem_node(ScanEnv* env, int num, Node* node) |
1060 | 329k | { |
1061 | 329k | if (env->num_mem >= num) |
1062 | 329k | SCANENV_MEM_NODES(env)[num] = node; |
1063 | 0 | else |
1064 | 0 | return ONIGERR_PARSER_BUG; |
1065 | 329k | return 0; |
1066 | 329k | } |
1067 | | |
1068 | | |
1069 | | extern void |
1070 | | onig_node_free(Node* node) |
1071 | 14.2M | { |
1072 | 22.5M | start: |
1073 | 22.5M | if (IS_NULL(node)) return ; |
1074 | | |
1075 | 19.7M | switch (NTYPE(node)) { |
1076 | 2.13M | case NT_STR: |
1077 | 2.13M | if (NSTR(node)->capa != 0 && |
1078 | 2.13M | IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { |
1079 | 132k | xfree(NSTR(node)->s); |
1080 | 132k | } |
1081 | 2.13M | break; |
1082 | | |
1083 | 5.88M | case NT_LIST: |
1084 | 8.32M | case NT_ALT: |
1085 | 8.32M | onig_node_free(NCAR(node)); |
1086 | 8.32M | { |
1087 | 8.32M | Node* next_node = NCDR(node); |
1088 | | |
1089 | 8.32M | xfree(node); |
1090 | 8.32M | node = next_node; |
1091 | 8.32M | goto start; |
1092 | 5.88M | } |
1093 | 0 | break; |
1094 | | |
1095 | 3.23M | case NT_CCLASS: |
1096 | 3.23M | { |
1097 | 3.23M | CClassNode* cc = NCCLASS(node); |
1098 | | |
1099 | 3.23M | if (cc->mbuf) |
1100 | 2.36M | bbuf_free(cc->mbuf); |
1101 | 3.23M | } |
1102 | 3.23M | break; |
1103 | | |
1104 | 2.88M | case NT_QTFR: |
1105 | 2.88M | if (NQTFR(node)->target) |
1106 | 2.87M | onig_node_free(NQTFR(node)->target); |
1107 | 2.88M | break; |
1108 | | |
1109 | 1.54M | case NT_ENCLOSE: |
1110 | 1.54M | if (NENCLOSE(node)->target) |
1111 | 1.51M | onig_node_free(NENCLOSE(node)->target); |
1112 | 1.54M | break; |
1113 | | |
1114 | 9.67k | case NT_BREF: |
1115 | 9.67k | if (IS_NOT_NULL(NBREF(node)->back_dynamic)) |
1116 | 1.29k | xfree(NBREF(node)->back_dynamic); |
1117 | 9.67k | break; |
1118 | | |
1119 | 1.02M | case NT_ANCHOR: |
1120 | 1.02M | if (NANCHOR(node)->target) |
1121 | 6.04k | onig_node_free(NANCHOR(node)->target); |
1122 | 1.02M | break; |
1123 | 19.7M | } |
1124 | | |
1125 | 11.4M | xfree(node); |
1126 | 11.4M | } |
1127 | | |
1128 | | static Node* |
1129 | | node_new(void) |
1130 | 19.7M | { |
1131 | 19.7M | Node* node; |
1132 | | |
1133 | 19.7M | node = (Node* )xmalloc(sizeof(Node)); |
1134 | | /* xmemset(node, 0, sizeof(Node)); */ |
1135 | 19.7M | return node; |
1136 | 19.7M | } |
1137 | | |
1138 | | static void |
1139 | | initialize_cclass(CClassNode* cc) |
1140 | 4.16M | { |
1141 | 4.16M | BITSET_CLEAR(cc->bs); |
1142 | | /* cc->base.flags = 0; */ |
1143 | 4.16M | cc->flags = 0; |
1144 | 4.16M | cc->mbuf = NULL; |
1145 | 4.16M | } |
1146 | | |
1147 | | static Node* |
1148 | | node_new_cclass(void) |
1149 | 3.23M | { |
1150 | 3.23M | Node* node = node_new(); |
1151 | 3.23M | CHECK_NULL_RETURN(node); |
1152 | | |
1153 | 3.23M | SET_NTYPE(node, NT_CCLASS); |
1154 | 3.23M | initialize_cclass(NCCLASS(node)); |
1155 | 3.23M | return node; |
1156 | 3.23M | } |
1157 | | |
1158 | | static Node* |
1159 | | node_new_ctype(int type, int not, int ascii_range) |
1160 | 4.94k | { |
1161 | 4.94k | Node* node = node_new(); |
1162 | 4.94k | CHECK_NULL_RETURN(node); |
1163 | | |
1164 | 4.94k | SET_NTYPE(node, NT_CTYPE); |
1165 | 4.94k | NCTYPE(node)->ctype = type; |
1166 | 4.94k | NCTYPE(node)->not = not; |
1167 | 4.94k | NCTYPE(node)->ascii_range = ascii_range; |
1168 | 4.94k | return node; |
1169 | 4.94k | } |
1170 | | |
1171 | | static Node* |
1172 | | node_new_anychar(void) |
1173 | 564k | { |
1174 | 564k | Node* node = node_new(); |
1175 | 564k | CHECK_NULL_RETURN(node); |
1176 | | |
1177 | 564k | SET_NTYPE(node, NT_CANY); |
1178 | 564k | return node; |
1179 | 564k | } |
1180 | | |
1181 | | static Node* |
1182 | | node_new_list(Node* left, Node* right) |
1183 | 5.87M | { |
1184 | 5.87M | Node* node = node_new(); |
1185 | 5.87M | CHECK_NULL_RETURN(node); |
1186 | | |
1187 | 5.87M | SET_NTYPE(node, NT_LIST); |
1188 | 5.87M | NCAR(node) = left; |
1189 | 5.87M | NCDR(node) = right; |
1190 | 5.87M | return node; |
1191 | 5.87M | } |
1192 | | |
1193 | | extern Node* |
1194 | | onig_node_new_list(Node* left, Node* right) |
1195 | 19.3k | { |
1196 | 19.3k | return node_new_list(left, right); |
1197 | 19.3k | } |
1198 | | |
1199 | | extern Node* |
1200 | | onig_node_list_add(Node* list, Node* x) |
1201 | 13.4k | { |
1202 | 13.4k | Node *n; |
1203 | | |
1204 | 13.4k | n = onig_node_new_list(x, NULL); |
1205 | 13.4k | if (IS_NULL(n)) return NULL_NODE; |
1206 | | |
1207 | 13.4k | if (IS_NOT_NULL(list)) { |
1208 | 10.3k | while (IS_NOT_NULL(NCDR(list))) |
1209 | 441 | list = NCDR(list); |
1210 | | |
1211 | 9.87k | NCDR(list) = n; |
1212 | 9.87k | } |
1213 | | |
1214 | 13.4k | return n; |
1215 | 13.4k | } |
1216 | | |
1217 | | extern Node* |
1218 | | onig_node_new_alt(Node* left, Node* right) |
1219 | 2.44M | { |
1220 | 2.44M | Node* node = node_new(); |
1221 | 2.44M | CHECK_NULL_RETURN(node); |
1222 | | |
1223 | 2.44M | SET_NTYPE(node, NT_ALT); |
1224 | 2.44M | NCAR(node) = left; |
1225 | 2.44M | NCDR(node) = right; |
1226 | 2.44M | return node; |
1227 | 2.44M | } |
1228 | | |
1229 | | extern Node* |
1230 | | onig_node_new_anchor(int type) |
1231 | 1.02M | { |
1232 | 1.02M | Node* node = node_new(); |
1233 | 1.02M | CHECK_NULL_RETURN(node); |
1234 | | |
1235 | 1.02M | SET_NTYPE(node, NT_ANCHOR); |
1236 | 1.02M | NANCHOR(node)->type = type; |
1237 | 1.02M | NANCHOR(node)->target = NULL; |
1238 | 1.02M | NANCHOR(node)->char_len = -1; |
1239 | 1.02M | NANCHOR(node)->ascii_range = 0; |
1240 | 1.02M | return node; |
1241 | 1.02M | } |
1242 | | |
1243 | | static Node* |
1244 | | node_new_backref(int back_num, int* backrefs, int by_name, |
1245 | | #ifdef USE_BACKREF_WITH_LEVEL |
1246 | | int exist_level, int nest_level, |
1247 | | #endif |
1248 | | ScanEnv* env) |
1249 | 9.67k | { |
1250 | 9.67k | int i; |
1251 | 9.67k | Node* node = node_new(); |
1252 | | |
1253 | 9.67k | CHECK_NULL_RETURN(node); |
1254 | | |
1255 | 9.67k | SET_NTYPE(node, NT_BREF); |
1256 | 9.67k | NBREF(node)->state = 0; |
1257 | 9.67k | NBREF(node)->back_num = back_num; |
1258 | 9.67k | NBREF(node)->back_dynamic = (int* )NULL; |
1259 | 9.67k | if (by_name != 0) |
1260 | 1.37k | NBREF(node)->state |= NST_NAME_REF; |
1261 | | |
1262 | 9.67k | #ifdef USE_BACKREF_WITH_LEVEL |
1263 | 9.67k | if (exist_level != 0) { |
1264 | 1.75k | NBREF(node)->state |= NST_NEST_LEVEL; |
1265 | 1.75k | NBREF(node)->nest_level = nest_level; |
1266 | 1.75k | } |
1267 | 9.67k | #endif |
1268 | | |
1269 | 271k | for (i = 0; i < back_num; i++) { |
1270 | 265k | if (backrefs[i] <= env->num_mem && |
1271 | 265k | IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { |
1272 | 3.81k | NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */ |
1273 | 3.81k | break; |
1274 | 3.81k | } |
1275 | 265k | } |
1276 | | |
1277 | 9.67k | if (back_num <= NODE_BACKREFS_SIZE) { |
1278 | 16.9k | for (i = 0; i < back_num; i++) |
1279 | 8.59k | NBREF(node)->back_static[i] = backrefs[i]; |
1280 | 8.37k | } |
1281 | 1.29k | else { |
1282 | 1.29k | int* p = (int* )xmalloc(sizeof(int) * back_num); |
1283 | 1.29k | if (IS_NULL(p)) { |
1284 | 0 | onig_node_free(node); |
1285 | 0 | return NULL; |
1286 | 0 | } |
1287 | 1.29k | NBREF(node)->back_dynamic = p; |
1288 | 279k | for (i = 0; i < back_num; i++) |
1289 | 277k | p[i] = backrefs[i]; |
1290 | 1.29k | } |
1291 | 9.67k | return node; |
1292 | 9.67k | } |
1293 | | |
1294 | | #ifdef USE_SUBEXP_CALL |
1295 | | static Node* |
1296 | | node_new_call(UChar* name, UChar* name_end, int gnum) |
1297 | 4.48k | { |
1298 | 4.48k | Node* node = node_new(); |
1299 | 4.48k | CHECK_NULL_RETURN(node); |
1300 | | |
1301 | 4.48k | SET_NTYPE(node, NT_CALL); |
1302 | 4.48k | NCALL(node)->state = 0; |
1303 | 4.48k | NCALL(node)->target = NULL_NODE; |
1304 | 4.48k | NCALL(node)->name = name; |
1305 | 4.48k | NCALL(node)->name_end = name_end; |
1306 | 4.48k | NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */ |
1307 | 4.48k | return node; |
1308 | 4.48k | } |
1309 | | #endif |
1310 | | |
1311 | | static Node* |
1312 | | node_new_quantifier(int lower, int upper, int by_number) |
1313 | 2.88M | { |
1314 | 2.88M | Node* node = node_new(); |
1315 | 2.88M | CHECK_NULL_RETURN(node); |
1316 | | |
1317 | 2.88M | SET_NTYPE(node, NT_QTFR); |
1318 | 2.88M | NQTFR(node)->state = 0; |
1319 | 2.88M | NQTFR(node)->target = NULL; |
1320 | 2.88M | NQTFR(node)->lower = lower; |
1321 | 2.88M | NQTFR(node)->upper = upper; |
1322 | 2.88M | NQTFR(node)->greedy = 1; |
1323 | 2.88M | NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY; |
1324 | 2.88M | NQTFR(node)->head_exact = NULL_NODE; |
1325 | 2.88M | NQTFR(node)->next_head_exact = NULL_NODE; |
1326 | 2.88M | NQTFR(node)->is_referred = 0; |
1327 | 2.88M | if (by_number != 0) |
1328 | 26.3k | NQTFR(node)->state |= NST_BY_NUMBER; |
1329 | | |
1330 | | #ifdef USE_COMBINATION_EXPLOSION_CHECK |
1331 | | NQTFR(node)->comb_exp_check_num = 0; |
1332 | | #endif |
1333 | | |
1334 | 2.88M | return node; |
1335 | 2.88M | } |
1336 | | |
1337 | | static Node* |
1338 | | node_new_enclose(int type) |
1339 | 1.54M | { |
1340 | 1.54M | Node* node = node_new(); |
1341 | 1.54M | CHECK_NULL_RETURN(node); |
1342 | | |
1343 | 1.54M | SET_NTYPE(node, NT_ENCLOSE); |
1344 | 1.54M | NENCLOSE(node)->type = type; |
1345 | 1.54M | NENCLOSE(node)->state = 0; |
1346 | 1.54M | NENCLOSE(node)->regnum = 0; |
1347 | 1.54M | NENCLOSE(node)->option = 0; |
1348 | 1.54M | NENCLOSE(node)->target = NULL; |
1349 | 1.54M | NENCLOSE(node)->call_addr = -1; |
1350 | 1.54M | NENCLOSE(node)->opt_count = 0; |
1351 | 1.54M | return node; |
1352 | 1.54M | } |
1353 | | |
1354 | | extern Node* |
1355 | | onig_node_new_enclose(int type) |
1356 | 647k | { |
1357 | 647k | return node_new_enclose(type); |
1358 | 647k | } |
1359 | | |
1360 | | static Node* |
1361 | | node_new_enclose_memory(OnigOptionType option, int is_named) |
1362 | 356k | { |
1363 | 356k | Node* node = node_new_enclose(ENCLOSE_MEMORY); |
1364 | 356k | CHECK_NULL_RETURN(node); |
1365 | 356k | if (is_named != 0) |
1366 | 246k | SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP); |
1367 | | |
1368 | 356k | #ifdef USE_SUBEXP_CALL |
1369 | 356k | NENCLOSE(node)->option = option; |
1370 | 356k | #endif |
1371 | 356k | return node; |
1372 | 356k | } |
1373 | | |
1374 | | static Node* |
1375 | | node_new_option(OnigOptionType option) |
1376 | 331k | { |
1377 | 331k | Node* node = node_new_enclose(ENCLOSE_OPTION); |
1378 | 331k | CHECK_NULL_RETURN(node); |
1379 | 331k | NENCLOSE(node)->option = option; |
1380 | 331k | return node; |
1381 | 331k | } |
1382 | | |
1383 | | extern int |
1384 | | onig_node_str_cat(Node* node, const UChar* s, const UChar* end) |
1385 | 13.7M | { |
1386 | 13.7M | ptrdiff_t addlen = end - s; |
1387 | | |
1388 | 13.7M | if (addlen > 0) { |
1389 | 13.6M | ptrdiff_t len = NSTR(node)->end - NSTR(node)->s; |
1390 | | |
1391 | 13.6M | if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { |
1392 | 2.53M | UChar* p; |
1393 | 2.53M | ptrdiff_t capa = len + addlen + NODE_STR_MARGIN; |
1394 | | |
1395 | 2.53M | if (capa <= NSTR(node)->capa) { |
1396 | 0 | onig_strcpy(NSTR(node)->s + len, s, end); |
1397 | 0 | } |
1398 | 2.53M | else { |
1399 | 2.53M | if (NSTR(node)->s == NSTR(node)->buf) |
1400 | 133k | p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end, |
1401 | 133k | s, end, capa); |
1402 | 2.40M | else |
1403 | 2.40M | p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa); |
1404 | | |
1405 | 2.53M | CHECK_NULL_RETURN_MEMERR(p); |
1406 | 2.53M | NSTR(node)->s = p; |
1407 | 2.53M | NSTR(node)->capa = (int )capa; |
1408 | 2.53M | } |
1409 | 2.53M | } |
1410 | 11.0M | else { |
1411 | 11.0M | onig_strcpy(NSTR(node)->s + len, s, end); |
1412 | 11.0M | } |
1413 | 13.6M | NSTR(node)->end = NSTR(node)->s + len + addlen; |
1414 | 13.6M | } |
1415 | | |
1416 | 13.7M | return 0; |
1417 | 13.7M | } |
1418 | | |
1419 | | extern int |
1420 | | onig_node_str_set(Node* node, const UChar* s, const UChar* end) |
1421 | 35.9k | { |
1422 | 35.9k | onig_node_str_clear(node); |
1423 | 35.9k | return onig_node_str_cat(node, s, end); |
1424 | 35.9k | } |
1425 | | |
1426 | | static int |
1427 | | node_str_cat_char(Node* node, UChar c) |
1428 | 0 | { |
1429 | 0 | UChar s[1]; |
1430 | |
|
1431 | 0 | s[0] = c; |
1432 | 0 | return onig_node_str_cat(node, s, s + 1); |
1433 | 0 | } |
1434 | | |
1435 | | static int |
1436 | | node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c) |
1437 | 2.76k | { |
1438 | 2.76k | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; |
1439 | 2.76k | int num = ONIGENC_CODE_TO_MBC(enc, c, buf); |
1440 | 2.76k | if (num < 0) return num; |
1441 | 2.76k | return onig_node_str_cat(node, buf, buf + num); |
1442 | 2.76k | } |
1443 | | |
1444 | | #if 0 |
1445 | | extern void |
1446 | | onig_node_conv_to_str_node(Node* node, int flag) |
1447 | | { |
1448 | | SET_NTYPE(node, NT_STR); |
1449 | | NSTR(node)->flag = flag; |
1450 | | NSTR(node)->capa = 0; |
1451 | | NSTR(node)->s = NSTR(node)->buf; |
1452 | | NSTR(node)->end = NSTR(node)->buf; |
1453 | | } |
1454 | | #endif |
1455 | | |
1456 | | extern void |
1457 | | onig_node_str_clear(Node* node) |
1458 | 35.9k | { |
1459 | 35.9k | if (NSTR(node)->capa != 0 && |
1460 | 35.9k | IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { |
1461 | 771 | xfree(NSTR(node)->s); |
1462 | 771 | } |
1463 | | |
1464 | 35.9k | NSTR(node)->capa = 0; |
1465 | 35.9k | NSTR(node)->flag = 0; |
1466 | 35.9k | NSTR(node)->s = NSTR(node)->buf; |
1467 | 35.9k | NSTR(node)->end = NSTR(node)->buf; |
1468 | 35.9k | } |
1469 | | |
1470 | | static Node* |
1471 | | node_new_str(const UChar* s, const UChar* end) |
1472 | 2.13M | { |
1473 | 2.13M | Node* node = node_new(); |
1474 | 2.13M | CHECK_NULL_RETURN(node); |
1475 | | |
1476 | 2.13M | SET_NTYPE(node, NT_STR); |
1477 | 2.13M | NSTR(node)->capa = 0; |
1478 | 2.13M | NSTR(node)->flag = 0; |
1479 | 2.13M | NSTR(node)->s = NSTR(node)->buf; |
1480 | 2.13M | NSTR(node)->end = NSTR(node)->buf; |
1481 | 2.13M | if (onig_node_str_cat(node, s, end)) { |
1482 | 0 | onig_node_free(node); |
1483 | 0 | return NULL; |
1484 | 0 | } |
1485 | 2.13M | return node; |
1486 | 2.13M | } |
1487 | | |
1488 | | extern Node* |
1489 | | onig_node_new_str(const UChar* s, const UChar* end) |
1490 | 96.6k | { |
1491 | 96.6k | return node_new_str(s, end); |
1492 | 96.6k | } |
1493 | | |
1494 | | static Node* |
1495 | | node_new_str_raw(UChar* s, UChar* end) |
1496 | 299k | { |
1497 | 299k | Node* node = node_new_str(s, end); |
1498 | 299k | if (IS_NOT_NULL(node)) |
1499 | 299k | NSTRING_SET_RAW(node); |
1500 | 299k | return node; |
1501 | 299k | } |
1502 | | |
1503 | | static Node* |
1504 | | node_new_empty(void) |
1505 | 64.0k | { |
1506 | 64.0k | return node_new_str(NULL, NULL); |
1507 | 64.0k | } |
1508 | | |
1509 | | static Node* |
1510 | | node_new_str_raw_char(UChar c) |
1511 | 4.19k | { |
1512 | 4.19k | UChar p[1]; |
1513 | | |
1514 | 4.19k | p[0] = c; |
1515 | 4.19k | return node_new_str_raw(p, p + 1); |
1516 | 4.19k | } |
1517 | | |
1518 | | static Node* |
1519 | | str_node_split_last_char(StrNode* sn, OnigEncoding enc) |
1520 | 36.2k | { |
1521 | 36.2k | const UChar *p; |
1522 | 36.2k | Node* n = NULL_NODE; |
1523 | | |
1524 | 36.2k | if (sn->end > sn->s) { |
1525 | 36.2k | p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end); |
1526 | 36.2k | if (p && p > sn->s) { /* can be split. */ |
1527 | 32.6k | n = node_new_str(p, sn->end); |
1528 | 32.6k | if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0) |
1529 | 0 | NSTRING_SET_RAW(n); |
1530 | 32.6k | sn->end = (UChar* )p; |
1531 | 32.6k | } |
1532 | 36.2k | } |
1533 | 36.2k | return n; |
1534 | 36.2k | } |
1535 | | |
1536 | | static int |
1537 | | str_node_can_be_split(StrNode* sn, OnigEncoding enc) |
1538 | 55.2k | { |
1539 | 55.2k | if (sn->end > sn->s) { |
1540 | 55.2k | return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0); |
1541 | 55.2k | } |
1542 | 0 | return 0; |
1543 | 55.2k | } |
1544 | | |
1545 | | #ifdef USE_PAD_TO_SHORT_BYTE_CHAR |
1546 | | static int |
1547 | | node_str_head_pad(StrNode* sn, int num, UChar val) |
1548 | | { |
1549 | | UChar buf[NODE_STR_BUF_SIZE]; |
1550 | | int i, len; |
1551 | | |
1552 | | len = sn->end - sn->s; |
1553 | | onig_strcpy(buf, sn->s, sn->end); |
1554 | | onig_strcpy(&(sn->s[num]), buf, buf + len); |
1555 | | sn->end += num; |
1556 | | |
1557 | | for (i = 0; i < num; i++) { |
1558 | | sn->s[i] = val; |
1559 | | } |
1560 | | } |
1561 | | #endif |
1562 | | |
1563 | | extern int |
1564 | | onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) |
1565 | 62.7k | { |
1566 | 62.7k | unsigned int num, val; |
1567 | 62.7k | OnigCodePoint c; |
1568 | 62.7k | UChar* p = *src; |
1569 | 62.7k | PFETCH_READY; |
1570 | | |
1571 | 62.7k | num = 0; |
1572 | 152k | while (!PEND) { |
1573 | 146k | PFETCH(c); |
1574 | 146k | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
1575 | 90.7k | val = (unsigned int )DIGITVAL(c); |
1576 | 90.7k | if ((INT_MAX_LIMIT - val) / 10UL < num) |
1577 | 643 | return -1; /* overflow */ |
1578 | | |
1579 | 90.0k | num = num * 10 + val; |
1580 | 90.0k | } |
1581 | 55.8k | else { |
1582 | 55.8k | PUNFETCH; |
1583 | 55.8k | break; |
1584 | 55.8k | } |
1585 | 146k | } |
1586 | 62.0k | *src = p; |
1587 | 62.0k | return num; |
1588 | 62.7k | } |
1589 | | |
1590 | | static int |
1591 | | scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, |
1592 | | int maxlen, OnigEncoding enc) |
1593 | 5.23k | { |
1594 | 5.23k | OnigCodePoint c; |
1595 | 5.23k | unsigned int num, val; |
1596 | 5.23k | int restlen; |
1597 | 5.23k | UChar* p = *src; |
1598 | 5.23k | PFETCH_READY; |
1599 | | |
1600 | 5.23k | restlen = maxlen - minlen; |
1601 | 5.23k | num = 0; |
1602 | 14.1k | while (!PEND && maxlen-- != 0) { |
1603 | 10.2k | PFETCH(c); |
1604 | 10.2k | if (ONIGENC_IS_CODE_XDIGIT(enc, c)) { |
1605 | 8.87k | val = (unsigned int )XDIGITVAL(enc,c); |
1606 | 8.87k | if ((INT_MAX_LIMIT - val) / 16UL < num) |
1607 | 0 | return -1; /* overflow */ |
1608 | | |
1609 | 8.87k | num = (num << 4) + XDIGITVAL(enc,c); |
1610 | 8.87k | } |
1611 | 1.34k | else { |
1612 | 1.34k | PUNFETCH; |
1613 | 1.34k | maxlen++; |
1614 | 1.34k | break; |
1615 | 1.34k | } |
1616 | 10.2k | } |
1617 | 5.23k | if (maxlen > restlen) |
1618 | 0 | return -2; /* not enough digits */ |
1619 | 5.23k | *src = p; |
1620 | 5.23k | return num; |
1621 | 5.23k | } |
1622 | | |
1623 | | static int |
1624 | | scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, |
1625 | | OnigEncoding enc) |
1626 | 4.05k | { |
1627 | 4.05k | OnigCodePoint c; |
1628 | 4.05k | unsigned int num, val; |
1629 | 4.05k | UChar* p = *src; |
1630 | 4.05k | PFETCH_READY; |
1631 | | |
1632 | 4.05k | num = 0; |
1633 | 6.97k | while (!PEND && maxlen-- != 0) { |
1634 | 6.12k | PFETCH(c); |
1635 | 6.12k | if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') { |
1636 | 2.91k | val = ODIGITVAL(c); |
1637 | 2.91k | if ((INT_MAX_LIMIT - val) / 8UL < num) |
1638 | 0 | return -1; /* overflow */ |
1639 | | |
1640 | 2.91k | num = (num << 3) + val; |
1641 | 2.91k | } |
1642 | 3.20k | else { |
1643 | 3.20k | PUNFETCH; |
1644 | 3.20k | break; |
1645 | 3.20k | } |
1646 | 6.12k | } |
1647 | 4.05k | *src = p; |
1648 | 4.05k | return num; |
1649 | 4.05k | } |
1650 | | |
1651 | | |
1652 | | #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \ |
1653 | 890M | BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) |
1654 | | |
1655 | | /* data format: |
1656 | | [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] |
1657 | | (all data size is OnigCodePoint) |
1658 | | */ |
1659 | | static int |
1660 | | new_code_range(BBuf** pbuf) |
1661 | 2.83M | { |
1662 | 2.83M | #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) |
1663 | 2.83M | int r; |
1664 | 2.83M | OnigCodePoint n; |
1665 | 2.83M | BBuf* bbuf; |
1666 | | |
1667 | 2.83M | bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); |
1668 | 2.83M | CHECK_NULL_RETURN_MEMERR(*pbuf); |
1669 | 2.83M | r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE); |
1670 | 2.83M | if (r) return r; |
1671 | | |
1672 | 2.83M | n = 0; |
1673 | 2.83M | BBUF_WRITE_CODE_POINT(bbuf, 0, n); |
1674 | 2.83M | return 0; |
1675 | 2.83M | } |
1676 | | |
1677 | | static int |
1678 | | add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, |
1679 | | int checkdup) |
1680 | 295M | { |
1681 | 295M | int r, inc_n, pos; |
1682 | 295M | OnigCodePoint low, high, bound, x; |
1683 | 295M | OnigCodePoint n, *data; |
1684 | 295M | BBuf* bbuf; |
1685 | | |
1686 | 295M | if (from > to) { |
1687 | 0 | n = from; from = to; to = n; |
1688 | 0 | } |
1689 | | |
1690 | 295M | if (IS_NULL(*pbuf)) { |
1691 | 2.83M | r = new_code_range(pbuf); |
1692 | 2.83M | if (r) return r; |
1693 | 2.83M | bbuf = *pbuf; |
1694 | 2.83M | n = 0; |
1695 | 2.83M | } |
1696 | 293M | else { |
1697 | 293M | bbuf = *pbuf; |
1698 | 293M | GET_CODE_POINT(n, bbuf->p); |
1699 | 293M | } |
1700 | 295M | data = (OnigCodePoint* )(bbuf->p); |
1701 | 295M | data++; |
1702 | | |
1703 | 295M | bound = (from == 0) ? 0 : n; |
1704 | 2.17G | for (low = 0; low < bound; ) { |
1705 | 1.88G | x = (low + bound) >> 1; |
1706 | 1.88G | if (from - 1 > data[x*2 + 1]) |
1707 | 1.77G | low = x + 1; |
1708 | 111M | else |
1709 | 111M | bound = x; |
1710 | 1.88G | } |
1711 | | |
1712 | 295M | high = (to == ONIG_LAST_CODE_POINT) ? n : low; |
1713 | 481M | for (bound = n; high < bound; ) { |
1714 | 186M | x = (high + bound) >> 1; |
1715 | 186M | if (to + 1 >= data[x*2]) |
1716 | 22.9M | high = x + 1; |
1717 | 163M | else |
1718 | 163M | bound = x; |
1719 | 186M | } |
1720 | | /* data[(low-1)*2+1] << from <= data[low*2] |
1721 | | * data[(high-1)*2+1] <= to << data[high*2] |
1722 | | */ |
1723 | | |
1724 | 295M | inc_n = low + 1 - high; |
1725 | 295M | if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) |
1726 | 0 | return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; |
1727 | | |
1728 | 295M | if (inc_n != 1) { |
1729 | 22.9M | if (checkdup && from <= data[low*2+1] |
1730 | 22.9M | && (data[low*2] <= from || data[low*2+1] <= to)) |
1731 | 32.4k | CC_DUP_WARN(env, from, to); |
1732 | 22.9M | if (from > data[low*2]) |
1733 | 16.6M | from = data[low*2]; |
1734 | 22.9M | if (to < data[(high - 1)*2 + 1]) |
1735 | 17.3M | to = data[(high - 1)*2 + 1]; |
1736 | 22.9M | } |
1737 | | |
1738 | 295M | if (inc_n != 0) { |
1739 | 282M | int from_pos = SIZE_CODE_POINT * (1 + high * 2); |
1740 | 282M | int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); |
1741 | | |
1742 | 282M | if (inc_n > 0) { |
1743 | 272M | if (high < n) { |
1744 | 2.80M | int size = (n - high) * 2 * SIZE_CODE_POINT; |
1745 | 2.80M | BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); |
1746 | 2.80M | } |
1747 | 272M | } |
1748 | 9.72M | else { |
1749 | 9.72M | BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); |
1750 | 9.72M | } |
1751 | 282M | } |
1752 | | |
1753 | 295M | pos = SIZE_CODE_POINT * (1 + low * 2); |
1754 | 295M | BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); |
1755 | 295M | BBUF_WRITE_CODE_POINT(bbuf, pos, from); |
1756 | 295M | BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); |
1757 | 295M | n += inc_n; |
1758 | 295M | BBUF_WRITE_CODE_POINT(bbuf, 0, n); |
1759 | | |
1760 | 295M | return 0; |
1761 | 295M | } |
1762 | | |
1763 | | static int |
1764 | | add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) |
1765 | 294M | { |
1766 | 294M | return add_code_range_to_buf0(pbuf, env, from, to, 1); |
1767 | 294M | } |
1768 | | |
1769 | | static int |
1770 | | add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup) |
1771 | 1.51M | { |
1772 | 1.51M | if (from > to) { |
1773 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) |
1774 | 0 | return 0; |
1775 | 0 | else |
1776 | 0 | return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; |
1777 | 0 | } |
1778 | | |
1779 | 1.51M | return add_code_range_to_buf0(pbuf, env, from, to, checkdup); |
1780 | 1.51M | } |
1781 | | |
1782 | | static int |
1783 | | add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) |
1784 | 176k | { |
1785 | 176k | return add_code_range0(pbuf, env, from, to, 1); |
1786 | 176k | } |
1787 | | |
1788 | | static int |
1789 | | not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env) |
1790 | 2.02k | { |
1791 | 2.02k | int r, i, n; |
1792 | 2.02k | OnigCodePoint pre, from, *data, to = 0; |
1793 | | |
1794 | 2.02k | *pbuf = (BBuf* )NULL; |
1795 | 2.02k | if (IS_NULL(bbuf)) { |
1796 | 0 | set_all: |
1797 | 0 | return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); |
1798 | 0 | } |
1799 | | |
1800 | 2.02k | data = (OnigCodePoint* )(bbuf->p); |
1801 | 2.02k | GET_CODE_POINT(n, data); |
1802 | 2.02k | data++; |
1803 | 2.02k | if (n <= 0) goto set_all; |
1804 | | |
1805 | 2.02k | r = 0; |
1806 | 2.02k | pre = MBCODE_START_POS(enc); |
1807 | 3.79k | for (i = 0; i < n; i++) { |
1808 | 2.84k | from = data[i*2]; |
1809 | 2.84k | to = data[i*2+1]; |
1810 | 2.84k | if (pre <= from - 1) { |
1811 | 1.81k | r = add_code_range_to_buf(pbuf, env, pre, from - 1); |
1812 | 1.81k | if (r != 0) return r; |
1813 | 1.81k | } |
1814 | 2.84k | if (to == ONIG_LAST_CODE_POINT) break; |
1815 | 1.77k | pre = to + 1; |
1816 | 1.77k | } |
1817 | 2.02k | if (to < ONIG_LAST_CODE_POINT) { |
1818 | 951 | r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT); |
1819 | 951 | } |
1820 | 2.02k | return r; |
1821 | 2.02k | } |
1822 | | |
1823 | 1.54k | #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\ |
1824 | 1.54k | BBuf *tbuf; \ |
1825 | 1.54k | int tnot; \ |
1826 | 1.54k | tnot = not1; not1 = not2; not2 = tnot; \ |
1827 | 1.54k | tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ |
1828 | 1.54k | } while (0) |
1829 | | |
1830 | | static int |
1831 | | or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, |
1832 | | BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) |
1833 | 465k | { |
1834 | 465k | int r; |
1835 | 465k | OnigCodePoint i, n1, *data1; |
1836 | 465k | OnigCodePoint from, to; |
1837 | | |
1838 | 465k | *pbuf = (BBuf* )NULL; |
1839 | 465k | if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { |
1840 | 460k | if (not1 != 0 || not2 != 0) |
1841 | 625 | return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); |
1842 | 460k | return 0; |
1843 | 460k | } |
1844 | | |
1845 | 4.46k | r = 0; |
1846 | 4.46k | if (IS_NULL(bbuf2)) |
1847 | 1.54k | SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); |
1848 | | |
1849 | 4.46k | if (IS_NULL(bbuf1)) { |
1850 | 3.21k | if (not1 != 0) { |
1851 | 61 | return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); |
1852 | 61 | } |
1853 | 3.15k | else { |
1854 | 3.15k | if (not2 == 0) { |
1855 | 1.81k | return bbuf_clone(pbuf, bbuf2); |
1856 | 1.81k | } |
1857 | 1.34k | else { |
1858 | 1.34k | return not_code_range_buf(enc, bbuf2, pbuf, env); |
1859 | 1.34k | } |
1860 | 3.15k | } |
1861 | 3.21k | } |
1862 | | |
1863 | 1.25k | if (not1 != 0) |
1864 | 0 | SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); |
1865 | | |
1866 | 1.25k | data1 = (OnigCodePoint* )(bbuf1->p); |
1867 | 1.25k | GET_CODE_POINT(n1, data1); |
1868 | 1.25k | data1++; |
1869 | | |
1870 | 1.25k | if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ |
1871 | 567 | r = bbuf_clone(pbuf, bbuf2); |
1872 | 567 | } |
1873 | 686 | else if (not1 == 0) { /* 1 OR (not 2) */ |
1874 | 686 | r = not_code_range_buf(enc, bbuf2, pbuf, env); |
1875 | 686 | } |
1876 | 1.25k | if (r != 0) return r; |
1877 | | |
1878 | 3.65k | for (i = 0; i < n1; i++) { |
1879 | 2.40k | from = data1[i*2]; |
1880 | 2.40k | to = data1[i*2+1]; |
1881 | 2.40k | r = add_code_range_to_buf(pbuf, env, from, to); |
1882 | 2.40k | if (r != 0) return r; |
1883 | 2.40k | } |
1884 | 1.25k | return 0; |
1885 | 1.25k | } |
1886 | | |
1887 | | static int |
1888 | | and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1, |
1889 | | OnigCodePoint* data, int n) |
1890 | 0 | { |
1891 | 0 | int i, r; |
1892 | 0 | OnigCodePoint from2, to2; |
1893 | |
|
1894 | 0 | for (i = 0; i < n; i++) { |
1895 | 0 | from2 = data[i*2]; |
1896 | 0 | to2 = data[i*2+1]; |
1897 | 0 | if (from2 < from1) { |
1898 | 0 | if (to2 < from1) continue; |
1899 | 0 | else { |
1900 | 0 | from1 = to2 + 1; |
1901 | 0 | } |
1902 | 0 | } |
1903 | 0 | else if (from2 <= to1) { |
1904 | 0 | if (to2 < to1) { |
1905 | 0 | if (from1 <= from2 - 1) { |
1906 | 0 | r = add_code_range_to_buf(pbuf, env, from1, from2-1); |
1907 | 0 | if (r != 0) return r; |
1908 | 0 | } |
1909 | 0 | from1 = to2 + 1; |
1910 | 0 | } |
1911 | 0 | else { |
1912 | 0 | to1 = from2 - 1; |
1913 | 0 | } |
1914 | 0 | } |
1915 | 0 | else { |
1916 | 0 | from1 = from2; |
1917 | 0 | } |
1918 | 0 | if (from1 > to1) break; |
1919 | 0 | } |
1920 | 0 | if (from1 <= to1) { |
1921 | 0 | r = add_code_range_to_buf(pbuf, env, from1, to1); |
1922 | 0 | if (r != 0) return r; |
1923 | 0 | } |
1924 | 0 | return 0; |
1925 | 0 | } |
1926 | | |
1927 | | static int |
1928 | | and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) |
1929 | 460k | { |
1930 | 460k | int r; |
1931 | 460k | OnigCodePoint i, j, n1, n2, *data1, *data2; |
1932 | 460k | OnigCodePoint from, to, from1, to1, from2, to2; |
1933 | | |
1934 | 460k | *pbuf = (BBuf* )NULL; |
1935 | 460k | if (IS_NULL(bbuf1)) { |
1936 | 1.43k | if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ |
1937 | 0 | return bbuf_clone(pbuf, bbuf2); |
1938 | 1.43k | return 0; |
1939 | 1.43k | } |
1940 | 458k | else if (IS_NULL(bbuf2)) { |
1941 | 458k | if (not2 != 0) |
1942 | 0 | return bbuf_clone(pbuf, bbuf1); |
1943 | 458k | return 0; |
1944 | 458k | } |
1945 | | |
1946 | 0 | if (not1 != 0) |
1947 | 0 | SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); |
1948 | |
|
1949 | 0 | data1 = (OnigCodePoint* )(bbuf1->p); |
1950 | 0 | data2 = (OnigCodePoint* )(bbuf2->p); |
1951 | 0 | GET_CODE_POINT(n1, data1); |
1952 | 0 | GET_CODE_POINT(n2, data2); |
1953 | 0 | data1++; |
1954 | 0 | data2++; |
1955 | |
|
1956 | 0 | if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ |
1957 | 0 | for (i = 0; i < n1; i++) { |
1958 | 0 | from1 = data1[i*2]; |
1959 | 0 | to1 = data1[i*2+1]; |
1960 | 0 | for (j = 0; j < n2; j++) { |
1961 | 0 | from2 = data2[j*2]; |
1962 | 0 | to2 = data2[j*2+1]; |
1963 | 0 | if (from2 > to1) break; |
1964 | 0 | if (to2 < from1) continue; |
1965 | 0 | from = MAX(from1, from2); |
1966 | 0 | to = MIN(to1, to2); |
1967 | 0 | r = add_code_range_to_buf(pbuf, env, from, to); |
1968 | 0 | if (r != 0) return r; |
1969 | 0 | } |
1970 | 0 | } |
1971 | 0 | } |
1972 | 0 | else if (not1 == 0) { /* 1 AND (not 2) */ |
1973 | 0 | for (i = 0; i < n1; i++) { |
1974 | 0 | from1 = data1[i*2]; |
1975 | 0 | to1 = data1[i*2+1]; |
1976 | 0 | r = and_code_range1(pbuf, env, from1, to1, data2, n2); |
1977 | 0 | if (r != 0) return r; |
1978 | 0 | } |
1979 | 0 | } |
1980 | | |
1981 | 0 | return 0; |
1982 | 0 | } |
1983 | | |
1984 | | static int |
1985 | | and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) |
1986 | 460k | { |
1987 | 460k | OnigEncoding enc = env->enc; |
1988 | 460k | int r, not1, not2; |
1989 | 460k | BBuf *buf1, *buf2, *pbuf = 0; |
1990 | 460k | BitSetRef bsr1, bsr2; |
1991 | 460k | BitSet bs1, bs2; |
1992 | | |
1993 | 460k | not1 = IS_NCCLASS_NOT(dest); |
1994 | 460k | bsr1 = dest->bs; |
1995 | 460k | buf1 = dest->mbuf; |
1996 | 460k | not2 = IS_NCCLASS_NOT(cc); |
1997 | 460k | bsr2 = cc->bs; |
1998 | 460k | buf2 = cc->mbuf; |
1999 | | |
2000 | 460k | if (not1 != 0) { |
2001 | 0 | bitset_invert_to(bsr1, bs1); |
2002 | 0 | bsr1 = bs1; |
2003 | 0 | } |
2004 | 460k | if (not2 != 0) { |
2005 | 0 | bitset_invert_to(bsr2, bs2); |
2006 | 0 | bsr2 = bs2; |
2007 | 0 | } |
2008 | 460k | bitset_and(bsr1, bsr2); |
2009 | 460k | if (bsr1 != dest->bs) { |
2010 | 0 | bitset_copy(dest->bs, bsr1); |
2011 | 0 | bsr1 = dest->bs; |
2012 | 0 | } |
2013 | 460k | if (not1 != 0) { |
2014 | 0 | bitset_invert(dest->bs); |
2015 | 0 | } |
2016 | | |
2017 | 460k | if (! ONIGENC_IS_SINGLEBYTE(enc)) { |
2018 | 460k | if (not1 != 0 && not2 != 0) { |
2019 | 0 | r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env); |
2020 | 0 | } |
2021 | 460k | else { |
2022 | 460k | r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env); |
2023 | 460k | if (r == 0 && not1 != 0) { |
2024 | 0 | BBuf *tbuf = 0; |
2025 | 0 | r = not_code_range_buf(enc, pbuf, &tbuf, env); |
2026 | 0 | bbuf_free(pbuf); |
2027 | 0 | pbuf = tbuf; |
2028 | 0 | } |
2029 | 460k | } |
2030 | 460k | if (r != 0) { |
2031 | 0 | bbuf_free(pbuf); |
2032 | 0 | return r; |
2033 | 0 | } |
2034 | | |
2035 | 460k | dest->mbuf = pbuf; |
2036 | 460k | bbuf_free(buf1); |
2037 | 460k | return r; |
2038 | 460k | } |
2039 | 0 | return 0; |
2040 | 460k | } |
2041 | | |
2042 | | static int |
2043 | | or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) |
2044 | 465k | { |
2045 | 465k | OnigEncoding enc = env->enc; |
2046 | 465k | int r, not1, not2; |
2047 | 465k | BBuf *buf1, *buf2, *pbuf = 0; |
2048 | 465k | BitSetRef bsr1, bsr2; |
2049 | 465k | BitSet bs1, bs2; |
2050 | | |
2051 | 465k | not1 = IS_NCCLASS_NOT(dest); |
2052 | 465k | bsr1 = dest->bs; |
2053 | 465k | buf1 = dest->mbuf; |
2054 | 465k | not2 = IS_NCCLASS_NOT(cc); |
2055 | 465k | bsr2 = cc->bs; |
2056 | 465k | buf2 = cc->mbuf; |
2057 | | |
2058 | 465k | if (not1 != 0) { |
2059 | 0 | bitset_invert_to(bsr1, bs1); |
2060 | 0 | bsr1 = bs1; |
2061 | 0 | } |
2062 | 465k | if (not2 != 0) { |
2063 | 2.71k | bitset_invert_to(bsr2, bs2); |
2064 | 2.71k | bsr2 = bs2; |
2065 | 2.71k | } |
2066 | 465k | bitset_or(bsr1, bsr2); |
2067 | 465k | if (bsr1 != dest->bs) { |
2068 | 0 | bitset_copy(dest->bs, bsr1); |
2069 | 0 | bsr1 = dest->bs; |
2070 | 0 | } |
2071 | 465k | if (not1 != 0) { |
2072 | 0 | bitset_invert(dest->bs); |
2073 | 0 | } |
2074 | | |
2075 | 465k | if (! ONIGENC_IS_SINGLEBYTE(enc)) { |
2076 | 465k | if (not1 != 0 && not2 != 0) { |
2077 | 0 | r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env); |
2078 | 0 | } |
2079 | 465k | else { |
2080 | 465k | r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env); |
2081 | 465k | if (r == 0 && not1 != 0) { |
2082 | 0 | BBuf *tbuf = 0; |
2083 | 0 | r = not_code_range_buf(enc, pbuf, &tbuf, env); |
2084 | 0 | bbuf_free(pbuf); |
2085 | 0 | pbuf = tbuf; |
2086 | 0 | } |
2087 | 465k | } |
2088 | 465k | if (r != 0) { |
2089 | 0 | bbuf_free(pbuf); |
2090 | 0 | return r; |
2091 | 0 | } |
2092 | | |
2093 | 465k | dest->mbuf = pbuf; |
2094 | 465k | bbuf_free(buf1); |
2095 | 465k | return r; |
2096 | 465k | } |
2097 | 0 | else |
2098 | 0 | return 0; |
2099 | 465k | } |
2100 | | |
2101 | | static void UNKNOWN_ESC_WARN(ScanEnv *env, int c); |
2102 | | |
2103 | | static OnigCodePoint |
2104 | | conv_backslash_value(OnigCodePoint c, ScanEnv* env) |
2105 | 600k | { |
2106 | 600k | if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { |
2107 | 600k | switch (c) { |
2108 | 62.5k | case 'n': return '\n'; |
2109 | 247k | case 't': return '\t'; |
2110 | 62.3k | case 'r': return '\r'; |
2111 | 389 | case 'f': return '\f'; |
2112 | 310 | case 'a': return '\007'; |
2113 | 0 | case 'b': return '\010'; |
2114 | 138 | case 'e': return '\033'; |
2115 | 255 | case 'v': |
2116 | 255 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) |
2117 | 255 | return '\v'; |
2118 | 0 | break; |
2119 | | |
2120 | 226k | default: |
2121 | 226k | if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) |
2122 | 4.67k | UNKNOWN_ESC_WARN(env, c); |
2123 | 226k | break; |
2124 | 600k | } |
2125 | 600k | } |
2126 | 226k | return c; |
2127 | 600k | } |
2128 | | |
2129 | | #ifdef USE_NO_INVALID_QUANTIFIER |
2130 | 1.27M | # define is_invalid_quantifier_target(node) 0 |
2131 | | #else |
2132 | | static int |
2133 | | is_invalid_quantifier_target(Node* node) |
2134 | | { |
2135 | | switch (NTYPE(node)) { |
2136 | | case NT_ANCHOR: |
2137 | | return 1; |
2138 | | break; |
2139 | | |
2140 | | case NT_ENCLOSE: |
2141 | | /* allow enclosed elements */ |
2142 | | /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */ |
2143 | | break; |
2144 | | |
2145 | | case NT_LIST: |
2146 | | do { |
2147 | | if (! is_invalid_quantifier_target(NCAR(node))) return 0; |
2148 | | } while (IS_NOT_NULL(node = NCDR(node))); |
2149 | | return 0; |
2150 | | break; |
2151 | | |
2152 | | case NT_ALT: |
2153 | | do { |
2154 | | if (is_invalid_quantifier_target(NCAR(node))) return 1; |
2155 | | } while (IS_NOT_NULL(node = NCDR(node))); |
2156 | | break; |
2157 | | |
2158 | | default: |
2159 | | break; |
2160 | | } |
2161 | | return 0; |
2162 | | } |
2163 | | #endif |
2164 | | |
2165 | | /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ |
2166 | | static int |
2167 | | popular_quantifier_num(QtfrNode* q) |
2168 | 101k | { |
2169 | 101k | if (q->greedy) { |
2170 | 74.6k | if (q->lower == 0) { |
2171 | 34.6k | if (q->upper == 1) return 0; |
2172 | 31.4k | else if (IS_REPEAT_INFINITE(q->upper)) return 1; |
2173 | 34.6k | } |
2174 | 39.9k | else if (q->lower == 1) { |
2175 | 11.8k | if (IS_REPEAT_INFINITE(q->upper)) return 2; |
2176 | 11.8k | } |
2177 | 74.6k | } |
2178 | 26.8k | else { |
2179 | 26.8k | if (q->lower == 0) { |
2180 | 20.2k | if (q->upper == 1) return 3; |
2181 | 9.39k | else if (IS_REPEAT_INFINITE(q->upper)) return 4; |
2182 | 20.2k | } |
2183 | 6.61k | else if (q->lower == 1) { |
2184 | 5.13k | if (IS_REPEAT_INFINITE(q->upper)) return 5; |
2185 | 5.13k | } |
2186 | 26.8k | } |
2187 | 32.1k | return -1; |
2188 | 101k | } |
2189 | | |
2190 | | |
2191 | | enum ReduceType { |
2192 | | RQ_ASIS = 0, /* as is */ |
2193 | | RQ_DEL = 1, /* delete parent */ |
2194 | | RQ_A, /* to '*' */ |
2195 | | RQ_AQ, /* to '*?' */ |
2196 | | RQ_QQ, /* to '??' */ |
2197 | | RQ_P_QQ, /* to '+)??' */ |
2198 | | RQ_PQ_Q /* to '+?)?' */ |
2199 | | }; |
2200 | | |
2201 | | static enum ReduceType const ReduceTypeTable[6][6] = { |
2202 | | /* '?', '*', '+', '??', '*?', '+?' p / c */ |
2203 | | {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ |
2204 | | {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ |
2205 | | {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ |
2206 | | {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ |
2207 | | {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ |
2208 | | {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ |
2209 | | }; |
2210 | | |
2211 | | extern void |
2212 | | onig_reduce_nested_quantifier(Node* pnode, Node* cnode) |
2213 | 17.1k | { |
2214 | 17.1k | int pnum, cnum; |
2215 | 17.1k | QtfrNode *p, *c; |
2216 | | |
2217 | 17.1k | p = NQTFR(pnode); |
2218 | 17.1k | c = NQTFR(cnode); |
2219 | 17.1k | pnum = popular_quantifier_num(p); |
2220 | 17.1k | cnum = popular_quantifier_num(c); |
2221 | 17.1k | if (pnum < 0 || cnum < 0) return ; |
2222 | | |
2223 | 15.9k | switch (ReduceTypeTable[cnum][pnum]) { |
2224 | 12.0k | case RQ_DEL: |
2225 | 12.0k | *pnode = *cnode; |
2226 | 12.0k | break; |
2227 | 2.77k | case RQ_A: |
2228 | 2.77k | p->target = c->target; |
2229 | 2.77k | p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; |
2230 | 2.77k | break; |
2231 | 455 | case RQ_AQ: |
2232 | 455 | p->target = c->target; |
2233 | 455 | p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; |
2234 | 455 | break; |
2235 | 131 | case RQ_QQ: |
2236 | 131 | p->target = c->target; |
2237 | 131 | p->lower = 0; p->upper = 1; p->greedy = 0; |
2238 | 131 | break; |
2239 | 26 | case RQ_P_QQ: |
2240 | 26 | p->target = cnode; |
2241 | 26 | p->lower = 0; p->upper = 1; p->greedy = 0; |
2242 | 26 | c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; |
2243 | 26 | return ; |
2244 | 0 | break; |
2245 | 435 | case RQ_PQ_Q: |
2246 | 435 | p->target = cnode; |
2247 | 435 | p->lower = 0; p->upper = 1; p->greedy = 1; |
2248 | 435 | c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; |
2249 | 435 | return ; |
2250 | 0 | break; |
2251 | 110 | case RQ_ASIS: |
2252 | 110 | p->target = cnode; |
2253 | 110 | return ; |
2254 | 0 | break; |
2255 | 15.9k | } |
2256 | | |
2257 | 15.4k | c->target = NULL_NODE; |
2258 | 15.4k | onig_node_free(cnode); |
2259 | 15.4k | } |
2260 | | |
2261 | | |
2262 | | enum TokenSyms { |
2263 | | TK_EOT = 0, /* end of token */ |
2264 | | TK_RAW_BYTE = 1, |
2265 | | TK_CHAR, |
2266 | | TK_STRING, |
2267 | | TK_CODE_POINT, |
2268 | | TK_ANYCHAR, |
2269 | | TK_CHAR_TYPE, |
2270 | | TK_BACKREF, |
2271 | | TK_CALL, |
2272 | | TK_ANCHOR, |
2273 | | TK_OP_REPEAT, |
2274 | | TK_INTERVAL, |
2275 | | TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ |
2276 | | TK_ALT, |
2277 | | TK_SUBEXP_OPEN, |
2278 | | TK_SUBEXP_CLOSE, |
2279 | | TK_CC_OPEN, |
2280 | | TK_QUOTE_OPEN, |
2281 | | TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ |
2282 | | TK_LINEBREAK, |
2283 | | TK_EXTENDED_GRAPHEME_CLUSTER, |
2284 | | TK_KEEP, |
2285 | | /* in cc */ |
2286 | | TK_CC_CLOSE, |
2287 | | TK_CC_RANGE, |
2288 | | TK_POSIX_BRACKET_OPEN, |
2289 | | TK_CC_AND, /* && */ |
2290 | | TK_CC_CC_OPEN /* [ */ |
2291 | | }; |
2292 | | |
2293 | | typedef struct { |
2294 | | enum TokenSyms type; |
2295 | | int escaped; |
2296 | | int base; /* is number: 8, 16 (used in [....]) */ |
2297 | | UChar* backp; |
2298 | | union { |
2299 | | UChar* s; |
2300 | | int c; |
2301 | | OnigCodePoint code; |
2302 | | struct { |
2303 | | int subtype; |
2304 | | int ascii_range; |
2305 | | } anchor; |
2306 | | struct { |
2307 | | int lower; |
2308 | | int upper; |
2309 | | int greedy; |
2310 | | int possessive; |
2311 | | } repeat; |
2312 | | struct { |
2313 | | int num; |
2314 | | int ref1; |
2315 | | int* refs; |
2316 | | int by_name; |
2317 | | #ifdef USE_BACKREF_WITH_LEVEL |
2318 | | int exist_level; |
2319 | | int level; /* \k<name+n> */ |
2320 | | #endif |
2321 | | } backref; |
2322 | | struct { |
2323 | | UChar* name; |
2324 | | UChar* name_end; |
2325 | | int gnum; |
2326 | | int rel; |
2327 | | } call; |
2328 | | struct { |
2329 | | int ctype; |
2330 | | int not; |
2331 | | } prop; |
2332 | | } u; |
2333 | | } OnigToken; |
2334 | | |
2335 | | |
2336 | | static int |
2337 | | fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) |
2338 | 40.6k | { |
2339 | 40.6k | int low, up, syn_allow, non_low = 0; |
2340 | 40.6k | int r = 0; |
2341 | 40.6k | OnigCodePoint c; |
2342 | 40.6k | OnigEncoding enc = env->enc; |
2343 | 40.6k | UChar* p = *src; |
2344 | 40.6k | PFETCH_READY; |
2345 | | |
2346 | 40.6k | syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); |
2347 | | |
2348 | 40.6k | if (PEND) { |
2349 | 159 | if (syn_allow) |
2350 | 159 | return 1; /* "....{" : OK! */ |
2351 | 0 | else |
2352 | 0 | return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ |
2353 | 159 | } |
2354 | | |
2355 | 40.4k | if (! syn_allow) { |
2356 | 0 | c = PPEEK; |
2357 | 0 | if (c == ')' || c == '(' || c == '|') { |
2358 | 0 | return ONIGERR_END_PATTERN_AT_LEFT_BRACE; |
2359 | 0 | } |
2360 | 0 | } |
2361 | | |
2362 | 40.4k | low = onig_scan_unsigned_number(&p, end, env->enc); |
2363 | 40.4k | if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2364 | 40.4k | if (low > ONIG_MAX_REPEAT_NUM) |
2365 | 0 | return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2366 | | |
2367 | 40.4k | if (p == *src) { /* can't read low */ |
2368 | 7.92k | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { |
2369 | | /* allow {,n} as {0,n} */ |
2370 | 7.92k | low = 0; |
2371 | 7.92k | non_low = 1; |
2372 | 7.92k | } |
2373 | 0 | else |
2374 | 0 | goto invalid; |
2375 | 7.92k | } |
2376 | | |
2377 | 40.4k | if (PEND) goto invalid; |
2378 | 40.2k | PFETCH(c); |
2379 | 40.2k | if (c == ',') { |
2380 | 7.41k | UChar* prev = p; |
2381 | 7.41k | up = onig_scan_unsigned_number(&p, end, env->enc); |
2382 | 7.41k | if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2383 | 7.41k | if (up > ONIG_MAX_REPEAT_NUM) |
2384 | 0 | return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2385 | | |
2386 | 7.41k | if (p == prev) { |
2387 | 4.27k | if (non_low != 0) |
2388 | 514 | goto invalid; |
2389 | 3.76k | up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ |
2390 | 3.76k | } |
2391 | 7.41k | } |
2392 | 32.8k | else { |
2393 | 32.8k | if (non_low != 0) |
2394 | 7.13k | goto invalid; |
2395 | | |
2396 | 25.7k | PUNFETCH; |
2397 | 25.7k | up = low; /* {n} : exact n times */ |
2398 | 25.7k | r = 2; /* fixed */ |
2399 | 25.7k | } |
2400 | | |
2401 | 32.6k | if (PEND) goto invalid; |
2402 | 32.2k | PFETCH(c); |
2403 | 32.2k | if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { |
2404 | 0 | if (c != MC_ESC(env->syntax)) goto invalid; |
2405 | 0 | if (PEND) goto invalid; |
2406 | 0 | PFETCH(c); |
2407 | 0 | } |
2408 | 32.2k | if (c != '}') goto invalid; |
2409 | | |
2410 | 26.3k | if (!IS_REPEAT_INFINITE(up) && low > up) { |
2411 | 0 | return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; |
2412 | 0 | } |
2413 | | |
2414 | 26.3k | tok->type = TK_INTERVAL; |
2415 | 26.3k | tok->u.repeat.lower = low; |
2416 | 26.3k | tok->u.repeat.upper = up; |
2417 | 26.3k | *src = p; |
2418 | 26.3k | return r; /* 0: normal {n,m}, 2: fixed {n} */ |
2419 | | |
2420 | 14.1k | invalid: |
2421 | 14.1k | if (syn_allow) |
2422 | 14.1k | return 1; /* OK */ |
2423 | 0 | else |
2424 | 0 | return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; |
2425 | 14.1k | } |
2426 | | |
2427 | | /* \M-, \C-, \c, or \... */ |
2428 | | static int |
2429 | | fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) |
2430 | 600k | { |
2431 | 600k | int v; |
2432 | 600k | OnigCodePoint c; |
2433 | 600k | OnigEncoding enc = env->enc; |
2434 | 600k | UChar* p = *src; |
2435 | | |
2436 | 600k | if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; |
2437 | | |
2438 | 600k | PFETCH_S(c); |
2439 | 600k | switch (c) { |
2440 | 0 | case 'M': |
2441 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { |
2442 | 0 | if (PEND) return ONIGERR_END_PATTERN_AT_META; |
2443 | 0 | PFETCH_S(c); |
2444 | 0 | if (c != '-') return ONIGERR_META_CODE_SYNTAX; |
2445 | 0 | if (PEND) return ONIGERR_END_PATTERN_AT_META; |
2446 | 0 | PFETCH_S(c); |
2447 | 0 | if (c == MC_ESC(env->syntax)) { |
2448 | 0 | v = fetch_escaped_value(&p, end, env, &c); |
2449 | 0 | if (v < 0) return v; |
2450 | 0 | } |
2451 | 0 | c = ((c & 0xff) | 0x80); |
2452 | 0 | } |
2453 | 0 | else |
2454 | 0 | goto backslash; |
2455 | 0 | break; |
2456 | | |
2457 | 0 | case 'C': |
2458 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { |
2459 | 0 | if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; |
2460 | 0 | PFETCH_S(c); |
2461 | 0 | if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; |
2462 | 0 | goto control; |
2463 | 0 | } |
2464 | 0 | else |
2465 | 0 | goto backslash; |
2466 | | |
2467 | 26 | case 'c': |
2468 | 26 | if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { |
2469 | 26 | control: |
2470 | 26 | if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; |
2471 | 26 | PFETCH_S(c); |
2472 | 26 | if (c == '?') { |
2473 | 0 | c = 0177; |
2474 | 0 | } |
2475 | 26 | else { |
2476 | 26 | if (c == MC_ESC(env->syntax)) { |
2477 | 0 | v = fetch_escaped_value(&p, end, env, &c); |
2478 | 0 | if (v < 0) return v; |
2479 | 0 | } |
2480 | 26 | c &= 0x9f; |
2481 | 26 | } |
2482 | 26 | break; |
2483 | 26 | } |
2484 | | /* fall through */ |
2485 | | |
2486 | 600k | default: |
2487 | 600k | { |
2488 | 600k | backslash: |
2489 | 600k | c = conv_backslash_value(c, env); |
2490 | 600k | } |
2491 | 600k | break; |
2492 | 600k | } |
2493 | | |
2494 | 600k | *src = p; |
2495 | 600k | *val = c; |
2496 | 600k | return 0; |
2497 | 600k | } |
2498 | | |
2499 | | static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); |
2500 | | |
2501 | | static OnigCodePoint |
2502 | | get_name_end_code_point(OnigCodePoint start) |
2503 | 255k | { |
2504 | 255k | switch (start) { |
2505 | 237k | case '<': return (OnigCodePoint )'>'; break; |
2506 | 17.6k | case '\'': return (OnigCodePoint )'\''; break; |
2507 | 32 | case '(': return (OnigCodePoint )')'; break; |
2508 | 0 | case '{': return (OnigCodePoint )'}'; break; |
2509 | 0 | default: |
2510 | 0 | break; |
2511 | 255k | } |
2512 | | |
2513 | 0 | return (OnigCodePoint )0; |
2514 | 255k | } |
2515 | | |
2516 | | #ifdef USE_NAMED_GROUP |
2517 | | # ifdef RUBY |
2518 | | # define ONIGENC_IS_CODE_NAME(enc, c) TRUE |
2519 | | # else |
2520 | 1.08M | # define ONIGENC_IS_CODE_NAME(enc, c) ONIGENC_IS_CODE_WORD(enc, c) |
2521 | | # endif |
2522 | | |
2523 | | # ifdef USE_BACKREF_WITH_LEVEL |
2524 | | /* |
2525 | | \k<name+n>, \k<name-n> |
2526 | | \k<num+n>, \k<num-n> |
2527 | | \k<-num+n>, \k<-num-n> |
2528 | | */ |
2529 | | static int |
2530 | | fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, |
2531 | | UChar** rname_end, ScanEnv* env, |
2532 | | int* rback_num, int* rlevel) |
2533 | 3.88k | { |
2534 | 3.88k | int r, sign, is_num, exist_level; |
2535 | 3.88k | OnigCodePoint end_code; |
2536 | 3.88k | OnigCodePoint c = 0; |
2537 | 3.88k | OnigEncoding enc = env->enc; |
2538 | 3.88k | UChar *name_end; |
2539 | 3.88k | UChar *pnum_head; |
2540 | 3.88k | UChar *p = *src; |
2541 | 3.88k | PFETCH_READY; |
2542 | | |
2543 | 3.88k | *rback_num = 0; |
2544 | 3.88k | is_num = exist_level = 0; |
2545 | 3.88k | sign = 1; |
2546 | 3.88k | pnum_head = *src; |
2547 | | |
2548 | 3.88k | end_code = get_name_end_code_point(start_code); |
2549 | | |
2550 | 3.88k | name_end = end; |
2551 | 3.88k | r = 0; |
2552 | 3.88k | if (PEND) { |
2553 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
2554 | 0 | } |
2555 | 3.88k | else { |
2556 | 3.88k | PFETCH(c); |
2557 | 3.88k | if (c == end_code) |
2558 | 1 | return ONIGERR_EMPTY_GROUP_NAME; |
2559 | | |
2560 | 3.88k | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2561 | 710 | is_num = 1; |
2562 | 710 | } |
2563 | 3.17k | else if (c == '-') { |
2564 | 1.66k | is_num = 2; |
2565 | 1.66k | sign = -1; |
2566 | 1.66k | pnum_head = p; |
2567 | 1.66k | } |
2568 | 1.50k | else if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2569 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2570 | 0 | } |
2571 | 3.88k | } |
2572 | | |
2573 | 5.56k | while (!PEND) { |
2574 | 5.56k | name_end = p; |
2575 | 5.56k | PFETCH(c); |
2576 | 5.56k | if (c == end_code || c == ')' || c == '+' || c == '-') { |
2577 | 3.88k | if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; |
2578 | 3.88k | break; |
2579 | 3.88k | } |
2580 | | |
2581 | 1.68k | if (is_num != 0) { |
2582 | 1.68k | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2583 | 1.68k | is_num = 1; |
2584 | 1.68k | } |
2585 | 0 | else { |
2586 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2587 | 0 | is_num = 0; |
2588 | 0 | } |
2589 | 1.68k | } |
2590 | 0 | else if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2591 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2592 | 0 | } |
2593 | 1.68k | } |
2594 | | |
2595 | 3.88k | if (r == 0 && c != end_code) { |
2596 | 1.75k | if (c == '+' || c == '-') { |
2597 | 1.75k | int level; |
2598 | 1.75k | int flag = (c == '-' ? -1 : 1); |
2599 | | |
2600 | 1.75k | if (PEND) { |
2601 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2602 | 0 | goto end; |
2603 | 0 | } |
2604 | 1.75k | PFETCH(c); |
2605 | 1.75k | if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; |
2606 | 1.75k | PUNFETCH; |
2607 | 1.75k | level = onig_scan_unsigned_number(&p, end, enc); |
2608 | 1.75k | if (level < 0) return ONIGERR_TOO_BIG_NUMBER; |
2609 | 1.75k | *rlevel = (level * flag); |
2610 | 1.75k | exist_level = 1; |
2611 | | |
2612 | 1.75k | if (!PEND) { |
2613 | 1.75k | PFETCH(c); |
2614 | 1.75k | if (c == end_code) |
2615 | 1.75k | goto end; |
2616 | 1.75k | } |
2617 | 1.75k | } |
2618 | | |
2619 | 2 | err: |
2620 | 2 | r = ONIGERR_INVALID_GROUP_NAME; |
2621 | 2 | name_end = end; |
2622 | 2 | } |
2623 | | |
2624 | 3.88k | end: |
2625 | 3.88k | if (r == 0) { |
2626 | 3.87k | if (is_num != 0) { |
2627 | 2.37k | *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); |
2628 | 2.37k | if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; |
2629 | 2.37k | else if (*rback_num == 0) goto err; |
2630 | | |
2631 | 2.37k | *rback_num *= sign; |
2632 | 2.37k | } |
2633 | | |
2634 | 3.87k | *rname_end = name_end; |
2635 | 3.87k | *src = p; |
2636 | 3.87k | return (exist_level ? 1 : 0); |
2637 | 3.87k | } |
2638 | 2 | else { |
2639 | 2 | onig_scan_env_set_error_string(env, r, *src, name_end); |
2640 | 2 | return r; |
2641 | 2 | } |
2642 | 3.88k | } |
2643 | | # endif /* USE_BACKREF_WITH_LEVEL */ |
2644 | | |
2645 | | /* |
2646 | | ref: 0 -> define name (don't allow number name) |
2647 | | 1 -> reference name (allow number name) |
2648 | | */ |
2649 | | static int |
2650 | | fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, |
2651 | | UChar** rname_end, ScanEnv* env, int* rback_num, int ref) |
2652 | 250k | { |
2653 | 250k | int r, is_num, sign; |
2654 | 250k | OnigCodePoint end_code; |
2655 | 250k | OnigCodePoint c = 0; |
2656 | 250k | OnigEncoding enc = env->enc; |
2657 | 250k | UChar *name_end; |
2658 | 250k | UChar *pnum_head; |
2659 | 250k | UChar *p = *src; |
2660 | | |
2661 | 250k | *rback_num = 0; |
2662 | | |
2663 | 250k | end_code = get_name_end_code_point(start_code); |
2664 | | |
2665 | 250k | name_end = end; |
2666 | 250k | pnum_head = *src; |
2667 | 250k | r = 0; |
2668 | 250k | is_num = 0; |
2669 | 250k | sign = 1; |
2670 | 250k | if (PEND) { |
2671 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
2672 | 0 | } |
2673 | 250k | else { |
2674 | 250k | PFETCH_S(c); |
2675 | 250k | if (c == end_code) |
2676 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
2677 | | |
2678 | 250k | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2679 | 1.26k | if (ref == 1) |
2680 | 1.26k | is_num = 1; |
2681 | 0 | else { |
2682 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2683 | 0 | is_num = 0; |
2684 | 0 | } |
2685 | 1.26k | } |
2686 | 249k | else if (c == '-') { |
2687 | 1.70k | if (ref == 1) { |
2688 | 1.70k | is_num = 2; |
2689 | 1.70k | sign = -1; |
2690 | 1.70k | pnum_head = p; |
2691 | 1.70k | } |
2692 | 0 | else { |
2693 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2694 | 0 | is_num = 0; |
2695 | 0 | } |
2696 | 1.70k | } |
2697 | 247k | else if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2698 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2699 | 0 | } |
2700 | 250k | } |
2701 | | |
2702 | 250k | if (r == 0) { |
2703 | 1.08M | while (!PEND) { |
2704 | 1.08M | name_end = p; |
2705 | 1.08M | PFETCH_S(c); |
2706 | 1.08M | if (c == end_code || c == ')') { |
2707 | 250k | if (is_num == 2) { |
2708 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2709 | 0 | goto teardown; |
2710 | 0 | } |
2711 | 250k | break; |
2712 | 250k | } |
2713 | | |
2714 | 834k | if (is_num != 0) { |
2715 | 1.72k | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2716 | 1.72k | is_num = 1; |
2717 | 1.72k | } |
2718 | 1 | else { |
2719 | 1 | if (!ONIGENC_IS_CODE_WORD(enc, c)) |
2720 | 1 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2721 | 0 | else |
2722 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2723 | 1 | goto teardown; |
2724 | 1 | } |
2725 | 1.72k | } |
2726 | 832k | else { |
2727 | 832k | if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2728 | 8 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2729 | 8 | goto teardown; |
2730 | 8 | } |
2731 | 832k | } |
2732 | 834k | } |
2733 | | |
2734 | 250k | if (c != end_code) { |
2735 | 1 | r = ONIGERR_INVALID_GROUP_NAME; |
2736 | 1 | name_end = end; |
2737 | 1 | goto err; |
2738 | 1 | } |
2739 | | |
2740 | 250k | if (is_num != 0) { |
2741 | 2.96k | *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); |
2742 | 2.96k | if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; |
2743 | 2.96k | else if (*rback_num == 0) { |
2744 | 2 | r = ONIGERR_INVALID_GROUP_NAME; |
2745 | 2 | goto err; |
2746 | 2 | } |
2747 | | |
2748 | 2.96k | *rback_num *= sign; |
2749 | 2.96k | } |
2750 | | |
2751 | 250k | *rname_end = name_end; |
2752 | 250k | *src = p; |
2753 | 250k | return 0; |
2754 | 250k | } |
2755 | 0 | else { |
2756 | 9 | teardown: |
2757 | 24.6k | while (!PEND) { |
2758 | 24.5k | name_end = p; |
2759 | 24.5k | PFETCH_S(c); |
2760 | 24.5k | if (c == end_code || c == ')') |
2761 | 6 | break; |
2762 | 24.5k | } |
2763 | 9 | if (PEND) |
2764 | 3 | name_end = end; |
2765 | | |
2766 | 12 | err: |
2767 | 12 | onig_scan_env_set_error_string(env, r, *src, name_end); |
2768 | 12 | return r; |
2769 | 9 | } |
2770 | 250k | } |
2771 | | #else |
2772 | | static int |
2773 | | fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, |
2774 | | UChar** rname_end, ScanEnv* env, int* rback_num, int ref) |
2775 | | { |
2776 | | int r, is_num, sign; |
2777 | | OnigCodePoint end_code; |
2778 | | OnigCodePoint c = 0; |
2779 | | UChar *name_end; |
2780 | | OnigEncoding enc = env->enc; |
2781 | | UChar *pnum_head; |
2782 | | UChar *p = *src; |
2783 | | PFETCH_READY; |
2784 | | |
2785 | | *rback_num = 0; |
2786 | | |
2787 | | end_code = get_name_end_code_point(start_code); |
2788 | | |
2789 | | *rname_end = name_end = end; |
2790 | | r = 0; |
2791 | | pnum_head = *src; |
2792 | | is_num = 0; |
2793 | | sign = 1; |
2794 | | |
2795 | | if (PEND) { |
2796 | | return ONIGERR_EMPTY_GROUP_NAME; |
2797 | | } |
2798 | | else { |
2799 | | PFETCH(c); |
2800 | | if (c == end_code) |
2801 | | return ONIGERR_EMPTY_GROUP_NAME; |
2802 | | |
2803 | | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2804 | | is_num = 1; |
2805 | | } |
2806 | | else if (c == '-') { |
2807 | | is_num = 2; |
2808 | | sign = -1; |
2809 | | pnum_head = p; |
2810 | | } |
2811 | | else { |
2812 | | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2813 | | } |
2814 | | } |
2815 | | |
2816 | | while (!PEND) { |
2817 | | name_end = p; |
2818 | | |
2819 | | PFETCH(c); |
2820 | | if (c == end_code || c == ')') break; |
2821 | | if (! ONIGENC_IS_CODE_DIGIT(enc, c)) |
2822 | | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2823 | | } |
2824 | | if (r == 0 && c != end_code) { |
2825 | | r = ONIGERR_INVALID_GROUP_NAME; |
2826 | | name_end = end; |
2827 | | } |
2828 | | |
2829 | | if (r == 0) { |
2830 | | *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); |
2831 | | if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; |
2832 | | else if (*rback_num == 0) { |
2833 | | r = ONIGERR_INVALID_GROUP_NAME; |
2834 | | goto err; |
2835 | | } |
2836 | | *rback_num *= sign; |
2837 | | |
2838 | | *rname_end = name_end; |
2839 | | *src = p; |
2840 | | return 0; |
2841 | | } |
2842 | | else { |
2843 | | err: |
2844 | | onig_scan_env_set_error_string(env, r, *src, name_end); |
2845 | | return r; |
2846 | | } |
2847 | | } |
2848 | | #endif /* USE_NAMED_GROUP */ |
2849 | | |
2850 | | |
2851 | | static void |
2852 | | onig_syntax_warn(ScanEnv *env, const char *fmt, ...) |
2853 | 5.70k | { |
2854 | 5.70k | va_list args; |
2855 | 5.70k | UChar buf[WARN_BUFSIZE]; |
2856 | 5.70k | va_start(args, fmt); |
2857 | 5.70k | onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, |
2858 | 5.70k | env->pattern, env->pattern_end, |
2859 | 5.70k | (const UChar *)fmt, args); |
2860 | 5.70k | va_end(args); |
2861 | | #ifdef RUBY |
2862 | | if (env->sourcefile == NULL) |
2863 | | rb_warn("%s", (char *)buf); |
2864 | | else |
2865 | | rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf); |
2866 | | #else |
2867 | 5.70k | (*onig_warn)((char* )buf); |
2868 | 5.70k | #endif |
2869 | 5.70k | } |
2870 | | |
2871 | | static void |
2872 | | CC_ESC_WARN(ScanEnv *env, UChar *c) |
2873 | 5.17k | { |
2874 | 5.17k | if (onig_warn == onig_null_warn) return ; |
2875 | | |
2876 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && |
2877 | 0 | IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { |
2878 | 0 | onig_syntax_warn(env, "character class has '%s' without escape", c); |
2879 | 0 | } |
2880 | 0 | } |
2881 | | |
2882 | | static void |
2883 | | CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) |
2884 | 9.75k | { |
2885 | 9.75k | if (onig_warn == onig_null_warn) return ; |
2886 | | |
2887 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { |
2888 | 0 | onig_syntax_warn(env, "regular expression has '%s' without escape", c); |
2889 | 0 | } |
2890 | 0 | } |
2891 | | |
2892 | | #ifndef RTEST |
2893 | 0 | # define RTEST(v) 1 |
2894 | | #endif |
2895 | | |
2896 | | static void |
2897 | | CC_DUP_WARN(ScanEnv *env, OnigCodePoint from ARG_UNUSED, OnigCodePoint to ARG_UNUSED) |
2898 | 212k | { |
2899 | 212k | if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; |
2900 | | |
2901 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) && |
2902 | 0 | !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) { |
2903 | | #ifdef WARN_ALL_CC_DUP |
2904 | | onig_syntax_warn(env, "character class has duplicated range: %04x-%04x", from, to); |
2905 | | #else |
2906 | 0 | env->warnings_flag |= ONIG_SYN_WARN_CC_DUP; |
2907 | 0 | onig_syntax_warn(env, "character class has duplicated range"); |
2908 | 0 | #endif |
2909 | 0 | } |
2910 | 0 | } |
2911 | | |
2912 | | static void |
2913 | | UNKNOWN_ESC_WARN(ScanEnv *env, int c) |
2914 | 4.67k | { |
2915 | 4.67k | if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; |
2916 | 0 | onig_syntax_warn(env, "Unknown escape \\%c is ignored", c); |
2917 | 0 | } |
2918 | | |
2919 | | static UChar* |
2920 | | find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, |
2921 | | UChar **next, OnigEncoding enc) |
2922 | 0 | { |
2923 | 0 | int i; |
2924 | 0 | OnigCodePoint x; |
2925 | 0 | UChar *q; |
2926 | 0 | UChar *p = from; |
2927 | |
|
2928 | 0 | while (p < to) { |
2929 | 0 | x = ONIGENC_MBC_TO_CODE(enc, p, to); |
2930 | 0 | q = p + enclen(enc, p, to); |
2931 | 0 | if (x == s[0]) { |
2932 | 0 | for (i = 1; i < n && q < to; i++) { |
2933 | 0 | x = ONIGENC_MBC_TO_CODE(enc, q, to); |
2934 | 0 | if (x != s[i]) break; |
2935 | 0 | q += enclen(enc, q, to); |
2936 | 0 | } |
2937 | 0 | if (i >= n) { |
2938 | 0 | if (IS_NOT_NULL(next)) |
2939 | 0 | *next = q; |
2940 | 0 | return p; |
2941 | 0 | } |
2942 | 0 | } |
2943 | 0 | p = q; |
2944 | 0 | } |
2945 | 0 | return NULL_UCHARP; |
2946 | 0 | } |
2947 | | |
2948 | | static int |
2949 | | str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, |
2950 | | OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn) |
2951 | 5.19k | { |
2952 | 5.19k | int i, in_esc; |
2953 | 5.19k | OnigCodePoint x; |
2954 | 5.19k | UChar *q; |
2955 | 5.19k | UChar *p = from; |
2956 | | |
2957 | 5.19k | in_esc = 0; |
2958 | 17.4M | while (p < to) { |
2959 | 17.4M | if (in_esc) { |
2960 | 607k | in_esc = 0; |
2961 | 607k | p += enclen(enc, p, to); |
2962 | 607k | } |
2963 | 16.8M | else { |
2964 | 16.8M | x = ONIGENC_MBC_TO_CODE(enc, p, to); |
2965 | 16.8M | q = p + enclen(enc, p, to); |
2966 | 16.8M | if (x == s[0]) { |
2967 | 2.18M | for (i = 1; i < n && q < to; i++) { |
2968 | 2.18M | x = ONIGENC_MBC_TO_CODE(enc, q, to); |
2969 | 2.18M | if (x != s[i]) break; |
2970 | 4.53k | q += enclen(enc, q, to); |
2971 | 4.53k | } |
2972 | 2.18M | if (i >= n) return 1; |
2973 | 2.17M | p += enclen(enc, p, to); |
2974 | 2.17M | } |
2975 | 14.6M | else { |
2976 | 14.6M | x = ONIGENC_MBC_TO_CODE(enc, p, to); |
2977 | 14.6M | if (x == bad) return 0; |
2978 | 14.6M | else if (x == MC_ESC(syn)) in_esc = 1; |
2979 | 14.6M | p = q; |
2980 | 14.6M | } |
2981 | 16.8M | } |
2982 | 17.4M | } |
2983 | 0 | return 0; |
2984 | 5.19k | } |
2985 | | |
2986 | | static int |
2987 | | fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) |
2988 | 2.15M | { |
2989 | 2.15M | int num; |
2990 | 2.15M | OnigCodePoint c, c2; |
2991 | 2.15M | const OnigSyntaxType* syn = env->syntax; |
2992 | 2.15M | OnigEncoding enc = env->enc; |
2993 | 2.15M | UChar* prev; |
2994 | 2.15M | UChar* p = *src; |
2995 | 2.15M | PFETCH_READY; |
2996 | | |
2997 | 2.15M | if (PEND) { |
2998 | 16 | tok->type = TK_EOT; |
2999 | 16 | return tok->type; |
3000 | 16 | } |
3001 | | |
3002 | 2.15M | PFETCH(c); |
3003 | 2.15M | tok->type = TK_CHAR; |
3004 | 2.15M | tok->base = 0; |
3005 | 2.15M | tok->u.c = c; |
3006 | 2.15M | tok->escaped = 0; |
3007 | | |
3008 | 2.15M | if (c == ']') { |
3009 | 519k | tok->type = TK_CC_CLOSE; |
3010 | 519k | } |
3011 | 1.63M | else if (c == '-') { |
3012 | 633 | tok->type = TK_CC_RANGE; |
3013 | 633 | } |
3014 | 1.63M | else if (c == MC_ESC(syn)) { |
3015 | 536k | if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) |
3016 | 0 | goto end; |
3017 | | |
3018 | 536k | if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; |
3019 | | |
3020 | 536k | PFETCH(c); |
3021 | 536k | tok->escaped = 1; |
3022 | 536k | tok->u.c = c; |
3023 | 536k | switch (c) { |
3024 | 21 | case 'w': |
3025 | 21 | tok->type = TK_CHAR_TYPE; |
3026 | 21 | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3027 | 21 | tok->u.prop.not = 0; |
3028 | 21 | break; |
3029 | 244 | case 'W': |
3030 | 244 | tok->type = TK_CHAR_TYPE; |
3031 | 244 | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3032 | 244 | tok->u.prop.not = 1; |
3033 | 244 | break; |
3034 | 617 | case 'd': |
3035 | 617 | tok->type = TK_CHAR_TYPE; |
3036 | 617 | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3037 | 617 | tok->u.prop.not = 0; |
3038 | 617 | break; |
3039 | 0 | case 'D': |
3040 | 0 | tok->type = TK_CHAR_TYPE; |
3041 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3042 | 0 | tok->u.prop.not = 1; |
3043 | 0 | break; |
3044 | 122k | case 's': |
3045 | 122k | tok->type = TK_CHAR_TYPE; |
3046 | 122k | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3047 | 122k | tok->u.prop.not = 0; |
3048 | 122k | break; |
3049 | 510 | case 'S': |
3050 | 510 | tok->type = TK_CHAR_TYPE; |
3051 | 510 | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3052 | 510 | tok->u.prop.not = 1; |
3053 | 510 | break; |
3054 | 0 | case 'h': |
3055 | 0 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3056 | 0 | tok->type = TK_CHAR_TYPE; |
3057 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3058 | 0 | tok->u.prop.not = 0; |
3059 | 0 | break; |
3060 | 67 | case 'H': |
3061 | 67 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3062 | 67 | tok->type = TK_CHAR_TYPE; |
3063 | 67 | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3064 | 67 | tok->u.prop.not = 1; |
3065 | 67 | break; |
3066 | | |
3067 | 59 | case 'p': |
3068 | 392 | case 'P': |
3069 | 392 | if (PEND) break; |
3070 | | |
3071 | 392 | c2 = PPEEK; |
3072 | 392 | if (c2 == '{' && |
3073 | 392 | IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { |
3074 | 262 | PINC; |
3075 | 262 | tok->type = TK_CHAR_PROPERTY; |
3076 | 262 | tok->u.prop.not = (c == 'P' ? 1 : 0); |
3077 | | |
3078 | 262 | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { |
3079 | 262 | PFETCH(c2); |
3080 | 262 | if (c2 == '^') { |
3081 | 0 | tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); |
3082 | 0 | } |
3083 | 262 | else |
3084 | 262 | PUNFETCH; |
3085 | 262 | } |
3086 | 262 | } |
3087 | 130 | else { |
3088 | 130 | onig_syntax_warn(env, "invalid Unicode Property \\%c", c); |
3089 | 130 | } |
3090 | 392 | break; |
3091 | | |
3092 | 4.34k | case 'x': |
3093 | 4.34k | if (PEND) break; |
3094 | | |
3095 | 4.34k | prev = p; |
3096 | 4.34k | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { |
3097 | 5 | PINC; |
3098 | 5 | num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); |
3099 | 5 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3100 | 5 | if (!PEND) { |
3101 | 5 | c2 = PPEEK; |
3102 | 5 | if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) |
3103 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3104 | 5 | } |
3105 | | |
3106 | 5 | if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { |
3107 | 0 | PINC; |
3108 | 0 | tok->type = TK_CODE_POINT; |
3109 | 0 | tok->base = 16; |
3110 | 0 | tok->u.code = (OnigCodePoint )num; |
3111 | 0 | } |
3112 | 5 | else { |
3113 | | /* can't read nothing or invalid format */ |
3114 | 5 | p = prev; |
3115 | 5 | } |
3116 | 5 | } |
3117 | 4.34k | else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { |
3118 | 4.34k | num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); |
3119 | 4.34k | if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3120 | 4.34k | if (p == prev) { /* can't read nothing. */ |
3121 | 553 | num = 0; /* but, it's not error */ |
3122 | 553 | } |
3123 | 4.34k | tok->type = TK_RAW_BYTE; |
3124 | 4.34k | tok->base = 16; |
3125 | 4.34k | tok->u.c = num; |
3126 | 4.34k | } |
3127 | 4.34k | break; |
3128 | | |
3129 | 4.34k | case 'u': |
3130 | 4 | if (PEND) break; |
3131 | | |
3132 | 4 | prev = p; |
3133 | 4 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { |
3134 | 4 | num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); |
3135 | 4 | if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; |
3136 | 4 | else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3137 | 4 | if (p == prev) { /* can't read nothing. */ |
3138 | 0 | num = 0; /* but, it's not error */ |
3139 | 0 | } |
3140 | 4 | tok->type = TK_CODE_POINT; |
3141 | 4 | tok->base = 16; |
3142 | 4 | tok->u.code = (OnigCodePoint )num; |
3143 | 4 | } |
3144 | 4 | break; |
3145 | | |
3146 | 31 | case 'o': |
3147 | 31 | if (PEND) break; |
3148 | | |
3149 | 31 | prev = p; |
3150 | 31 | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { |
3151 | 0 | PINC; |
3152 | 0 | num = scan_unsigned_octal_number(&p, end, 11, enc); |
3153 | 0 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3154 | 0 | if (!PEND) { |
3155 | 0 | c2 = PPEEK; |
3156 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c2) && c2 < '8') |
3157 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3158 | 0 | } |
3159 | | |
3160 | 0 | if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { |
3161 | 0 | PINC; |
3162 | 0 | tok->type = TK_CODE_POINT; |
3163 | 0 | tok->base = 8; |
3164 | 0 | tok->u.code = (OnigCodePoint )num; |
3165 | 0 | } |
3166 | 0 | else { |
3167 | | /* can't read nothing or invalid format */ |
3168 | 0 | p = prev; |
3169 | 0 | } |
3170 | 0 | } |
3171 | 31 | break; |
3172 | | |
3173 | 31 | case '0': |
3174 | 294 | case '1': case '2': case '3': case '4': case '5': case '6': case '7': |
3175 | 294 | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { |
3176 | 294 | PUNFETCH; |
3177 | 294 | prev = p; |
3178 | 294 | num = scan_unsigned_octal_number(&p, end, 3, enc); |
3179 | 294 | if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER; |
3180 | 294 | if (p == prev) { /* can't read nothing. */ |
3181 | 0 | num = 0; /* but, it's not error */ |
3182 | 0 | } |
3183 | 294 | tok->type = TK_RAW_BYTE; |
3184 | 294 | tok->base = 8; |
3185 | 294 | tok->u.c = num; |
3186 | 294 | } |
3187 | 294 | break; |
3188 | | |
3189 | 406k | default: |
3190 | 406k | PUNFETCH; |
3191 | 406k | num = fetch_escaped_value(&p, end, env, &c2); |
3192 | 406k | if (num < 0) return num; |
3193 | 406k | if ((OnigCodePoint )tok->u.c != c2) { |
3194 | 371k | tok->u.code = (OnigCodePoint )c2; |
3195 | 371k | tok->type = TK_CODE_POINT; |
3196 | 371k | } |
3197 | 406k | break; |
3198 | 536k | } |
3199 | 536k | } |
3200 | 1.09M | else if (c == '[') { |
3201 | 18.6k | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { |
3202 | 5.19k | OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; |
3203 | 5.19k | tok->backp = p; /* point at '[' is read */ |
3204 | 5.19k | PINC; |
3205 | 5.19k | if (str_exist_check_with_esc(send, 2, p, end, |
3206 | 5.19k | (OnigCodePoint )']', enc, syn)) { |
3207 | 4.53k | tok->type = TK_POSIX_BRACKET_OPEN; |
3208 | 4.53k | } |
3209 | 658 | else { |
3210 | 658 | PUNFETCH; |
3211 | 658 | goto cc_in_cc; |
3212 | 658 | } |
3213 | 5.19k | } |
3214 | 13.5k | else { |
3215 | 14.1k | cc_in_cc: |
3216 | 14.1k | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { |
3217 | 14.1k | tok->type = TK_CC_CC_OPEN; |
3218 | 14.1k | } |
3219 | 0 | else { |
3220 | 0 | CC_ESC_WARN(env, (UChar* )"["); |
3221 | 0 | } |
3222 | 14.1k | } |
3223 | 18.6k | } |
3224 | 1.08M | else if (c == '&') { |
3225 | 4.83k | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && |
3226 | 4.83k | !PEND && (PPEEK_IS('&'))) { |
3227 | 4.09k | PINC; |
3228 | 4.09k | tok->type = TK_CC_AND; |
3229 | 4.09k | } |
3230 | 4.83k | } |
3231 | | |
3232 | 2.15M | end: |
3233 | 2.15M | *src = p; |
3234 | 2.15M | return tok->type; |
3235 | 2.15M | } |
3236 | | |
3237 | | #ifdef USE_NAMED_GROUP |
3238 | | static int |
3239 | | fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src, |
3240 | | UChar* end, ScanEnv* env) |
3241 | 3.88k | { |
3242 | 3.88k | int r, num; |
3243 | 3.88k | const OnigSyntaxType* syn = env->syntax; |
3244 | 3.88k | UChar* prev; |
3245 | 3.88k | UChar* p = *src; |
3246 | 3.88k | UChar* name_end; |
3247 | 3.88k | int* backs; |
3248 | 3.88k | int back_num; |
3249 | | |
3250 | 3.88k | prev = p; |
3251 | | |
3252 | 3.88k | # ifdef USE_BACKREF_WITH_LEVEL |
3253 | 3.88k | name_end = NULL_UCHARP; /* no need. escape gcc warning. */ |
3254 | 3.88k | r = fetch_name_with_level(c, &p, end, &name_end, |
3255 | 3.88k | env, &back_num, &tok->u.backref.level); |
3256 | 3.88k | if (r == 1) tok->u.backref.exist_level = 1; |
3257 | 2.13k | else tok->u.backref.exist_level = 0; |
3258 | | # else |
3259 | | r = fetch_name(&p, end, &name_end, env, &back_num, 1); |
3260 | | # endif |
3261 | 3.88k | if (r < 0) return r; |
3262 | | |
3263 | 3.87k | if (back_num != 0) { |
3264 | 2.37k | if (back_num < 0) { |
3265 | 1.66k | back_num = BACKREF_REL_TO_ABS(back_num, env); |
3266 | 1.66k | if (back_num <= 0) |
3267 | 0 | return ONIGERR_INVALID_BACKREF; |
3268 | 1.66k | } |
3269 | | |
3270 | 2.37k | if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
3271 | 0 | if (back_num > env->num_mem || |
3272 | 0 | IS_NULL(SCANENV_MEM_NODES(env)[back_num])) |
3273 | 0 | return ONIGERR_INVALID_BACKREF; |
3274 | 0 | } |
3275 | 2.37k | tok->type = TK_BACKREF; |
3276 | 2.37k | tok->u.backref.by_name = 0; |
3277 | 2.37k | tok->u.backref.num = 1; |
3278 | 2.37k | tok->u.backref.ref1 = back_num; |
3279 | 2.37k | } |
3280 | 1.50k | else { |
3281 | 1.50k | num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); |
3282 | 1.50k | if (num <= 0) { |
3283 | 0 | onig_scan_env_set_error_string(env, |
3284 | 0 | ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); |
3285 | 0 | return ONIGERR_UNDEFINED_NAME_REFERENCE; |
3286 | 0 | } |
3287 | 1.50k | if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
3288 | 0 | int i; |
3289 | 0 | for (i = 0; i < num; i++) { |
3290 | 0 | if (backs[i] > env->num_mem || |
3291 | 0 | IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) |
3292 | 0 | return ONIGERR_INVALID_BACKREF; |
3293 | 0 | } |
3294 | 0 | } |
3295 | | |
3296 | 1.50k | tok->type = TK_BACKREF; |
3297 | 1.50k | tok->u.backref.by_name = 1; |
3298 | 1.50k | if (num == 1 || IS_SYNTAX_BV(syn, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) { |
3299 | 15 | tok->u.backref.num = 1; |
3300 | 15 | tok->u.backref.ref1 = backs[0]; |
3301 | 15 | } |
3302 | 1.49k | else { |
3303 | 1.49k | tok->u.backref.num = num; |
3304 | 1.49k | tok->u.backref.refs = backs; |
3305 | 1.49k | } |
3306 | 1.50k | } |
3307 | 3.87k | *src = p; |
3308 | 3.87k | return 0; |
3309 | 3.87k | } |
3310 | | #endif |
3311 | | |
3312 | | static int |
3313 | | fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) |
3314 | 18.5M | { |
3315 | 18.5M | int r, num; |
3316 | 18.5M | OnigCodePoint c; |
3317 | 18.5M | OnigEncoding enc = env->enc; |
3318 | 18.5M | const OnigSyntaxType* syn = env->syntax; |
3319 | 18.5M | UChar* prev; |
3320 | 18.5M | UChar* p = *src; |
3321 | 18.5M | PFETCH_READY; |
3322 | | |
3323 | 18.5M | start: |
3324 | 18.5M | if (PEND) { |
3325 | 836k | tok->type = TK_EOT; |
3326 | 836k | return tok->type; |
3327 | 836k | } |
3328 | | |
3329 | 17.7M | tok->type = TK_STRING; |
3330 | 17.7M | tok->base = 0; |
3331 | 17.7M | tok->backp = p; |
3332 | | |
3333 | 17.7M | PFETCH(c); |
3334 | 17.7M | if (p > end) return ONIGERR_PREMATURE_END_OF_CHAR_CLASS; |
3335 | 17.7M | if (IS_MC_ESC_CODE(c, syn)) { |
3336 | 841k | if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; |
3337 | | |
3338 | 841k | tok->backp = p; |
3339 | 841k | PFETCH(c); |
3340 | | |
3341 | 841k | tok->u.c = c; |
3342 | 841k | tok->escaped = 1; |
3343 | 841k | switch (c) { |
3344 | 607 | case '*': |
3345 | 607 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; |
3346 | 0 | tok->type = TK_OP_REPEAT; |
3347 | 0 | tok->u.repeat.lower = 0; |
3348 | 0 | tok->u.repeat.upper = REPEAT_INFINITE; |
3349 | 0 | goto greedy_check; |
3350 | 0 | break; |
3351 | | |
3352 | 388 | case '+': |
3353 | 388 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; |
3354 | 0 | tok->type = TK_OP_REPEAT; |
3355 | 0 | tok->u.repeat.lower = 1; |
3356 | 0 | tok->u.repeat.upper = REPEAT_INFINITE; |
3357 | 0 | goto greedy_check; |
3358 | 0 | break; |
3359 | | |
3360 | 550 | case '?': |
3361 | 550 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; |
3362 | 0 | tok->type = TK_OP_REPEAT; |
3363 | 0 | tok->u.repeat.lower = 0; |
3364 | 0 | tok->u.repeat.upper = 1; |
3365 | 1.24M | greedy_check: |
3366 | 1.24M | if (!PEND && PPEEK_IS('?') && |
3367 | 1.24M | IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { |
3368 | 61.6k | PFETCH(c); |
3369 | 61.6k | tok->u.repeat.greedy = 0; |
3370 | 61.6k | tok->u.repeat.possessive = 0; |
3371 | 61.6k | } |
3372 | 1.18M | else { |
3373 | 1.20M | possessive_check: |
3374 | 1.20M | if (!PEND && PPEEK_IS('+') && |
3375 | 1.20M | ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && |
3376 | 63.2k | tok->type != TK_INTERVAL) || |
3377 | 63.2k | (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && |
3378 | 62.0k | tok->type == TK_INTERVAL))) { |
3379 | 62.0k | PFETCH(c); |
3380 | 62.0k | tok->u.repeat.greedy = 1; |
3381 | 62.0k | tok->u.repeat.possessive = 1; |
3382 | 62.0k | } |
3383 | 1.14M | else { |
3384 | 1.14M | tok->u.repeat.greedy = 1; |
3385 | 1.14M | tok->u.repeat.possessive = 0; |
3386 | 1.14M | } |
3387 | 1.20M | } |
3388 | 1.27M | break; |
3389 | | |
3390 | 1.27M | case '{': |
3391 | 26 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; |
3392 | 0 | r = fetch_range_quantifier(&p, end, tok, env); |
3393 | 0 | if (r < 0) return r; /* error */ |
3394 | 0 | if (r == 0) goto greedy_check; |
3395 | 0 | else if (r == 2) { /* {n} */ |
3396 | 0 | if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) |
3397 | 0 | goto possessive_check; |
3398 | | |
3399 | 0 | goto greedy_check; |
3400 | 0 | } |
3401 | | /* r == 1 : normal char */ |
3402 | 0 | break; |
3403 | | |
3404 | 510 | case '|': |
3405 | 510 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; |
3406 | 0 | tok->type = TK_ALT; |
3407 | 0 | break; |
3408 | | |
3409 | 61.4k | case '(': |
3410 | 61.4k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; |
3411 | 0 | tok->type = TK_SUBEXP_OPEN; |
3412 | 0 | break; |
3413 | | |
3414 | 30.7k | case ')': |
3415 | 30.7k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; |
3416 | 0 | tok->type = TK_SUBEXP_CLOSE; |
3417 | 0 | break; |
3418 | | |
3419 | 508 | case 'w': |
3420 | 508 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; |
3421 | 508 | tok->type = TK_CHAR_TYPE; |
3422 | 508 | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3423 | 508 | tok->u.prop.not = 0; |
3424 | 508 | break; |
3425 | | |
3426 | 4.43k | case 'W': |
3427 | 4.43k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; |
3428 | 4.43k | tok->type = TK_CHAR_TYPE; |
3429 | 4.43k | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3430 | 4.43k | tok->u.prop.not = 1; |
3431 | 4.43k | break; |
3432 | | |
3433 | 31.2k | case 'b': |
3434 | 31.2k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; |
3435 | 31.2k | tok->type = TK_ANCHOR; |
3436 | 31.2k | tok->u.anchor.subtype = ANCHOR_WORD_BOUND; |
3437 | 31.2k | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) |
3438 | 31.2k | && ! IS_WORD_BOUND_ALL_RANGE(env->option); |
3439 | 31.2k | break; |
3440 | | |
3441 | 47 | case 'B': |
3442 | 47 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; |
3443 | 47 | tok->type = TK_ANCHOR; |
3444 | 47 | tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND; |
3445 | 47 | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) |
3446 | 47 | && ! IS_WORD_BOUND_ALL_RANGE(env->option); |
3447 | 47 | break; |
3448 | | |
3449 | 0 | #ifdef USE_WORD_BEGIN_END |
3450 | 261 | case '<': |
3451 | 261 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; |
3452 | 0 | tok->type = TK_ANCHOR; |
3453 | 0 | tok->u.anchor.subtype = ANCHOR_WORD_BEGIN; |
3454 | 0 | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); |
3455 | 0 | break; |
3456 | | |
3457 | 201 | case '>': |
3458 | 201 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; |
3459 | 0 | tok->type = TK_ANCHOR; |
3460 | 0 | tok->u.anchor.subtype = ANCHOR_WORD_END; |
3461 | 0 | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); |
3462 | 0 | break; |
3463 | 0 | #endif |
3464 | | |
3465 | 164k | case 's': |
3466 | 164k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; |
3467 | 164k | tok->type = TK_CHAR_TYPE; |
3468 | 164k | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3469 | 164k | tok->u.prop.not = 0; |
3470 | 164k | break; |
3471 | | |
3472 | 5.40k | case 'S': |
3473 | 5.40k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; |
3474 | 5.40k | tok->type = TK_CHAR_TYPE; |
3475 | 5.40k | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3476 | 5.40k | tok->u.prop.not = 1; |
3477 | 5.40k | break; |
3478 | | |
3479 | 166k | case 'd': |
3480 | 166k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; |
3481 | 166k | tok->type = TK_CHAR_TYPE; |
3482 | 166k | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3483 | 166k | tok->u.prop.not = 0; |
3484 | 166k | break; |
3485 | | |
3486 | 411 | case 'D': |
3487 | 411 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; |
3488 | 411 | tok->type = TK_CHAR_TYPE; |
3489 | 411 | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3490 | 411 | tok->u.prop.not = 1; |
3491 | 411 | break; |
3492 | | |
3493 | 6 | case 'h': |
3494 | 6 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3495 | 6 | tok->type = TK_CHAR_TYPE; |
3496 | 6 | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3497 | 6 | tok->u.prop.not = 0; |
3498 | 6 | break; |
3499 | | |
3500 | 1.42k | case 'H': |
3501 | 1.42k | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3502 | 1.42k | tok->type = TK_CHAR_TYPE; |
3503 | 1.42k | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3504 | 1.42k | tok->u.prop.not = 1; |
3505 | 1.42k | break; |
3506 | | |
3507 | 13 | case 'A': |
3508 | 13 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; |
3509 | 13 | begin_buf: |
3510 | 13 | tok->type = TK_ANCHOR; |
3511 | 13 | tok->u.anchor.subtype = ANCHOR_BEGIN_BUF; |
3512 | 13 | break; |
3513 | | |
3514 | 784 | case 'Z': |
3515 | 784 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; |
3516 | 784 | tok->type = TK_ANCHOR; |
3517 | 784 | tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF; |
3518 | 784 | break; |
3519 | | |
3520 | 734 | case 'z': |
3521 | 734 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; |
3522 | 734 | end_buf: |
3523 | 734 | tok->type = TK_ANCHOR; |
3524 | 734 | tok->u.anchor.subtype = ANCHOR_END_BUF; |
3525 | 734 | break; |
3526 | | |
3527 | 384 | case 'G': |
3528 | 384 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; |
3529 | 384 | tok->type = TK_ANCHOR; |
3530 | 384 | tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION; |
3531 | 384 | break; |
3532 | | |
3533 | 586 | case '`': |
3534 | 586 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; |
3535 | 0 | goto begin_buf; |
3536 | 0 | break; |
3537 | | |
3538 | 231 | case '\'': |
3539 | 231 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; |
3540 | 0 | goto end_buf; |
3541 | 0 | break; |
3542 | | |
3543 | 1.01k | case 'x': |
3544 | 1.01k | if (PEND) break; |
3545 | | |
3546 | 751 | prev = p; |
3547 | 751 | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { |
3548 | 316 | PINC; |
3549 | 316 | num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); |
3550 | 316 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3551 | 316 | if (!PEND) { |
3552 | 188 | if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) |
3553 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3554 | 188 | } |
3555 | | |
3556 | 316 | if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { |
3557 | 167 | PINC; |
3558 | 167 | tok->type = TK_CODE_POINT; |
3559 | 167 | tok->u.code = (OnigCodePoint )num; |
3560 | 167 | } |
3561 | 149 | else { |
3562 | | /* can't read nothing or invalid format */ |
3563 | 149 | p = prev; |
3564 | 149 | } |
3565 | 316 | } |
3566 | 435 | else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { |
3567 | 435 | num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); |
3568 | 435 | if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3569 | 435 | if (p == prev) { /* can't read nothing. */ |
3570 | 400 | num = 0; /* but, it's not error */ |
3571 | 400 | } |
3572 | 435 | tok->type = TK_RAW_BYTE; |
3573 | 435 | tok->base = 16; |
3574 | 435 | tok->u.c = num; |
3575 | 435 | } |
3576 | 751 | break; |
3577 | | |
3578 | 751 | case 'u': |
3579 | 178 | if (PEND) break; |
3580 | | |
3581 | 131 | prev = p; |
3582 | 131 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { |
3583 | 131 | num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); |
3584 | 131 | if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; |
3585 | 131 | else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3586 | 131 | if (p == prev) { /* can't read nothing. */ |
3587 | 0 | num = 0; /* but, it's not error */ |
3588 | 0 | } |
3589 | 131 | tok->type = TK_CODE_POINT; |
3590 | 131 | tok->base = 16; |
3591 | 131 | tok->u.code = (OnigCodePoint )num; |
3592 | 131 | } |
3593 | 131 | break; |
3594 | | |
3595 | 131 | case 'o': |
3596 | 24 | if (PEND) break; |
3597 | | |
3598 | 16 | prev = p; |
3599 | 16 | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { |
3600 | 0 | PINC; |
3601 | 0 | num = scan_unsigned_octal_number(&p, end, 11, enc); |
3602 | 0 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3603 | 0 | if (!PEND) { |
3604 | 0 | OnigCodePoint c = PPEEK; |
3605 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') |
3606 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3607 | 0 | } |
3608 | | |
3609 | 0 | if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { |
3610 | 0 | PINC; |
3611 | 0 | tok->type = TK_CODE_POINT; |
3612 | 0 | tok->u.code = (OnigCodePoint )num; |
3613 | 0 | } |
3614 | 0 | else { |
3615 | | /* can't read nothing or invalid format */ |
3616 | 0 | p = prev; |
3617 | 0 | } |
3618 | 0 | } |
3619 | 16 | break; |
3620 | | |
3621 | 6.20k | case '1': case '2': case '3': case '4': |
3622 | 7.73k | case '5': case '6': case '7': case '8': case '9': |
3623 | 7.73k | PUNFETCH; |
3624 | 7.73k | prev = p; |
3625 | 7.73k | num = onig_scan_unsigned_number(&p, end, enc); |
3626 | 7.73k | if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { |
3627 | 762 | goto skip_backref; |
3628 | 762 | } |
3629 | | |
3630 | 6.96k | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && |
3631 | 6.96k | (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ |
3632 | 6.54k | if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
3633 | 0 | if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num])) |
3634 | 0 | return ONIGERR_INVALID_BACKREF; |
3635 | 0 | } |
3636 | | |
3637 | 6.54k | tok->type = TK_BACKREF; |
3638 | 6.54k | tok->u.backref.num = 1; |
3639 | 6.54k | tok->u.backref.ref1 = num; |
3640 | 6.54k | tok->u.backref.by_name = 0; |
3641 | 6.54k | #ifdef USE_BACKREF_WITH_LEVEL |
3642 | 6.54k | tok->u.backref.exist_level = 0; |
3643 | 6.54k | #endif |
3644 | 6.54k | break; |
3645 | 6.54k | } |
3646 | | |
3647 | 1.18k | skip_backref: |
3648 | 1.18k | if (c == '8' || c == '9') { |
3649 | | /* normal char */ |
3650 | 309 | p = prev; PINC; |
3651 | 309 | break; |
3652 | 309 | } |
3653 | | |
3654 | 879 | p = prev; |
3655 | | /* fall through */ |
3656 | 3.75k | case '0': |
3657 | 3.75k | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { |
3658 | 3.75k | prev = p; |
3659 | 3.75k | num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); |
3660 | 3.75k | if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER; |
3661 | 3.75k | if (p == prev) { /* can't read nothing. */ |
3662 | 2.64k | num = 0; /* but, it's not error */ |
3663 | 2.64k | } |
3664 | 3.75k | tok->type = TK_RAW_BYTE; |
3665 | 3.75k | tok->base = 8; |
3666 | 3.75k | tok->u.c = num; |
3667 | 3.75k | } |
3668 | 0 | else if (c != '0') { |
3669 | 0 | PINC; |
3670 | 0 | } |
3671 | 3.75k | break; |
3672 | | |
3673 | 3.75k | #ifdef USE_NAMED_GROUP |
3674 | 6.23k | case 'k': |
3675 | 6.23k | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { |
3676 | 5.89k | PFETCH(c); |
3677 | 5.89k | if (c == '<' || c == '\'') { |
3678 | 3.13k | r = fetch_named_backref_token(c, tok, &p, end, env); |
3679 | 3.13k | if (r < 0) return r; |
3680 | 3.13k | } |
3681 | 2.75k | else { |
3682 | 2.75k | PUNFETCH; |
3683 | 2.75k | onig_syntax_warn(env, "invalid back reference"); |
3684 | 2.75k | } |
3685 | 5.89k | } |
3686 | 6.23k | break; |
3687 | 6.23k | #endif |
3688 | | |
3689 | 6.23k | #if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP) |
3690 | 6.59k | case 'g': |
3691 | 6.59k | # ifdef USE_NAMED_GROUP |
3692 | 6.59k | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) { |
3693 | 0 | PFETCH(c); |
3694 | 0 | if (c == '{') { |
3695 | 0 | r = fetch_named_backref_token(c, tok, &p, end, env); |
3696 | 0 | if (r < 0) return r; |
3697 | 0 | } |
3698 | 0 | else |
3699 | 0 | PUNFETCH; |
3700 | 0 | } |
3701 | 6.59k | # endif |
3702 | 6.59k | # ifdef USE_SUBEXP_CALL |
3703 | 6.59k | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { |
3704 | 6.23k | PFETCH(c); |
3705 | 6.23k | if (c == '<' || c == '\'') { |
3706 | 4.48k | int gnum = -1, rel = 0; |
3707 | 4.48k | UChar* name_end; |
3708 | 4.48k | OnigCodePoint cnext; |
3709 | | |
3710 | 4.48k | cnext = PPEEK; |
3711 | 4.48k | if (cnext == '0') { |
3712 | 945 | PINC; |
3713 | 945 | if (PPEEK_IS(get_name_end_code_point(c))) { /* \g<0>, \g'0' */ |
3714 | 152 | PINC; |
3715 | 152 | name_end = p; |
3716 | 152 | gnum = 0; |
3717 | 152 | } |
3718 | 945 | } |
3719 | 3.54k | else if (cnext == '+') { |
3720 | 707 | PINC; |
3721 | 707 | rel = 1; |
3722 | 707 | } |
3723 | 4.48k | prev = p; |
3724 | 4.48k | if (gnum < 0) { |
3725 | 4.33k | r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); |
3726 | 4.33k | if (r < 0) return r; |
3727 | 4.33k | } |
3728 | | |
3729 | 4.48k | tok->type = TK_CALL; |
3730 | 4.48k | tok->u.call.name = prev; |
3731 | 4.48k | tok->u.call.name_end = name_end; |
3732 | 4.48k | tok->u.call.gnum = gnum; |
3733 | 4.48k | tok->u.call.rel = rel; |
3734 | 4.48k | } |
3735 | 1.74k | else { |
3736 | 1.74k | onig_syntax_warn(env, "invalid subexp call"); |
3737 | 1.74k | PUNFETCH; |
3738 | 1.74k | } |
3739 | 6.23k | } |
3740 | 6.58k | # endif |
3741 | 6.58k | break; |
3742 | 6.58k | #endif |
3743 | | |
3744 | 6.58k | case 'Q': |
3745 | 161 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { |
3746 | 0 | tok->type = TK_QUOTE_OPEN; |
3747 | 0 | } |
3748 | 161 | break; |
3749 | | |
3750 | 163 | case 'p': |
3751 | 2.44k | case 'P': |
3752 | 2.44k | if (PPEEK_IS('{') && |
3753 | 2.44k | IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { |
3754 | 1.37k | PINC; |
3755 | 1.37k | tok->type = TK_CHAR_PROPERTY; |
3756 | 1.37k | tok->u.prop.not = (c == 'P' ? 1 : 0); |
3757 | | |
3758 | 1.37k | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { |
3759 | 1.17k | PFETCH(c); |
3760 | 1.17k | if (c == '^') { |
3761 | 0 | tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); |
3762 | 0 | } |
3763 | 1.17k | else |
3764 | 1.17k | PUNFETCH; |
3765 | 1.17k | } |
3766 | 1.37k | } |
3767 | 1.07k | else { |
3768 | 1.07k | onig_syntax_warn(env, "invalid Unicode Property \\%c", c); |
3769 | 1.07k | } |
3770 | 2.44k | break; |
3771 | | |
3772 | 1.00k | case 'R': |
3773 | 1.00k | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) { |
3774 | 1.00k | tok->type = TK_LINEBREAK; |
3775 | 1.00k | } |
3776 | 1.00k | break; |
3777 | | |
3778 | 147k | case 'X': |
3779 | 147k | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) { |
3780 | 147k | tok->type = TK_EXTENDED_GRAPHEME_CLUSTER; |
3781 | 147k | } |
3782 | 147k | break; |
3783 | | |
3784 | 168 | case 'K': |
3785 | 168 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) { |
3786 | 168 | tok->type = TK_KEEP; |
3787 | 168 | } |
3788 | 168 | break; |
3789 | | |
3790 | 193k | default: |
3791 | 193k | { |
3792 | 193k | OnigCodePoint c2; |
3793 | | |
3794 | 193k | PUNFETCH; |
3795 | 193k | num = fetch_escaped_value(&p, end, env, &c2); |
3796 | 193k | if (num < 0) return num; |
3797 | | /* set_raw: */ |
3798 | 193k | if ((OnigCodePoint )tok->u.c != c2) { |
3799 | 1.98k | tok->type = TK_CODE_POINT; |
3800 | 1.98k | tok->u.code = (OnigCodePoint )c2; |
3801 | 1.98k | } |
3802 | 191k | else { /* string */ |
3803 | 191k | p = tok->backp + enclen(enc, tok->backp, end); |
3804 | 191k | } |
3805 | 193k | } |
3806 | 0 | break; |
3807 | 841k | } |
3808 | 841k | } |
3809 | 16.8M | else { |
3810 | 16.8M | tok->u.c = c; |
3811 | 16.8M | tok->escaped = 0; |
3812 | | |
3813 | 16.8M | #ifdef USE_VARIABLE_META_CHARS |
3814 | 16.8M | if ((c != ONIG_INEFFECTIVE_META_CHAR) && |
3815 | 16.8M | IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { |
3816 | 0 | if (c == MC_ANYCHAR(syn)) |
3817 | 0 | goto any_char; |
3818 | 0 | else if (c == MC_ANYTIME(syn)) |
3819 | 0 | goto anytime; |
3820 | 0 | else if (c == MC_ZERO_OR_ONE_TIME(syn)) |
3821 | 0 | goto zero_or_one_time; |
3822 | 0 | else if (c == MC_ONE_OR_MORE_TIME(syn)) |
3823 | 0 | goto one_or_more_time; |
3824 | 0 | else if (c == MC_ANYCHAR_ANYTIME(syn)) { |
3825 | 0 | tok->type = TK_ANYCHAR_ANYTIME; |
3826 | 0 | goto out; |
3827 | 0 | } |
3828 | 0 | } |
3829 | 16.8M | #endif |
3830 | | |
3831 | 16.8M | switch (c) { |
3832 | 416k | case '.': |
3833 | 416k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; |
3834 | 416k | #ifdef USE_VARIABLE_META_CHARS |
3835 | 416k | any_char: |
3836 | 416k | #endif |
3837 | 416k | tok->type = TK_ANYCHAR; |
3838 | 416k | break; |
3839 | | |
3840 | 465k | case '*': |
3841 | 465k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; |
3842 | 465k | #ifdef USE_VARIABLE_META_CHARS |
3843 | 465k | anytime: |
3844 | 465k | #endif |
3845 | 465k | tok->type = TK_OP_REPEAT; |
3846 | 465k | tok->u.repeat.lower = 0; |
3847 | 465k | tok->u.repeat.upper = REPEAT_INFINITE; |
3848 | 465k | goto greedy_check; |
3849 | 0 | break; |
3850 | | |
3851 | 734k | case '+': |
3852 | 734k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; |
3853 | 734k | #ifdef USE_VARIABLE_META_CHARS |
3854 | 734k | one_or_more_time: |
3855 | 734k | #endif |
3856 | 734k | tok->type = TK_OP_REPEAT; |
3857 | 734k | tok->u.repeat.lower = 1; |
3858 | 734k | tok->u.repeat.upper = REPEAT_INFINITE; |
3859 | 734k | goto greedy_check; |
3860 | 0 | break; |
3861 | | |
3862 | 45.6k | case '?': |
3863 | 45.6k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; |
3864 | 45.6k | #ifdef USE_VARIABLE_META_CHARS |
3865 | 45.6k | zero_or_one_time: |
3866 | 45.6k | #endif |
3867 | 45.6k | tok->type = TK_OP_REPEAT; |
3868 | 45.6k | tok->u.repeat.lower = 0; |
3869 | 45.6k | tok->u.repeat.upper = 1; |
3870 | 45.6k | goto greedy_check; |
3871 | 0 | break; |
3872 | | |
3873 | 40.6k | case '{': |
3874 | 40.6k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; |
3875 | 40.6k | r = fetch_range_quantifier(&p, end, tok, env); |
3876 | 40.6k | if (r < 0) return r; /* error */ |
3877 | 40.6k | if (r == 0) goto greedy_check; |
3878 | 35.9k | else if (r == 2) { /* {n} */ |
3879 | 21.5k | if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) |
3880 | 21.5k | goto possessive_check; |
3881 | | |
3882 | 0 | goto greedy_check; |
3883 | 21.5k | } |
3884 | | /* r == 1 : normal char */ |
3885 | 14.3k | break; |
3886 | | |
3887 | 278k | case '|': |
3888 | 278k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; |
3889 | 278k | tok->type = TK_ALT; |
3890 | 278k | break; |
3891 | | |
3892 | 585k | case '(': |
3893 | 585k | if (PPEEK_IS('?') && |
3894 | 585k | IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { |
3895 | 475k | PINC; |
3896 | 475k | if (PPEEK_IS('#')) { |
3897 | 213 | PFETCH(c); |
3898 | 2.29k | while (1) { |
3899 | 2.29k | if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; |
3900 | 2.29k | PFETCH(c); |
3901 | 2.29k | if (c == MC_ESC(syn)) { |
3902 | 4 | if (!PEND) PFETCH(c); |
3903 | 4 | } |
3904 | 2.29k | else { |
3905 | 2.29k | if (c == ')') break; |
3906 | 2.29k | } |
3907 | 2.29k | } |
3908 | 213 | goto start; |
3909 | 213 | } |
3910 | 475k | #ifdef USE_PERL_SUBEXP_CALL |
3911 | | /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */ |
3912 | 475k | c = PPEEK; |
3913 | 475k | if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) && |
3914 | 475k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { |
3915 | | /* (?&name), (?n), (?R), (?0) */ |
3916 | 0 | int gnum; |
3917 | 0 | UChar *name; |
3918 | 0 | UChar *name_end; |
3919 | |
|
3920 | 0 | if (c == 'R' || c == '0') { |
3921 | 0 | PINC; /* skip 'R' / '0' */ |
3922 | 0 | if (!PPEEK_IS(')')) { |
3923 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
3924 | 0 | onig_scan_env_set_error_string(env, r, p - 1, p + 1); |
3925 | 0 | return r; |
3926 | 0 | } |
3927 | 0 | PINC; /* skip ')' */ |
3928 | 0 | name_end = name = p; |
3929 | 0 | gnum = 0; |
3930 | 0 | } |
3931 | 0 | else { |
3932 | 0 | int numref = 1; |
3933 | 0 | if (c == '&') { /* (?&name) */ |
3934 | 0 | PINC; |
3935 | 0 | numref = 0; /* don't allow number name */ |
3936 | 0 | } |
3937 | 0 | name = p; |
3938 | 0 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref); |
3939 | 0 | if (r < 0) return r; |
3940 | 0 | } |
3941 | | |
3942 | 0 | tok->type = TK_CALL; |
3943 | 0 | tok->u.call.name = name; |
3944 | 0 | tok->u.call.name_end = name_end; |
3945 | 0 | tok->u.call.gnum = gnum; |
3946 | 0 | tok->u.call.rel = 0; |
3947 | 0 | break; |
3948 | 0 | } |
3949 | 475k | else if ((c == '-' || c == '+') && |
3950 | 475k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { |
3951 | | /* (?+n), (?-n) */ |
3952 | 0 | int gnum; |
3953 | 0 | UChar *name; |
3954 | 0 | UChar *name_end; |
3955 | 0 | OnigCodePoint cnext; |
3956 | 0 | PFETCH_READY; |
3957 | |
|
3958 | 0 | PINC; /* skip '-' / '+' */ |
3959 | 0 | cnext = PPEEK; |
3960 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) { |
3961 | 0 | if (c == '-') PUNFETCH; |
3962 | 0 | name = p; |
3963 | 0 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1); |
3964 | 0 | if (r < 0) return r; |
3965 | | |
3966 | 0 | tok->type = TK_CALL; |
3967 | 0 | tok->u.call.name = name; |
3968 | 0 | tok->u.call.name_end = name_end; |
3969 | 0 | tok->u.call.gnum = gnum; |
3970 | 0 | tok->u.call.rel = 1; |
3971 | 0 | break; |
3972 | 0 | } |
3973 | 0 | } |
3974 | 475k | #endif /* USE_PERL_SUBEXP_CALL */ |
3975 | 475k | #ifdef USE_CAPITAL_P_NAMED_GROUP |
3976 | 475k | if (PPEEK_IS('P') && |
3977 | 475k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { |
3978 | 0 | int gnum; |
3979 | 0 | UChar *name; |
3980 | 0 | UChar *name_end; |
3981 | 0 | PFETCH_READY; |
3982 | |
|
3983 | 0 | PINC; /* skip 'P' */ |
3984 | 0 | if (PEND) return ONIGERR_UNDEFINED_GROUP_OPTION; |
3985 | 0 | PFETCH(c); |
3986 | 0 | if (c == '=') { /* (?P=name): backref */ |
3987 | 0 | r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env); |
3988 | 0 | if (r < 0) return r; |
3989 | 0 | break; |
3990 | 0 | } |
3991 | 0 | else if (c == '>') { /* (?P>name): subexp call */ |
3992 | 0 | name = p; |
3993 | 0 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0); |
3994 | 0 | if (r < 0) return r; |
3995 | | |
3996 | 0 | tok->type = TK_CALL; |
3997 | 0 | tok->u.call.name = name; |
3998 | 0 | tok->u.call.name_end = name_end; |
3999 | 0 | tok->u.call.gnum = gnum; |
4000 | 0 | tok->u.call.rel = 0; |
4001 | 0 | break; |
4002 | 0 | } |
4003 | 0 | } |
4004 | 475k | #endif /* USE_CAPITAL_P_NAMED_GROUP */ |
4005 | 475k | PUNFETCH; |
4006 | 475k | } |
4007 | | |
4008 | 585k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; |
4009 | 585k | tok->type = TK_SUBEXP_OPEN; |
4010 | 585k | break; |
4011 | | |
4012 | 551k | case ')': |
4013 | 551k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; |
4014 | 551k | tok->type = TK_SUBEXP_CLOSE; |
4015 | 551k | break; |
4016 | | |
4017 | 673k | case '^': |
4018 | 673k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; |
4019 | 673k | tok->type = TK_ANCHOR; |
4020 | 673k | tok->u.anchor.subtype = (IS_SINGLELINE(env->option) |
4021 | 673k | ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); |
4022 | 673k | break; |
4023 | | |
4024 | 308k | case '$': |
4025 | 308k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; |
4026 | 308k | tok->type = TK_ANCHOR; |
4027 | 308k | tok->u.anchor.subtype = (IS_SINGLELINE(env->option) |
4028 | 308k | ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE); |
4029 | 308k | break; |
4030 | | |
4031 | 516k | case '[': |
4032 | 516k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; |
4033 | 516k | tok->type = TK_CC_OPEN; |
4034 | 516k | break; |
4035 | | |
4036 | 9.90k | case ']': |
4037 | 9.90k | if (*src > env->pattern) /* /].../ is allowed. */ |
4038 | 9.75k | CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); |
4039 | 9.90k | break; |
4040 | | |
4041 | 4.07k | case '#': |
4042 | 4.07k | if (IS_EXTEND(env->option)) { |
4043 | 878 | while (!PEND) { |
4044 | 815 | PFETCH(c); |
4045 | 815 | if (ONIGENC_IS_CODE_NEWLINE(enc, c)) |
4046 | 0 | break; |
4047 | 815 | } |
4048 | 63 | goto start; |
4049 | 0 | break; |
4050 | 63 | } |
4051 | 4.01k | break; |
4052 | | |
4053 | 1.53M | case ' ': case '\t': case '\n': case '\r': case '\f': |
4054 | 1.53M | if (IS_EXTEND(env->option)) |
4055 | 653 | goto start; |
4056 | 1.53M | break; |
4057 | | |
4058 | 10.7M | default: |
4059 | | /* string */ |
4060 | 10.7M | break; |
4061 | 16.8M | } |
4062 | 16.8M | } |
4063 | | |
4064 | 17.7M | #ifdef USE_VARIABLE_META_CHARS |
4065 | 17.7M | out: |
4066 | 17.7M | #endif |
4067 | 17.7M | *src = p; |
4068 | 17.7M | return tok->type; |
4069 | 17.7M | } |
4070 | | |
4071 | | static int |
4072 | | add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, |
4073 | | ScanEnv* env, |
4074 | | OnigCodePoint sb_out, const OnigCodePoint mbr[]) |
4075 | 2.96M | { |
4076 | 2.96M | int i, r; |
4077 | 2.96M | OnigCodePoint j; |
4078 | | |
4079 | 2.96M | int n = ONIGENC_CODE_RANGE_NUM(mbr); |
4080 | | |
4081 | 2.96M | if (not == 0) { |
4082 | 4.01M | for (i = 0; i < n; i++) { |
4083 | 4.01M | for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); |
4084 | 12.0M | j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { |
4085 | 10.8M | if (j >= sb_out) { |
4086 | 2.81M | if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { |
4087 | 147k | r = add_code_range_to_buf(&(cc->mbuf), env, j, |
4088 | 147k | ONIGENC_CODE_RANGE_TO(mbr, i)); |
4089 | 147k | if (r != 0) return r; |
4090 | 147k | i++; |
4091 | 147k | } |
4092 | | |
4093 | 2.81M | goto sb_end; |
4094 | 2.81M | } |
4095 | 8.04M | BITSET_SET_BIT_CHKDUP(cc->bs, j); |
4096 | 8.04M | } |
4097 | 4.01M | } |
4098 | | |
4099 | 2.81M | sb_end: |
4100 | 294M | for ( ; i < n; i++) { |
4101 | 291M | r = add_code_range_to_buf(&(cc->mbuf), env, |
4102 | 291M | ONIGENC_CODE_RANGE_FROM(mbr, i), |
4103 | 291M | ONIGENC_CODE_RANGE_TO(mbr, i)); |
4104 | 291M | if (r != 0) return r; |
4105 | 291M | } |
4106 | 2.81M | } |
4107 | 148k | else { |
4108 | 148k | OnigCodePoint prev = 0; |
4109 | | |
4110 | 740k | for (i = 0; i < n; i++) { |
4111 | 740k | for (j = prev; |
4112 | 15.2M | j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { |
4113 | 14.6M | if (j >= sb_out) { |
4114 | 148k | goto sb_end2; |
4115 | 148k | } |
4116 | 14.4M | BITSET_SET_BIT_CHKDUP(cc->bs, j); |
4117 | 14.4M | } |
4118 | 591k | prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; |
4119 | 591k | } |
4120 | 2.86k | for (j = prev; j < sb_out; j++) { |
4121 | 2.75k | BITSET_SET_BIT_CHKDUP(cc->bs, j); |
4122 | 2.75k | } |
4123 | | |
4124 | 148k | sb_end2: |
4125 | 148k | prev = sb_out; |
4126 | | |
4127 | 3.14M | for (i = 0; i < n; i++) { |
4128 | 2.99M | if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { |
4129 | 2.84M | r = add_code_range_to_buf(&(cc->mbuf), env, prev, |
4130 | 2.84M | ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); |
4131 | 2.84M | if (r != 0) return r; |
4132 | 2.84M | } |
4133 | 2.99M | prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; |
4134 | 2.99M | } |
4135 | 148k | if (prev < 0x7fffffff) { |
4136 | 148k | r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff); |
4137 | 148k | if (r != 0) return r; |
4138 | 148k | } |
4139 | 148k | } |
4140 | | |
4141 | 2.96M | return 0; |
4142 | 2.96M | } |
4143 | | |
4144 | | static int |
4145 | | add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* env) |
4146 | 2.96M | { |
4147 | 2.96M | int maxcode; |
4148 | 2.96M | int c, r; |
4149 | 2.96M | const OnigCodePoint *ranges; |
4150 | 2.96M | OnigCodePoint sb_out; |
4151 | 2.96M | OnigEncoding enc = env->enc; |
4152 | | |
4153 | 2.96M | r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); |
4154 | 2.96M | if (r == 0) { |
4155 | 2.96M | if (ascii_range) { |
4156 | 460k | CClassNode ccwork; |
4157 | 460k | initialize_cclass(&ccwork); |
4158 | 460k | r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out, |
4159 | 460k | ranges); |
4160 | 460k | if (r == 0) { |
4161 | 460k | if (not) { |
4162 | 440 | r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE); |
4163 | 440 | } |
4164 | 460k | else { |
4165 | 460k | CClassNode ccascii; |
4166 | 460k | initialize_cclass(&ccascii); |
4167 | 460k | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { |
4168 | 0 | r = add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F); |
4169 | 0 | } |
4170 | 460k | else { |
4171 | 460k | bitset_set_range(env, ccascii.bs, 0x00, 0x7F); |
4172 | 460k | r = 0; |
4173 | 460k | } |
4174 | 460k | if (r == 0) { |
4175 | 460k | r = and_cclass(&ccwork, &ccascii, env); |
4176 | 460k | } |
4177 | 460k | if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf); |
4178 | 460k | } |
4179 | 460k | if (r == 0) { |
4180 | 460k | r = or_cclass(cc, &ccwork, env); |
4181 | 460k | } |
4182 | 460k | if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf); |
4183 | 460k | } |
4184 | 460k | } |
4185 | 2.50M | else { |
4186 | 2.50M | r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges); |
4187 | 2.50M | } |
4188 | 2.96M | return r; |
4189 | 2.96M | } |
4190 | 0 | else if (r != ONIG_NO_SUPPORT_CONFIG) { |
4191 | 0 | return r; |
4192 | 0 | } |
4193 | | |
4194 | 0 | maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE; |
4195 | 0 | r = 0; |
4196 | 0 | switch (ctype) { |
4197 | 0 | case ONIGENC_CTYPE_ALPHA: |
4198 | 0 | case ONIGENC_CTYPE_BLANK: |
4199 | 0 | case ONIGENC_CTYPE_CNTRL: |
4200 | 0 | case ONIGENC_CTYPE_DIGIT: |
4201 | 0 | case ONIGENC_CTYPE_LOWER: |
4202 | 0 | case ONIGENC_CTYPE_PUNCT: |
4203 | 0 | case ONIGENC_CTYPE_SPACE: |
4204 | 0 | case ONIGENC_CTYPE_UPPER: |
4205 | 0 | case ONIGENC_CTYPE_XDIGIT: |
4206 | 0 | case ONIGENC_CTYPE_ASCII: |
4207 | 0 | case ONIGENC_CTYPE_ALNUM: |
4208 | 0 | if (not != 0) { |
4209 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4210 | 0 | if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) |
4211 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4212 | 0 | } |
4213 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4214 | 0 | } |
4215 | 0 | else { |
4216 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4217 | 0 | if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) |
4218 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4219 | 0 | } |
4220 | 0 | } |
4221 | 0 | break; |
4222 | | |
4223 | 0 | case ONIGENC_CTYPE_GRAPH: |
4224 | 0 | case ONIGENC_CTYPE_PRINT: |
4225 | 0 | if (not != 0) { |
4226 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4227 | 0 | if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype) |
4228 | 0 | || c >= maxcode) |
4229 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4230 | 0 | } |
4231 | 0 | if (ascii_range) |
4232 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4233 | 0 | } |
4234 | 0 | else { |
4235 | 0 | for (c = 0; c < maxcode; c++) { |
4236 | 0 | if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) |
4237 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4238 | 0 | } |
4239 | 0 | if (! ascii_range) |
4240 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4241 | 0 | } |
4242 | 0 | break; |
4243 | | |
4244 | 0 | case ONIGENC_CTYPE_WORD: |
4245 | 0 | if (not == 0) { |
4246 | 0 | for (c = 0; c < maxcode; c++) { |
4247 | 0 | if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4248 | 0 | } |
4249 | 0 | if (! ascii_range) |
4250 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4251 | 0 | } |
4252 | 0 | else { |
4253 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4254 | 0 | if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ |
4255 | 0 | && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode)) |
4256 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4257 | 0 | } |
4258 | 0 | if (ascii_range) |
4259 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4260 | 0 | } |
4261 | 0 | break; |
4262 | | |
4263 | 0 | default: |
4264 | 0 | return ONIGERR_PARSER_BUG; |
4265 | 0 | break; |
4266 | 0 | } |
4267 | | |
4268 | 0 | return r; |
4269 | 0 | } |
4270 | | |
4271 | | static int |
4272 | | parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc, |
4273 | | UChar** src, UChar* end, ScanEnv* env) |
4274 | 4.53k | { |
4275 | 26.2k | #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 |
4276 | 4.53k | #define POSIX_BRACKET_NAME_MIN_LEN 4 |
4277 | | |
4278 | 4.53k | static const PosixBracketEntryType PBS[] = { |
4279 | 4.53k | POSIX_BRACKET_ENTRY_INIT("alnum", ONIGENC_CTYPE_ALNUM), |
4280 | 4.53k | POSIX_BRACKET_ENTRY_INIT("alpha", ONIGENC_CTYPE_ALPHA), |
4281 | 4.53k | POSIX_BRACKET_ENTRY_INIT("blank", ONIGENC_CTYPE_BLANK), |
4282 | 4.53k | POSIX_BRACKET_ENTRY_INIT("cntrl", ONIGENC_CTYPE_CNTRL), |
4283 | 4.53k | POSIX_BRACKET_ENTRY_INIT("digit", ONIGENC_CTYPE_DIGIT), |
4284 | 4.53k | POSIX_BRACKET_ENTRY_INIT("graph", ONIGENC_CTYPE_GRAPH), |
4285 | 4.53k | POSIX_BRACKET_ENTRY_INIT("lower", ONIGENC_CTYPE_LOWER), |
4286 | 4.53k | POSIX_BRACKET_ENTRY_INIT("print", ONIGENC_CTYPE_PRINT), |
4287 | 4.53k | POSIX_BRACKET_ENTRY_INIT("punct", ONIGENC_CTYPE_PUNCT), |
4288 | 4.53k | POSIX_BRACKET_ENTRY_INIT("space", ONIGENC_CTYPE_SPACE), |
4289 | 4.53k | POSIX_BRACKET_ENTRY_INIT("upper", ONIGENC_CTYPE_UPPER), |
4290 | 4.53k | POSIX_BRACKET_ENTRY_INIT("xdigit", ONIGENC_CTYPE_XDIGIT), |
4291 | 4.53k | POSIX_BRACKET_ENTRY_INIT("ascii", ONIGENC_CTYPE_ASCII), |
4292 | 4.53k | POSIX_BRACKET_ENTRY_INIT("word", ONIGENC_CTYPE_WORD), |
4293 | 4.53k | }; |
4294 | | |
4295 | 4.53k | const PosixBracketEntryType *pb; |
4296 | 4.53k | int not, i, r; |
4297 | 4.53k | int ascii_range; |
4298 | 4.53k | OnigCodePoint c; |
4299 | 4.53k | OnigEncoding enc = env->enc; |
4300 | 4.53k | UChar *p = *src; |
4301 | | |
4302 | 4.53k | if (PPEEK_IS('^')) { |
4303 | 679 | PINC_S; |
4304 | 679 | not = 1; |
4305 | 679 | } |
4306 | 3.85k | else |
4307 | 3.85k | not = 0; |
4308 | | |
4309 | 4.53k | if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) |
4310 | 5 | goto not_posix_bracket; |
4311 | | |
4312 | 4.52k | ascii_range = IS_ASCII_RANGE(env->option) && |
4313 | 4.52k | ! IS_POSIX_BRACKET_ALL_RANGE(env->option); |
4314 | 67.9k | for (pb = PBS; pb < PBS + numberof(PBS); pb++) { |
4315 | 63.3k | if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { |
4316 | 0 | p = (UChar* )onigenc_step(enc, p, end, pb->len); |
4317 | 0 | if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) |
4318 | 0 | return ONIGERR_INVALID_POSIX_BRACKET_TYPE; |
4319 | | |
4320 | 0 | r = add_ctype_to_cc(cc, pb->ctype, not, ascii_range, env); |
4321 | 0 | if (r != 0) return r; |
4322 | | |
4323 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4324 | 0 | if (pb->ctype != ONIGENC_CTYPE_WORD && |
4325 | 0 | pb->ctype != ONIGENC_CTYPE_ASCII && |
4326 | 0 | !ascii_range) |
4327 | 0 | r = add_ctype_to_cc(asc_cc, pb->ctype, not, ascii_range, env); |
4328 | 0 | if (r != 0) return r; |
4329 | 0 | } |
4330 | | |
4331 | 0 | PINC_S; PINC_S; |
4332 | 0 | *src = p; |
4333 | 0 | return 0; |
4334 | 0 | } |
4335 | 63.3k | } |
4336 | | |
4337 | 4.53k | not_posix_bracket: |
4338 | 4.53k | c = 0; |
4339 | 4.53k | i = 0; |
4340 | 30.1k | while (!PEND && ((c = PPEEK) != ':') && c != ']') { |
4341 | 26.2k | PINC_S; |
4342 | 26.2k | if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; |
4343 | 26.2k | } |
4344 | 4.53k | if (c == ':' && ! PEND) { |
4345 | 2.94k | PINC_S; |
4346 | 2.94k | if (! PEND) { |
4347 | 2.94k | PFETCH_S(c); |
4348 | 2.94k | if (c == ']') |
4349 | 7 | return ONIGERR_INVALID_POSIX_BRACKET_TYPE; |
4350 | 2.94k | } |
4351 | 2.94k | } |
4352 | | |
4353 | 4.52k | return 1; /* 1: is not POSIX bracket, but no error. */ |
4354 | 4.53k | } |
4355 | | |
4356 | | static int |
4357 | | fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) |
4358 | 1.63k | { |
4359 | 1.63k | int r; |
4360 | 1.63k | OnigCodePoint c; |
4361 | 1.63k | OnigEncoding enc = env->enc; |
4362 | 1.63k | UChar *prev, *start, *p = *src; |
4363 | | |
4364 | 1.63k | r = 0; |
4365 | 1.63k | start = prev = p; |
4366 | | |
4367 | 6.48k | while (!PEND) { |
4368 | 5.85k | prev = p; |
4369 | 5.85k | PFETCH_S(c); |
4370 | 5.85k | if (c == '}') { |
4371 | 1.00k | r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); |
4372 | 1.00k | if (r < 0) break; |
4373 | | |
4374 | 1.00k | *src = p; |
4375 | 1.00k | return r; |
4376 | 1.00k | } |
4377 | 4.84k | else if (c == '(' || c == ')' || c == '{' || c == '|') { |
4378 | 1 | r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; |
4379 | 1 | break; |
4380 | 1 | } |
4381 | 5.85k | } |
4382 | | |
4383 | 632 | onig_scan_env_set_error_string(env, r, *src, prev); |
4384 | 632 | return r; |
4385 | 1.63k | } |
4386 | | |
4387 | | static int cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env); |
4388 | | |
4389 | | static int |
4390 | | parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, |
4391 | | ScanEnv* env) |
4392 | 1.37k | { |
4393 | 1.37k | int r, ctype; |
4394 | 1.37k | CClassNode* cc; |
4395 | | |
4396 | 1.37k | ctype = fetch_char_property_to_ctype(src, end, env); |
4397 | 1.37k | if (ctype < 0) return ctype; |
4398 | | |
4399 | 1.37k | *np = node_new_cclass(); |
4400 | 1.37k | CHECK_NULL_RETURN_MEMERR(*np); |
4401 | 1.37k | cc = NCCLASS(*np); |
4402 | 1.37k | r = add_ctype_to_cc(cc, ctype, 0, 0, env); |
4403 | 1.37k | if (r != 0) return r; |
4404 | 1.37k | if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); |
4405 | | |
4406 | 1.37k | if (IS_IGNORECASE(env->option)) { |
4407 | 38 | if (ctype != ONIGENC_CTYPE_ASCII) |
4408 | 38 | r = cclass_case_fold(np, cc, cc, env); |
4409 | 38 | } |
4410 | 1.37k | return r; |
4411 | 1.37k | } |
4412 | | |
4413 | | |
4414 | | enum CCSTATE { |
4415 | | CCS_VALUE, |
4416 | | CCS_RANGE, |
4417 | | CCS_COMPLETE, |
4418 | | CCS_START |
4419 | | }; |
4420 | | |
4421 | | enum CCVALTYPE { |
4422 | | CCV_SB, |
4423 | | CCV_CODE_POINT, |
4424 | | CCV_CLASS |
4425 | | }; |
4426 | | |
4427 | | static int |
4428 | | next_state_class(CClassNode* cc, CClassNode* asc_cc, |
4429 | | OnigCodePoint* vs, enum CCVALTYPE* type, |
4430 | | enum CCSTATE* state, ScanEnv* env) |
4431 | 124k | { |
4432 | 124k | int r; |
4433 | | |
4434 | 124k | if (*state == CCS_RANGE) |
4435 | 0 | return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; |
4436 | | |
4437 | 124k | if (*state == CCS_VALUE && *type != CCV_CLASS) { |
4438 | 1.22k | if (*type == CCV_SB) { |
4439 | 1.04k | BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs)); |
4440 | 1.04k | if (IS_NOT_NULL(asc_cc)) |
4441 | 1.04k | BITSET_SET_BIT(asc_cc->bs, (int )(*vs)); |
4442 | 1.04k | } |
4443 | 179 | else if (*type == CCV_CODE_POINT) { |
4444 | 179 | r = add_code_range(&(cc->mbuf), env, *vs, *vs); |
4445 | 179 | if (r < 0) return r; |
4446 | 179 | if (IS_NOT_NULL(asc_cc)) { |
4447 | 149 | r = add_code_range0(&(asc_cc->mbuf), env, *vs, *vs, 0); |
4448 | 149 | if (r < 0) return r; |
4449 | 149 | } |
4450 | 179 | } |
4451 | 1.22k | } |
4452 | | |
4453 | 124k | *state = CCS_VALUE; |
4454 | 124k | *type = CCV_CLASS; |
4455 | 124k | return 0; |
4456 | 124k | } |
4457 | | |
4458 | | static int |
4459 | | next_state_val(CClassNode* cc, CClassNode* asc_cc, |
4460 | | OnigCodePoint *from, OnigCodePoint to, |
4461 | | int* from_israw, int to_israw, |
4462 | | enum CCVALTYPE intype, enum CCVALTYPE* type, |
4463 | | enum CCSTATE* state, ScanEnv* env) |
4464 | 1.77M | { |
4465 | 1.77M | int r; |
4466 | | |
4467 | 1.77M | switch (*state) { |
4468 | 1.37M | case CCS_VALUE: |
4469 | 1.37M | if (*type == CCV_SB) { |
4470 | 1.21M | BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*from)); |
4471 | 1.21M | if (IS_NOT_NULL(asc_cc)) |
4472 | 1.21M | BITSET_SET_BIT(asc_cc->bs, (int )(*from)); |
4473 | 1.21M | } |
4474 | 150k | else if (*type == CCV_CODE_POINT) { |
4475 | 26.8k | r = add_code_range(&(cc->mbuf), env, *from, *from); |
4476 | 26.8k | if (r < 0) return r; |
4477 | 26.8k | if (IS_NOT_NULL(asc_cc)) { |
4478 | 11.8k | r = add_code_range0(&(asc_cc->mbuf), env, *from, *from, 0); |
4479 | 11.8k | if (r < 0) return r; |
4480 | 11.8k | } |
4481 | 26.8k | } |
4482 | 1.37M | break; |
4483 | | |
4484 | 1.37M | case CCS_RANGE: |
4485 | 273 | if (intype == *type) { |
4486 | 261 | if (intype == CCV_SB) { |
4487 | 31 | if (*from > 0xff || to > 0xff) |
4488 | 0 | return ONIGERR_INVALID_CODE_POINT_VALUE; |
4489 | | |
4490 | 31 | if (*from > to) { |
4491 | 6 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) |
4492 | 0 | goto ccs_range_end; |
4493 | 6 | else |
4494 | 6 | return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; |
4495 | 6 | } |
4496 | 25 | bitset_set_range(env, cc->bs, (int )*from, (int )to); |
4497 | 25 | if (IS_NOT_NULL(asc_cc)) |
4498 | 19 | bitset_set_range(env, asc_cc->bs, (int )*from, (int )to); |
4499 | 25 | } |
4500 | 230 | else { |
4501 | 230 | r = add_code_range(&(cc->mbuf), env, *from, to); |
4502 | 230 | if (r < 0) return r; |
4503 | 230 | if (IS_NOT_NULL(asc_cc)) { |
4504 | 225 | r = add_code_range0(&(asc_cc->mbuf), env, *from, to, 0); |
4505 | 225 | if (r < 0) return r; |
4506 | 225 | } |
4507 | 230 | } |
4508 | 261 | } |
4509 | 12 | else { |
4510 | 12 | if (*from > to) { |
4511 | 2 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) |
4512 | 0 | goto ccs_range_end; |
4513 | 2 | else |
4514 | 2 | return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; |
4515 | 2 | } |
4516 | 10 | bitset_set_range(env, cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff)); |
4517 | 10 | r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to); |
4518 | 10 | if (r < 0) return r; |
4519 | 10 | if (IS_NOT_NULL(asc_cc)) { |
4520 | 9 | bitset_set_range(env, asc_cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff)); |
4521 | 9 | r = add_code_range0(&(asc_cc->mbuf), env, (OnigCodePoint )*from, to, 0); |
4522 | 9 | if (r < 0) return r; |
4523 | 9 | } |
4524 | 10 | } |
4525 | 265 | ccs_range_end: |
4526 | 265 | *state = CCS_COMPLETE; |
4527 | 265 | break; |
4528 | | |
4529 | 254 | case CCS_COMPLETE: |
4530 | 402k | case CCS_START: |
4531 | 402k | *state = CCS_VALUE; |
4532 | 402k | break; |
4533 | | |
4534 | 0 | default: |
4535 | 0 | break; |
4536 | 1.77M | } |
4537 | | |
4538 | 1.77M | *from_israw = to_israw; |
4539 | 1.77M | *from = to; |
4540 | 1.77M | *type = intype; |
4541 | 1.77M | return 0; |
4542 | 1.77M | } |
4543 | | |
4544 | | static int |
4545 | | code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, |
4546 | | ScanEnv* env) |
4547 | 641 | { |
4548 | 641 | int in_esc; |
4549 | 641 | OnigCodePoint code; |
4550 | 641 | OnigEncoding enc = env->enc; |
4551 | 641 | UChar* p = from; |
4552 | | |
4553 | 641 | in_esc = 0; |
4554 | 8.68k | while (! PEND) { |
4555 | 8.68k | if (ignore_escaped && in_esc) { |
4556 | 545 | in_esc = 0; |
4557 | 545 | } |
4558 | 8.13k | else { |
4559 | 8.13k | PFETCH_S(code); |
4560 | 8.13k | if (code == c) return 1; |
4561 | 7.49k | if (code == MC_ESC(env->syntax)) in_esc = 1; |
4562 | 7.49k | } |
4563 | 8.68k | } |
4564 | 0 | return 0; |
4565 | 641 | } |
4566 | | |
4567 | | static int |
4568 | | parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* end, |
4569 | | ScanEnv* env) |
4570 | 530k | { |
4571 | 530k | int r, neg, len, fetched, and_start; |
4572 | 530k | OnigCodePoint v, vs; |
4573 | 530k | UChar *p; |
4574 | 530k | Node* node; |
4575 | 530k | Node* asc_node; |
4576 | 530k | CClassNode *cc, *prev_cc; |
4577 | 530k | CClassNode *asc_cc, *asc_prev_cc; |
4578 | 530k | CClassNode work_cc, asc_work_cc; |
4579 | | |
4580 | 530k | enum CCSTATE state; |
4581 | 530k | enum CCVALTYPE val_type, in_type; |
4582 | 530k | int val_israw, in_israw; |
4583 | | |
4584 | 530k | *np = *asc_np = NULL_NODE; |
4585 | 530k | env->parse_depth++; |
4586 | 530k | if (env->parse_depth > ParseDepthLimit) |
4587 | 2 | return ONIGERR_PARSE_DEPTH_LIMIT_OVER; |
4588 | 530k | prev_cc = asc_prev_cc = (CClassNode* )NULL; |
4589 | 530k | r = fetch_token_in_cc(tok, src, end, env); |
4590 | 530k | if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { |
4591 | 235k | neg = 1; |
4592 | 235k | r = fetch_token_in_cc(tok, src, end, env); |
4593 | 235k | } |
4594 | 295k | else { |
4595 | 295k | neg = 0; |
4596 | 295k | } |
4597 | | |
4598 | 530k | if (r < 0) return r; |
4599 | 530k | if (r == TK_CC_CLOSE) { |
4600 | 641 | if (! code_exist_check((OnigCodePoint )']', |
4601 | 641 | *src, env->pattern_end, 1, env)) |
4602 | 0 | return ONIGERR_EMPTY_CHAR_CLASS; |
4603 | | |
4604 | 641 | CC_ESC_WARN(env, (UChar* )"]"); |
4605 | 641 | r = tok->type = TK_CHAR; /* allow []...] */ |
4606 | 641 | } |
4607 | | |
4608 | 530k | *np = node = node_new_cclass(); |
4609 | 530k | CHECK_NULL_RETURN_MEMERR(node); |
4610 | 530k | cc = NCCLASS(node); |
4611 | | |
4612 | 530k | if (IS_IGNORECASE(env->option)) { |
4613 | 9.98k | *asc_np = asc_node = node_new_cclass(); |
4614 | 9.98k | CHECK_NULL_RETURN_MEMERR(asc_node); |
4615 | 9.98k | asc_cc = NCCLASS(asc_node); |
4616 | 9.98k | } |
4617 | 520k | else { |
4618 | 520k | asc_node = NULL_NODE; |
4619 | 520k | asc_cc = NULL; |
4620 | 520k | } |
4621 | | |
4622 | 530k | and_start = 0; |
4623 | 530k | state = CCS_START; |
4624 | 530k | p = *src; |
4625 | 1.91M | while (r != TK_CC_CLOSE) { |
4626 | 1.39M | fetched = 0; |
4627 | 1.39M | switch (r) { |
4628 | 877k | case TK_CHAR: |
4629 | 877k | if ((tok->u.code >= SINGLE_BYTE_SIZE) || |
4630 | 877k | (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) { |
4631 | 31.5k | in_type = CCV_CODE_POINT; |
4632 | 31.5k | } |
4633 | 846k | else if (len < 0) { |
4634 | 0 | r = len; |
4635 | 0 | goto err; |
4636 | 0 | } |
4637 | 846k | else { |
4638 | 846k | sb_char: |
4639 | 846k | in_type = CCV_SB; |
4640 | 846k | } |
4641 | 877k | v = (OnigCodePoint )tok->u.c; |
4642 | 877k | in_israw = 0; |
4643 | 877k | goto val_entry2; |
4644 | 0 | break; |
4645 | | |
4646 | 1.38k | case TK_RAW_BYTE: |
4647 | | /* tok->base != 0 : octal or hexadec. */ |
4648 | 1.38k | if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { |
4649 | 1.38k | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; |
4650 | 1.38k | UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; |
4651 | 1.38k | UChar* psave = p; |
4652 | 1.38k | int i, base = tok->base; |
4653 | | |
4654 | 1.38k | buf[0] = (UChar )tok->u.c; |
4655 | 4.12k | for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { |
4656 | 3.32k | r = fetch_token_in_cc(tok, &p, end, env); |
4657 | 3.32k | if (r < 0) goto err; |
4658 | 3.32k | if (r != TK_RAW_BYTE || tok->base != base) { |
4659 | 578 | fetched = 1; |
4660 | 578 | break; |
4661 | 578 | } |
4662 | 2.74k | buf[i] = (UChar )tok->u.c; |
4663 | 2.74k | } |
4664 | | |
4665 | 1.38k | if (i < ONIGENC_MBC_MINLEN(env->enc)) { |
4666 | 0 | r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; |
4667 | 0 | goto err; |
4668 | 0 | } |
4669 | | |
4670 | 1.38k | if (env->enc == ONIG_ENCODING_EUC_JP || |
4671 | 1.38k | env->enc == ONIG_ENCODING_SJIS) { |
4672 | | /* Strict version of enclen does not handle invalid single code |
4673 | | * point for SJIS and EUC-JP...*/ |
4674 | 0 | len = enclen_approximate(env->enc, buf, buf + i); |
4675 | 0 | } |
4676 | 1.38k | else { |
4677 | 1.38k | len = enclen(env->enc, buf, buf + i); |
4678 | 1.38k | } |
4679 | 1.38k | if (i < len) { |
4680 | 0 | r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; |
4681 | 0 | goto err; |
4682 | 0 | } |
4683 | 1.38k | else if (i > len) { /* fetch back */ |
4684 | 1.01k | p = psave; |
4685 | 1.52k | for (i = 1; i < len; i++) { |
4686 | 507 | (void)fetch_token_in_cc(tok, &p, end, env); |
4687 | | /* no need to check the retun value (already checked above) */ |
4688 | 507 | } |
4689 | 1.01k | fetched = 0; |
4690 | 1.01k | } |
4691 | | |
4692 | 1.38k | if (i == 1) { |
4693 | 984 | v = (OnigCodePoint )buf[0]; |
4694 | 984 | goto raw_single; |
4695 | 984 | } |
4696 | 398 | else { |
4697 | 398 | v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); |
4698 | 398 | in_type = CCV_CODE_POINT; |
4699 | 398 | } |
4700 | 1.38k | } |
4701 | 0 | else { |
4702 | 0 | v = (OnigCodePoint )tok->u.c; |
4703 | 984 | raw_single: |
4704 | 984 | in_type = CCV_SB; |
4705 | 984 | } |
4706 | 1.38k | in_israw = 1; |
4707 | 1.38k | goto val_entry2; |
4708 | 0 | break; |
4709 | | |
4710 | 371k | case TK_CODE_POINT: |
4711 | 371k | v = tok->u.code; |
4712 | 371k | in_israw = 1; |
4713 | 376k | val_entry: |
4714 | 376k | len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); |
4715 | 376k | if (len < 0) { |
4716 | 0 | r = len; |
4717 | 0 | goto err; |
4718 | 0 | } |
4719 | 376k | in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); |
4720 | 1.25M | val_entry2: |
4721 | 1.25M | r = next_state_val(cc, asc_cc, &vs, v, &val_israw, in_israw, in_type, &val_type, |
4722 | 1.25M | &state, env); |
4723 | 1.25M | if (r != 0) goto err; |
4724 | 1.25M | break; |
4725 | | |
4726 | 1.25M | case TK_POSIX_BRACKET_OPEN: |
4727 | 4.53k | r = parse_posix_bracket(cc, asc_cc, &p, end, env); |
4728 | 4.53k | if (r < 0) goto err; |
4729 | 4.52k | if (r == 1) { /* is not POSIX bracket */ |
4730 | 4.52k | CC_ESC_WARN(env, (UChar* )"["); |
4731 | 4.52k | p = tok->backp; |
4732 | 4.52k | v = (OnigCodePoint )tok->u.c; |
4733 | 4.52k | in_israw = 0; |
4734 | 4.52k | goto val_entry; |
4735 | 4.52k | } |
4736 | 0 | goto next_class; |
4737 | 0 | break; |
4738 | | |
4739 | 124k | case TK_CHAR_TYPE: |
4740 | 124k | r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, |
4741 | 124k | IS_ASCII_RANGE(env->option), env); |
4742 | 124k | if (r != 0) return r; |
4743 | 124k | if (IS_NOT_NULL(asc_cc)) { |
4744 | 1.26k | if (tok->u.prop.ctype != ONIGENC_CTYPE_WORD) |
4745 | 1.02k | r = add_ctype_to_cc(asc_cc, tok->u.prop.ctype, tok->u.prop.not, |
4746 | 1.02k | IS_ASCII_RANGE(env->option), env); |
4747 | 1.26k | if (r != 0) return r; |
4748 | 1.26k | } |
4749 | | |
4750 | 124k | next_class: |
4751 | 124k | r = next_state_class(cc, asc_cc, &vs, &val_type, &state, env); |
4752 | 124k | if (r != 0) goto err; |
4753 | 124k | break; |
4754 | | |
4755 | 124k | case TK_CHAR_PROPERTY: |
4756 | 262 | { |
4757 | 262 | int ctype; |
4758 | | |
4759 | 262 | ctype = fetch_char_property_to_ctype(&p, end, env); |
4760 | 262 | if (ctype < 0) return ctype; |
4761 | 262 | r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 0, env); |
4762 | 262 | if (r != 0) return r; |
4763 | 262 | if (IS_NOT_NULL(asc_cc)) { |
4764 | 128 | if (ctype != ONIGENC_CTYPE_ASCII) |
4765 | 128 | r = add_ctype_to_cc(asc_cc, ctype, tok->u.prop.not, 0, env); |
4766 | 128 | if (r != 0) return r; |
4767 | 128 | } |
4768 | 262 | goto next_class; |
4769 | 262 | } |
4770 | 262 | break; |
4771 | | |
4772 | 615 | case TK_CC_RANGE: |
4773 | 615 | if (state == CCS_VALUE) { |
4774 | 332 | r = fetch_token_in_cc(tok, &p, end, env); |
4775 | 332 | if (r < 0) goto err; |
4776 | 332 | fetched = 1; |
4777 | 332 | if (r == TK_CC_CLOSE) { /* allow [x-] */ |
4778 | 62 | range_end_val: |
4779 | 62 | v = (OnigCodePoint )'-'; |
4780 | 62 | in_israw = 0; |
4781 | 62 | goto val_entry; |
4782 | 59 | } |
4783 | 273 | else if (r == TK_CC_AND) { |
4784 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4785 | 0 | goto range_end_val; |
4786 | 0 | } |
4787 | | |
4788 | 273 | if (val_type == CCV_CLASS) { |
4789 | 0 | r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; |
4790 | 0 | goto err; |
4791 | 0 | } |
4792 | | |
4793 | 273 | state = CCS_RANGE; |
4794 | 273 | } |
4795 | 283 | else if (state == CCS_START) { |
4796 | | /* [-xa] is allowed */ |
4797 | 272 | v = (OnigCodePoint )tok->u.c; |
4798 | 272 | in_israw = 0; |
4799 | | |
4800 | 272 | r = fetch_token_in_cc(tok, &p, end, env); |
4801 | 272 | if (r < 0) goto err; |
4802 | 272 | fetched = 1; |
4803 | | /* [--x] or [a&&-x] is warned. */ |
4804 | 272 | if (r == TK_CC_RANGE || and_start != 0) |
4805 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4806 | | |
4807 | 272 | goto val_entry; |
4808 | 272 | } |
4809 | 11 | else if (state == CCS_RANGE) { |
4810 | 8 | CC_ESC_WARN(env, (UChar* )"-"); |
4811 | 8 | goto sb_char; /* [!--x] is allowed */ |
4812 | 8 | } |
4813 | 3 | else { /* CCS_COMPLETE */ |
4814 | 3 | r = fetch_token_in_cc(tok, &p, end, env); |
4815 | 3 | if (r < 0) goto err; |
4816 | 3 | fetched = 1; |
4817 | 3 | if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ |
4818 | 0 | else if (r == TK_CC_AND) { |
4819 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4820 | 0 | goto range_end_val; |
4821 | 0 | } |
4822 | | |
4823 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { |
4824 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4825 | 0 | goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */ |
4826 | 0 | } |
4827 | 0 | r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; |
4828 | 0 | goto err; |
4829 | 0 | } |
4830 | 273 | break; |
4831 | | |
4832 | 14.1k | case TK_CC_CC_OPEN: /* [ */ |
4833 | 14.1k | { |
4834 | 14.1k | Node *anode, *aasc_node; |
4835 | 14.1k | CClassNode* acc; |
4836 | | |
4837 | 14.1k | r = parse_char_class(&anode, &aasc_node, tok, &p, end, env); |
4838 | 14.1k | if (r == 0) { |
4839 | 2.66k | acc = NCCLASS(anode); |
4840 | 2.66k | r = or_cclass(cc, acc, env); |
4841 | 2.66k | } |
4842 | 14.1k | if (r == 0 && IS_NOT_NULL(aasc_node)) { |
4843 | 2.09k | acc = NCCLASS(aasc_node); |
4844 | 2.09k | r = or_cclass(asc_cc, acc, env); |
4845 | 2.09k | } |
4846 | 14.1k | onig_node_free(anode); |
4847 | 14.1k | onig_node_free(aasc_node); |
4848 | 14.1k | if (r != 0) goto err; |
4849 | 14.1k | } |
4850 | 2.66k | break; |
4851 | | |
4852 | 4.09k | case TK_CC_AND: /* && */ |
4853 | 4.09k | { |
4854 | 4.09k | if (state == CCS_VALUE) { |
4855 | 4 | r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type, |
4856 | 4 | &val_type, &state, env); |
4857 | 4 | if (r != 0) goto err; |
4858 | 4 | } |
4859 | | /* initialize local variables */ |
4860 | 4.09k | and_start = 1; |
4861 | 4.09k | state = CCS_START; |
4862 | | |
4863 | 4.09k | if (IS_NOT_NULL(prev_cc)) { |
4864 | 3 | r = and_cclass(prev_cc, cc, env); |
4865 | 3 | if (r != 0) goto err; |
4866 | 3 | bbuf_free(cc->mbuf); |
4867 | 3 | if (IS_NOT_NULL(asc_cc)) { |
4868 | 3 | r = and_cclass(asc_prev_cc, asc_cc, env); |
4869 | 3 | if (r != 0) goto err; |
4870 | 3 | bbuf_free(asc_cc->mbuf); |
4871 | 3 | } |
4872 | 3 | } |
4873 | 4.09k | else { |
4874 | 4.09k | prev_cc = cc; |
4875 | 4.09k | cc = &work_cc; |
4876 | 4.09k | if (IS_NOT_NULL(asc_cc)) { |
4877 | 4.09k | asc_prev_cc = asc_cc; |
4878 | 4.09k | asc_cc = &asc_work_cc; |
4879 | 4.09k | } |
4880 | 4.09k | } |
4881 | 4.09k | initialize_cclass(cc); |
4882 | 4.09k | if (IS_NOT_NULL(asc_cc)) |
4883 | 4.09k | initialize_cclass(asc_cc); |
4884 | 4.09k | } |
4885 | 0 | break; |
4886 | | |
4887 | 14 | case TK_EOT: |
4888 | 14 | r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; |
4889 | 14 | goto err; |
4890 | 0 | break; |
4891 | 0 | default: |
4892 | 0 | r = ONIGERR_PARSER_BUG; |
4893 | 0 | goto err; |
4894 | 0 | break; |
4895 | 1.39M | } |
4896 | | |
4897 | 1.38M | if (fetched) |
4898 | 976 | r = tok->type; |
4899 | 1.38M | else { |
4900 | 1.38M | r = fetch_token_in_cc(tok, &p, end, env); |
4901 | 1.38M | if (r < 0) goto err; |
4902 | 1.38M | } |
4903 | 1.38M | } |
4904 | | |
4905 | 518k | if (state == CCS_VALUE) { |
4906 | 517k | r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type, |
4907 | 517k | &val_type, &state, env); |
4908 | 517k | if (r != 0) goto err; |
4909 | 517k | } |
4910 | | |
4911 | 518k | if (IS_NOT_NULL(prev_cc)) { |
4912 | 1 | r = and_cclass(prev_cc, cc, env); |
4913 | 1 | if (r != 0) goto err; |
4914 | 1 | bbuf_free(cc->mbuf); |
4915 | 1 | cc = prev_cc; |
4916 | 1 | if (IS_NOT_NULL(asc_cc)) { |
4917 | 1 | r = and_cclass(asc_prev_cc, asc_cc, env); |
4918 | 1 | if (r != 0) goto err; |
4919 | 1 | bbuf_free(asc_cc->mbuf); |
4920 | 1 | asc_cc = asc_prev_cc; |
4921 | 1 | } |
4922 | 1 | } |
4923 | | |
4924 | 518k | if (neg != 0) { |
4925 | 234k | NCCLASS_SET_NOT(cc); |
4926 | 234k | if (IS_NOT_NULL(asc_cc)) |
4927 | 234k | NCCLASS_SET_NOT(asc_cc); |
4928 | 234k | } |
4929 | 283k | else { |
4930 | 283k | NCCLASS_CLEAR_NOT(cc); |
4931 | 283k | if (IS_NOT_NULL(asc_cc)) |
4932 | 283k | NCCLASS_CLEAR_NOT(asc_cc); |
4933 | 283k | } |
4934 | 518k | if (IS_NCCLASS_NOT(cc) && |
4935 | 518k | IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { |
4936 | 0 | int is_empty; |
4937 | |
|
4938 | 0 | is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); |
4939 | 0 | if (is_empty != 0) |
4940 | 0 | BITSET_IS_EMPTY(cc->bs, is_empty); |
4941 | |
|
4942 | 0 | if (is_empty == 0) { |
4943 | 0 | #define NEWLINE_CODE 0x0a |
4944 | |
|
4945 | 0 | if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { |
4946 | 0 | if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) |
4947 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE); |
4948 | 0 | else { |
4949 | 0 | r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); |
4950 | 0 | if (r < 0) goto err; |
4951 | 0 | } |
4952 | 0 | } |
4953 | 0 | } |
4954 | 0 | } |
4955 | 518k | *src = p; |
4956 | 518k | env->parse_depth--; |
4957 | 518k | return 0; |
4958 | | |
4959 | 11.5k | err: |
4960 | 11.5k | if (cc != NCCLASS(*np)) |
4961 | 4.09k | bbuf_free(cc->mbuf); |
4962 | 11.5k | if (IS_NOT_NULL(asc_cc) && (asc_cc != NCCLASS(*asc_np))) |
4963 | 4.09k | bbuf_free(asc_cc->mbuf); |
4964 | 11.5k | return r; |
4965 | 518k | } |
4966 | | |
4967 | | static int parse_subexp(Node** top, OnigToken* tok, int term, |
4968 | | UChar** src, UChar* end, ScanEnv* env); |
4969 | | |
4970 | | static int |
4971 | | parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, |
4972 | | ScanEnv* env) |
4973 | 585k | { |
4974 | 585k | int r = 0, num; |
4975 | 585k | Node *target, *work1 = NULL, *work2 = NULL; |
4976 | 585k | OnigOptionType option; |
4977 | 585k | OnigCodePoint c; |
4978 | 585k | OnigEncoding enc = env->enc; |
4979 | | |
4980 | 585k | #ifdef USE_NAMED_GROUP |
4981 | 585k | int list_capture; |
4982 | 585k | #endif |
4983 | | |
4984 | 585k | UChar* p = *src; |
4985 | 585k | PFETCH_READY; |
4986 | | |
4987 | 585k | *np = NULL; |
4988 | 585k | if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; |
4989 | | |
4990 | 585k | option = env->option; |
4991 | 585k | if (PPEEK_IS('?') && |
4992 | 585k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { |
4993 | 475k | PINC; |
4994 | 475k | if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; |
4995 | | |
4996 | 475k | PFETCH(c); |
4997 | 475k | switch (c) { |
4998 | 186k | case ':': /* (?:...) grouping only */ |
4999 | 186k | group: |
5000 | 186k | r = fetch_token(tok, &p, end, env); |
5001 | 186k | if (r < 0) return r; |
5002 | 186k | r = parse_subexp(np, tok, term, &p, end, env); |
5003 | 186k | if (r < 0) return r; |
5004 | 186k | *src = p; |
5005 | 186k | return 1; /* group */ |
5006 | 0 | break; |
5007 | | |
5008 | 478 | case '=': |
5009 | 478 | *np = onig_node_new_anchor(ANCHOR_PREC_READ); |
5010 | 478 | break; |
5011 | 434 | case '!': /* preceding read */ |
5012 | 434 | *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); |
5013 | 434 | break; |
5014 | 669 | case '>': /* (?>...) stop backtrack */ |
5015 | 669 | *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
5016 | 669 | break; |
5017 | 517 | case '~': /* (?~...) absent operator */ |
5018 | 517 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT)) { |
5019 | 517 | *np = node_new_enclose(ENCLOSE_ABSENT); |
5020 | 517 | } |
5021 | 0 | else { |
5022 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5023 | 0 | } |
5024 | 517 | break; |
5025 | | |
5026 | 517 | #ifdef USE_NAMED_GROUP |
5027 | 9.98k | case '\'': |
5028 | 9.98k | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { |
5029 | 9.98k | goto named_group1; |
5030 | 9.98k | } |
5031 | 0 | else |
5032 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5033 | 0 | break; |
5034 | | |
5035 | 0 | # ifdef USE_CAPITAL_P_NAMED_GROUP |
5036 | 0 | case 'P': /* (?P<name>...) */ |
5037 | 0 | if (!PEND && |
5038 | 0 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { |
5039 | 0 | PFETCH(c); |
5040 | 0 | if (c == '<') goto named_group1; |
5041 | 0 | } |
5042 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5043 | 0 | break; |
5044 | 0 | # endif |
5045 | 0 | #endif |
5046 | | |
5047 | 239k | case '<': /* look behind (?<=...), (?<!...) */ |
5048 | 239k | if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; |
5049 | 239k | PFETCH(c); |
5050 | 239k | if (c == '=') |
5051 | 599 | *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND); |
5052 | 239k | else if (c == '!') |
5053 | 2.95k | *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT); |
5054 | 236k | #ifdef USE_NAMED_GROUP |
5055 | 236k | else { /* (?<name>...) */ |
5056 | 236k | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { |
5057 | 236k | UChar *name; |
5058 | 236k | UChar *name_end; |
5059 | | |
5060 | 236k | PUNFETCH; |
5061 | 236k | c = '<'; |
5062 | | |
5063 | 246k | named_group1: |
5064 | 246k | list_capture = 0; |
5065 | | |
5066 | 246k | # ifdef USE_CAPTURE_HISTORY |
5067 | 246k | named_group2: |
5068 | 246k | # endif |
5069 | 246k | name = p; |
5070 | 246k | r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); |
5071 | 246k | if (r < 0) return r; |
5072 | | |
5073 | 246k | num = scan_env_add_mem_entry(env); |
5074 | 246k | if (num < 0) return num; |
5075 | 246k | if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM) |
5076 | 0 | return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; |
5077 | | |
5078 | 246k | r = name_add(env->reg, name, name_end, num, env); |
5079 | 246k | if (r != 0) return r; |
5080 | 246k | *np = node_new_enclose_memory(env->option, 1); |
5081 | 246k | CHECK_NULL_RETURN_MEMERR(*np); |
5082 | 246k | NENCLOSE(*np)->regnum = num; |
5083 | 246k | if (list_capture != 0) |
5084 | 0 | BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); |
5085 | 246k | env->num_named++; |
5086 | 246k | } |
5087 | 0 | else { |
5088 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5089 | 0 | } |
5090 | 236k | } |
5091 | | #else |
5092 | | else { |
5093 | | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5094 | | } |
5095 | | #endif |
5096 | 249k | break; |
5097 | | |
5098 | 249k | #ifdef USE_CAPTURE_HISTORY |
5099 | 249k | case '@': |
5100 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { |
5101 | 0 | # ifdef USE_NAMED_GROUP |
5102 | 0 | if (!PEND && |
5103 | 0 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { |
5104 | 0 | PFETCH(c); |
5105 | 0 | if (c == '<' || c == '\'') { |
5106 | 0 | list_capture = 1; |
5107 | 0 | goto named_group2; /* (?@<name>...) */ |
5108 | 0 | } |
5109 | 0 | PUNFETCH; |
5110 | 0 | } |
5111 | 0 | # endif |
5112 | 0 | *np = node_new_enclose_memory(env->option, 0); |
5113 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
5114 | 0 | num = scan_env_add_mem_entry(env); |
5115 | 0 | if (num < 0) return num; |
5116 | 0 | if (num >= (int )BIT_STATUS_BITS_NUM) |
5117 | 0 | return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; |
5118 | | |
5119 | 0 | NENCLOSE(*np)->regnum = num; |
5120 | 0 | BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); |
5121 | 0 | } |
5122 | 0 | else { |
5123 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5124 | 0 | } |
5125 | 0 | break; |
5126 | 0 | #endif /* USE_CAPTURE_HISTORY */ |
5127 | | |
5128 | 781 | case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */ |
5129 | 781 | if (!PEND && |
5130 | 781 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) { |
5131 | 781 | UChar *name = NULL; |
5132 | 781 | UChar *name_end; |
5133 | 781 | PFETCH(c); |
5134 | 781 | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */ |
5135 | 32 | PUNFETCH; |
5136 | 32 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1); |
5137 | 32 | if (r < 0) return r; |
5138 | | #if 0 |
5139 | | /* Relative number is not currently supported. (same as Perl) */ |
5140 | | if (num < 0) { |
5141 | | num = BACKREF_REL_TO_ABS(num, env); |
5142 | | if (num <= 0) |
5143 | | return ONIGERR_INVALID_BACKREF; |
5144 | | } |
5145 | | #endif |
5146 | 32 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
5147 | 0 | if (num > env->num_mem || |
5148 | 0 | IS_NULL(SCANENV_MEM_NODES(env)[num])) |
5149 | 0 | return ONIGERR_INVALID_BACKREF; |
5150 | 0 | } |
5151 | 32 | } |
5152 | 749 | #ifdef USE_NAMED_GROUP |
5153 | 749 | else if (c == '<' || c == '\'') { /* (<name>), ('name') */ |
5154 | 749 | name = p; |
5155 | 749 | r = fetch_named_backref_token(c, tok, &p, end, env); |
5156 | 749 | if (r < 0) return r; |
5157 | 748 | if (!PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; |
5158 | 748 | PINC; |
5159 | | |
5160 | 748 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) { |
5161 | 0 | num = tok->u.backref.ref1; |
5162 | 0 | } |
5163 | 748 | else { |
5164 | | /* FIXME: |
5165 | | * Use left most named group for now. This is the same as Perl. |
5166 | | * However this should use the same strategy as normal back- |
5167 | | * references on Ruby syntax; search right to left. */ |
5168 | 748 | int len = tok->u.backref.num; |
5169 | 748 | num = len > 1 ? tok->u.backref.refs[0] : tok->u.backref.ref1; |
5170 | 748 | } |
5171 | 748 | } |
5172 | 0 | #endif |
5173 | 0 | else |
5174 | 0 | return ONIGERR_INVALID_CONDITION_PATTERN; |
5175 | 780 | *np = node_new_enclose(ENCLOSE_CONDITION); |
5176 | 780 | CHECK_NULL_RETURN_MEMERR(*np); |
5177 | 780 | NENCLOSE(*np)->regnum = num; |
5178 | 780 | if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF; |
5179 | 780 | } |
5180 | 0 | else |
5181 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5182 | 780 | break; |
5183 | | |
5184 | | #if 0 |
5185 | | case '|': /* branch reset: (?|...) */ |
5186 | | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) { |
5187 | | /* TODO */ |
5188 | | } |
5189 | | else |
5190 | | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5191 | | break; |
5192 | | #endif |
5193 | | |
5194 | 780 | case '^': /* loads default options */ |
5195 | 0 | if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { |
5196 | | /* d-imsx */ |
5197 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5198 | 0 | ONOFF(option, ONIG_OPTION_IGNORECASE, 1); |
5199 | 0 | ONOFF(option, ONIG_OPTION_SINGLELINE, 0); |
5200 | 0 | ONOFF(option, ONIG_OPTION_MULTILINE, 1); |
5201 | 0 | ONOFF(option, ONIG_OPTION_EXTEND, 1); |
5202 | 0 | PFETCH(c); |
5203 | 0 | } |
5204 | | #if 0 |
5205 | | else if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { |
5206 | | /* d-imx */ |
5207 | | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); |
5208 | | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); |
5209 | | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); |
5210 | | ONOFF(option, ONIG_OPTION_IGNORECASE, 1); |
5211 | | ONOFF(option, ONIG_OPTION_MULTILINE, 1); |
5212 | | ONOFF(option, ONIG_OPTION_EXTEND, 1); |
5213 | | PFETCH(c); |
5214 | | } |
5215 | | #endif |
5216 | 0 | else { |
5217 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5218 | 0 | } |
5219 | | /* fall through */ |
5220 | | #ifdef USE_POSIXLINE_OPTION |
5221 | | case 'p': |
5222 | | #endif |
5223 | 36.7k | case '-': case 'i': case 'm': case 's': case 'x': |
5224 | 37.0k | case 'a': case 'd': case 'l': case 'u': |
5225 | 37.0k | { |
5226 | 37.0k | int neg = 0; |
5227 | | |
5228 | 77.0k | while (1) { |
5229 | 77.0k | switch (c) { |
5230 | 30.9k | case ':': |
5231 | 37.0k | case ')': |
5232 | 37.0k | break; |
5233 | | |
5234 | 245 | case '-': neg = 1; break; |
5235 | 36.2k | case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break; |
5236 | 3.22k | case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break; |
5237 | 0 | case 's': |
5238 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { |
5239 | 0 | ONOFF(option, ONIG_OPTION_MULTILINE, neg); |
5240 | 0 | } |
5241 | 0 | else |
5242 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5243 | 0 | break; |
5244 | | |
5245 | 129 | case 'm': |
5246 | 129 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { |
5247 | 0 | ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); |
5248 | 0 | } |
5249 | 129 | else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { |
5250 | 129 | ONOFF(option, ONIG_OPTION_MULTILINE, neg); |
5251 | 129 | } |
5252 | 0 | else |
5253 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5254 | 129 | break; |
5255 | | #ifdef USE_POSIXLINE_OPTION |
5256 | | case 'p': |
5257 | | ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); |
5258 | | break; |
5259 | | #endif |
5260 | | |
5261 | 248 | case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */ |
5262 | 248 | if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || |
5263 | 248 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && |
5264 | 248 | (neg == 0)) { |
5265 | 248 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); |
5266 | 248 | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); |
5267 | 248 | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); |
5268 | 248 | } |
5269 | 0 | else |
5270 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5271 | 248 | break; |
5272 | | |
5273 | 248 | case 'u': |
5274 | 12 | if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || |
5275 | 12 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && |
5276 | 12 | (neg == 0)) { |
5277 | 12 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5278 | 12 | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); |
5279 | 12 | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); |
5280 | 12 | } |
5281 | 0 | else |
5282 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5283 | 12 | break; |
5284 | | |
5285 | 12 | case 'd': |
5286 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && |
5287 | 0 | (neg == 0)) { |
5288 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5289 | 0 | } |
5290 | 0 | else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) && |
5291 | 0 | (neg == 0)) { |
5292 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); |
5293 | 0 | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); |
5294 | 0 | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); |
5295 | 0 | } |
5296 | 0 | else |
5297 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5298 | 0 | break; |
5299 | | |
5300 | 0 | case 'l': |
5301 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) { |
5302 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5303 | 0 | } |
5304 | 0 | else |
5305 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5306 | 0 | break; |
5307 | | |
5308 | 1 | default: |
5309 | 1 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5310 | 77.0k | } |
5311 | | |
5312 | 77.0k | if (c == ')') { |
5313 | 6.07k | *np = node_new_option(option); |
5314 | 6.07k | CHECK_NULL_RETURN_MEMERR(*np); |
5315 | 6.07k | *src = p; |
5316 | 6.07k | return 2; /* option only */ |
5317 | 6.07k | } |
5318 | 71.0k | else if (c == ':') { |
5319 | 30.9k | OnigOptionType prev = env->option; |
5320 | | |
5321 | 30.9k | env->option = option; |
5322 | 30.9k | r = fetch_token(tok, &p, end, env); |
5323 | 30.9k | if (r < 0) { |
5324 | 0 | env->option = prev; |
5325 | 0 | return r; |
5326 | 0 | } |
5327 | 30.9k | r = parse_subexp(&target, tok, term, &p, end, env); |
5328 | 30.9k | env->option = prev; |
5329 | 30.9k | if (r < 0) return r; |
5330 | 30.9k | *np = node_new_option(option); |
5331 | 30.9k | CHECK_NULL_RETURN_MEMERR(*np); |
5332 | 30.9k | NENCLOSE(*np)->target = target; |
5333 | 30.9k | *src = p; |
5334 | 30.9k | return 0; |
5335 | 30.9k | } |
5336 | | |
5337 | 40.0k | if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; |
5338 | 40.0k | PFETCH(c); |
5339 | 40.0k | } |
5340 | 37.0k | } |
5341 | 0 | break; |
5342 | | |
5343 | 2 | default: |
5344 | 2 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5345 | 475k | } |
5346 | 475k | } |
5347 | 109k | else { |
5348 | 109k | if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) |
5349 | 0 | goto group; |
5350 | | |
5351 | 109k | *np = node_new_enclose_memory(env->option, 0); |
5352 | 109k | CHECK_NULL_RETURN_MEMERR(*np); |
5353 | 109k | num = scan_env_add_mem_entry(env); |
5354 | 109k | if (num < 0) return num; |
5355 | 109k | NENCLOSE(*np)->regnum = num; |
5356 | 109k | } |
5357 | | |
5358 | 361k | CHECK_NULL_RETURN_MEMERR(*np); |
5359 | 361k | r = fetch_token(tok, &p, end, env); |
5360 | 361k | if (r < 0) return r; |
5361 | 361k | r = parse_subexp(&target, tok, term, &p, end, env); |
5362 | 361k | if (r < 0) { |
5363 | 26.9k | onig_node_free(target); |
5364 | 26.9k | return r; |
5365 | 26.9k | } |
5366 | | |
5367 | 334k | if (NTYPE(*np) == NT_ANCHOR) |
5368 | 4.45k | NANCHOR(*np)->target = target; |
5369 | 330k | else { |
5370 | 330k | NENCLOSE(*np)->target = target; |
5371 | 330k | if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) { |
5372 | | /* Don't move this to previous of parse_subexp() */ |
5373 | 328k | r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np); |
5374 | 328k | if (r != 0) return r; |
5375 | 328k | } |
5376 | 1.77k | else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) { |
5377 | 614 | if (NTYPE(target) != NT_ALT) { |
5378 | | /* convert (?(cond)yes) to (?(cond)yes|empty) */ |
5379 | 347 | work1 = node_new_empty(); |
5380 | 347 | if (IS_NULL(work1)) goto err; |
5381 | 347 | work2 = onig_node_new_alt(work1, NULL_NODE); |
5382 | 347 | if (IS_NULL(work2)) goto err; |
5383 | 347 | work1 = onig_node_new_alt(target, work2); |
5384 | 347 | if (IS_NULL(work1)) goto err; |
5385 | 347 | NENCLOSE(*np)->target = work1; |
5386 | 347 | } |
5387 | 614 | } |
5388 | 330k | } |
5389 | | |
5390 | 334k | *src = p; |
5391 | 334k | return 0; |
5392 | | |
5393 | 0 | err: |
5394 | 0 | onig_node_free(work1); |
5395 | 0 | onig_node_free(work2); |
5396 | 0 | onig_node_free(*np); |
5397 | 0 | *np = NULL; |
5398 | 0 | return ONIGERR_MEMORY; |
5399 | 334k | } |
5400 | | |
5401 | | static const char* const PopularQStr[] = { |
5402 | | "?", "*", "+", "??", "*?", "+?" |
5403 | | }; |
5404 | | |
5405 | | static const char* const ReduceQStr[] = { |
5406 | | "", "", "*", "*?", "??", "+ and ??", "+? and ?" |
5407 | | }; |
5408 | | |
5409 | | static int |
5410 | | set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) |
5411 | 1.27M | { |
5412 | 1.27M | QtfrNode* qn; |
5413 | | |
5414 | 1.27M | qn = NQTFR(qnode); |
5415 | 1.27M | if (qn->lower == 1 && qn->upper == 1) { |
5416 | 257 | return 1; |
5417 | 257 | } |
5418 | | |
5419 | 1.27M | switch (NTYPE(target)) { |
5420 | 86.8k | case NT_STR: |
5421 | 86.8k | if (! group) { |
5422 | 55.2k | StrNode* sn = NSTR(target); |
5423 | 55.2k | if (str_node_can_be_split(sn, env->enc)) { |
5424 | 36.2k | Node* n = str_node_split_last_char(sn, env->enc); |
5425 | 36.2k | if (IS_NOT_NULL(n)) { |
5426 | 32.6k | qn->target = n; |
5427 | 32.6k | return 2; |
5428 | 32.6k | } |
5429 | 36.2k | } |
5430 | 55.2k | } |
5431 | 54.1k | break; |
5432 | | |
5433 | 54.1k | case NT_QTFR: |
5434 | 33.6k | { /* check redundant double repeat. */ |
5435 | | /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ |
5436 | 33.6k | QtfrNode* qnt = NQTFR(target); |
5437 | 33.6k | int nestq_num = popular_quantifier_num(qn); |
5438 | 33.6k | int targetq_num = popular_quantifier_num(qnt); |
5439 | | |
5440 | 33.6k | #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR |
5441 | 33.6k | if (nestq_num >= 0 && targetq_num >= 0 && |
5442 | 33.6k | IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { |
5443 | 15.9k | switch (ReduceTypeTable[targetq_num][nestq_num]) { |
5444 | 110 | case RQ_ASIS: |
5445 | 110 | break; |
5446 | | |
5447 | 12.0k | case RQ_DEL: |
5448 | 12.0k | if (onig_warn != onig_null_warn) { |
5449 | 0 | onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'", |
5450 | 0 | PopularQStr[targetq_num]); |
5451 | 0 | } |
5452 | 12.0k | goto warn_exit; |
5453 | 0 | break; |
5454 | | |
5455 | 3.80k | default: |
5456 | 3.80k | if (onig_warn != onig_null_warn) { |
5457 | 0 | onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression", |
5458 | 0 | PopularQStr[targetq_num], PopularQStr[nestq_num], |
5459 | 0 | ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); |
5460 | 0 | } |
5461 | 3.80k | goto warn_exit; |
5462 | 0 | break; |
5463 | 15.9k | } |
5464 | 15.9k | } |
5465 | | |
5466 | 33.6k | warn_exit: |
5467 | 33.6k | #endif |
5468 | 33.6k | if (targetq_num >= 0) { |
5469 | 18.9k | if (nestq_num >= 0) { |
5470 | 15.9k | onig_reduce_nested_quantifier(qnode, target); |
5471 | 15.9k | goto q_exit; |
5472 | 15.9k | } |
5473 | 2.96k | else if (targetq_num == 1 || targetq_num == 2) { /* * or + */ |
5474 | | /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ |
5475 | 1.18k | if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { |
5476 | 340 | qn->upper = (qn->lower == 0 ? 1 : qn->lower); |
5477 | 340 | } |
5478 | 1.18k | } |
5479 | 18.9k | } |
5480 | 33.6k | } |
5481 | 17.6k | break; |
5482 | | |
5483 | 1.15M | default: |
5484 | 1.15M | break; |
5485 | 1.27M | } |
5486 | | |
5487 | 1.22M | qn->target = target; |
5488 | 1.23M | q_exit: |
5489 | 1.23M | return 0; |
5490 | 1.22M | } |
5491 | | |
5492 | | |
5493 | | #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
5494 | | static int |
5495 | | clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) |
5496 | | { |
5497 | | BBuf *tbuf; |
5498 | | int r; |
5499 | | |
5500 | | if (IS_NCCLASS_NOT(cc)) { |
5501 | | bitset_invert(cc->bs); |
5502 | | |
5503 | | if (! ONIGENC_IS_SINGLEBYTE(enc)) { |
5504 | | r = not_code_range_buf(enc, cc->mbuf, &tbuf); |
5505 | | if (r != 0) return r; |
5506 | | |
5507 | | bbuf_free(cc->mbuf); |
5508 | | cc->mbuf = tbuf; |
5509 | | } |
5510 | | |
5511 | | NCCLASS_CLEAR_NOT(cc); |
5512 | | } |
5513 | | |
5514 | | return 0; |
5515 | | } |
5516 | | #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ |
5517 | | |
5518 | | typedef struct { |
5519 | | ScanEnv* env; |
5520 | | CClassNode* cc; |
5521 | | CClassNode* asc_cc; |
5522 | | Node* alt_root; |
5523 | | Node** ptail; |
5524 | | } IApplyCaseFoldArg; |
5525 | | |
5526 | | static int |
5527 | | i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], |
5528 | | int to_len, void* arg) |
5529 | 10.7M | { |
5530 | 10.7M | IApplyCaseFoldArg* iarg; |
5531 | 10.7M | ScanEnv* env; |
5532 | 10.7M | CClassNode* cc; |
5533 | 10.7M | CClassNode* asc_cc; |
5534 | 10.7M | BitSetRef bs; |
5535 | 10.7M | int add_flag, r; |
5536 | | |
5537 | 10.7M | iarg = (IApplyCaseFoldArg* )arg; |
5538 | 10.7M | env = iarg->env; |
5539 | 10.7M | cc = iarg->cc; |
5540 | 10.7M | asc_cc = iarg->asc_cc; |
5541 | 10.7M | bs = cc->bs; |
5542 | | |
5543 | 10.7M | if (IS_NULL(asc_cc)) { |
5544 | 0 | add_flag = 0; |
5545 | 0 | } |
5546 | 10.7M | else if (ONIGENC_IS_ASCII_CODE(from) == ONIGENC_IS_ASCII_CODE(*to)) { |
5547 | 10.6M | add_flag = 1; |
5548 | 10.6M | } |
5549 | 83.7k | else { |
5550 | 83.7k | add_flag = onig_is_code_in_cc(env->enc, from, asc_cc); |
5551 | 83.7k | if (IS_NCCLASS_NOT(asc_cc)) |
5552 | 20.8k | add_flag = !add_flag; |
5553 | 83.7k | } |
5554 | | |
5555 | 10.7M | if (to_len == 1) { |
5556 | 10.3M | int is_in = onig_is_code_in_cc(env->enc, from, cc); |
5557 | 10.3M | #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
5558 | 10.3M | if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || |
5559 | 10.3M | (is_in == 0 && IS_NCCLASS_NOT(cc))) { |
5560 | 1.36M | if (add_flag) { |
5561 | 1.36M | if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { |
5562 | 1.32M | r = add_code_range0(&(cc->mbuf), env, *to, *to, 0); |
5563 | 1.32M | if (r < 0) return r; |
5564 | 1.32M | } |
5565 | 38.8k | else { |
5566 | 38.8k | BITSET_SET_BIT(bs, *to); |
5567 | 38.8k | } |
5568 | 1.36M | } |
5569 | 1.36M | } |
5570 | | #else |
5571 | | if (is_in != 0) { |
5572 | | if (add_flag) { |
5573 | | if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { |
5574 | | if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); |
5575 | | r = add_code_range0(&(cc->mbuf), env, *to, *to, 0); |
5576 | | if (r < 0) return r; |
5577 | | } |
5578 | | else { |
5579 | | if (IS_NCCLASS_NOT(cc)) { |
5580 | | BITSET_CLEAR_BIT(bs, *to); |
5581 | | } |
5582 | | else { |
5583 | | BITSET_SET_BIT(bs, *to); |
5584 | | } |
5585 | | } |
5586 | | } |
5587 | | } |
5588 | | #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ |
5589 | 10.3M | } |
5590 | 362k | else { |
5591 | 362k | int r, i, len; |
5592 | 362k | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; |
5593 | 362k | Node *snode = NULL_NODE; |
5594 | | |
5595 | 362k | if (onig_is_code_in_cc(env->enc, from, cc) |
5596 | 362k | #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
5597 | 362k | && !IS_NCCLASS_NOT(cc) |
5598 | 362k | #endif |
5599 | 362k | ) { |
5600 | 137k | for (i = 0; i < to_len; i++) { |
5601 | 93.8k | len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); |
5602 | 93.8k | if (i == 0) { |
5603 | 43.5k | snode = onig_node_new_str(buf, buf + len); |
5604 | 43.5k | CHECK_NULL_RETURN_MEMERR(snode); |
5605 | | |
5606 | | /* char-class expanded multi-char only |
5607 | | compare with string folded at match time. */ |
5608 | 43.5k | NSTRING_SET_AMBIG(snode); |
5609 | 43.5k | } |
5610 | 50.2k | else { |
5611 | 50.2k | r = onig_node_str_cat(snode, buf, buf + len); |
5612 | 50.2k | if (r < 0) { |
5613 | 0 | onig_node_free(snode); |
5614 | 0 | return r; |
5615 | 0 | } |
5616 | 50.2k | } |
5617 | 93.8k | } |
5618 | | |
5619 | 43.5k | *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); |
5620 | 43.5k | CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); |
5621 | 43.5k | iarg->ptail = &(NCDR((*(iarg->ptail)))); |
5622 | 43.5k | } |
5623 | 362k | } |
5624 | | |
5625 | 10.7M | return 0; |
5626 | 10.7M | } |
5627 | | |
5628 | | static int |
5629 | | cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env) |
5630 | 3.48k | { |
5631 | 3.48k | int r; |
5632 | 3.48k | IApplyCaseFoldArg iarg; |
5633 | | |
5634 | 3.48k | iarg.env = env; |
5635 | 3.48k | iarg.cc = cc; |
5636 | 3.48k | iarg.asc_cc = asc_cc; |
5637 | 3.48k | iarg.alt_root = NULL_NODE; |
5638 | 3.48k | iarg.ptail = &(iarg.alt_root); |
5639 | | |
5640 | 3.48k | r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, |
5641 | 3.48k | i_apply_case_fold, &iarg); |
5642 | 3.48k | if (r != 0) { |
5643 | 0 | onig_node_free(iarg.alt_root); |
5644 | 0 | return r; |
5645 | 0 | } |
5646 | 3.48k | if (IS_NOT_NULL(iarg.alt_root)) { |
5647 | 426 | Node* work = onig_node_new_alt(*np, iarg.alt_root); |
5648 | 426 | if (IS_NULL(work)) { |
5649 | 0 | onig_node_free(iarg.alt_root); |
5650 | 0 | return ONIGERR_MEMORY; |
5651 | 0 | } |
5652 | 426 | *np = work; |
5653 | 426 | } |
5654 | 3.48k | return r; |
5655 | 3.48k | } |
5656 | | |
5657 | | static int |
5658 | | node_linebreak(Node** np, ScanEnv* env) |
5659 | 1.00k | { |
5660 | | /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */ |
5661 | 1.00k | Node* left = NULL; |
5662 | 1.00k | Node* right = NULL; |
5663 | 1.00k | Node* target1 = NULL; |
5664 | 1.00k | Node* target2 = NULL; |
5665 | 1.00k | CClassNode* cc; |
5666 | 1.00k | int num1, num2, r; |
5667 | 1.00k | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; |
5668 | | |
5669 | | /* \x0D\x0A */ |
5670 | 1.00k | num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); |
5671 | 1.00k | if (num1 < 0) return num1; |
5672 | 1.00k | num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); |
5673 | 1.00k | if (num2 < 0) return num2; |
5674 | 1.00k | left = node_new_str_raw(buf, buf + num1 + num2); |
5675 | 1.00k | if (IS_NULL(left)) goto err; |
5676 | | |
5677 | | /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */ |
5678 | 1.00k | right = node_new_cclass(); |
5679 | 1.00k | if (IS_NULL(right)) goto err; |
5680 | 1.00k | cc = NCCLASS(right); |
5681 | 1.00k | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { |
5682 | 0 | r = add_code_range(&(cc->mbuf), env, 0x0A, 0x0D); |
5683 | 0 | if (r != 0) goto err; |
5684 | 0 | } |
5685 | 1.00k | else { |
5686 | 1.00k | bitset_set_range(env, cc->bs, 0x0A, 0x0D); |
5687 | 1.00k | } |
5688 | | |
5689 | | /* TODO: move this block to enc/unicode.c */ |
5690 | 1.00k | if (ONIGENC_IS_UNICODE(env->enc)) { |
5691 | | /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ |
5692 | 1.00k | r = add_code_range(&(cc->mbuf), env, 0x85, 0x85); |
5693 | 1.00k | if (r != 0) goto err; |
5694 | 1.00k | r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); |
5695 | 1.00k | if (r != 0) goto err; |
5696 | 1.00k | } |
5697 | | |
5698 | | /* ...|... */ |
5699 | 1.00k | target1 = onig_node_new_alt(right, NULL_NODE); |
5700 | 1.00k | if (IS_NULL(target1)) goto err; |
5701 | 1.00k | right = NULL; |
5702 | 1.00k | target2 = onig_node_new_alt(left, target1); |
5703 | 1.00k | if (IS_NULL(target2)) goto err; |
5704 | 1.00k | left = NULL; |
5705 | 1.00k | target1 = NULL; |
5706 | | |
5707 | | /* (?>...) */ |
5708 | 1.00k | *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
5709 | 1.00k | if (IS_NULL(*np)) goto err; |
5710 | 1.00k | NENCLOSE(*np)->target = target2; |
5711 | 1.00k | return ONIG_NORMAL; |
5712 | | |
5713 | 0 | err: |
5714 | 0 | onig_node_free(left); |
5715 | 0 | onig_node_free(right); |
5716 | 0 | onig_node_free(target1); |
5717 | 0 | onig_node_free(target2); |
5718 | 0 | return ONIGERR_MEMORY; |
5719 | 1.00k | } |
5720 | | |
5721 | | static int |
5722 | | propname2ctype(ScanEnv* env, const char* propname) |
5723 | 2.64M | { |
5724 | 2.64M | UChar* name = (UChar* )propname; |
5725 | 2.64M | UChar* name_end = name + strlen(propname); |
5726 | 2.64M | int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII, |
5727 | 2.64M | name, name_end); |
5728 | 2.64M | if (ctype < 0) { |
5729 | 0 | onig_scan_env_set_error_string(env, ctype, name, name_end); |
5730 | 0 | } |
5731 | 2.64M | return ctype; |
5732 | 2.64M | } |
5733 | | |
5734 | | static int |
5735 | | add_property_to_cc(CClassNode* cc, const char* propname, int not, ScanEnv* env) |
5736 | 2.50M | { |
5737 | 2.50M | int ctype = propname2ctype(env, propname); |
5738 | 2.50M | if (ctype < 0) return ctype; |
5739 | 2.50M | return add_ctype_to_cc(cc, ctype, not, 0, env); |
5740 | 2.50M | } |
5741 | | |
5742 | | /* |
5743 | | * helper methods for node_extended_grapheme_cluster (/\X/) |
5744 | | */ |
5745 | | static int |
5746 | | create_property_node(Node **np, ScanEnv* env, const char* propname) |
5747 | 2.05M | { |
5748 | 2.05M | int r; |
5749 | 2.05M | CClassNode* cc; |
5750 | | |
5751 | 2.05M | *np = node_new_cclass(); |
5752 | 2.05M | if (IS_NULL(*np)) return ONIGERR_MEMORY; |
5753 | 2.05M | cc = NCCLASS(*np); |
5754 | 2.05M | r = add_property_to_cc(cc, propname, 0, env); |
5755 | 2.05M | if (r != 0) |
5756 | 0 | onig_node_free(*np); |
5757 | 2.05M | return r; |
5758 | 2.05M | } |
5759 | | |
5760 | | static int |
5761 | | quantify_node(Node **np, int lower, int upper) |
5762 | 1.61M | { |
5763 | 1.61M | Node* tmp = node_new_quantifier(lower, upper, 0); |
5764 | 1.61M | if (IS_NULL(tmp)) return ONIGERR_MEMORY; |
5765 | 1.61M | NQTFR(tmp)->target = *np; |
5766 | 1.61M | *np = tmp; |
5767 | 1.61M | return 0; |
5768 | 1.61M | } |
5769 | | |
5770 | | static int |
5771 | | quantify_property_node(Node **np, ScanEnv* env, const char* propname, char repetitions) |
5772 | 1.32M | { |
5773 | 1.32M | int r; |
5774 | 1.32M | int lower = 0; |
5775 | 1.32M | int upper = REPEAT_INFINITE; |
5776 | | |
5777 | 1.32M | r = create_property_node(np, env, propname); |
5778 | 1.32M | if (r != 0) return r; |
5779 | 1.32M | switch (repetitions) { |
5780 | 0 | case '?': upper = 1; break; |
5781 | 441k | case '+': lower = 1; break; |
5782 | 735k | case '*': break; |
5783 | 147k | case '2': lower = upper = 2; break; |
5784 | 0 | default : return ONIGERR_PARSER_BUG; |
5785 | 1.32M | } |
5786 | 1.32M | return quantify_node(np, lower, upper); |
5787 | 1.32M | } |
5788 | | |
5789 | 3.82M | #define LIST 0 |
5790 | | #define ALT 1 |
5791 | | |
5792 | | /* IMPORTANT: Make sure node_array ends with NULL_NODE */ |
5793 | | static int |
5794 | | create_node_from_array(int kind, Node **np, Node **node_array) |
5795 | 1.17M | { |
5796 | 1.17M | Node* tmp = NULL_NODE; |
5797 | 1.17M | int i = 0; |
5798 | | |
5799 | 5.00M | while (node_array[i] != NULL_NODE) i++; |
5800 | 5.00M | while (--i >= 0) { |
5801 | 3.82M | *np = kind==LIST ? node_new_list(node_array[i], tmp) |
5802 | 3.82M | : onig_node_new_alt(node_array[i], tmp); |
5803 | 3.82M | if (IS_NULL(*np)) { |
5804 | 0 | while (i >= 0) { |
5805 | 0 | onig_node_free(node_array[i]); |
5806 | 0 | node_array[i--] = NULL_NODE; |
5807 | 0 | } |
5808 | 0 | onig_node_free(tmp); |
5809 | 0 | return ONIGERR_MEMORY; |
5810 | 0 | } |
5811 | 3.82M | else |
5812 | 3.82M | node_array[i] = NULL_NODE; |
5813 | 3.82M | tmp = *np; |
5814 | 3.82M | } |
5815 | 1.17M | return 0; |
5816 | 1.17M | } |
5817 | | |
5818 | 4.11M | #define R_ERR(call) r=(call);if(r!=0)goto err |
5819 | | |
5820 | | /* Memory layout for common node array: |
5821 | | * The main purpose is to be able to easily free all leftover nodes |
5822 | | * after an error. As a side effect, we share some memory. |
5823 | | * |
5824 | | * The layout is as shown below (each line corresponds to one call of |
5825 | | * create_node_from_array()). Because create_node_from_array sets all |
5826 | | * nodes of the source to NULL_NODE, we can overlap the target array |
5827 | | * as long as we do not override the actual target location. |
5828 | | * |
5829 | | * Target Array name Index |
5830 | | * |
5831 | | * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F |
5832 | | * top_alts alts[5] 0 1 2 3 4* |
5833 | | * alts+1 list[4] 0 1 2 3* |
5834 | | * list+1 core_alts[7] 0 1 2 3 4 5 6* |
5835 | | * core_alts+0 H_list[4] 0 1 2 3* |
5836 | | * H_list+1 H_alt2[4] 0 1 2 3* |
5837 | | * h_alt2+1 H_list2[3] 0 1 2* |
5838 | | * core_alts+4 XP_list[4] 0 1 2 3* |
5839 | | * XP_list+1 Ex_list[4] 0 1 2 3* |
5840 | | */ |
5841 | 2.50M | #define NODE_COMMON_SIZE 16 |
5842 | | |
5843 | | static int |
5844 | | node_extended_grapheme_cluster(Node** np, ScanEnv* env) |
5845 | 147k | { |
5846 | 147k | Node* tmp = NULL; |
5847 | 147k | Node* np1 = NULL; |
5848 | 147k | Node* top_alt = NULL; |
5849 | 147k | int r = 0; |
5850 | 147k | int num1; |
5851 | 147k | int i; |
5852 | 147k | int any_target_position; |
5853 | 147k | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; |
5854 | 147k | OnigOptionType option; |
5855 | | /* node_common is function-global so that we can free all nodes |
5856 | | * in case of error. Unused slots are set to NULL_NODE at all times. */ |
5857 | 147k | Node *node_common[NODE_COMMON_SIZE]; |
5858 | 147k | Node **alts = node_common+0; /* size: 5 */ |
5859 | | |
5860 | 2.50M | for (i=0; i<NODE_COMMON_SIZE; i++) |
5861 | 2.35M | node_common[i] = NULL_NODE; |
5862 | | |
5863 | | /* CRLF, common for both Unicode and non-Unicode */ |
5864 | | /* \x0D\x0A */ |
5865 | 147k | r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); |
5866 | 147k | if (r < 0) goto err; |
5867 | 147k | num1 = r; |
5868 | 147k | r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); |
5869 | 147k | if (r < 0) goto err; |
5870 | 147k | alts[0] = node_new_str_raw(buf, buf + num1 + r); |
5871 | 147k | if (IS_NULL(alts[0])) goto err; |
5872 | | |
5873 | 147k | #ifdef USE_UNICODE_PROPERTIES |
5874 | 147k | if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ |
5875 | 147k | CClassNode* cc; |
5876 | | |
5877 | 147k | if (propname2ctype(env, "Grapheme_Cluster_Break=Extend") < 0) goto err; |
5878 | | /* Unicode 11.0.0 |
5879 | | * CRLF (already done) |
5880 | | * | [Control CR LF] |
5881 | | * | precore* core postcore* |
5882 | | * | . (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */ |
5883 | | |
5884 | | /* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */ |
5885 | 147k | alts[1] = node_new_cclass(); |
5886 | 147k | if (IS_NULL(alts[1])) goto err; |
5887 | 147k | cc = NCCLASS(alts[1]); |
5888 | 147k | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env)); |
5889 | 147k | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ |
5890 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */ |
5891 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */ |
5892 | 0 | } |
5893 | 147k | else { |
5894 | 147k | BITSET_SET_BIT(cc->bs, 0x0a); |
5895 | 147k | BITSET_SET_BIT(cc->bs, 0x0d); |
5896 | 147k | } |
5897 | | |
5898 | | /* precore* core postcore* */ |
5899 | 147k | { |
5900 | 147k | Node **list = alts + 3; /* size: 4 */ |
5901 | | |
5902 | | /* precore*; precore := Prepend */ |
5903 | 147k | R_ERR(quantify_property_node(list+0, env, "Grapheme_Cluster_Break=Prepend", '*')); |
5904 | | |
5905 | | /* core := hangul-syllable |
5906 | | * | ri-sequence |
5907 | | * | xpicto-sequence |
5908 | | * | [^Control CR LF] */ |
5909 | 147k | { |
5910 | 147k | Node **core_alts = list + 2; /* size: 7 */ |
5911 | | |
5912 | | /* hangul-syllable := |
5913 | | * L* (V+ | LV V* | LVT) T* |
5914 | | * | L+ |
5915 | | * | T+ */ |
5916 | | /* hangul-syllable is an alternative (would be called H_alt) |
5917 | | * inside an alternative, but we flatten it into core_alts */ |
5918 | | |
5919 | | /* L* (V+ | LV V* | LVT) T* */ |
5920 | 147k | { |
5921 | 147k | Node **H_list = core_alts + 1; /* size: 4 */ |
5922 | 147k | R_ERR(quantify_property_node(H_list+0, env, "Grapheme_Cluster_Break=L", '*')); |
5923 | | |
5924 | | /* V+ | LV V* | LVT */ |
5925 | 147k | { |
5926 | 147k | Node **H_alt2 = H_list + 2; /* size: 4 */ |
5927 | 147k | R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+')); |
5928 | | |
5929 | | /* LV V* */ |
5930 | 147k | { |
5931 | 147k | Node **H_list2 = H_alt2 + 2; /* size: 3 */ |
5932 | | |
5933 | 147k | R_ERR(create_property_node(H_list2+0, env, "Grapheme_Cluster_Break=LV")); |
5934 | 147k | R_ERR(quantify_property_node(H_list2+1, env, "Grapheme_Cluster_Break=V", '*')); |
5935 | 147k | R_ERR(create_node_from_array(LIST, H_alt2+1, H_list2)); |
5936 | 147k | } |
5937 | | |
5938 | 147k | R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT")); |
5939 | 147k | R_ERR(create_node_from_array(ALT, H_list+1, H_alt2)); |
5940 | 147k | } |
5941 | | |
5942 | 147k | R_ERR(quantify_property_node(H_list+2, env, "Grapheme_Cluster_Break=T", '*')); |
5943 | 147k | R_ERR(create_node_from_array(LIST, core_alts+0, H_list)); |
5944 | 147k | } |
5945 | | |
5946 | 147k | R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+')); |
5947 | 147k | R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+')); |
5948 | | /* end of hangul-syllable */ |
5949 | | |
5950 | | /* ri-sequence := RI RI */ |
5951 | 147k | R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2')); |
5952 | | |
5953 | | /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */ |
5954 | 147k | { |
5955 | 147k | Node **XP_list = core_alts + 5; /* size: 3 */ |
5956 | 147k | R_ERR(create_property_node(XP_list+0, env, "Extended_Pictographic")); |
5957 | | |
5958 | | /* (Extend* ZWJ \p{Extended_Pictographic})* */ |
5959 | 147k | { |
5960 | 147k | Node **Ex_list = XP_list + 2; /* size: 4 */ |
5961 | | /* assert(Ex_list+4 == node_common+NODE_COMMON_SIZE); */ |
5962 | 147k | R_ERR(quantify_property_node(Ex_list+0, env, "Grapheme_Cluster_Break=Extend", '*')); |
5963 | | |
5964 | | /* ZWJ (ZERO WIDTH JOINER) */ |
5965 | 147k | r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); |
5966 | 147k | if (r < 0) goto err; |
5967 | 147k | Ex_list[1] = node_new_str_raw(buf, buf + r); |
5968 | 147k | if (IS_NULL(Ex_list[1])) goto err; |
5969 | | |
5970 | 147k | R_ERR(create_property_node(Ex_list+2, env, "Extended_Pictographic")); |
5971 | 147k | R_ERR(create_node_from_array(LIST, XP_list+1, Ex_list)); |
5972 | 147k | } |
5973 | 147k | R_ERR(quantify_node(XP_list+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */ |
5974 | | |
5975 | 147k | R_ERR(create_node_from_array(LIST, core_alts+4, XP_list)); |
5976 | 147k | } |
5977 | | |
5978 | | /* [^Control CR LF] */ |
5979 | 147k | core_alts[5] = node_new_cclass(); |
5980 | 147k | if (IS_NULL(core_alts[5])) goto err; |
5981 | 147k | cc = NCCLASS(core_alts[5]); |
5982 | 147k | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ |
5983 | 0 | BBuf *inverted_buf = NULL; |
5984 | | |
5985 | | /* TODO: fix false warning */ |
5986 | 0 | const int dup_not_warned = env->warnings_flag | ~ONIG_SYN_WARN_CC_DUP; |
5987 | 0 | env->warnings_flag |= ONIG_SYN_WARN_CC_DUP; |
5988 | | |
5989 | | /* Start with a positive buffer and invert at the end. |
5990 | | * Otherwise, adding single-character ranges work the wrong way. */ |
5991 | 0 | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env)); |
5992 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */ |
5993 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */ |
5994 | 0 | R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env)); |
5995 | 0 | cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */ |
5996 | |
|
5997 | 0 | env->warnings_flag &= dup_not_warned; /* TODO: fix false warning */ |
5998 | 0 | } |
5999 | 147k | else { |
6000 | 147k | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env)); |
6001 | 147k | BITSET_CLEAR_BIT(cc->bs, 0x0a); |
6002 | 147k | BITSET_CLEAR_BIT(cc->bs, 0x0d); |
6003 | 147k | } |
6004 | | |
6005 | 147k | R_ERR(create_node_from_array(ALT, list+1, core_alts)); |
6006 | 147k | } |
6007 | | |
6008 | | /* postcore*; postcore = [Extend ZWJ SpacingMark] */ |
6009 | 147k | R_ERR(create_property_node(list+2, env, "Grapheme_Cluster_Break=Extend")); |
6010 | 147k | cc = NCCLASS(list[2]); |
6011 | 147k | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env)); |
6012 | 147k | R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D)); |
6013 | 147k | R_ERR(quantify_node(list+2, 0, REPEAT_INFINITE)); |
6014 | | |
6015 | 147k | R_ERR(create_node_from_array(LIST, alts+2, list)); |
6016 | 147k | } |
6017 | | |
6018 | 147k | any_target_position = 3; |
6019 | 147k | } |
6020 | 0 | else |
6021 | 0 | #endif /* USE_UNICODE_PROPERTIES */ |
6022 | 0 | { |
6023 | 0 | any_target_position = 1; |
6024 | 0 | } |
6025 | | |
6026 | | /* PerlSyntax: (?s:.), RubySyntax: (?m:.), common for both Unicode and non-Unicode */ |
6027 | | /* Not in Unicode spec (UAX #29), but added to catch invalid stuff, |
6028 | | * because this is Ruby spec for String#grapheme_clusters. */ |
6029 | 147k | np1 = node_new_anychar(); |
6030 | 147k | if (IS_NULL(np1)) goto err; |
6031 | | |
6032 | 147k | option = env->option; |
6033 | 147k | ONOFF(option, ONIG_OPTION_MULTILINE, 0); |
6034 | 147k | tmp = node_new_option(option); |
6035 | 147k | if (IS_NULL(tmp)) goto err; |
6036 | 147k | NENCLOSE(tmp)->target = np1; |
6037 | 147k | alts[any_target_position] = tmp; |
6038 | 147k | np1 = NULL; |
6039 | | |
6040 | 147k | R_ERR(create_node_from_array(ALT, &top_alt, alts)); |
6041 | | |
6042 | | /* (?>): For efficiency, because there is no text piece |
6043 | | * that is not in a grapheme cluster, and there is only one way |
6044 | | * to split a string into grapheme clusters. */ |
6045 | 147k | tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
6046 | 147k | if (IS_NULL(tmp)) goto err; |
6047 | 147k | NENCLOSE(tmp)->target = top_alt; |
6048 | 147k | np1 = tmp; |
6049 | | |
6050 | 147k | #ifdef USE_UNICODE_PROPERTIES |
6051 | 147k | if (ONIGENC_IS_UNICODE(env->enc)) { |
6052 | | /* Don't ignore case. */ |
6053 | 147k | option = env->option; |
6054 | 147k | ONOFF(option, ONIG_OPTION_IGNORECASE, 1); |
6055 | 147k | *np = node_new_option(option); |
6056 | 147k | if (IS_NULL(*np)) goto err; |
6057 | 147k | NENCLOSE(*np)->target = np1; |
6058 | 147k | } |
6059 | 0 | else |
6060 | 0 | #endif |
6061 | 0 | { |
6062 | 0 | *np = np1; |
6063 | 0 | } |
6064 | 147k | return ONIG_NORMAL; |
6065 | | |
6066 | 0 | err: |
6067 | 0 | onig_node_free(np1); |
6068 | 0 | for (i=0; i<NODE_COMMON_SIZE; i++) |
6069 | 0 | onig_node_free(node_common[i]); |
6070 | 0 | return (r == 0) ? ONIGERR_MEMORY : r; |
6071 | 147k | } |
6072 | | #undef R_ERR |
6073 | | |
6074 | | static int |
6075 | | countbits(unsigned int bits) |
6076 | 218k | { |
6077 | 218k | bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555); |
6078 | 218k | bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333); |
6079 | 218k | bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f); |
6080 | 218k | bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff); |
6081 | 218k | return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff); |
6082 | 218k | } |
6083 | | |
6084 | | static int |
6085 | | is_onechar_cclass(CClassNode* cc, OnigCodePoint* code) |
6086 | 516k | { |
6087 | 516k | const OnigCodePoint not_found = ONIG_LAST_CODE_POINT; |
6088 | 516k | OnigCodePoint c = not_found; |
6089 | 516k | int i; |
6090 | 516k | BBuf *bbuf = cc->mbuf; |
6091 | | |
6092 | 516k | if (IS_NCCLASS_NOT(cc)) return 0; |
6093 | | |
6094 | | /* check bbuf */ |
6095 | 282k | if (IS_NOT_NULL(bbuf)) { |
6096 | 1.37k | OnigCodePoint n, *data; |
6097 | 1.37k | GET_CODE_POINT(n, bbuf->p); |
6098 | 1.37k | data = (OnigCodePoint* )(bbuf->p) + 1; |
6099 | 1.37k | if ((n == 1) && (data[0] == data[1])) { |
6100 | | /* only one char found in the bbuf, save the code point. */ |
6101 | 798 | c = data[0]; |
6102 | 798 | if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) { |
6103 | | /* skip if c is included in the bitset */ |
6104 | 0 | c = not_found; |
6105 | 0 | } |
6106 | 798 | } |
6107 | 577 | else { |
6108 | 577 | return 0; /* the bbuf contains multiple chars */ |
6109 | 577 | } |
6110 | 1.37k | } |
6111 | | |
6112 | | /* check bitset */ |
6113 | 510k | for (i = 0; i < BITSET_SIZE; i++) { |
6114 | 510k | Bits b1 = cc->bs[i]; |
6115 | 510k | if (b1 != 0) { |
6116 | 500k | if (((b1 & (b1 - 1)) == 0) && (c == not_found)) { |
6117 | 218k | c = BITS_IN_ROOM * i + countbits(b1 - 1); |
6118 | 281k | } else { |
6119 | 281k | return 0; /* the character class contains multiple chars */ |
6120 | 281k | } |
6121 | 500k | } |
6122 | 510k | } |
6123 | | |
6124 | 654 | if (c != not_found) { |
6125 | 479 | *code = c; |
6126 | 479 | return 1; |
6127 | 479 | } |
6128 | | |
6129 | | /* the character class contains no char. */ |
6130 | 175 | return 0; |
6131 | 654 | } |
6132 | | |
6133 | | |
6134 | | static int |
6135 | | parse_exp(Node** np, OnigToken* tok, int term, |
6136 | | UChar** src, UChar* end, ScanEnv* env) |
6137 | 4.75M | { |
6138 | 4.75M | int r, len, group = 0; |
6139 | 4.75M | Node* qn; |
6140 | 4.75M | Node** targetp; |
6141 | 4.75M | unsigned int parse_depth; |
6142 | | |
6143 | 4.75M | *np = NULL; |
6144 | 4.75M | if (tok->type == (enum TokenSyms )term) |
6145 | 47.8k | goto end_of_token; |
6146 | | |
6147 | 4.70M | parse_depth = env->parse_depth; |
6148 | | |
6149 | 4.70M | switch (tok->type) { |
6150 | 14.8k | case TK_ALT: |
6151 | 14.8k | case TK_EOT: |
6152 | 62.7k | end_of_token: |
6153 | 62.7k | *np = node_new_empty(); |
6154 | 62.7k | return tok->type; |
6155 | 0 | break; |
6156 | | |
6157 | 585k | case TK_SUBEXP_OPEN: |
6158 | 585k | r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env); |
6159 | 585k | if (r < 0) return r; |
6160 | 558k | if (r == 1) group = 1; |
6161 | 371k | else if (r == 2) { /* option only */ |
6162 | 6.07k | Node* target; |
6163 | 6.07k | OnigOptionType prev = env->option; |
6164 | | |
6165 | 6.07k | env->option = NENCLOSE(*np)->option; |
6166 | 6.07k | r = fetch_token(tok, src, end, env); |
6167 | 6.07k | if (r < 0) { |
6168 | 0 | env->option = prev; |
6169 | 0 | return r; |
6170 | 0 | } |
6171 | 6.07k | r = parse_subexp(&target, tok, term, src, end, env); |
6172 | 6.07k | env->option = prev; |
6173 | 6.07k | if (r < 0) { |
6174 | 469 | onig_node_free(target); |
6175 | 469 | return r; |
6176 | 469 | } |
6177 | 5.60k | NENCLOSE(*np)->target = target; |
6178 | 5.60k | return tok->type; |
6179 | 6.07k | } |
6180 | 551k | break; |
6181 | | |
6182 | 551k | case TK_SUBEXP_CLOSE: |
6183 | 18 | if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) |
6184 | 18 | return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; |
6185 | | |
6186 | 0 | if (tok->escaped) goto tk_raw_byte; |
6187 | 0 | else goto tk_byte; |
6188 | 0 | break; |
6189 | | |
6190 | 1.00k | case TK_LINEBREAK: |
6191 | 1.00k | r = node_linebreak(np, env); |
6192 | 1.00k | if (r < 0) return r; |
6193 | 1.00k | break; |
6194 | | |
6195 | 147k | case TK_EXTENDED_GRAPHEME_CLUSTER: |
6196 | 147k | r = node_extended_grapheme_cluster(np, env); |
6197 | 147k | if (r < 0) return r; |
6198 | 147k | break; |
6199 | | |
6200 | 147k | case TK_KEEP: |
6201 | 168 | *np = onig_node_new_anchor(ANCHOR_KEEP); |
6202 | 168 | CHECK_NULL_RETURN_MEMERR(*np); |
6203 | 168 | break; |
6204 | | |
6205 | 1.64M | case TK_STRING: |
6206 | 1.64M | tk_byte: |
6207 | 1.64M | { |
6208 | 1.64M | *np = node_new_str(tok->backp, *src); |
6209 | 1.64M | CHECK_NULL_RETURN_MEMERR(*np); |
6210 | | |
6211 | 1.64M | string_loop: |
6212 | 12.5M | while (1) { |
6213 | 12.5M | r = fetch_token(tok, src, end, env); |
6214 | 12.5M | if (r < 0) return r; |
6215 | 12.5M | if (r == TK_STRING) { |
6216 | 10.9M | r = onig_node_str_cat(*np, tok->backp, *src); |
6217 | 10.9M | } |
6218 | 1.64M | #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG |
6219 | 1.64M | else if (r == TK_CODE_POINT) { |
6220 | 1.82k | r = node_str_cat_codepoint(*np, env->enc, tok->u.code); |
6221 | 1.82k | } |
6222 | 1.64M | #endif |
6223 | 1.64M | else { |
6224 | 1.64M | break; |
6225 | 1.64M | } |
6226 | 10.9M | if (r < 0) return r; |
6227 | 10.9M | } |
6228 | | |
6229 | 1.64M | string_end: |
6230 | 1.64M | targetp = np; |
6231 | 1.64M | goto repeat; |
6232 | 1.64M | } |
6233 | 0 | break; |
6234 | | |
6235 | 4.19k | case TK_RAW_BYTE: |
6236 | 4.19k | tk_raw_byte: |
6237 | 4.19k | { |
6238 | 4.19k | *np = node_new_str_raw_char((UChar )tok->u.c); |
6239 | 4.19k | CHECK_NULL_RETURN_MEMERR(*np); |
6240 | 4.19k | len = 1; |
6241 | 4.19k | while (1) { |
6242 | 4.19k | if (len >= ONIGENC_MBC_MINLEN(env->enc)) { |
6243 | 4.19k | if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) { |
6244 | 4.19k | r = fetch_token(tok, src, end, env); |
6245 | 4.19k | NSTRING_CLEAR_RAW(*np); |
6246 | 4.19k | goto string_end; |
6247 | 4.19k | } |
6248 | 4.19k | } |
6249 | | |
6250 | 0 | r = fetch_token(tok, src, end, env); |
6251 | 0 | if (r < 0) return r; |
6252 | 0 | if (r != TK_RAW_BYTE) { |
6253 | | /* Don't use this, it is wrong for little endian encodings. */ |
6254 | | #ifdef USE_PAD_TO_SHORT_BYTE_CHAR |
6255 | | int rem; |
6256 | | if (len < ONIGENC_MBC_MINLEN(env->enc)) { |
6257 | | rem = ONIGENC_MBC_MINLEN(env->enc) - len; |
6258 | | (void )node_str_head_pad(NSTR(*np), rem, (UChar )0); |
6259 | | if (len + rem == enclen(env->enc, NSTR(*np)->s)) { |
6260 | | NSTRING_CLEAR_RAW(*np); |
6261 | | goto string_end; |
6262 | | } |
6263 | | } |
6264 | | #endif |
6265 | 0 | return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; |
6266 | 0 | } |
6267 | | |
6268 | 0 | r = node_str_cat_char(*np, (UChar )tok->u.c); |
6269 | 0 | if (r < 0) return r; |
6270 | | |
6271 | 0 | len++; |
6272 | 0 | } |
6273 | 4.19k | } |
6274 | 0 | break; |
6275 | | |
6276 | 462 | case TK_CODE_POINT: |
6277 | 462 | { |
6278 | 462 | *np = node_new_empty(); |
6279 | 462 | CHECK_NULL_RETURN_MEMERR(*np); |
6280 | 462 | r = node_str_cat_codepoint(*np, env->enc, tok->u.code); |
6281 | 462 | if (r != 0) return r; |
6282 | | #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG |
6283 | | NSTRING_SET_RAW(*np); |
6284 | | #else |
6285 | 462 | goto string_loop; |
6286 | 462 | #endif |
6287 | 462 | } |
6288 | 462 | break; |
6289 | | |
6290 | 462 | case TK_QUOTE_OPEN: |
6291 | 0 | { |
6292 | 0 | OnigCodePoint end_op[2]; |
6293 | 0 | UChar *qstart, *qend, *nextp; |
6294 | |
|
6295 | 0 | end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); |
6296 | 0 | end_op[1] = (OnigCodePoint )'E'; |
6297 | 0 | qstart = *src; |
6298 | 0 | qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); |
6299 | 0 | if (IS_NULL(qend)) { |
6300 | 0 | nextp = qend = end; |
6301 | 0 | } |
6302 | 0 | *np = node_new_str(qstart, qend); |
6303 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6304 | 0 | *src = nextp; |
6305 | 0 | } |
6306 | 0 | break; |
6307 | | |
6308 | 343k | case TK_CHAR_TYPE: |
6309 | 343k | { |
6310 | 343k | switch (tok->u.prop.ctype) { |
6311 | 4.94k | case ONIGENC_CTYPE_WORD: |
6312 | 4.94k | *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, |
6313 | 4.94k | IS_ASCII_RANGE(env->option)); |
6314 | 4.94k | CHECK_NULL_RETURN_MEMERR(*np); |
6315 | 4.94k | break; |
6316 | | |
6317 | 169k | case ONIGENC_CTYPE_SPACE: |
6318 | 337k | case ONIGENC_CTYPE_DIGIT: |
6319 | 338k | case ONIGENC_CTYPE_XDIGIT: |
6320 | 338k | { |
6321 | 338k | CClassNode* cc; |
6322 | | |
6323 | 338k | *np = node_new_cclass(); |
6324 | 338k | CHECK_NULL_RETURN_MEMERR(*np); |
6325 | 338k | cc = NCCLASS(*np); |
6326 | 338k | r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0, |
6327 | 338k | IS_ASCII_RANGE(env->option), env); |
6328 | 338k | if (r != 0) return r; |
6329 | 338k | if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); |
6330 | 338k | } |
6331 | 0 | break; |
6332 | | |
6333 | 0 | default: |
6334 | 0 | return ONIGERR_PARSER_BUG; |
6335 | 0 | break; |
6336 | 343k | } |
6337 | 343k | } |
6338 | 343k | break; |
6339 | | |
6340 | 343k | case TK_CHAR_PROPERTY: |
6341 | 1.37k | r = parse_char_property(np, tok, src, end, env); |
6342 | 1.37k | if (r != 0) return r; |
6343 | 1.37k | break; |
6344 | | |
6345 | 516k | case TK_CC_OPEN: |
6346 | 516k | { |
6347 | 516k | Node *asc_node; |
6348 | 516k | CClassNode* cc; |
6349 | 516k | OnigCodePoint code; |
6350 | | |
6351 | 516k | r = parse_char_class(np, &asc_node, tok, src, end, env); |
6352 | 516k | if (r != 0) { |
6353 | 31 | onig_node_free(asc_node); |
6354 | 31 | return r; |
6355 | 31 | } |
6356 | | |
6357 | 516k | cc = NCCLASS(*np); |
6358 | 516k | if (is_onechar_cclass(cc, &code)) { |
6359 | 479 | onig_node_free(*np); |
6360 | 479 | onig_node_free(asc_node); |
6361 | 479 | *np = node_new_empty(); |
6362 | 479 | CHECK_NULL_RETURN_MEMERR(*np); |
6363 | 479 | r = node_str_cat_codepoint(*np, env->enc, code); |
6364 | 479 | if (r != 0) return r; |
6365 | 479 | goto string_loop; |
6366 | 479 | } |
6367 | 515k | if (IS_IGNORECASE(env->option)) { |
6368 | 3.45k | r = cclass_case_fold(np, cc, NCCLASS(asc_node), env); |
6369 | 3.45k | if (r != 0) { |
6370 | 0 | onig_node_free(asc_node); |
6371 | 0 | return r; |
6372 | 0 | } |
6373 | 3.45k | } |
6374 | 515k | onig_node_free(asc_node); |
6375 | 515k | } |
6376 | 0 | break; |
6377 | | |
6378 | 416k | case TK_ANYCHAR: |
6379 | 416k | *np = node_new_anychar(); |
6380 | 416k | CHECK_NULL_RETURN_MEMERR(*np); |
6381 | 416k | break; |
6382 | | |
6383 | 416k | case TK_ANYCHAR_ANYTIME: |
6384 | 0 | *np = node_new_anychar(); |
6385 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6386 | 0 | qn = node_new_quantifier(0, REPEAT_INFINITE, 0); |
6387 | 0 | CHECK_NULL_RETURN_MEMERR(qn); |
6388 | 0 | NQTFR(qn)->target = *np; |
6389 | 0 | *np = qn; |
6390 | 0 | break; |
6391 | | |
6392 | 9.67k | case TK_BACKREF: |
6393 | 9.67k | len = tok->u.backref.num; |
6394 | 9.67k | *np = node_new_backref(len, |
6395 | 9.67k | (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), |
6396 | 9.67k | tok->u.backref.by_name, |
6397 | 9.67k | #ifdef USE_BACKREF_WITH_LEVEL |
6398 | 9.67k | tok->u.backref.exist_level, |
6399 | 9.67k | tok->u.backref.level, |
6400 | 9.67k | #endif |
6401 | 9.67k | env); |
6402 | 9.67k | CHECK_NULL_RETURN_MEMERR(*np); |
6403 | 9.67k | break; |
6404 | | |
6405 | 9.67k | #ifdef USE_SUBEXP_CALL |
6406 | 9.67k | case TK_CALL: |
6407 | 4.48k | { |
6408 | 4.48k | int gnum = tok->u.call.gnum; |
6409 | | |
6410 | 4.48k | if (gnum < 0 || tok->u.call.rel != 0) { |
6411 | 2.41k | if (gnum > 0) gnum--; |
6412 | 2.41k | gnum = BACKREF_REL_TO_ABS(gnum, env); |
6413 | 2.41k | if (gnum <= 0) |
6414 | 1 | return ONIGERR_INVALID_BACKREF; |
6415 | 2.41k | } |
6416 | 4.48k | *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum); |
6417 | 4.48k | CHECK_NULL_RETURN_MEMERR(*np); |
6418 | 4.48k | env->num_call++; |
6419 | 4.48k | } |
6420 | 0 | break; |
6421 | 0 | #endif |
6422 | | |
6423 | 1.01M | case TK_ANCHOR: |
6424 | 1.01M | *np = onig_node_new_anchor(tok->u.anchor.subtype); |
6425 | 1.01M | CHECK_NULL_RETURN_MEMERR(*np); |
6426 | 1.01M | NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range; |
6427 | 1.01M | break; |
6428 | | |
6429 | 4 | case TK_OP_REPEAT: |
6430 | 4 | case TK_INTERVAL: |
6431 | 4 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { |
6432 | 4 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) |
6433 | 4 | return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; |
6434 | 0 | else |
6435 | 0 | *np = node_new_empty(); |
6436 | 4 | } |
6437 | 0 | else { |
6438 | 0 | goto tk_byte; |
6439 | 0 | } |
6440 | 0 | break; |
6441 | | |
6442 | 0 | default: |
6443 | 0 | return ONIGERR_PARSER_BUG; |
6444 | 0 | break; |
6445 | 4.70M | } |
6446 | | |
6447 | 3.00M | { |
6448 | 3.00M | targetp = np; |
6449 | | |
6450 | 4.27M | re_entry: |
6451 | 4.27M | r = fetch_token(tok, src, end, env); |
6452 | 4.27M | if (r < 0) return r; |
6453 | | |
6454 | 5.92M | repeat: |
6455 | 5.92M | if (r == TK_OP_REPEAT || r == TK_INTERVAL) { |
6456 | 1.27M | if (is_invalid_quantifier_target(*targetp)) |
6457 | 0 | return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; |
6458 | | |
6459 | 1.27M | parse_depth++; |
6460 | 1.27M | if (parse_depth > ParseDepthLimit) |
6461 | 1 | return ONIGERR_PARSE_DEPTH_LIMIT_OVER; |
6462 | | |
6463 | 1.27M | qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, |
6464 | 1.27M | (r == TK_INTERVAL ? 1 : 0)); |
6465 | 1.27M | CHECK_NULL_RETURN_MEMERR(qn); |
6466 | 1.27M | NQTFR(qn)->greedy = tok->u.repeat.greedy; |
6467 | 1.27M | r = set_quantifier(qn, *targetp, group, env); |
6468 | 1.27M | if (r < 0) { |
6469 | 0 | onig_node_free(qn); |
6470 | 0 | return r; |
6471 | 0 | } |
6472 | | |
6473 | 1.27M | if (tok->u.repeat.possessive != 0) { |
6474 | 62.0k | Node* en; |
6475 | 62.0k | en = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
6476 | 62.0k | if (IS_NULL(en)) { |
6477 | 0 | onig_node_free(qn); |
6478 | 0 | return ONIGERR_MEMORY; |
6479 | 0 | } |
6480 | 62.0k | NENCLOSE(en)->target = qn; |
6481 | 62.0k | qn = en; |
6482 | 62.0k | } |
6483 | | |
6484 | 1.27M | if (r == 0) { |
6485 | 1.23M | *targetp = qn; |
6486 | 1.23M | } |
6487 | 32.9k | else if (r == 1) { |
6488 | 257 | onig_node_free(qn); |
6489 | 257 | } |
6490 | 32.6k | else if (r == 2) { /* split case: /abc+/ */ |
6491 | 32.6k | Node *tmp; |
6492 | | |
6493 | 32.6k | *targetp = node_new_list(*targetp, NULL); |
6494 | 32.6k | if (IS_NULL(*targetp)) { |
6495 | 0 | onig_node_free(qn); |
6496 | 0 | return ONIGERR_MEMORY; |
6497 | 0 | } |
6498 | 32.6k | tmp = NCDR(*targetp) = node_new_list(qn, NULL); |
6499 | 32.6k | if (IS_NULL(tmp)) { |
6500 | 0 | onig_node_free(qn); |
6501 | 0 | return ONIGERR_MEMORY; |
6502 | 0 | } |
6503 | 32.6k | targetp = &(NCAR(tmp)); |
6504 | 32.6k | } |
6505 | 1.27M | goto re_entry; |
6506 | 1.27M | } |
6507 | 5.92M | } |
6508 | | |
6509 | 4.65M | return r; |
6510 | 5.92M | } |
6511 | | |
6512 | | static int |
6513 | | parse_branch(Node** top, OnigToken* tok, int term, |
6514 | | UChar** src, UChar* end, ScanEnv* env) |
6515 | 1.69M | { |
6516 | 1.69M | int r; |
6517 | 1.69M | Node *node, **headp; |
6518 | | |
6519 | 1.69M | *top = NULL; |
6520 | 1.69M | r = parse_exp(&node, tok, term, src, end, env); |
6521 | 1.69M | if (r < 0) { |
6522 | 22.3k | onig_node_free(node); |
6523 | 22.3k | return r; |
6524 | 22.3k | } |
6525 | | |
6526 | 1.67M | if (r == TK_EOT || r == term || r == TK_ALT) { |
6527 | 818k | *top = node; |
6528 | 818k | } |
6529 | 858k | else { |
6530 | 858k | *top = node_new_list(node, NULL); |
6531 | 858k | headp = &(NCDR(*top)); |
6532 | 3.90M | while (r != TK_EOT && r != term && r != TK_ALT) { |
6533 | 3.05M | r = parse_exp(&node, tok, term, src, end, env); |
6534 | 3.05M | if (r < 0) { |
6535 | 5.21k | onig_node_free(node); |
6536 | 5.21k | return r; |
6537 | 5.21k | } |
6538 | | |
6539 | 3.04M | if (NTYPE(node) == NT_LIST) { |
6540 | 25.6k | *headp = node; |
6541 | 51.3k | while (IS_NOT_NULL(NCDR(node))) node = NCDR(node); |
6542 | 25.6k | headp = &(NCDR(node)); |
6543 | 25.6k | } |
6544 | 3.02M | else { |
6545 | 3.02M | *headp = node_new_list(node, NULL); |
6546 | 3.02M | headp = &(NCDR(*headp)); |
6547 | 3.02M | } |
6548 | 3.04M | } |
6549 | 858k | } |
6550 | | |
6551 | 1.67M | return r; |
6552 | 1.67M | } |
6553 | | |
6554 | | /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ |
6555 | | static int |
6556 | | parse_subexp(Node** top, OnigToken* tok, int term, |
6557 | | UChar** src, UChar* end, ScanEnv* env) |
6558 | 1.42M | { |
6559 | 1.42M | int r; |
6560 | 1.42M | Node *node, **headp; |
6561 | | |
6562 | 1.42M | *top = NULL; |
6563 | 1.42M | env->parse_depth++; |
6564 | 1.42M | if (env->parse_depth > ParseDepthLimit) |
6565 | 2 | return ONIGERR_PARSE_DEPTH_LIMIT_OVER; |
6566 | 1.42M | r = parse_branch(&node, tok, term, src, end, env); |
6567 | 1.42M | if (r < 0) { |
6568 | 27.0k | onig_node_free(node); |
6569 | 27.0k | return r; |
6570 | 27.0k | } |
6571 | | |
6572 | 1.39M | if (r == term) { |
6573 | 1.19M | *top = node; |
6574 | 1.19M | } |
6575 | 195k | else if (r == TK_ALT) { |
6576 | 195k | *top = onig_node_new_alt(node, NULL); |
6577 | 195k | headp = &(NCDR(*top)); |
6578 | 473k | while (r == TK_ALT) { |
6579 | 278k | r = fetch_token(tok, src, end, env); |
6580 | 278k | if (r < 0) return r; |
6581 | 278k | r = parse_branch(&node, tok, term, src, end, env); |
6582 | 278k | if (r < 0) { |
6583 | 526 | onig_node_free(node); |
6584 | 526 | return r; |
6585 | 526 | } |
6586 | | |
6587 | 277k | *headp = onig_node_new_alt(node, NULL); |
6588 | 277k | headp = &(NCDR(*headp)); |
6589 | 277k | } |
6590 | | |
6591 | 195k | if (tok->type != (enum TokenSyms )term) |
6592 | 2 | goto err; |
6593 | 195k | } |
6594 | 25 | else { |
6595 | 25 | onig_node_free(node); |
6596 | 27 | err: |
6597 | 27 | if (term == TK_SUBEXP_CLOSE) |
6598 | 27 | return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; |
6599 | 0 | else |
6600 | 0 | return ONIGERR_PARSER_BUG; |
6601 | 27 | } |
6602 | | |
6603 | 1.39M | env->parse_depth--; |
6604 | 1.39M | return r; |
6605 | 1.39M | } |
6606 | | |
6607 | | static int |
6608 | | parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) |
6609 | 836k | { |
6610 | 836k | int r; |
6611 | 836k | OnigToken tok; |
6612 | | |
6613 | 836k | r = fetch_token(&tok, src, end, env); |
6614 | 836k | if (r < 0) return r; |
6615 | 836k | r = parse_subexp(top, &tok, TK_EOT, src, end, env); |
6616 | 836k | if (r < 0) return r; |
6617 | | |
6618 | 836k | #ifdef USE_SUBEXP_CALL |
6619 | 836k | if (env->num_call > 0) { |
6620 | | /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */ |
6621 | 1.24k | const int num = 0; |
6622 | 1.24k | Node* np; |
6623 | 1.24k | np = node_new_enclose_memory(env->option, 0); |
6624 | 1.24k | CHECK_NULL_RETURN_MEMERR(np); |
6625 | 1.24k | NENCLOSE(np)->regnum = num; |
6626 | 1.24k | NENCLOSE(np)->target = *top; |
6627 | 1.24k | r = scan_env_set_mem_node(env, num, np); |
6628 | 1.24k | if (r != 0) { |
6629 | 0 | onig_node_free(np); |
6630 | 0 | return r; |
6631 | 0 | } |
6632 | 1.24k | *top = np; |
6633 | 1.24k | } |
6634 | 836k | #endif |
6635 | 836k | return 0; |
6636 | 836k | } |
6637 | | |
6638 | | extern int |
6639 | | onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, |
6640 | | regex_t* reg, ScanEnv* env) |
6641 | 836k | { |
6642 | 836k | int r; |
6643 | 836k | UChar* p; |
6644 | | |
6645 | 836k | #ifdef USE_NAMED_GROUP |
6646 | 836k | names_clear(reg); |
6647 | 836k | #endif |
6648 | | |
6649 | 836k | scan_env_clear(env); |
6650 | 836k | env->option = reg->options; |
6651 | 836k | env->case_fold_flag = reg->case_fold_flag; |
6652 | 836k | env->enc = reg->enc; |
6653 | 836k | env->syntax = reg->syntax; |
6654 | 836k | env->pattern = (UChar* )pattern; |
6655 | 836k | env->pattern_end = (UChar* )end; |
6656 | 836k | env->reg = reg; |
6657 | | |
6658 | 836k | *root = NULL; |
6659 | 836k | p = (UChar* )pattern; |
6660 | 836k | r = parse_regexp(root, &p, (UChar* )end, env); |
6661 | 836k | reg->num_mem = env->num_mem; |
6662 | 836k | return r; |
6663 | 836k | } |
6664 | | |
6665 | | extern void |
6666 | | onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, |
6667 | | UChar* arg, UChar* arg_end) |
6668 | 648 | { |
6669 | 648 | env->error = arg; |
6670 | 648 | env->error_end = arg_end; |
6671 | 648 | } |