/src/fluent-bit/lib/onigmo/regparse.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | regparse.c - Onigmo (Oniguruma-mod) (regular expression library) |
3 | | **********************************************************************/ |
4 | | /*- |
5 | | * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> |
6 | | * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp> |
7 | | * All rights reserved. |
8 | | * |
9 | | * Redistribution and use in source and binary forms, with or without |
10 | | * modification, are permitted provided that the following conditions |
11 | | * are met: |
12 | | * 1. Redistributions of source code must retain the above copyright |
13 | | * notice, this list of conditions and the following disclaimer. |
14 | | * 2. Redistributions in binary form must reproduce the above copyright |
15 | | * notice, this list of conditions and the following disclaimer in the |
16 | | * documentation and/or other materials provided with the distribution. |
17 | | * |
18 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
19 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
20 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
21 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
22 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
23 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
24 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
25 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
26 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
27 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
28 | | * SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | #include "regparse.h" |
32 | | #include <stdarg.h> |
33 | | |
34 | 0 | #define WARN_BUFSIZE 256 |
35 | | |
36 | | #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
37 | | |
38 | | |
39 | | const OnigSyntaxType OnigSyntaxRuby = { |
40 | | (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | |
41 | | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | |
42 | | ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | |
43 | | ONIG_SYN_OP_ESC_C_CONTROL ) |
44 | | & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) |
45 | | , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | |
46 | | ONIG_SYN_OP2_OPTION_RUBY | |
47 | | ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | |
48 | | ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | |
49 | | ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | |
50 | | ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | |
51 | | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | |
52 | | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | |
53 | | ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | |
54 | | ONIG_SYN_OP2_ESC_H_XDIGIT | |
55 | | #ifndef RUBY |
56 | | ONIG_SYN_OP2_ESC_U_HEX4 | |
57 | | #endif |
58 | | ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER | |
59 | | ONIG_SYN_OP2_QMARK_LPAREN_CONDITION | |
60 | | ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK | |
61 | | ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | |
62 | | ONIG_SYN_OP2_QMARK_TILDE_ABSENT ) |
63 | | , ( SYN_GNU_REGEX_BV | |
64 | | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | |
65 | | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | |
66 | | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | |
67 | | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | |
68 | | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | |
69 | | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | |
70 | | ONIG_SYN_WARN_CC_DUP | |
71 | | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) |
72 | | , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE | |
73 | | ONIG_OPTION_WORD_BOUND_ALL_RANGE ) |
74 | | , |
75 | | { |
76 | | (OnigCodePoint )'\\' /* esc */ |
77 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ |
78 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ |
79 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ |
80 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ |
81 | | , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ |
82 | | } |
83 | | }; |
84 | | |
85 | | const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; |
86 | | |
87 | 0 | extern void onig_null_warn(const char* s ARG_UNUSED) { } |
88 | | |
89 | | #ifdef DEFAULT_WARN_FUNCTION |
90 | | static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; |
91 | | #else |
92 | | static OnigWarnFunc onig_warn = onig_null_warn; |
93 | | #endif |
94 | | |
95 | | #ifdef DEFAULT_VERB_WARN_FUNCTION |
96 | | static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; |
97 | | #else |
98 | | static OnigWarnFunc onig_verb_warn = onig_null_warn; |
99 | | #endif |
100 | | |
101 | | extern void onig_set_warn_func(OnigWarnFunc f) |
102 | 0 | { |
103 | 0 | onig_warn = f; |
104 | 0 | } |
105 | | |
106 | | extern void onig_set_verb_warn_func(OnigWarnFunc f) |
107 | 0 | { |
108 | 0 | onig_verb_warn = f; |
109 | 0 | } |
110 | | |
111 | | static void CC_DUP_WARN(ScanEnv *env, OnigCodePoint from, OnigCodePoint to); |
112 | | |
113 | | |
114 | | static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; |
115 | | |
116 | | extern unsigned int |
117 | | onig_get_parse_depth_limit(void) |
118 | 0 | { |
119 | 0 | return ParseDepthLimit; |
120 | 0 | } |
121 | | |
122 | | extern int |
123 | | onig_set_parse_depth_limit(unsigned int depth) |
124 | 0 | { |
125 | 0 | if (depth == 0) |
126 | 0 | ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; |
127 | 0 | else |
128 | 0 | ParseDepthLimit = depth; |
129 | 0 | return 0; |
130 | 0 | } |
131 | | |
132 | | |
133 | | static void |
134 | | bbuf_free(BBuf* bbuf) |
135 | 76.1k | { |
136 | 76.1k | if (IS_NOT_NULL(bbuf)) { |
137 | 38.0k | if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); |
138 | 38.0k | xfree(bbuf); |
139 | 38.0k | } |
140 | 76.1k | } |
141 | | |
142 | | static int |
143 | | bbuf_clone(BBuf** rto, BBuf* from) |
144 | 0 | { |
145 | 0 | int r; |
146 | 0 | BBuf *to; |
147 | |
|
148 | 0 | *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); |
149 | 0 | CHECK_NULL_RETURN_MEMERR(to); |
150 | 0 | r = BBUF_INIT(to, from->alloc); |
151 | 0 | if (r != 0) return r; |
152 | 0 | to->used = from->used; |
153 | 0 | xmemcpy(to->p, from->p, from->used); |
154 | 0 | return 0; |
155 | 0 | } |
156 | | |
157 | | #define BACKREF_REL_TO_ABS(rel_no, env) \ |
158 | 0 | ((env)->num_mem + 1 + (rel_no)) |
159 | | |
160 | 2.92k | #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) |
161 | | |
162 | | #define MBCODE_START_POS(enc) \ |
163 | 0 | (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) |
164 | | |
165 | | #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ |
166 | 0 | add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT) |
167 | | |
168 | 0 | #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ |
169 | 0 | if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ |
170 | 0 | r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ |
171 | 0 | if (r) return r;\ |
172 | 0 | }\ |
173 | 0 | } while (0) |
174 | | |
175 | | |
176 | 5.25M | #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \ |
177 | 5.25M | if (BITSET_AT(bs, pos)) CC_DUP_WARN(env, pos, pos); \ |
178 | 5.25M | BS_ROOM(bs, pos) |= BS_BIT(pos); \ |
179 | 5.25M | } while (0) |
180 | | |
181 | 0 | #define BITSET_IS_EMPTY(bs,empty) do {\ |
182 | 0 | int i;\ |
183 | 0 | empty = 1;\ |
184 | 0 | for (i = 0; i < BITSET_SIZE; i++) {\ |
185 | 0 | if ((bs)[i] != 0) {\ |
186 | 0 | empty = 0; break;\ |
187 | 0 | }\ |
188 | 0 | }\ |
189 | 0 | } while (0) |
190 | | |
191 | | static void |
192 | | bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to) |
193 | 38.0k | { |
194 | 38.0k | int i; |
195 | 4.91M | for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { |
196 | 4.87M | BITSET_SET_BIT_CHKDUP(bs, i); |
197 | 4.87M | } |
198 | 38.0k | } |
199 | | |
200 | | #if 0 |
201 | | static void |
202 | | bitset_set_all(BitSetRef bs) |
203 | | { |
204 | | int i; |
205 | | for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } |
206 | | } |
207 | | #endif |
208 | | |
209 | | static void |
210 | | bitset_invert(BitSetRef bs) |
211 | 0 | { |
212 | 0 | int i; |
213 | 0 | for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); } |
214 | 0 | } |
215 | | |
216 | | static void |
217 | | bitset_invert_to(BitSetRef from, BitSetRef to) |
218 | 0 | { |
219 | 0 | int i; |
220 | 0 | for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); } |
221 | 0 | } |
222 | | |
223 | | static void |
224 | | bitset_and(BitSetRef dest, BitSetRef bs) |
225 | 38.0k | { |
226 | 38.0k | int i; |
227 | 342k | for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; } |
228 | 38.0k | } |
229 | | |
230 | | static void |
231 | | bitset_or(BitSetRef dest, BitSetRef bs) |
232 | 38.0k | { |
233 | 38.0k | int i; |
234 | 342k | for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; } |
235 | 38.0k | } |
236 | | |
237 | | static void |
238 | | bitset_copy(BitSetRef dest, BitSetRef bs) |
239 | 0 | { |
240 | 0 | int i; |
241 | 0 | for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; } |
242 | 0 | } |
243 | | |
244 | | #if defined(USE_NAMED_GROUP) && !defined(USE_ST_LIBRARY) |
245 | | extern int |
246 | | onig_strncmp(const UChar* s1, const UChar* s2, int n) |
247 | | { |
248 | | int x; |
249 | | |
250 | | while (n-- > 0) { |
251 | | x = *s2++ - *s1++; |
252 | | if (x) return x; |
253 | | } |
254 | | return 0; |
255 | | } |
256 | | #endif |
257 | | |
258 | | extern void |
259 | | onig_strcpy(UChar* dest, const UChar* src, const UChar* end) |
260 | 1.08M | { |
261 | 1.08M | ptrdiff_t len = end - src; |
262 | 1.08M | if (len > 0) { |
263 | 1.08M | xmemcpy(dest, src, len); |
264 | 1.08M | dest[len] = (UChar )0; |
265 | 1.08M | } |
266 | 1.08M | } |
267 | | |
268 | | #ifdef USE_NAMED_GROUP |
269 | | static UChar* |
270 | | strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) |
271 | 11.7k | { |
272 | 11.7k | ptrdiff_t slen; |
273 | 11.7k | int term_len, i; |
274 | 11.7k | UChar *r; |
275 | | |
276 | 11.7k | slen = end - s; |
277 | 11.7k | term_len = ONIGENC_MBC_MINLEN(enc); |
278 | | |
279 | 11.7k | r = (UChar* )xmalloc(slen + term_len); |
280 | 11.7k | CHECK_NULL_RETURN(r); |
281 | 11.7k | xmemcpy(r, s, slen); |
282 | | |
283 | 23.4k | for (i = 0; i < term_len; i++) |
284 | 11.7k | r[slen + i] = (UChar )0; |
285 | | |
286 | 11.7k | return r; |
287 | 11.7k | } |
288 | | #endif |
289 | | |
290 | | /* scan pattern methods */ |
291 | 0 | #define PEND_VALUE 0 |
292 | | |
293 | | #ifdef __GNUC__ |
294 | | /* get rid of Wunused-but-set-variable and Wuninitialized */ |
295 | 1.72M | # define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev |
296 | | #else |
297 | | # define PFETCH_READY UChar* pfetch_prev |
298 | | #endif |
299 | 2.32M | #define PEND (p < end ? 0 : 1) |
300 | 96.6k | #define PUNFETCH p = pfetch_prev |
301 | 64.4k | #define PINC do { \ |
302 | 64.4k | pfetch_prev = p; \ |
303 | 64.4k | p += enclen(enc, p, end); \ |
304 | 64.4k | } while (0) |
305 | 1.77M | #define PFETCH(c) do { \ |
306 | 1.77M | c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \ |
307 | 1.77M | pfetch_prev = p; \ |
308 | 1.77M | p += enclen(enc, p, end); \ |
309 | 1.77M | } while (0) |
310 | | |
311 | 0 | #define PINC_S do { \ |
312 | 0 | p += enclen(enc, p, end); \ |
313 | 0 | } while (0) |
314 | 108k | #define PFETCH_S(c) do { \ |
315 | 108k | c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \ |
316 | 108k | p += enclen(enc, p, end); \ |
317 | 108k | } while (0) |
318 | | |
319 | 322k | #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) |
320 | 565k | #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) |
321 | | |
322 | | static UChar* |
323 | | strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, |
324 | | size_t capa) |
325 | 163k | { |
326 | 163k | UChar* r; |
327 | | |
328 | 163k | if (dest) |
329 | 163k | r = (UChar* )xrealloc(dest, capa + 1); |
330 | 0 | else |
331 | 0 | r = (UChar* )xmalloc(capa + 1); |
332 | | |
333 | 163k | CHECK_NULL_RETURN(r); |
334 | 163k | onig_strcpy(r + (dest_end - dest), src, src_end); |
335 | 163k | return r; |
336 | 163k | } |
337 | | |
338 | | /* dest on static area */ |
339 | | static UChar* |
340 | | strcat_capa_from_static(UChar* dest, UChar* dest_end, |
341 | | const UChar* src, const UChar* src_end, size_t capa) |
342 | 11.7k | { |
343 | 11.7k | UChar* r; |
344 | | |
345 | 11.7k | r = (UChar* )xmalloc(capa + 1); |
346 | 11.7k | CHECK_NULL_RETURN(r); |
347 | 11.7k | onig_strcpy(r, dest, dest_end); |
348 | 11.7k | onig_strcpy(r + (dest_end - dest), src, src_end); |
349 | 11.7k | return r; |
350 | 11.7k | } |
351 | | |
352 | | |
353 | | #ifdef USE_ST_LIBRARY |
354 | | |
355 | | # ifdef RUBY |
356 | | # include "ruby/st.h" |
357 | | # else |
358 | | # include "st.h" |
359 | | # endif |
360 | | |
361 | | typedef struct { |
362 | | const UChar* s; |
363 | | const UChar* end; |
364 | | } st_str_end_key; |
365 | | |
366 | | static int |
367 | | str_end_cmp(st_data_t xp, st_data_t yp) |
368 | 61.0k | { |
369 | 61.0k | const st_str_end_key *x, *y; |
370 | 61.0k | const UChar *p, *q; |
371 | 61.0k | int c; |
372 | | |
373 | 61.0k | x = (const st_str_end_key *)xp; |
374 | 61.0k | y = (const st_str_end_key *)yp; |
375 | 61.0k | if ((x->end - x->s) != (y->end - y->s)) |
376 | 0 | return 1; |
377 | | |
378 | 61.0k | p = x->s; |
379 | 61.0k | q = y->s; |
380 | 289k | while (p < x->end) { |
381 | 228k | c = (int )*p - (int )*q; |
382 | 228k | if (c != 0) return c; |
383 | | |
384 | 228k | p++; q++; |
385 | 228k | } |
386 | | |
387 | 61.0k | return 0; |
388 | 61.0k | } |
389 | | |
390 | | static st_index_t |
391 | | str_end_hash(st_data_t xp) |
392 | 81.5k | { |
393 | 81.5k | const st_str_end_key *x = (const st_str_end_key *)xp; |
394 | 81.5k | const UChar *p; |
395 | 81.5k | st_index_t val = 0; |
396 | | |
397 | 81.5k | p = x->s; |
398 | 386k | while (p < x->end) { |
399 | 304k | val = val * 997 + (int )*p++; |
400 | 304k | } |
401 | | |
402 | 81.5k | return val + (val >> 5); |
403 | 81.5k | } |
404 | | |
405 | | extern hash_table_type* |
406 | | onig_st_init_strend_table_with_size(st_index_t size) |
407 | 2.92k | { |
408 | 2.92k | static const struct st_hash_type hashType = { |
409 | 2.92k | str_end_cmp, |
410 | 2.92k | str_end_hash, |
411 | 2.92k | }; |
412 | | |
413 | 2.92k | return (hash_table_type* ) |
414 | 2.92k | onig_st_init_table_with_size(&hashType, size); |
415 | 2.92k | } |
416 | | |
417 | | extern int |
418 | | onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, |
419 | | const UChar* end_key, hash_data_type *value) |
420 | 69.7k | { |
421 | 69.7k | st_str_end_key key; |
422 | | |
423 | 69.7k | key.s = (UChar* )str_key; |
424 | 69.7k | key.end = (UChar* )end_key; |
425 | | |
426 | 69.7k | return onig_st_lookup(table, (st_data_t )(&key), value); |
427 | 69.7k | } |
428 | | |
429 | | extern int |
430 | | onig_st_insert_strend(hash_table_type* table, const UChar* str_key, |
431 | | const UChar* end_key, hash_data_type value) |
432 | 11.7k | { |
433 | 11.7k | st_str_end_key* key; |
434 | 11.7k | int result; |
435 | | |
436 | 11.7k | key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); |
437 | 11.7k | key->s = (UChar* )str_key; |
438 | 11.7k | key->end = (UChar* )end_key; |
439 | 11.7k | result = onig_st_insert(table, (st_data_t )key, value); |
440 | 11.7k | if (result) { |
441 | 0 | xfree(key); |
442 | 0 | } |
443 | 11.7k | return result; |
444 | 11.7k | } |
445 | | |
446 | | #endif /* USE_ST_LIBRARY */ |
447 | | |
448 | | |
449 | | #ifdef USE_NAMED_GROUP |
450 | | |
451 | 0 | # define INIT_NAME_BACKREFS_ALLOC_NUM 8 |
452 | | |
453 | | typedef struct { |
454 | | UChar* name; |
455 | | size_t name_len; /* byte length */ |
456 | | int back_num; /* number of backrefs */ |
457 | | int back_alloc; |
458 | | int back_ref1; |
459 | | int* back_refs; |
460 | | } NameEntry; |
461 | | |
462 | | # ifdef USE_ST_LIBRARY |
463 | | |
464 | | typedef st_table NameTable; |
465 | | typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ |
466 | | |
467 | | # ifdef ONIG_DEBUG |
468 | | static int |
469 | | i_print_name_entry(UChar* key, NameEntry* e, void* arg) |
470 | | { |
471 | | int i; |
472 | | FILE* fp = (FILE* )arg; |
473 | | |
474 | | fprintf(fp, "%s: ", e->name); |
475 | | if (e->back_num == 0) |
476 | | fputs("-", fp); |
477 | | else if (e->back_num == 1) |
478 | | fprintf(fp, "%d", e->back_ref1); |
479 | | else { |
480 | | for (i = 0; i < e->back_num; i++) { |
481 | | if (i > 0) fprintf(fp, ", "); |
482 | | fprintf(fp, "%d", e->back_refs[i]); |
483 | | } |
484 | | } |
485 | | fputs("\n", fp); |
486 | | return ST_CONTINUE; |
487 | | } |
488 | | |
489 | | extern int |
490 | | onig_print_names(FILE* fp, regex_t* reg) |
491 | | { |
492 | | NameTable* t = (NameTable* )reg->name_table; |
493 | | |
494 | | if (IS_NOT_NULL(t)) { |
495 | | fprintf(fp, "name table\n"); |
496 | | onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); |
497 | | fputs("\n", fp); |
498 | | } |
499 | | return 0; |
500 | | } |
501 | | # endif /* ONIG_DEBUG */ |
502 | | |
503 | | static int |
504 | | i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) |
505 | 11.7k | { |
506 | 11.7k | xfree(e->name); |
507 | 11.7k | if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); |
508 | 11.7k | xfree(key); |
509 | 11.7k | xfree(e); |
510 | 11.7k | return ST_DELETE; |
511 | 11.7k | } |
512 | | |
513 | | static int |
514 | | names_clear(regex_t* reg) |
515 | 134k | { |
516 | 134k | NameTable* t = (NameTable* )reg->name_table; |
517 | | |
518 | 134k | if (IS_NOT_NULL(t)) { |
519 | 2.92k | onig_st_foreach(t, i_free_name_entry, 0); |
520 | 2.92k | } |
521 | 134k | return 0; |
522 | 134k | } |
523 | | |
524 | | extern int |
525 | | onig_names_free(regex_t* reg) |
526 | 67.3k | { |
527 | 67.3k | int r; |
528 | 67.3k | NameTable* t; |
529 | | |
530 | 67.3k | r = names_clear(reg); |
531 | 67.3k | if (r) return r; |
532 | | |
533 | 67.3k | t = (NameTable* )reg->name_table; |
534 | 67.3k | if (IS_NOT_NULL(t)) onig_st_free_table(t); |
535 | 67.3k | reg->name_table = (void* )NULL; |
536 | 67.3k | return 0; |
537 | 67.3k | } |
538 | | |
539 | | static NameEntry* |
540 | | name_find(regex_t* reg, const UChar* name, const UChar* name_end) |
541 | 72.7k | { |
542 | 72.7k | NameEntry* e; |
543 | 72.7k | NameTable* t = (NameTable* )reg->name_table; |
544 | | |
545 | 72.7k | e = (NameEntry* )NULL; |
546 | 72.7k | if (IS_NOT_NULL(t)) { |
547 | 69.7k | onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); |
548 | 69.7k | } |
549 | 72.7k | return e; |
550 | 72.7k | } |
551 | | |
552 | | typedef struct { |
553 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); |
554 | | regex_t* reg; |
555 | | void* arg; |
556 | | int ret; |
557 | | OnigEncoding enc; |
558 | | } INamesArg; |
559 | | |
560 | | static int |
561 | | i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) |
562 | 61.0k | { |
563 | 61.0k | int r = (*(arg->func))(e->name, |
564 | 61.0k | e->name + e->name_len, |
565 | 61.0k | e->back_num, |
566 | 61.0k | (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), |
567 | 61.0k | arg->reg, arg->arg); |
568 | 61.0k | if (r != 0) { |
569 | 0 | arg->ret = r; |
570 | 0 | return ST_STOP; |
571 | 0 | } |
572 | 61.0k | return ST_CONTINUE; |
573 | 61.0k | } |
574 | | |
575 | | extern int |
576 | | onig_foreach_name(regex_t* reg, |
577 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) |
578 | 15.2k | { |
579 | 15.2k | INamesArg narg; |
580 | 15.2k | NameTable* t = (NameTable* )reg->name_table; |
581 | | |
582 | 15.2k | narg.ret = 0; |
583 | 15.2k | if (IS_NOT_NULL(t)) { |
584 | 15.2k | narg.func = func; |
585 | 15.2k | narg.reg = reg; |
586 | 15.2k | narg.arg = arg; |
587 | 15.2k | narg.enc = reg->enc; /* should be pattern encoding. */ |
588 | 15.2k | onig_st_foreach(t, i_names, (HashDataType )&narg); |
589 | 15.2k | } |
590 | 15.2k | return narg.ret; |
591 | 15.2k | } |
592 | | |
593 | | static int |
594 | | i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map) |
595 | 0 | { |
596 | 0 | int i; |
597 | |
|
598 | 0 | if (e->back_num > 1) { |
599 | 0 | for (i = 0; i < e->back_num; i++) { |
600 | 0 | e->back_refs[i] = map[e->back_refs[i]].new_val; |
601 | 0 | } |
602 | 0 | } |
603 | 0 | else if (e->back_num == 1) { |
604 | 0 | e->back_ref1 = map[e->back_ref1].new_val; |
605 | 0 | } |
606 | |
|
607 | 0 | return ST_CONTINUE; |
608 | 0 | } |
609 | | |
610 | | extern int |
611 | | onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) |
612 | 0 | { |
613 | 0 | NameTable* t = (NameTable* )reg->name_table; |
614 | |
|
615 | 0 | if (IS_NOT_NULL(t)) { |
616 | 0 | onig_st_foreach(t, i_renumber_name, (HashDataType )map); |
617 | 0 | } |
618 | 0 | return 0; |
619 | 0 | } |
620 | | |
621 | | |
622 | | extern int |
623 | | onig_number_of_names(const regex_t* reg) |
624 | 0 | { |
625 | 0 | NameTable* t = (NameTable* )reg->name_table; |
626 | |
|
627 | 0 | if (IS_NOT_NULL(t)) |
628 | 0 | return (int )t->num_entries; |
629 | 0 | else |
630 | 0 | return 0; |
631 | 0 | } |
632 | | |
633 | | # else /* USE_ST_LIBRARY */ |
634 | | |
635 | | # define INIT_NAMES_ALLOC_NUM 8 |
636 | | |
637 | | typedef struct { |
638 | | NameEntry* e; |
639 | | int num; |
640 | | int alloc; |
641 | | } NameTable; |
642 | | |
643 | | # ifdef ONIG_DEBUG |
644 | | extern int |
645 | | onig_print_names(FILE* fp, regex_t* reg) |
646 | | { |
647 | | int i, j; |
648 | | NameEntry* e; |
649 | | NameTable* t = (NameTable* )reg->name_table; |
650 | | |
651 | | if (IS_NOT_NULL(t) && t->num > 0) { |
652 | | fprintf(fp, "name table\n"); |
653 | | for (i = 0; i < t->num; i++) { |
654 | | e = &(t->e[i]); |
655 | | fprintf(fp, "%s: ", e->name); |
656 | | if (e->back_num == 0) { |
657 | | fputs("-", fp); |
658 | | } |
659 | | else if (e->back_num == 1) { |
660 | | fprintf(fp, "%d", e->back_ref1); |
661 | | } |
662 | | else { |
663 | | for (j = 0; j < e->back_num; j++) { |
664 | | if (j > 0) fprintf(fp, ", "); |
665 | | fprintf(fp, "%d", e->back_refs[j]); |
666 | | } |
667 | | } |
668 | | fputs("\n", fp); |
669 | | } |
670 | | fputs("\n", fp); |
671 | | } |
672 | | return 0; |
673 | | } |
674 | | # endif |
675 | | |
676 | | static int |
677 | | names_clear(regex_t* reg) |
678 | | { |
679 | | int i; |
680 | | NameEntry* e; |
681 | | NameTable* t = (NameTable* )reg->name_table; |
682 | | |
683 | | if (IS_NOT_NULL(t)) { |
684 | | for (i = 0; i < t->num; i++) { |
685 | | e = &(t->e[i]); |
686 | | if (IS_NOT_NULL(e->name)) { |
687 | | xfree(e->name); |
688 | | e->name = NULL; |
689 | | e->name_len = 0; |
690 | | e->back_num = 0; |
691 | | e->back_alloc = 0; |
692 | | if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); |
693 | | e->back_refs = (int* )NULL; |
694 | | } |
695 | | } |
696 | | if (IS_NOT_NULL(t->e)) { |
697 | | xfree(t->e); |
698 | | t->e = NULL; |
699 | | } |
700 | | t->num = 0; |
701 | | } |
702 | | return 0; |
703 | | } |
704 | | |
705 | | extern int |
706 | | onig_names_free(regex_t* reg) |
707 | | { |
708 | | int r; |
709 | | NameTable* t; |
710 | | |
711 | | r = names_clear(reg); |
712 | | if (r) return r; |
713 | | |
714 | | t = (NameTable* )reg->name_table; |
715 | | if (IS_NOT_NULL(t)) xfree(t); |
716 | | reg->name_table = NULL; |
717 | | return 0; |
718 | | } |
719 | | |
720 | | static NameEntry* |
721 | | name_find(regex_t* reg, const UChar* name, const UChar* name_end) |
722 | | { |
723 | | int i, len; |
724 | | NameEntry* e; |
725 | | NameTable* t = (NameTable* )reg->name_table; |
726 | | |
727 | | if (IS_NOT_NULL(t)) { |
728 | | len = name_end - name; |
729 | | for (i = 0; i < t->num; i++) { |
730 | | e = &(t->e[i]); |
731 | | if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) |
732 | | return e; |
733 | | } |
734 | | } |
735 | | return (NameEntry* )NULL; |
736 | | } |
737 | | |
738 | | extern int |
739 | | onig_foreach_name(regex_t* reg, |
740 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) |
741 | | { |
742 | | int i, r; |
743 | | NameEntry* e; |
744 | | NameTable* t = (NameTable* )reg->name_table; |
745 | | |
746 | | if (IS_NOT_NULL(t)) { |
747 | | for (i = 0; i < t->num; i++) { |
748 | | e = &(t->e[i]); |
749 | | r = (*func)(e->name, e->name + e->name_len, e->back_num, |
750 | | (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), |
751 | | reg, arg); |
752 | | if (r != 0) return r; |
753 | | } |
754 | | } |
755 | | return 0; |
756 | | } |
757 | | |
758 | | extern int |
759 | | onig_number_of_names(const regex_t* reg) |
760 | | { |
761 | | NameTable* t = (NameTable* )reg->name_table; |
762 | | |
763 | | if (IS_NOT_NULL(t)) |
764 | | return t->num; |
765 | | else |
766 | | return 0; |
767 | | } |
768 | | |
769 | | # endif /* else USE_ST_LIBRARY */ |
770 | | |
771 | | static int |
772 | | name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) |
773 | 11.7k | { |
774 | 11.7k | int alloc; |
775 | 11.7k | NameEntry* e; |
776 | 11.7k | NameTable* t = (NameTable* )reg->name_table; |
777 | | |
778 | 11.7k | if (name_end - name <= 0) |
779 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
780 | | |
781 | 11.7k | e = name_find(reg, name, name_end); |
782 | 11.7k | if (IS_NULL(e)) { |
783 | 11.7k | # ifdef USE_ST_LIBRARY |
784 | 11.7k | if (IS_NULL(t)) { |
785 | 2.92k | t = onig_st_init_strend_table_with_size(5); |
786 | 2.92k | reg->name_table = (void* )t; |
787 | 2.92k | } |
788 | 11.7k | e = (NameEntry* )xmalloc(sizeof(NameEntry)); |
789 | 11.7k | CHECK_NULL_RETURN_MEMERR(e); |
790 | | |
791 | 11.7k | e->name = strdup_with_null(reg->enc, name, name_end); |
792 | 11.7k | if (IS_NULL(e->name)) { |
793 | 0 | xfree(e); |
794 | 0 | return ONIGERR_MEMORY; |
795 | 0 | } |
796 | 11.7k | onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), |
797 | 11.7k | (HashDataType )e); |
798 | | |
799 | 11.7k | e->name_len = name_end - name; |
800 | 11.7k | e->back_num = 0; |
801 | 11.7k | e->back_alloc = 0; |
802 | 11.7k | e->back_refs = (int* )NULL; |
803 | | |
804 | | # else |
805 | | |
806 | | if (IS_NULL(t)) { |
807 | | alloc = INIT_NAMES_ALLOC_NUM; |
808 | | t = (NameTable* )xmalloc(sizeof(NameTable)); |
809 | | CHECK_NULL_RETURN_MEMERR(t); |
810 | | t->e = NULL; |
811 | | t->alloc = 0; |
812 | | t->num = 0; |
813 | | |
814 | | t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); |
815 | | if (IS_NULL(t->e)) { |
816 | | xfree(t); |
817 | | return ONIGERR_MEMORY; |
818 | | } |
819 | | t->alloc = alloc; |
820 | | reg->name_table = t; |
821 | | goto clear; |
822 | | } |
823 | | else if (t->num == t->alloc) { |
824 | | int i; |
825 | | NameEntry* p; |
826 | | |
827 | | alloc = t->alloc * 2; |
828 | | p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); |
829 | | CHECK_NULL_RETURN_MEMERR(p); |
830 | | t->e = p; |
831 | | t->alloc = alloc; |
832 | | |
833 | | clear: |
834 | | for (i = t->num; i < t->alloc; i++) { |
835 | | t->e[i].name = NULL; |
836 | | t->e[i].name_len = 0; |
837 | | t->e[i].back_num = 0; |
838 | | t->e[i].back_alloc = 0; |
839 | | t->e[i].back_refs = (int* )NULL; |
840 | | } |
841 | | } |
842 | | e = &(t->e[t->num]); |
843 | | t->num++; |
844 | | e->name = strdup_with_null(reg->enc, name, name_end); |
845 | | if (IS_NULL(e->name)) return ONIGERR_MEMORY; |
846 | | e->name_len = name_end - name; |
847 | | # endif |
848 | 11.7k | } |
849 | | |
850 | 11.7k | if (e->back_num >= 1 && |
851 | 11.7k | ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { |
852 | 0 | onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, |
853 | 0 | name, name_end); |
854 | 0 | return ONIGERR_MULTIPLEX_DEFINED_NAME; |
855 | 0 | } |
856 | | |
857 | 11.7k | e->back_num++; |
858 | 11.7k | if (e->back_num == 1) { |
859 | 11.7k | e->back_ref1 = backref; |
860 | 11.7k | } |
861 | 0 | else { |
862 | 0 | if (e->back_num == 2) { |
863 | 0 | alloc = INIT_NAME_BACKREFS_ALLOC_NUM; |
864 | 0 | e->back_refs = (int* )xmalloc(sizeof(int) * alloc); |
865 | 0 | CHECK_NULL_RETURN_MEMERR(e->back_refs); |
866 | 0 | e->back_alloc = alloc; |
867 | 0 | e->back_refs[0] = e->back_ref1; |
868 | 0 | e->back_refs[1] = backref; |
869 | 0 | } |
870 | 0 | else { |
871 | 0 | if (e->back_num > e->back_alloc) { |
872 | 0 | int* p; |
873 | 0 | alloc = e->back_alloc * 2; |
874 | 0 | p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); |
875 | 0 | CHECK_NULL_RETURN_MEMERR(p); |
876 | 0 | e->back_refs = p; |
877 | 0 | e->back_alloc = alloc; |
878 | 0 | } |
879 | 0 | e->back_refs[e->back_num - 1] = backref; |
880 | 0 | } |
881 | 0 | } |
882 | | |
883 | 11.7k | return 0; |
884 | 11.7k | } |
885 | | |
886 | | extern int |
887 | | onig_name_to_group_numbers(regex_t* reg, const UChar* name, |
888 | | const UChar* name_end, int** nums) |
889 | 61.0k | { |
890 | 61.0k | NameEntry* e = name_find(reg, name, name_end); |
891 | | |
892 | 61.0k | if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; |
893 | | |
894 | 61.0k | switch (e->back_num) { |
895 | 0 | case 0: |
896 | 0 | *nums = 0; |
897 | 0 | break; |
898 | 61.0k | case 1: |
899 | 61.0k | *nums = &(e->back_ref1); |
900 | 61.0k | break; |
901 | 0 | default: |
902 | 0 | *nums = e->back_refs; |
903 | 0 | break; |
904 | 61.0k | } |
905 | 61.0k | return e->back_num; |
906 | 61.0k | } |
907 | | |
908 | | extern int |
909 | | onig_name_to_backref_number(regex_t* reg, const UChar* name, |
910 | | const UChar* name_end, const OnigRegion *region) |
911 | 61.0k | { |
912 | 61.0k | int i, n, *nums; |
913 | | |
914 | 61.0k | n = onig_name_to_group_numbers(reg, name, name_end, &nums); |
915 | 61.0k | if (n < 0) |
916 | 0 | return n; |
917 | 61.0k | else if (n == 0) |
918 | 0 | return ONIGERR_PARSER_BUG; |
919 | 61.0k | else if (n == 1) |
920 | 61.0k | return nums[0]; |
921 | 0 | else { |
922 | 0 | if (IS_NOT_NULL(region)) { |
923 | 0 | for (i = n - 1; i >= 0; i--) { |
924 | 0 | if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) |
925 | 0 | return nums[i]; |
926 | 0 | } |
927 | 0 | } |
928 | 0 | return nums[n - 1]; |
929 | 0 | } |
930 | 61.0k | } |
931 | | |
932 | | #else /* USE_NAMED_GROUP */ |
933 | | |
934 | | extern int |
935 | | onig_name_to_group_numbers(regex_t* reg, const UChar* name, |
936 | | const UChar* name_end, int** nums) |
937 | | { |
938 | | return ONIG_NO_SUPPORT_CONFIG; |
939 | | } |
940 | | |
941 | | extern int |
942 | | onig_name_to_backref_number(regex_t* reg, const UChar* name, |
943 | | const UChar* name_end, const OnigRegion* region) |
944 | | { |
945 | | return ONIG_NO_SUPPORT_CONFIG; |
946 | | } |
947 | | |
948 | | extern int |
949 | | onig_foreach_name(regex_t* reg, |
950 | | int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) |
951 | | { |
952 | | return ONIG_NO_SUPPORT_CONFIG; |
953 | | } |
954 | | |
955 | | extern int |
956 | | onig_number_of_names(const regex_t* reg) |
957 | | { |
958 | | return 0; |
959 | | } |
960 | | #endif /* else USE_NAMED_GROUP */ |
961 | | |
962 | | extern int |
963 | | onig_noname_group_capture_is_active(const regex_t* reg) |
964 | 0 | { |
965 | 0 | if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) |
966 | 0 | return 0; |
967 | | |
968 | 0 | #ifdef USE_NAMED_GROUP |
969 | 0 | if (onig_number_of_names(reg) > 0 && |
970 | 0 | IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && |
971 | 0 | !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { |
972 | 0 | return 0; |
973 | 0 | } |
974 | 0 | #endif |
975 | | |
976 | 0 | return 1; |
977 | 0 | } |
978 | | |
979 | | |
980 | 0 | #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 |
981 | | |
982 | | static void |
983 | | scan_env_clear(ScanEnv* env) |
984 | 67.3k | { |
985 | 67.3k | int i; |
986 | | |
987 | 67.3k | BIT_STATUS_CLEAR(env->capture_history); |
988 | 67.3k | BIT_STATUS_CLEAR(env->bt_mem_start); |
989 | 67.3k | BIT_STATUS_CLEAR(env->bt_mem_end); |
990 | 67.3k | BIT_STATUS_CLEAR(env->backrefed_mem); |
991 | 67.3k | env->error = (UChar* )NULL; |
992 | 67.3k | env->error_end = (UChar* )NULL; |
993 | 67.3k | env->num_call = 0; |
994 | 67.3k | env->num_mem = 0; |
995 | 67.3k | #ifdef USE_NAMED_GROUP |
996 | 67.3k | env->num_named = 0; |
997 | 67.3k | #endif |
998 | 67.3k | env->mem_alloc = 0; |
999 | 67.3k | env->mem_nodes_dynamic = (Node** )NULL; |
1000 | | |
1001 | 606k | for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) |
1002 | 538k | env->mem_nodes_static[i] = NULL_NODE; |
1003 | | |
1004 | | #ifdef USE_COMBINATION_EXPLOSION_CHECK |
1005 | | env->num_comb_exp_check = 0; |
1006 | | env->comb_exp_max_regnum = 0; |
1007 | | env->curr_max_regnum = 0; |
1008 | | env->has_recursion = 0; |
1009 | | #endif |
1010 | 67.3k | env->parse_depth = 0; |
1011 | 67.3k | env->warnings_flag = 0; |
1012 | 67.3k | } |
1013 | | |
1014 | | static int |
1015 | | scan_env_add_mem_entry(ScanEnv* env) |
1016 | 14.6k | { |
1017 | 14.6k | int i, need, alloc; |
1018 | 14.6k | Node** p; |
1019 | | |
1020 | 14.6k | need = env->num_mem + 1; |
1021 | 14.6k | if (need > ONIG_MAX_CAPTURE_GROUP_NUM) |
1022 | 0 | return ONIGERR_TOO_MANY_CAPTURE_GROUPS; |
1023 | 14.6k | if (need >= SCANENV_MEMNODES_SIZE) { |
1024 | 0 | if (env->mem_alloc <= need) { |
1025 | 0 | if (IS_NULL(env->mem_nodes_dynamic)) { |
1026 | 0 | alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; |
1027 | 0 | p = (Node** )xmalloc(sizeof(Node*) * alloc); |
1028 | 0 | CHECK_NULL_RETURN_MEMERR(p); |
1029 | 0 | xmemcpy(p, env->mem_nodes_static, |
1030 | 0 | sizeof(Node*) * SCANENV_MEMNODES_SIZE); |
1031 | 0 | } |
1032 | 0 | else { |
1033 | 0 | alloc = env->mem_alloc * 2; |
1034 | 0 | p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); |
1035 | 0 | CHECK_NULL_RETURN_MEMERR(p); |
1036 | 0 | } |
1037 | | |
1038 | 0 | for (i = env->num_mem + 1; i < alloc; i++) |
1039 | 0 | p[i] = NULL_NODE; |
1040 | |
|
1041 | 0 | env->mem_nodes_dynamic = p; |
1042 | 0 | env->mem_alloc = alloc; |
1043 | 0 | } |
1044 | 0 | } |
1045 | | |
1046 | 14.6k | env->num_mem++; |
1047 | 14.6k | return env->num_mem; |
1048 | 14.6k | } |
1049 | | |
1050 | | static int |
1051 | | scan_env_set_mem_node(ScanEnv* env, int num, Node* node) |
1052 | 14.6k | { |
1053 | 14.6k | if (env->num_mem >= num) |
1054 | 14.6k | SCANENV_MEM_NODES(env)[num] = node; |
1055 | 0 | else |
1056 | 0 | return ONIGERR_PARSER_BUG; |
1057 | 14.6k | return 0; |
1058 | 14.6k | } |
1059 | | |
1060 | | |
1061 | | extern void |
1062 | | onig_node_free(Node* node) |
1063 | 585k | { |
1064 | 916k | start: |
1065 | 916k | if (IS_NULL(node)) return ; |
1066 | | |
1067 | 784k | switch (NTYPE(node)) { |
1068 | 125k | case NT_STR: |
1069 | 125k | if (NSTR(node)->capa != 0 && |
1070 | 125k | IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { |
1071 | 11.7k | xfree(NSTR(node)->s); |
1072 | 11.7k | } |
1073 | 125k | break; |
1074 | | |
1075 | 289k | case NT_LIST: |
1076 | 330k | case NT_ALT: |
1077 | 330k | onig_node_free(NCAR(node)); |
1078 | 330k | { |
1079 | 330k | Node* next_node = NCDR(node); |
1080 | | |
1081 | 330k | xfree(node); |
1082 | 330k | node = next_node; |
1083 | 330k | goto start; |
1084 | 289k | } |
1085 | 0 | break; |
1086 | | |
1087 | 70.2k | case NT_CCLASS: |
1088 | 70.2k | { |
1089 | 70.2k | CClassNode* cc = NCCLASS(node); |
1090 | | |
1091 | 70.2k | if (cc->mbuf) |
1092 | 0 | bbuf_free(cc->mbuf); |
1093 | 70.2k | } |
1094 | 70.2k | break; |
1095 | | |
1096 | 87.8k | case NT_QTFR: |
1097 | 87.8k | if (NQTFR(node)->target) |
1098 | 87.8k | onig_node_free(NQTFR(node)->target); |
1099 | 87.8k | break; |
1100 | | |
1101 | 55.6k | case NT_ENCLOSE: |
1102 | 55.6k | if (NENCLOSE(node)->target) |
1103 | 55.6k | onig_node_free(NENCLOSE(node)->target); |
1104 | 55.6k | break; |
1105 | | |
1106 | 0 | case NT_BREF: |
1107 | 0 | if (IS_NOT_NULL(NBREF(node)->back_dynamic)) |
1108 | 0 | xfree(NBREF(node)->back_dynamic); |
1109 | 0 | break; |
1110 | | |
1111 | 84.9k | case NT_ANCHOR: |
1112 | 84.9k | if (NANCHOR(node)->target) |
1113 | 0 | onig_node_free(NANCHOR(node)->target); |
1114 | 84.9k | break; |
1115 | 784k | } |
1116 | | |
1117 | 453k | xfree(node); |
1118 | 453k | } |
1119 | | |
1120 | | static Node* |
1121 | | node_new(void) |
1122 | 784k | { |
1123 | 784k | Node* node; |
1124 | | |
1125 | 784k | node = (Node* )xmalloc(sizeof(Node)); |
1126 | | /* xmemset(node, 0, sizeof(Node)); */ |
1127 | 784k | return node; |
1128 | 784k | } |
1129 | | |
1130 | | static void |
1131 | | initialize_cclass(CClassNode* cc) |
1132 | 146k | { |
1133 | 146k | BITSET_CLEAR(cc->bs); |
1134 | | /* cc->base.flags = 0; */ |
1135 | 146k | cc->flags = 0; |
1136 | 146k | cc->mbuf = NULL; |
1137 | 146k | } |
1138 | | |
1139 | | static Node* |
1140 | | node_new_cclass(void) |
1141 | 70.2k | { |
1142 | 70.2k | Node* node = node_new(); |
1143 | 70.2k | CHECK_NULL_RETURN(node); |
1144 | | |
1145 | 70.2k | SET_NTYPE(node, NT_CCLASS); |
1146 | 70.2k | initialize_cclass(NCCLASS(node)); |
1147 | 70.2k | return node; |
1148 | 70.2k | } |
1149 | | |
1150 | | static Node* |
1151 | | node_new_ctype(int type, int not, int ascii_range) |
1152 | 0 | { |
1153 | 0 | Node* node = node_new(); |
1154 | 0 | CHECK_NULL_RETURN(node); |
1155 | | |
1156 | 0 | SET_NTYPE(node, NT_CTYPE); |
1157 | 0 | NCTYPE(node)->ctype = type; |
1158 | 0 | NCTYPE(node)->not = not; |
1159 | 0 | NCTYPE(node)->ascii_range = ascii_range; |
1160 | 0 | return node; |
1161 | 0 | } |
1162 | | |
1163 | | static Node* |
1164 | | node_new_anychar(void) |
1165 | 29.2k | { |
1166 | 29.2k | Node* node = node_new(); |
1167 | 29.2k | CHECK_NULL_RETURN(node); |
1168 | | |
1169 | 29.2k | SET_NTYPE(node, NT_CANY); |
1170 | 29.2k | return node; |
1171 | 29.2k | } |
1172 | | |
1173 | | static Node* |
1174 | | node_new_list(Node* left, Node* right) |
1175 | 289k | { |
1176 | 289k | Node* node = node_new(); |
1177 | 289k | CHECK_NULL_RETURN(node); |
1178 | | |
1179 | 289k | SET_NTYPE(node, NT_LIST); |
1180 | 289k | NCAR(node) = left; |
1181 | 289k | NCDR(node) = right; |
1182 | 289k | return node; |
1183 | 289k | } |
1184 | | |
1185 | | extern Node* |
1186 | | onig_node_new_list(Node* left, Node* right) |
1187 | 0 | { |
1188 | 0 | return node_new_list(left, right); |
1189 | 0 | } |
1190 | | |
1191 | | extern Node* |
1192 | | onig_node_list_add(Node* list, Node* x) |
1193 | 0 | { |
1194 | 0 | Node *n; |
1195 | |
|
1196 | 0 | n = onig_node_new_list(x, NULL); |
1197 | 0 | if (IS_NULL(n)) return NULL_NODE; |
1198 | | |
1199 | 0 | if (IS_NOT_NULL(list)) { |
1200 | 0 | while (IS_NOT_NULL(NCDR(list))) |
1201 | 0 | list = NCDR(list); |
1202 | |
|
1203 | 0 | NCDR(list) = n; |
1204 | 0 | } |
1205 | |
|
1206 | 0 | return n; |
1207 | 0 | } |
1208 | | |
1209 | | extern Node* |
1210 | | onig_node_new_alt(Node* left, Node* right) |
1211 | 40.9k | { |
1212 | 40.9k | Node* node = node_new(); |
1213 | 40.9k | CHECK_NULL_RETURN(node); |
1214 | | |
1215 | 40.9k | SET_NTYPE(node, NT_ALT); |
1216 | 40.9k | NCAR(node) = left; |
1217 | 40.9k | NCDR(node) = right; |
1218 | 40.9k | return node; |
1219 | 40.9k | } |
1220 | | |
1221 | | extern Node* |
1222 | | onig_node_new_anchor(int type) |
1223 | 84.9k | { |
1224 | 84.9k | Node* node = node_new(); |
1225 | 84.9k | CHECK_NULL_RETURN(node); |
1226 | | |
1227 | 84.9k | SET_NTYPE(node, NT_ANCHOR); |
1228 | 84.9k | NANCHOR(node)->type = type; |
1229 | 84.9k | NANCHOR(node)->target = NULL; |
1230 | 84.9k | NANCHOR(node)->char_len = -1; |
1231 | 84.9k | NANCHOR(node)->ascii_range = 0; |
1232 | 84.9k | return node; |
1233 | 84.9k | } |
1234 | | |
1235 | | static Node* |
1236 | | node_new_backref(int back_num, int* backrefs, int by_name, |
1237 | | #ifdef USE_BACKREF_WITH_LEVEL |
1238 | | int exist_level, int nest_level, |
1239 | | #endif |
1240 | | ScanEnv* env) |
1241 | 0 | { |
1242 | 0 | int i; |
1243 | 0 | Node* node = node_new(); |
1244 | |
|
1245 | 0 | CHECK_NULL_RETURN(node); |
1246 | | |
1247 | 0 | SET_NTYPE(node, NT_BREF); |
1248 | 0 | NBREF(node)->state = 0; |
1249 | 0 | NBREF(node)->back_num = back_num; |
1250 | 0 | NBREF(node)->back_dynamic = (int* )NULL; |
1251 | 0 | if (by_name != 0) |
1252 | 0 | NBREF(node)->state |= NST_NAME_REF; |
1253 | |
|
1254 | 0 | #ifdef USE_BACKREF_WITH_LEVEL |
1255 | 0 | if (exist_level != 0) { |
1256 | 0 | NBREF(node)->state |= NST_NEST_LEVEL; |
1257 | 0 | NBREF(node)->nest_level = nest_level; |
1258 | 0 | } |
1259 | 0 | #endif |
1260 | |
|
1261 | 0 | for (i = 0; i < back_num; i++) { |
1262 | 0 | if (backrefs[i] <= env->num_mem && |
1263 | 0 | IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { |
1264 | 0 | NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */ |
1265 | 0 | break; |
1266 | 0 | } |
1267 | 0 | } |
1268 | |
|
1269 | 0 | if (back_num <= NODE_BACKREFS_SIZE) { |
1270 | 0 | for (i = 0; i < back_num; i++) |
1271 | 0 | NBREF(node)->back_static[i] = backrefs[i]; |
1272 | 0 | } |
1273 | 0 | else { |
1274 | 0 | int* p = (int* )xmalloc(sizeof(int) * back_num); |
1275 | 0 | if (IS_NULL(p)) { |
1276 | 0 | onig_node_free(node); |
1277 | 0 | return NULL; |
1278 | 0 | } |
1279 | 0 | NBREF(node)->back_dynamic = p; |
1280 | 0 | for (i = 0; i < back_num; i++) |
1281 | 0 | p[i] = backrefs[i]; |
1282 | 0 | } |
1283 | 0 | return node; |
1284 | 0 | } |
1285 | | |
1286 | | #ifdef USE_SUBEXP_CALL |
1287 | | static Node* |
1288 | | node_new_call(UChar* name, UChar* name_end, int gnum) |
1289 | 0 | { |
1290 | 0 | Node* node = node_new(); |
1291 | 0 | CHECK_NULL_RETURN(node); |
1292 | | |
1293 | 0 | SET_NTYPE(node, NT_CALL); |
1294 | 0 | NCALL(node)->state = 0; |
1295 | 0 | NCALL(node)->target = NULL_NODE; |
1296 | 0 | NCALL(node)->name = name; |
1297 | 0 | NCALL(node)->name_end = name_end; |
1298 | 0 | NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */ |
1299 | 0 | return node; |
1300 | 0 | } |
1301 | | #endif |
1302 | | |
1303 | | static Node* |
1304 | | node_new_quantifier(int lower, int upper, int by_number) |
1305 | 87.8k | { |
1306 | 87.8k | Node* node = node_new(); |
1307 | 87.8k | CHECK_NULL_RETURN(node); |
1308 | | |
1309 | 87.8k | SET_NTYPE(node, NT_QTFR); |
1310 | 87.8k | NQTFR(node)->state = 0; |
1311 | 87.8k | NQTFR(node)->target = NULL; |
1312 | 87.8k | NQTFR(node)->lower = lower; |
1313 | 87.8k | NQTFR(node)->upper = upper; |
1314 | 87.8k | NQTFR(node)->greedy = 1; |
1315 | 87.8k | NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY; |
1316 | 87.8k | NQTFR(node)->head_exact = NULL_NODE; |
1317 | 87.8k | NQTFR(node)->next_head_exact = NULL_NODE; |
1318 | 87.8k | NQTFR(node)->is_referred = 0; |
1319 | 87.8k | if (by_number != 0) |
1320 | 0 | NQTFR(node)->state |= NST_BY_NUMBER; |
1321 | | |
1322 | | #ifdef USE_COMBINATION_EXPLOSION_CHECK |
1323 | | NQTFR(node)->comb_exp_check_num = 0; |
1324 | | #endif |
1325 | | |
1326 | 87.8k | return node; |
1327 | 87.8k | } |
1328 | | |
1329 | | static Node* |
1330 | | node_new_enclose(int type) |
1331 | 55.6k | { |
1332 | 55.6k | Node* node = node_new(); |
1333 | 55.6k | CHECK_NULL_RETURN(node); |
1334 | | |
1335 | 55.6k | SET_NTYPE(node, NT_ENCLOSE); |
1336 | 55.6k | NENCLOSE(node)->type = type; |
1337 | 55.6k | NENCLOSE(node)->state = 0; |
1338 | 55.6k | NENCLOSE(node)->regnum = 0; |
1339 | 55.6k | NENCLOSE(node)->option = 0; |
1340 | 55.6k | NENCLOSE(node)->target = NULL; |
1341 | 55.6k | NENCLOSE(node)->call_addr = -1; |
1342 | 55.6k | NENCLOSE(node)->opt_count = 0; |
1343 | 55.6k | return node; |
1344 | 55.6k | } |
1345 | | |
1346 | | extern Node* |
1347 | | onig_node_new_enclose(int type) |
1348 | 38.0k | { |
1349 | 38.0k | return node_new_enclose(type); |
1350 | 38.0k | } |
1351 | | |
1352 | | static Node* |
1353 | | node_new_enclose_memory(OnigOptionType option, int is_named) |
1354 | 14.6k | { |
1355 | 14.6k | Node* node = node_new_enclose(ENCLOSE_MEMORY); |
1356 | 14.6k | CHECK_NULL_RETURN(node); |
1357 | 14.6k | if (is_named != 0) |
1358 | 11.7k | SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP); |
1359 | | |
1360 | 14.6k | #ifdef USE_SUBEXP_CALL |
1361 | 14.6k | NENCLOSE(node)->option = option; |
1362 | 14.6k | #endif |
1363 | 14.6k | return node; |
1364 | 14.6k | } |
1365 | | |
1366 | | static Node* |
1367 | | node_new_option(OnigOptionType option) |
1368 | 2.92k | { |
1369 | 2.92k | Node* node = node_new_enclose(ENCLOSE_OPTION); |
1370 | 2.92k | CHECK_NULL_RETURN(node); |
1371 | 2.92k | NENCLOSE(node)->option = option; |
1372 | 2.92k | return node; |
1373 | 2.92k | } |
1374 | | |
1375 | | extern int |
1376 | | onig_node_str_cat(Node* node, const UChar* s, const UChar* end) |
1377 | 1.08M | { |
1378 | 1.08M | ptrdiff_t addlen = end - s; |
1379 | | |
1380 | 1.08M | if (addlen > 0) { |
1381 | 1.07M | ptrdiff_t len = NSTR(node)->end - NSTR(node)->s; |
1382 | | |
1383 | 1.07M | if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { |
1384 | 175k | UChar* p; |
1385 | 175k | ptrdiff_t capa = len + addlen + NODE_STR_MARGIN; |
1386 | | |
1387 | 175k | if (capa <= NSTR(node)->capa) { |
1388 | 0 | onig_strcpy(NSTR(node)->s + len, s, end); |
1389 | 0 | } |
1390 | 175k | else { |
1391 | 175k | if (NSTR(node)->s == NSTR(node)->buf) |
1392 | 11.7k | p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end, |
1393 | 11.7k | s, end, capa); |
1394 | 163k | else |
1395 | 163k | p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa); |
1396 | | |
1397 | 175k | CHECK_NULL_RETURN_MEMERR(p); |
1398 | 175k | NSTR(node)->s = p; |
1399 | 175k | NSTR(node)->capa = (int )capa; |
1400 | 175k | } |
1401 | 175k | } |
1402 | 901k | else { |
1403 | 901k | onig_strcpy(NSTR(node)->s + len, s, end); |
1404 | 901k | } |
1405 | 1.07M | NSTR(node)->end = NSTR(node)->s + len + addlen; |
1406 | 1.07M | } |
1407 | | |
1408 | 1.08M | return 0; |
1409 | 1.08M | } |
1410 | | |
1411 | | extern int |
1412 | | onig_node_str_set(Node* node, const UChar* s, const UChar* end) |
1413 | 0 | { |
1414 | 0 | onig_node_str_clear(node); |
1415 | 0 | return onig_node_str_cat(node, s, end); |
1416 | 0 | } |
1417 | | |
1418 | | static int |
1419 | | node_str_cat_char(Node* node, UChar c) |
1420 | 0 | { |
1421 | 0 | UChar s[1]; |
1422 | |
|
1423 | 0 | s[0] = c; |
1424 | 0 | return onig_node_str_cat(node, s, s + 1); |
1425 | 0 | } |
1426 | | |
1427 | | static int |
1428 | | node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c) |
1429 | 0 | { |
1430 | 0 | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; |
1431 | 0 | int num = ONIGENC_CODE_TO_MBC(enc, c, buf); |
1432 | 0 | if (num < 0) return num; |
1433 | 0 | return onig_node_str_cat(node, buf, buf + num); |
1434 | 0 | } |
1435 | | |
1436 | | #if 0 |
1437 | | extern void |
1438 | | onig_node_conv_to_str_node(Node* node, int flag) |
1439 | | { |
1440 | | SET_NTYPE(node, NT_STR); |
1441 | | NSTR(node)->flag = flag; |
1442 | | NSTR(node)->capa = 0; |
1443 | | NSTR(node)->s = NSTR(node)->buf; |
1444 | | NSTR(node)->end = NSTR(node)->buf; |
1445 | | } |
1446 | | #endif |
1447 | | |
1448 | | extern void |
1449 | | onig_node_str_clear(Node* node) |
1450 | 0 | { |
1451 | 0 | if (NSTR(node)->capa != 0 && |
1452 | 0 | IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { |
1453 | 0 | xfree(NSTR(node)->s); |
1454 | 0 | } |
1455 | |
|
1456 | 0 | NSTR(node)->capa = 0; |
1457 | 0 | NSTR(node)->flag = 0; |
1458 | 0 | NSTR(node)->s = NSTR(node)->buf; |
1459 | 0 | NSTR(node)->end = NSTR(node)->buf; |
1460 | 0 | } |
1461 | | |
1462 | | static Node* |
1463 | | node_new_str(const UChar* s, const UChar* end) |
1464 | 125k | { |
1465 | 125k | Node* node = node_new(); |
1466 | 125k | CHECK_NULL_RETURN(node); |
1467 | | |
1468 | 125k | SET_NTYPE(node, NT_STR); |
1469 | 125k | NSTR(node)->capa = 0; |
1470 | 125k | NSTR(node)->flag = 0; |
1471 | 125k | NSTR(node)->s = NSTR(node)->buf; |
1472 | 125k | NSTR(node)->end = NSTR(node)->buf; |
1473 | 125k | if (onig_node_str_cat(node, s, end)) { |
1474 | 0 | onig_node_free(node); |
1475 | 0 | return NULL; |
1476 | 0 | } |
1477 | 125k | return node; |
1478 | 125k | } |
1479 | | |
1480 | | extern Node* |
1481 | | onig_node_new_str(const UChar* s, const UChar* end) |
1482 | 0 | { |
1483 | 0 | return node_new_str(s, end); |
1484 | 0 | } |
1485 | | |
1486 | | static Node* |
1487 | | node_new_str_raw(UChar* s, UChar* end) |
1488 | 0 | { |
1489 | 0 | Node* node = node_new_str(s, end); |
1490 | 0 | if (IS_NOT_NULL(node)) |
1491 | 0 | NSTRING_SET_RAW(node); |
1492 | 0 | return node; |
1493 | 0 | } |
1494 | | |
1495 | | static Node* |
1496 | | node_new_empty(void) |
1497 | 2.92k | { |
1498 | 2.92k | return node_new_str(NULL, NULL); |
1499 | 2.92k | } |
1500 | | |
1501 | | static Node* |
1502 | | node_new_str_raw_char(UChar c) |
1503 | 0 | { |
1504 | 0 | UChar p[1]; |
1505 | |
|
1506 | 0 | p[0] = c; |
1507 | 0 | return node_new_str_raw(p, p + 1); |
1508 | 0 | } |
1509 | | |
1510 | | static Node* |
1511 | | str_node_split_last_char(StrNode* sn, OnigEncoding enc) |
1512 | 0 | { |
1513 | 0 | const UChar *p; |
1514 | 0 | Node* n = NULL_NODE; |
1515 | |
|
1516 | 0 | if (sn->end > sn->s) { |
1517 | 0 | p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end); |
1518 | 0 | if (p && p > sn->s) { /* can be split. */ |
1519 | 0 | n = node_new_str(p, sn->end); |
1520 | 0 | if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0) |
1521 | 0 | NSTRING_SET_RAW(n); |
1522 | 0 | sn->end = (UChar* )p; |
1523 | 0 | } |
1524 | 0 | } |
1525 | 0 | return n; |
1526 | 0 | } |
1527 | | |
1528 | | static int |
1529 | | str_node_can_be_split(StrNode* sn, OnigEncoding enc) |
1530 | 0 | { |
1531 | 0 | if (sn->end > sn->s) { |
1532 | 0 | return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0); |
1533 | 0 | } |
1534 | 0 | return 0; |
1535 | 0 | } |
1536 | | |
1537 | | #ifdef USE_PAD_TO_SHORT_BYTE_CHAR |
1538 | | static int |
1539 | | node_str_head_pad(StrNode* sn, int num, UChar val) |
1540 | | { |
1541 | | UChar buf[NODE_STR_BUF_SIZE]; |
1542 | | int i, len; |
1543 | | |
1544 | | len = sn->end - sn->s; |
1545 | | onig_strcpy(buf, sn->s, sn->end); |
1546 | | onig_strcpy(&(sn->s[num]), buf, buf + len); |
1547 | | sn->end += num; |
1548 | | |
1549 | | for (i = 0; i < num; i++) { |
1550 | | sn->s[i] = val; |
1551 | | } |
1552 | | } |
1553 | | #endif |
1554 | | |
1555 | | extern int |
1556 | | onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) |
1557 | 0 | { |
1558 | 0 | unsigned int num, val; |
1559 | 0 | OnigCodePoint c; |
1560 | 0 | UChar* p = *src; |
1561 | 0 | PFETCH_READY; |
1562 | |
|
1563 | 0 | num = 0; |
1564 | 0 | while (!PEND) { |
1565 | 0 | PFETCH(c); |
1566 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
1567 | 0 | val = (unsigned int )DIGITVAL(c); |
1568 | 0 | if ((INT_MAX_LIMIT - val) / 10UL < num) |
1569 | 0 | return -1; /* overflow */ |
1570 | | |
1571 | 0 | num = num * 10 + val; |
1572 | 0 | } |
1573 | 0 | else { |
1574 | 0 | PUNFETCH; |
1575 | 0 | break; |
1576 | 0 | } |
1577 | 0 | } |
1578 | 0 | *src = p; |
1579 | 0 | return num; |
1580 | 0 | } |
1581 | | |
1582 | | static int |
1583 | | scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, |
1584 | | int maxlen, OnigEncoding enc) |
1585 | 0 | { |
1586 | 0 | OnigCodePoint c; |
1587 | 0 | unsigned int num, val; |
1588 | 0 | int restlen; |
1589 | 0 | UChar* p = *src; |
1590 | 0 | PFETCH_READY; |
1591 | |
|
1592 | 0 | restlen = maxlen - minlen; |
1593 | 0 | num = 0; |
1594 | 0 | while (!PEND && maxlen-- != 0) { |
1595 | 0 | PFETCH(c); |
1596 | 0 | if (ONIGENC_IS_CODE_XDIGIT(enc, c)) { |
1597 | 0 | val = (unsigned int )XDIGITVAL(enc,c); |
1598 | 0 | if ((INT_MAX_LIMIT - val) / 16UL < num) |
1599 | 0 | return -1; /* overflow */ |
1600 | | |
1601 | 0 | num = (num << 4) + XDIGITVAL(enc,c); |
1602 | 0 | } |
1603 | 0 | else { |
1604 | 0 | PUNFETCH; |
1605 | 0 | maxlen++; |
1606 | 0 | break; |
1607 | 0 | } |
1608 | 0 | } |
1609 | 0 | if (maxlen > restlen) |
1610 | 0 | return -2; /* not enough digits */ |
1611 | 0 | *src = p; |
1612 | 0 | return num; |
1613 | 0 | } |
1614 | | |
1615 | | static int |
1616 | | scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, |
1617 | | OnigEncoding enc) |
1618 | 0 | { |
1619 | 0 | OnigCodePoint c; |
1620 | 0 | unsigned int num, val; |
1621 | 0 | UChar* p = *src; |
1622 | 0 | PFETCH_READY; |
1623 | |
|
1624 | 0 | num = 0; |
1625 | 0 | while (!PEND && maxlen-- != 0) { |
1626 | 0 | PFETCH(c); |
1627 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') { |
1628 | 0 | val = ODIGITVAL(c); |
1629 | 0 | if ((INT_MAX_LIMIT - val) / 8UL < num) |
1630 | 0 | return -1; /* overflow */ |
1631 | | |
1632 | 0 | num = (num << 3) + val; |
1633 | 0 | } |
1634 | 0 | else { |
1635 | 0 | PUNFETCH; |
1636 | 0 | break; |
1637 | 0 | } |
1638 | 0 | } |
1639 | 0 | *src = p; |
1640 | 0 | return num; |
1641 | 0 | } |
1642 | | |
1643 | | |
1644 | | #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \ |
1645 | 2.88M | BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) |
1646 | | |
1647 | | /* data format: |
1648 | | [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] |
1649 | | (all data size is OnigCodePoint) |
1650 | | */ |
1651 | | static int |
1652 | | new_code_range(BBuf** pbuf) |
1653 | 38.0k | { |
1654 | 38.0k | #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) |
1655 | 38.0k | int r; |
1656 | 38.0k | OnigCodePoint n; |
1657 | 38.0k | BBuf* bbuf; |
1658 | | |
1659 | 38.0k | bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); |
1660 | 38.0k | CHECK_NULL_RETURN_MEMERR(*pbuf); |
1661 | 38.0k | r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE); |
1662 | 38.0k | if (r) return r; |
1663 | | |
1664 | 38.0k | n = 0; |
1665 | 38.0k | BBUF_WRITE_CODE_POINT(bbuf, 0, n); |
1666 | 38.0k | return 0; |
1667 | 38.0k | } |
1668 | | |
1669 | | static int |
1670 | | add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, |
1671 | | int checkdup) |
1672 | 948k | { |
1673 | 948k | int r, inc_n, pos; |
1674 | 948k | OnigCodePoint low, high, bound, x; |
1675 | 948k | OnigCodePoint n, *data; |
1676 | 948k | BBuf* bbuf; |
1677 | | |
1678 | 948k | if (from > to) { |
1679 | 0 | n = from; from = to; to = n; |
1680 | 0 | } |
1681 | | |
1682 | 948k | if (IS_NULL(*pbuf)) { |
1683 | 38.0k | r = new_code_range(pbuf); |
1684 | 38.0k | if (r) return r; |
1685 | 38.0k | bbuf = *pbuf; |
1686 | 38.0k | n = 0; |
1687 | 38.0k | } |
1688 | 910k | else { |
1689 | 910k | bbuf = *pbuf; |
1690 | 910k | GET_CODE_POINT(n, bbuf->p); |
1691 | 910k | } |
1692 | 948k | data = (OnigCodePoint* )(bbuf->p); |
1693 | 948k | data++; |
1694 | | |
1695 | 948k | bound = (from == 0) ? 0 : n; |
1696 | 4.31M | for (low = 0; low < bound; ) { |
1697 | 3.36M | x = (low + bound) >> 1; |
1698 | 3.36M | if (from - 1 > data[x*2 + 1]) |
1699 | 3.36M | low = x + 1; |
1700 | 0 | else |
1701 | 0 | bound = x; |
1702 | 3.36M | } |
1703 | | |
1704 | 948k | high = (to == ONIG_LAST_CODE_POINT) ? n : low; |
1705 | 948k | for (bound = n; high < bound; ) { |
1706 | 0 | x = (high + bound) >> 1; |
1707 | 0 | if (to + 1 >= data[x*2]) |
1708 | 0 | high = x + 1; |
1709 | 0 | else |
1710 | 0 | bound = x; |
1711 | 0 | } |
1712 | | /* data[(low-1)*2+1] << from <= data[low*2] |
1713 | | * data[(high-1)*2+1] <= to << data[high*2] |
1714 | | */ |
1715 | | |
1716 | 948k | inc_n = low + 1 - high; |
1717 | 948k | if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) |
1718 | 0 | return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; |
1719 | | |
1720 | 948k | if (inc_n != 1) { |
1721 | 0 | if (checkdup && from <= data[low*2+1] |
1722 | 0 | && (data[low*2] <= from || data[low*2+1] <= to)) |
1723 | 0 | CC_DUP_WARN(env, from, to); |
1724 | 0 | if (from > data[low*2]) |
1725 | 0 | from = data[low*2]; |
1726 | 0 | if (to < data[(high - 1)*2 + 1]) |
1727 | 0 | to = data[(high - 1)*2 + 1]; |
1728 | 0 | } |
1729 | | |
1730 | 948k | if (inc_n != 0) { |
1731 | 948k | int from_pos = SIZE_CODE_POINT * (1 + high * 2); |
1732 | 948k | int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); |
1733 | | |
1734 | 948k | if (inc_n > 0) { |
1735 | 948k | if (high < n) { |
1736 | 0 | int size = (n - high) * 2 * SIZE_CODE_POINT; |
1737 | 0 | BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); |
1738 | 0 | } |
1739 | 948k | } |
1740 | 0 | else { |
1741 | 0 | BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); |
1742 | 0 | } |
1743 | 948k | } |
1744 | | |
1745 | 948k | pos = SIZE_CODE_POINT * (1 + low * 2); |
1746 | 948k | BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); |
1747 | 948k | BBUF_WRITE_CODE_POINT(bbuf, pos, from); |
1748 | 948k | BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); |
1749 | 948k | n += inc_n; |
1750 | 948k | BBUF_WRITE_CODE_POINT(bbuf, 0, n); |
1751 | | |
1752 | 948k | return 0; |
1753 | 948k | } |
1754 | | |
1755 | | static int |
1756 | | add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) |
1757 | 948k | { |
1758 | 948k | return add_code_range_to_buf0(pbuf, env, from, to, 1); |
1759 | 948k | } |
1760 | | |
1761 | | static int |
1762 | | add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup) |
1763 | 0 | { |
1764 | 0 | if (from > to) { |
1765 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) |
1766 | 0 | return 0; |
1767 | 0 | else |
1768 | 0 | return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; |
1769 | 0 | } |
1770 | | |
1771 | 0 | return add_code_range_to_buf0(pbuf, env, from, to, checkdup); |
1772 | 0 | } |
1773 | | |
1774 | | static int |
1775 | | add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) |
1776 | 0 | { |
1777 | 0 | return add_code_range0(pbuf, env, from, to, 1); |
1778 | 0 | } |
1779 | | |
1780 | | static int |
1781 | | not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env) |
1782 | 0 | { |
1783 | 0 | int r, i, n; |
1784 | 0 | OnigCodePoint pre, from, *data, to = 0; |
1785 | |
|
1786 | 0 | *pbuf = (BBuf* )NULL; |
1787 | 0 | if (IS_NULL(bbuf)) { |
1788 | 0 | set_all: |
1789 | 0 | return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); |
1790 | 0 | } |
1791 | | |
1792 | 0 | data = (OnigCodePoint* )(bbuf->p); |
1793 | 0 | GET_CODE_POINT(n, data); |
1794 | 0 | data++; |
1795 | 0 | if (n <= 0) goto set_all; |
1796 | | |
1797 | 0 | r = 0; |
1798 | 0 | pre = MBCODE_START_POS(enc); |
1799 | 0 | for (i = 0; i < n; i++) { |
1800 | 0 | from = data[i*2]; |
1801 | 0 | to = data[i*2+1]; |
1802 | 0 | if (pre <= from - 1) { |
1803 | 0 | r = add_code_range_to_buf(pbuf, env, pre, from - 1); |
1804 | 0 | if (r != 0) return r; |
1805 | 0 | } |
1806 | 0 | if (to == ONIG_LAST_CODE_POINT) break; |
1807 | 0 | pre = to + 1; |
1808 | 0 | } |
1809 | 0 | if (to < ONIG_LAST_CODE_POINT) { |
1810 | 0 | r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT); |
1811 | 0 | } |
1812 | 0 | return r; |
1813 | 0 | } |
1814 | | |
1815 | 0 | #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\ |
1816 | 0 | BBuf *tbuf; \ |
1817 | 0 | int tnot; \ |
1818 | 0 | tnot = not1; not1 = not2; not2 = tnot; \ |
1819 | 0 | tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ |
1820 | 0 | } while (0) |
1821 | | |
1822 | | static int |
1823 | | or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, |
1824 | | BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) |
1825 | 38.0k | { |
1826 | 38.0k | int r; |
1827 | 38.0k | OnigCodePoint i, n1, *data1; |
1828 | 38.0k | OnigCodePoint from, to; |
1829 | | |
1830 | 38.0k | *pbuf = (BBuf* )NULL; |
1831 | 38.0k | if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { |
1832 | 38.0k | if (not1 != 0 || not2 != 0) |
1833 | 0 | return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); |
1834 | 38.0k | return 0; |
1835 | 38.0k | } |
1836 | | |
1837 | 0 | r = 0; |
1838 | 0 | if (IS_NULL(bbuf2)) |
1839 | 0 | SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); |
1840 | |
|
1841 | 0 | if (IS_NULL(bbuf1)) { |
1842 | 0 | if (not1 != 0) { |
1843 | 0 | return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); |
1844 | 0 | } |
1845 | 0 | else { |
1846 | 0 | if (not2 == 0) { |
1847 | 0 | return bbuf_clone(pbuf, bbuf2); |
1848 | 0 | } |
1849 | 0 | else { |
1850 | 0 | return not_code_range_buf(enc, bbuf2, pbuf, env); |
1851 | 0 | } |
1852 | 0 | } |
1853 | 0 | } |
1854 | | |
1855 | 0 | if (not1 != 0) |
1856 | 0 | SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); |
1857 | |
|
1858 | 0 | data1 = (OnigCodePoint* )(bbuf1->p); |
1859 | 0 | GET_CODE_POINT(n1, data1); |
1860 | 0 | data1++; |
1861 | |
|
1862 | 0 | if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ |
1863 | 0 | r = bbuf_clone(pbuf, bbuf2); |
1864 | 0 | } |
1865 | 0 | else if (not1 == 0) { /* 1 OR (not 2) */ |
1866 | 0 | r = not_code_range_buf(enc, bbuf2, pbuf, env); |
1867 | 0 | } |
1868 | 0 | if (r != 0) return r; |
1869 | | |
1870 | 0 | for (i = 0; i < n1; i++) { |
1871 | 0 | from = data1[i*2]; |
1872 | 0 | to = data1[i*2+1]; |
1873 | 0 | r = add_code_range_to_buf(pbuf, env, from, to); |
1874 | 0 | if (r != 0) return r; |
1875 | 0 | } |
1876 | 0 | return 0; |
1877 | 0 | } |
1878 | | |
1879 | | static int |
1880 | | and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1, |
1881 | | OnigCodePoint* data, int n) |
1882 | 0 | { |
1883 | 0 | int i, r; |
1884 | 0 | OnigCodePoint from2, to2; |
1885 | |
|
1886 | 0 | for (i = 0; i < n; i++) { |
1887 | 0 | from2 = data[i*2]; |
1888 | 0 | to2 = data[i*2+1]; |
1889 | 0 | if (from2 < from1) { |
1890 | 0 | if (to2 < from1) continue; |
1891 | 0 | else { |
1892 | 0 | from1 = to2 + 1; |
1893 | 0 | } |
1894 | 0 | } |
1895 | 0 | else if (from2 <= to1) { |
1896 | 0 | if (to2 < to1) { |
1897 | 0 | if (from1 <= from2 - 1) { |
1898 | 0 | r = add_code_range_to_buf(pbuf, env, from1, from2-1); |
1899 | 0 | if (r != 0) return r; |
1900 | 0 | } |
1901 | 0 | from1 = to2 + 1; |
1902 | 0 | } |
1903 | 0 | else { |
1904 | 0 | to1 = from2 - 1; |
1905 | 0 | } |
1906 | 0 | } |
1907 | 0 | else { |
1908 | 0 | from1 = from2; |
1909 | 0 | } |
1910 | 0 | if (from1 > to1) break; |
1911 | 0 | } |
1912 | 0 | if (from1 <= to1) { |
1913 | 0 | r = add_code_range_to_buf(pbuf, env, from1, to1); |
1914 | 0 | if (r != 0) return r; |
1915 | 0 | } |
1916 | 0 | return 0; |
1917 | 0 | } |
1918 | | |
1919 | | static int |
1920 | | and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env) |
1921 | 38.0k | { |
1922 | 38.0k | int r; |
1923 | 38.0k | OnigCodePoint i, j, n1, n2, *data1, *data2; |
1924 | 38.0k | OnigCodePoint from, to, from1, to1, from2, to2; |
1925 | | |
1926 | 38.0k | *pbuf = (BBuf* )NULL; |
1927 | 38.0k | if (IS_NULL(bbuf1)) { |
1928 | 0 | if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ |
1929 | 0 | return bbuf_clone(pbuf, bbuf2); |
1930 | 0 | return 0; |
1931 | 0 | } |
1932 | 38.0k | else if (IS_NULL(bbuf2)) { |
1933 | 38.0k | if (not2 != 0) |
1934 | 0 | return bbuf_clone(pbuf, bbuf1); |
1935 | 38.0k | return 0; |
1936 | 38.0k | } |
1937 | | |
1938 | 0 | if (not1 != 0) |
1939 | 0 | SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); |
1940 | |
|
1941 | 0 | data1 = (OnigCodePoint* )(bbuf1->p); |
1942 | 0 | data2 = (OnigCodePoint* )(bbuf2->p); |
1943 | 0 | GET_CODE_POINT(n1, data1); |
1944 | 0 | GET_CODE_POINT(n2, data2); |
1945 | 0 | data1++; |
1946 | 0 | data2++; |
1947 | |
|
1948 | 0 | if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ |
1949 | 0 | for (i = 0; i < n1; i++) { |
1950 | 0 | from1 = data1[i*2]; |
1951 | 0 | to1 = data1[i*2+1]; |
1952 | 0 | for (j = 0; j < n2; j++) { |
1953 | 0 | from2 = data2[j*2]; |
1954 | 0 | to2 = data2[j*2+1]; |
1955 | 0 | if (from2 > to1) break; |
1956 | 0 | if (to2 < from1) continue; |
1957 | 0 | from = MAX(from1, from2); |
1958 | 0 | to = MIN(to1, to2); |
1959 | 0 | r = add_code_range_to_buf(pbuf, env, from, to); |
1960 | 0 | if (r != 0) return r; |
1961 | 0 | } |
1962 | 0 | } |
1963 | 0 | } |
1964 | 0 | else if (not1 == 0) { /* 1 AND (not 2) */ |
1965 | 0 | for (i = 0; i < n1; i++) { |
1966 | 0 | from1 = data1[i*2]; |
1967 | 0 | to1 = data1[i*2+1]; |
1968 | 0 | r = and_code_range1(pbuf, env, from1, to1, data2, n2); |
1969 | 0 | if (r != 0) return r; |
1970 | 0 | } |
1971 | 0 | } |
1972 | | |
1973 | 0 | return 0; |
1974 | 0 | } |
1975 | | |
1976 | | static int |
1977 | | and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) |
1978 | 38.0k | { |
1979 | 38.0k | OnigEncoding enc = env->enc; |
1980 | 38.0k | int r, not1, not2; |
1981 | 38.0k | BBuf *buf1, *buf2, *pbuf = 0; |
1982 | 38.0k | BitSetRef bsr1, bsr2; |
1983 | 38.0k | BitSet bs1, bs2; |
1984 | | |
1985 | 38.0k | not1 = IS_NCCLASS_NOT(dest); |
1986 | 38.0k | bsr1 = dest->bs; |
1987 | 38.0k | buf1 = dest->mbuf; |
1988 | 38.0k | not2 = IS_NCCLASS_NOT(cc); |
1989 | 38.0k | bsr2 = cc->bs; |
1990 | 38.0k | buf2 = cc->mbuf; |
1991 | | |
1992 | 38.0k | if (not1 != 0) { |
1993 | 0 | bitset_invert_to(bsr1, bs1); |
1994 | 0 | bsr1 = bs1; |
1995 | 0 | } |
1996 | 38.0k | if (not2 != 0) { |
1997 | 0 | bitset_invert_to(bsr2, bs2); |
1998 | 0 | bsr2 = bs2; |
1999 | 0 | } |
2000 | 38.0k | bitset_and(bsr1, bsr2); |
2001 | 38.0k | if (bsr1 != dest->bs) { |
2002 | 0 | bitset_copy(dest->bs, bsr1); |
2003 | 0 | bsr1 = dest->bs; |
2004 | 0 | } |
2005 | 38.0k | if (not1 != 0) { |
2006 | 0 | bitset_invert(dest->bs); |
2007 | 0 | } |
2008 | | |
2009 | 38.0k | if (! ONIGENC_IS_SINGLEBYTE(enc)) { |
2010 | 38.0k | if (not1 != 0 && not2 != 0) { |
2011 | 0 | r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env); |
2012 | 0 | } |
2013 | 38.0k | else { |
2014 | 38.0k | r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env); |
2015 | 38.0k | if (r == 0 && not1 != 0) { |
2016 | 0 | BBuf *tbuf = 0; |
2017 | 0 | r = not_code_range_buf(enc, pbuf, &tbuf, env); |
2018 | 0 | bbuf_free(pbuf); |
2019 | 0 | pbuf = tbuf; |
2020 | 0 | } |
2021 | 38.0k | } |
2022 | 38.0k | if (r != 0) { |
2023 | 0 | bbuf_free(pbuf); |
2024 | 0 | return r; |
2025 | 0 | } |
2026 | | |
2027 | 38.0k | dest->mbuf = pbuf; |
2028 | 38.0k | bbuf_free(buf1); |
2029 | 38.0k | return r; |
2030 | 38.0k | } |
2031 | 0 | return 0; |
2032 | 38.0k | } |
2033 | | |
2034 | | static int |
2035 | | or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) |
2036 | 38.0k | { |
2037 | 38.0k | OnigEncoding enc = env->enc; |
2038 | 38.0k | int r, not1, not2; |
2039 | 38.0k | BBuf *buf1, *buf2, *pbuf = 0; |
2040 | 38.0k | BitSetRef bsr1, bsr2; |
2041 | 38.0k | BitSet bs1, bs2; |
2042 | | |
2043 | 38.0k | not1 = IS_NCCLASS_NOT(dest); |
2044 | 38.0k | bsr1 = dest->bs; |
2045 | 38.0k | buf1 = dest->mbuf; |
2046 | 38.0k | not2 = IS_NCCLASS_NOT(cc); |
2047 | 38.0k | bsr2 = cc->bs; |
2048 | 38.0k | buf2 = cc->mbuf; |
2049 | | |
2050 | 38.0k | if (not1 != 0) { |
2051 | 0 | bitset_invert_to(bsr1, bs1); |
2052 | 0 | bsr1 = bs1; |
2053 | 0 | } |
2054 | 38.0k | if (not2 != 0) { |
2055 | 0 | bitset_invert_to(bsr2, bs2); |
2056 | 0 | bsr2 = bs2; |
2057 | 0 | } |
2058 | 38.0k | bitset_or(bsr1, bsr2); |
2059 | 38.0k | if (bsr1 != dest->bs) { |
2060 | 0 | bitset_copy(dest->bs, bsr1); |
2061 | 0 | bsr1 = dest->bs; |
2062 | 0 | } |
2063 | 38.0k | if (not1 != 0) { |
2064 | 0 | bitset_invert(dest->bs); |
2065 | 0 | } |
2066 | | |
2067 | 38.0k | if (! ONIGENC_IS_SINGLEBYTE(enc)) { |
2068 | 38.0k | if (not1 != 0 && not2 != 0) { |
2069 | 0 | r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env); |
2070 | 0 | } |
2071 | 38.0k | else { |
2072 | 38.0k | r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env); |
2073 | 38.0k | if (r == 0 && not1 != 0) { |
2074 | 0 | BBuf *tbuf = 0; |
2075 | 0 | r = not_code_range_buf(enc, pbuf, &tbuf, env); |
2076 | 0 | bbuf_free(pbuf); |
2077 | 0 | pbuf = tbuf; |
2078 | 0 | } |
2079 | 38.0k | } |
2080 | 38.0k | if (r != 0) { |
2081 | 0 | bbuf_free(pbuf); |
2082 | 0 | return r; |
2083 | 0 | } |
2084 | | |
2085 | 38.0k | dest->mbuf = pbuf; |
2086 | 38.0k | bbuf_free(buf1); |
2087 | 38.0k | return r; |
2088 | 38.0k | } |
2089 | 0 | else |
2090 | 0 | return 0; |
2091 | 38.0k | } |
2092 | | |
2093 | | static void UNKNOWN_ESC_WARN(ScanEnv *env, int c); |
2094 | | |
2095 | | static OnigCodePoint |
2096 | | conv_backslash_value(OnigCodePoint c, ScanEnv* env) |
2097 | 52.7k | { |
2098 | 52.7k | if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { |
2099 | 52.7k | switch (c) { |
2100 | 5.85k | case 'n': return '\n'; |
2101 | 23.4k | case 't': return '\t'; |
2102 | 5.85k | case 'r': return '\r'; |
2103 | 0 | case 'f': return '\f'; |
2104 | 0 | case 'a': return '\007'; |
2105 | 0 | case 'b': return '\010'; |
2106 | 0 | case 'e': return '\033'; |
2107 | 0 | case 'v': |
2108 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) |
2109 | 0 | return '\v'; |
2110 | 0 | break; |
2111 | | |
2112 | 17.5k | default: |
2113 | 17.5k | if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) |
2114 | 0 | UNKNOWN_ESC_WARN(env, c); |
2115 | 17.5k | break; |
2116 | 52.7k | } |
2117 | 52.7k | } |
2118 | 17.5k | return c; |
2119 | 52.7k | } |
2120 | | |
2121 | | #ifdef USE_NO_INVALID_QUANTIFIER |
2122 | 87.8k | # define is_invalid_quantifier_target(node) 0 |
2123 | | #else |
2124 | | static int |
2125 | | is_invalid_quantifier_target(Node* node) |
2126 | | { |
2127 | | switch (NTYPE(node)) { |
2128 | | case NT_ANCHOR: |
2129 | | return 1; |
2130 | | break; |
2131 | | |
2132 | | case NT_ENCLOSE: |
2133 | | /* allow enclosed elements */ |
2134 | | /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */ |
2135 | | break; |
2136 | | |
2137 | | case NT_LIST: |
2138 | | do { |
2139 | | if (! is_invalid_quantifier_target(NCAR(node))) return 0; |
2140 | | } while (IS_NOT_NULL(node = NCDR(node))); |
2141 | | return 0; |
2142 | | break; |
2143 | | |
2144 | | case NT_ALT: |
2145 | | do { |
2146 | | if (is_invalid_quantifier_target(NCAR(node))) return 1; |
2147 | | } while (IS_NOT_NULL(node = NCDR(node))); |
2148 | | break; |
2149 | | |
2150 | | default: |
2151 | | break; |
2152 | | } |
2153 | | return 0; |
2154 | | } |
2155 | | #endif |
2156 | | |
2157 | | /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ |
2158 | | static int |
2159 | | popular_quantifier_num(QtfrNode* q) |
2160 | 0 | { |
2161 | 0 | if (q->greedy) { |
2162 | 0 | if (q->lower == 0) { |
2163 | 0 | if (q->upper == 1) return 0; |
2164 | 0 | else if (IS_REPEAT_INFINITE(q->upper)) return 1; |
2165 | 0 | } |
2166 | 0 | else if (q->lower == 1) { |
2167 | 0 | if (IS_REPEAT_INFINITE(q->upper)) return 2; |
2168 | 0 | } |
2169 | 0 | } |
2170 | 0 | else { |
2171 | 0 | if (q->lower == 0) { |
2172 | 0 | if (q->upper == 1) return 3; |
2173 | 0 | else if (IS_REPEAT_INFINITE(q->upper)) return 4; |
2174 | 0 | } |
2175 | 0 | else if (q->lower == 1) { |
2176 | 0 | if (IS_REPEAT_INFINITE(q->upper)) return 5; |
2177 | 0 | } |
2178 | 0 | } |
2179 | 0 | return -1; |
2180 | 0 | } |
2181 | | |
2182 | | |
2183 | | enum ReduceType { |
2184 | | RQ_ASIS = 0, /* as is */ |
2185 | | RQ_DEL = 1, /* delete parent */ |
2186 | | RQ_A, /* to '*' */ |
2187 | | RQ_AQ, /* to '*?' */ |
2188 | | RQ_QQ, /* to '??' */ |
2189 | | RQ_P_QQ, /* to '+)??' */ |
2190 | | RQ_PQ_Q /* to '+?)?' */ |
2191 | | }; |
2192 | | |
2193 | | static enum ReduceType const ReduceTypeTable[6][6] = { |
2194 | | /* '?', '*', '+', '??', '*?', '+?' p / c */ |
2195 | | {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ |
2196 | | {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ |
2197 | | {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ |
2198 | | {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ |
2199 | | {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ |
2200 | | {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ |
2201 | | }; |
2202 | | |
2203 | | extern void |
2204 | | onig_reduce_nested_quantifier(Node* pnode, Node* cnode) |
2205 | 0 | { |
2206 | 0 | int pnum, cnum; |
2207 | 0 | QtfrNode *p, *c; |
2208 | |
|
2209 | 0 | p = NQTFR(pnode); |
2210 | 0 | c = NQTFR(cnode); |
2211 | 0 | pnum = popular_quantifier_num(p); |
2212 | 0 | cnum = popular_quantifier_num(c); |
2213 | 0 | if (pnum < 0 || cnum < 0) return ; |
2214 | | |
2215 | 0 | switch (ReduceTypeTable[cnum][pnum]) { |
2216 | 0 | case RQ_DEL: |
2217 | 0 | *pnode = *cnode; |
2218 | 0 | break; |
2219 | 0 | case RQ_A: |
2220 | 0 | p->target = c->target; |
2221 | 0 | p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; |
2222 | 0 | break; |
2223 | 0 | case RQ_AQ: |
2224 | 0 | p->target = c->target; |
2225 | 0 | p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; |
2226 | 0 | break; |
2227 | 0 | case RQ_QQ: |
2228 | 0 | p->target = c->target; |
2229 | 0 | p->lower = 0; p->upper = 1; p->greedy = 0; |
2230 | 0 | break; |
2231 | 0 | case RQ_P_QQ: |
2232 | 0 | p->target = cnode; |
2233 | 0 | p->lower = 0; p->upper = 1; p->greedy = 0; |
2234 | 0 | c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; |
2235 | 0 | return ; |
2236 | 0 | break; |
2237 | 0 | case RQ_PQ_Q: |
2238 | 0 | p->target = cnode; |
2239 | 0 | p->lower = 0; p->upper = 1; p->greedy = 1; |
2240 | 0 | c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; |
2241 | 0 | return ; |
2242 | 0 | break; |
2243 | 0 | case RQ_ASIS: |
2244 | 0 | p->target = cnode; |
2245 | 0 | return ; |
2246 | 0 | break; |
2247 | 0 | } |
2248 | | |
2249 | 0 | c->target = NULL_NODE; |
2250 | 0 | onig_node_free(cnode); |
2251 | 0 | } |
2252 | | |
2253 | | |
2254 | | enum TokenSyms { |
2255 | | TK_EOT = 0, /* end of token */ |
2256 | | TK_RAW_BYTE = 1, |
2257 | | TK_CHAR, |
2258 | | TK_STRING, |
2259 | | TK_CODE_POINT, |
2260 | | TK_ANYCHAR, |
2261 | | TK_CHAR_TYPE, |
2262 | | TK_BACKREF, |
2263 | | TK_CALL, |
2264 | | TK_ANCHOR, |
2265 | | TK_OP_REPEAT, |
2266 | | TK_INTERVAL, |
2267 | | TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ |
2268 | | TK_ALT, |
2269 | | TK_SUBEXP_OPEN, |
2270 | | TK_SUBEXP_CLOSE, |
2271 | | TK_CC_OPEN, |
2272 | | TK_QUOTE_OPEN, |
2273 | | TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ |
2274 | | TK_LINEBREAK, |
2275 | | TK_EXTENDED_GRAPHEME_CLUSTER, |
2276 | | TK_KEEP, |
2277 | | /* in cc */ |
2278 | | TK_CC_CLOSE, |
2279 | | TK_CC_RANGE, |
2280 | | TK_POSIX_BRACKET_OPEN, |
2281 | | TK_CC_AND, /* && */ |
2282 | | TK_CC_CC_OPEN /* [ */ |
2283 | | }; |
2284 | | |
2285 | | typedef struct { |
2286 | | enum TokenSyms type; |
2287 | | int escaped; |
2288 | | int base; /* is number: 8, 16 (used in [....]) */ |
2289 | | UChar* backp; |
2290 | | union { |
2291 | | UChar* s; |
2292 | | int c; |
2293 | | OnigCodePoint code; |
2294 | | struct { |
2295 | | int subtype; |
2296 | | int ascii_range; |
2297 | | } anchor; |
2298 | | struct { |
2299 | | int lower; |
2300 | | int upper; |
2301 | | int greedy; |
2302 | | int possessive; |
2303 | | } repeat; |
2304 | | struct { |
2305 | | int num; |
2306 | | int ref1; |
2307 | | int* refs; |
2308 | | int by_name; |
2309 | | #ifdef USE_BACKREF_WITH_LEVEL |
2310 | | int exist_level; |
2311 | | int level; /* \k<name+n> */ |
2312 | | #endif |
2313 | | } backref; |
2314 | | struct { |
2315 | | UChar* name; |
2316 | | UChar* name_end; |
2317 | | int gnum; |
2318 | | int rel; |
2319 | | } call; |
2320 | | struct { |
2321 | | int ctype; |
2322 | | int not; |
2323 | | } prop; |
2324 | | } u; |
2325 | | } OnigToken; |
2326 | | |
2327 | | |
2328 | | static int |
2329 | | fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) |
2330 | 0 | { |
2331 | 0 | int low, up, syn_allow, non_low = 0; |
2332 | 0 | int r = 0; |
2333 | 0 | OnigCodePoint c; |
2334 | 0 | OnigEncoding enc = env->enc; |
2335 | 0 | UChar* p = *src; |
2336 | 0 | PFETCH_READY; |
2337 | |
|
2338 | 0 | syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); |
2339 | |
|
2340 | 0 | if (PEND) { |
2341 | 0 | if (syn_allow) |
2342 | 0 | return 1; /* "....{" : OK! */ |
2343 | 0 | else |
2344 | 0 | return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ |
2345 | 0 | } |
2346 | | |
2347 | 0 | if (! syn_allow) { |
2348 | 0 | c = PPEEK; |
2349 | 0 | if (c == ')' || c == '(' || c == '|') { |
2350 | 0 | return ONIGERR_END_PATTERN_AT_LEFT_BRACE; |
2351 | 0 | } |
2352 | 0 | } |
2353 | | |
2354 | 0 | low = onig_scan_unsigned_number(&p, end, env->enc); |
2355 | 0 | if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2356 | 0 | if (low > ONIG_MAX_REPEAT_NUM) |
2357 | 0 | return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2358 | | |
2359 | 0 | if (p == *src) { /* can't read low */ |
2360 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { |
2361 | | /* allow {,n} as {0,n} */ |
2362 | 0 | low = 0; |
2363 | 0 | non_low = 1; |
2364 | 0 | } |
2365 | 0 | else |
2366 | 0 | goto invalid; |
2367 | 0 | } |
2368 | | |
2369 | 0 | if (PEND) goto invalid; |
2370 | 0 | PFETCH(c); |
2371 | 0 | if (c == ',') { |
2372 | 0 | UChar* prev = p; |
2373 | 0 | up = onig_scan_unsigned_number(&p, end, env->enc); |
2374 | 0 | if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2375 | 0 | if (up > ONIG_MAX_REPEAT_NUM) |
2376 | 0 | return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; |
2377 | | |
2378 | 0 | if (p == prev) { |
2379 | 0 | if (non_low != 0) |
2380 | 0 | goto invalid; |
2381 | 0 | up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ |
2382 | 0 | } |
2383 | 0 | } |
2384 | 0 | else { |
2385 | 0 | if (non_low != 0) |
2386 | 0 | goto invalid; |
2387 | | |
2388 | 0 | PUNFETCH; |
2389 | 0 | up = low; /* {n} : exact n times */ |
2390 | 0 | r = 2; /* fixed */ |
2391 | 0 | } |
2392 | | |
2393 | 0 | if (PEND) goto invalid; |
2394 | 0 | PFETCH(c); |
2395 | 0 | if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { |
2396 | 0 | if (c != MC_ESC(env->syntax)) goto invalid; |
2397 | 0 | if (PEND) goto invalid; |
2398 | 0 | PFETCH(c); |
2399 | 0 | } |
2400 | 0 | if (c != '}') goto invalid; |
2401 | | |
2402 | 0 | if (!IS_REPEAT_INFINITE(up) && low > up) { |
2403 | 0 | return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; |
2404 | 0 | } |
2405 | | |
2406 | 0 | tok->type = TK_INTERVAL; |
2407 | 0 | tok->u.repeat.lower = low; |
2408 | 0 | tok->u.repeat.upper = up; |
2409 | 0 | *src = p; |
2410 | 0 | return r; /* 0: normal {n,m}, 2: fixed {n} */ |
2411 | | |
2412 | 0 | invalid: |
2413 | 0 | if (syn_allow) |
2414 | 0 | return 1; /* OK */ |
2415 | 0 | else |
2416 | 0 | return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; |
2417 | 0 | } |
2418 | | |
2419 | | /* \M-, \C-, \c, or \... */ |
2420 | | static int |
2421 | | fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) |
2422 | 52.7k | { |
2423 | 52.7k | int v; |
2424 | 52.7k | OnigCodePoint c; |
2425 | 52.7k | OnigEncoding enc = env->enc; |
2426 | 52.7k | UChar* p = *src; |
2427 | | |
2428 | 52.7k | if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; |
2429 | | |
2430 | 52.7k | PFETCH_S(c); |
2431 | 52.7k | switch (c) { |
2432 | 0 | case 'M': |
2433 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { |
2434 | 0 | if (PEND) return ONIGERR_END_PATTERN_AT_META; |
2435 | 0 | PFETCH_S(c); |
2436 | 0 | if (c != '-') return ONIGERR_META_CODE_SYNTAX; |
2437 | 0 | if (PEND) return ONIGERR_END_PATTERN_AT_META; |
2438 | 0 | PFETCH_S(c); |
2439 | 0 | if (c == MC_ESC(env->syntax)) { |
2440 | 0 | v = fetch_escaped_value(&p, end, env, &c); |
2441 | 0 | if (v < 0) return v; |
2442 | 0 | } |
2443 | 0 | c = ((c & 0xff) | 0x80); |
2444 | 0 | } |
2445 | 0 | else |
2446 | 0 | goto backslash; |
2447 | 0 | break; |
2448 | | |
2449 | 0 | case 'C': |
2450 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { |
2451 | 0 | if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; |
2452 | 0 | PFETCH_S(c); |
2453 | 0 | if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; |
2454 | 0 | goto control; |
2455 | 0 | } |
2456 | 0 | else |
2457 | 0 | goto backslash; |
2458 | | |
2459 | 0 | case 'c': |
2460 | 0 | if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { |
2461 | 0 | control: |
2462 | 0 | if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; |
2463 | 0 | PFETCH_S(c); |
2464 | 0 | if (c == '?') { |
2465 | 0 | c = 0177; |
2466 | 0 | } |
2467 | 0 | else { |
2468 | 0 | if (c == MC_ESC(env->syntax)) { |
2469 | 0 | v = fetch_escaped_value(&p, end, env, &c); |
2470 | 0 | if (v < 0) return v; |
2471 | 0 | } |
2472 | 0 | c &= 0x9f; |
2473 | 0 | } |
2474 | 0 | break; |
2475 | 0 | } |
2476 | | /* fall through */ |
2477 | | |
2478 | 52.7k | default: |
2479 | 52.7k | { |
2480 | 52.7k | backslash: |
2481 | 52.7k | c = conv_backslash_value(c, env); |
2482 | 52.7k | } |
2483 | 52.7k | break; |
2484 | 52.7k | } |
2485 | | |
2486 | 52.7k | *src = p; |
2487 | 52.7k | *val = c; |
2488 | 52.7k | return 0; |
2489 | 52.7k | } |
2490 | | |
2491 | | static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); |
2492 | | |
2493 | | static OnigCodePoint |
2494 | | get_name_end_code_point(OnigCodePoint start) |
2495 | 11.7k | { |
2496 | 11.7k | switch (start) { |
2497 | 11.7k | case '<': return (OnigCodePoint )'>'; break; |
2498 | 0 | case '\'': return (OnigCodePoint )'\''; break; |
2499 | 0 | case '(': return (OnigCodePoint )')'; break; |
2500 | 0 | case '{': return (OnigCodePoint )'}'; break; |
2501 | 0 | default: |
2502 | 0 | break; |
2503 | 11.7k | } |
2504 | | |
2505 | 0 | return (OnigCodePoint )0; |
2506 | 11.7k | } |
2507 | | |
2508 | | #ifdef USE_NAMED_GROUP |
2509 | | # ifdef RUBY |
2510 | | # define ONIGENC_IS_CODE_NAME(enc, c) TRUE |
2511 | | # else |
2512 | 43.9k | # define ONIGENC_IS_CODE_NAME(enc, c) ONIGENC_IS_CODE_WORD(enc, c) |
2513 | | # endif |
2514 | | |
2515 | | # ifdef USE_BACKREF_WITH_LEVEL |
2516 | | /* |
2517 | | \k<name+n>, \k<name-n> |
2518 | | \k<num+n>, \k<num-n> |
2519 | | \k<-num+n>, \k<-num-n> |
2520 | | */ |
2521 | | static int |
2522 | | fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, |
2523 | | UChar** rname_end, ScanEnv* env, |
2524 | | int* rback_num, int* rlevel) |
2525 | 0 | { |
2526 | 0 | int r, sign, is_num, exist_level; |
2527 | 0 | OnigCodePoint end_code; |
2528 | 0 | OnigCodePoint c = 0; |
2529 | 0 | OnigEncoding enc = env->enc; |
2530 | 0 | UChar *name_end; |
2531 | 0 | UChar *pnum_head; |
2532 | 0 | UChar *p = *src; |
2533 | 0 | PFETCH_READY; |
2534 | |
|
2535 | 0 | *rback_num = 0; |
2536 | 0 | is_num = exist_level = 0; |
2537 | 0 | sign = 1; |
2538 | 0 | pnum_head = *src; |
2539 | |
|
2540 | 0 | end_code = get_name_end_code_point(start_code); |
2541 | |
|
2542 | 0 | name_end = end; |
2543 | 0 | r = 0; |
2544 | 0 | if (PEND) { |
2545 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
2546 | 0 | } |
2547 | 0 | else { |
2548 | 0 | PFETCH(c); |
2549 | 0 | if (c == end_code) |
2550 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
2551 | | |
2552 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2553 | 0 | is_num = 1; |
2554 | 0 | } |
2555 | 0 | else if (c == '-') { |
2556 | 0 | is_num = 2; |
2557 | 0 | sign = -1; |
2558 | 0 | pnum_head = p; |
2559 | 0 | } |
2560 | 0 | else if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2561 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2562 | 0 | } |
2563 | 0 | } |
2564 | | |
2565 | 0 | while (!PEND) { |
2566 | 0 | name_end = p; |
2567 | 0 | PFETCH(c); |
2568 | 0 | if (c == end_code || c == ')' || c == '+' || c == '-') { |
2569 | 0 | if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; |
2570 | 0 | break; |
2571 | 0 | } |
2572 | | |
2573 | 0 | if (is_num != 0) { |
2574 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2575 | 0 | is_num = 1; |
2576 | 0 | } |
2577 | 0 | else { |
2578 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2579 | 0 | is_num = 0; |
2580 | 0 | } |
2581 | 0 | } |
2582 | 0 | else if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2583 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2584 | 0 | } |
2585 | 0 | } |
2586 | |
|
2587 | 0 | if (r == 0 && c != end_code) { |
2588 | 0 | if (c == '+' || c == '-') { |
2589 | 0 | int level; |
2590 | 0 | int flag = (c == '-' ? -1 : 1); |
2591 | |
|
2592 | 0 | if (PEND) { |
2593 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2594 | 0 | goto end; |
2595 | 0 | } |
2596 | 0 | PFETCH(c); |
2597 | 0 | if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; |
2598 | 0 | PUNFETCH; |
2599 | 0 | level = onig_scan_unsigned_number(&p, end, enc); |
2600 | 0 | if (level < 0) return ONIGERR_TOO_BIG_NUMBER; |
2601 | 0 | *rlevel = (level * flag); |
2602 | 0 | exist_level = 1; |
2603 | |
|
2604 | 0 | if (!PEND) { |
2605 | 0 | PFETCH(c); |
2606 | 0 | if (c == end_code) |
2607 | 0 | goto end; |
2608 | 0 | } |
2609 | 0 | } |
2610 | | |
2611 | 0 | err: |
2612 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2613 | 0 | name_end = end; |
2614 | 0 | } |
2615 | | |
2616 | 0 | end: |
2617 | 0 | if (r == 0) { |
2618 | 0 | if (is_num != 0) { |
2619 | 0 | *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); |
2620 | 0 | if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; |
2621 | 0 | else if (*rback_num == 0) goto err; |
2622 | | |
2623 | 0 | *rback_num *= sign; |
2624 | 0 | } |
2625 | | |
2626 | 0 | *rname_end = name_end; |
2627 | 0 | *src = p; |
2628 | 0 | return (exist_level ? 1 : 0); |
2629 | 0 | } |
2630 | 0 | else { |
2631 | 0 | onig_scan_env_set_error_string(env, r, *src, name_end); |
2632 | 0 | return r; |
2633 | 0 | } |
2634 | 0 | } |
2635 | | # endif /* USE_BACKREF_WITH_LEVEL */ |
2636 | | |
2637 | | /* |
2638 | | ref: 0 -> define name (don't allow number name) |
2639 | | 1 -> reference name (allow number name) |
2640 | | */ |
2641 | | static int |
2642 | | fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, |
2643 | | UChar** rname_end, ScanEnv* env, int* rback_num, int ref) |
2644 | 11.7k | { |
2645 | 11.7k | int r, is_num, sign; |
2646 | 11.7k | OnigCodePoint end_code; |
2647 | 11.7k | OnigCodePoint c = 0; |
2648 | 11.7k | OnigEncoding enc = env->enc; |
2649 | 11.7k | UChar *name_end; |
2650 | 11.7k | UChar *pnum_head; |
2651 | 11.7k | UChar *p = *src; |
2652 | | |
2653 | 11.7k | *rback_num = 0; |
2654 | | |
2655 | 11.7k | end_code = get_name_end_code_point(start_code); |
2656 | | |
2657 | 11.7k | name_end = end; |
2658 | 11.7k | pnum_head = *src; |
2659 | 11.7k | r = 0; |
2660 | 11.7k | is_num = 0; |
2661 | 11.7k | sign = 1; |
2662 | 11.7k | if (PEND) { |
2663 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
2664 | 0 | } |
2665 | 11.7k | else { |
2666 | 11.7k | PFETCH_S(c); |
2667 | 11.7k | if (c == end_code) |
2668 | 0 | return ONIGERR_EMPTY_GROUP_NAME; |
2669 | | |
2670 | 11.7k | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2671 | 0 | if (ref == 1) |
2672 | 0 | is_num = 1; |
2673 | 0 | else { |
2674 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2675 | 0 | is_num = 0; |
2676 | 0 | } |
2677 | 0 | } |
2678 | 11.7k | else if (c == '-') { |
2679 | 0 | if (ref == 1) { |
2680 | 0 | is_num = 2; |
2681 | 0 | sign = -1; |
2682 | 0 | pnum_head = p; |
2683 | 0 | } |
2684 | 0 | else { |
2685 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2686 | 0 | is_num = 0; |
2687 | 0 | } |
2688 | 0 | } |
2689 | 11.7k | else if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2690 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2691 | 0 | } |
2692 | 11.7k | } |
2693 | | |
2694 | 11.7k | if (r == 0) { |
2695 | 43.9k | while (!PEND) { |
2696 | 43.9k | name_end = p; |
2697 | 43.9k | PFETCH_S(c); |
2698 | 43.9k | if (c == end_code || c == ')') { |
2699 | 11.7k | if (is_num == 2) { |
2700 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2701 | 0 | goto teardown; |
2702 | 0 | } |
2703 | 11.7k | break; |
2704 | 11.7k | } |
2705 | | |
2706 | 32.2k | if (is_num != 0) { |
2707 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2708 | 0 | is_num = 1; |
2709 | 0 | } |
2710 | 0 | else { |
2711 | 0 | if (!ONIGENC_IS_CODE_WORD(enc, c)) |
2712 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2713 | 0 | else |
2714 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2715 | 0 | goto teardown; |
2716 | 0 | } |
2717 | 0 | } |
2718 | 32.2k | else { |
2719 | 32.2k | if (!ONIGENC_IS_CODE_NAME(enc, c)) { |
2720 | 0 | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2721 | 0 | goto teardown; |
2722 | 0 | } |
2723 | 32.2k | } |
2724 | 32.2k | } |
2725 | | |
2726 | 11.7k | if (c != end_code) { |
2727 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2728 | 0 | name_end = end; |
2729 | 0 | goto err; |
2730 | 0 | } |
2731 | | |
2732 | 11.7k | if (is_num != 0) { |
2733 | 0 | *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); |
2734 | 0 | if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; |
2735 | 0 | else if (*rback_num == 0) { |
2736 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
2737 | 0 | goto err; |
2738 | 0 | } |
2739 | | |
2740 | 0 | *rback_num *= sign; |
2741 | 0 | } |
2742 | | |
2743 | 11.7k | *rname_end = name_end; |
2744 | 11.7k | *src = p; |
2745 | 11.7k | return 0; |
2746 | 11.7k | } |
2747 | 0 | else { |
2748 | 0 | teardown: |
2749 | 0 | while (!PEND) { |
2750 | 0 | name_end = p; |
2751 | 0 | PFETCH_S(c); |
2752 | 0 | if (c == end_code || c == ')') |
2753 | 0 | break; |
2754 | 0 | } |
2755 | 0 | if (PEND) |
2756 | 0 | name_end = end; |
2757 | |
|
2758 | 0 | err: |
2759 | 0 | onig_scan_env_set_error_string(env, r, *src, name_end); |
2760 | 0 | return r; |
2761 | 0 | } |
2762 | 11.7k | } |
2763 | | #else |
2764 | | static int |
2765 | | fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, |
2766 | | UChar** rname_end, ScanEnv* env, int* rback_num, int ref) |
2767 | | { |
2768 | | int r, is_num, sign; |
2769 | | OnigCodePoint end_code; |
2770 | | OnigCodePoint c = 0; |
2771 | | UChar *name_end; |
2772 | | OnigEncoding enc = env->enc; |
2773 | | UChar *pnum_head; |
2774 | | UChar *p = *src; |
2775 | | PFETCH_READY; |
2776 | | |
2777 | | *rback_num = 0; |
2778 | | |
2779 | | end_code = get_name_end_code_point(start_code); |
2780 | | |
2781 | | *rname_end = name_end = end; |
2782 | | r = 0; |
2783 | | pnum_head = *src; |
2784 | | is_num = 0; |
2785 | | sign = 1; |
2786 | | |
2787 | | if (PEND) { |
2788 | | return ONIGERR_EMPTY_GROUP_NAME; |
2789 | | } |
2790 | | else { |
2791 | | PFETCH(c); |
2792 | | if (c == end_code) |
2793 | | return ONIGERR_EMPTY_GROUP_NAME; |
2794 | | |
2795 | | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { |
2796 | | is_num = 1; |
2797 | | } |
2798 | | else if (c == '-') { |
2799 | | is_num = 2; |
2800 | | sign = -1; |
2801 | | pnum_head = p; |
2802 | | } |
2803 | | else { |
2804 | | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2805 | | } |
2806 | | } |
2807 | | |
2808 | | while (!PEND) { |
2809 | | name_end = p; |
2810 | | |
2811 | | PFETCH(c); |
2812 | | if (c == end_code || c == ')') break; |
2813 | | if (! ONIGENC_IS_CODE_DIGIT(enc, c)) |
2814 | | r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; |
2815 | | } |
2816 | | if (r == 0 && c != end_code) { |
2817 | | r = ONIGERR_INVALID_GROUP_NAME; |
2818 | | name_end = end; |
2819 | | } |
2820 | | |
2821 | | if (r == 0) { |
2822 | | *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); |
2823 | | if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; |
2824 | | else if (*rback_num == 0) { |
2825 | | r = ONIGERR_INVALID_GROUP_NAME; |
2826 | | goto err; |
2827 | | } |
2828 | | *rback_num *= sign; |
2829 | | |
2830 | | *rname_end = name_end; |
2831 | | *src = p; |
2832 | | return 0; |
2833 | | } |
2834 | | else { |
2835 | | err: |
2836 | | onig_scan_env_set_error_string(env, r, *src, name_end); |
2837 | | return r; |
2838 | | } |
2839 | | } |
2840 | | #endif /* USE_NAMED_GROUP */ |
2841 | | |
2842 | | |
2843 | | static void |
2844 | | onig_syntax_warn(ScanEnv *env, const char *fmt, ...) |
2845 | 0 | { |
2846 | 0 | va_list args; |
2847 | 0 | UChar buf[WARN_BUFSIZE]; |
2848 | 0 | va_start(args, fmt); |
2849 | 0 | onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, |
2850 | 0 | env->pattern, env->pattern_end, |
2851 | 0 | (const UChar *)fmt, args); |
2852 | 0 | va_end(args); |
2853 | | #ifdef RUBY |
2854 | | if (env->sourcefile == NULL) |
2855 | | rb_warn("%s", (char *)buf); |
2856 | | else |
2857 | | rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf); |
2858 | | #else |
2859 | 0 | (*onig_warn)((char* )buf); |
2860 | 0 | #endif |
2861 | 0 | } |
2862 | | |
2863 | | static void |
2864 | | CC_ESC_WARN(ScanEnv *env, UChar *c) |
2865 | 0 | { |
2866 | 0 | if (onig_warn == onig_null_warn) return ; |
2867 | | |
2868 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && |
2869 | 0 | IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { |
2870 | 0 | onig_syntax_warn(env, "character class has '%s' without escape", c); |
2871 | 0 | } |
2872 | 0 | } |
2873 | | |
2874 | | static void |
2875 | | CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) |
2876 | 0 | { |
2877 | 0 | if (onig_warn == onig_null_warn) return ; |
2878 | | |
2879 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { |
2880 | 0 | onig_syntax_warn(env, "regular expression has '%s' without escape", c); |
2881 | 0 | } |
2882 | 0 | } |
2883 | | |
2884 | | #ifndef RTEST |
2885 | 0 | # define RTEST(v) 1 |
2886 | | #endif |
2887 | | |
2888 | | static void |
2889 | | CC_DUP_WARN(ScanEnv *env, OnigCodePoint from ARG_UNUSED, OnigCodePoint to ARG_UNUSED) |
2890 | 0 | { |
2891 | 0 | if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; |
2892 | | |
2893 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) && |
2894 | 0 | !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) { |
2895 | | #ifdef WARN_ALL_CC_DUP |
2896 | | onig_syntax_warn(env, "character class has duplicated range: %04x-%04x", from, to); |
2897 | | #else |
2898 | 0 | env->warnings_flag |= ONIG_SYN_WARN_CC_DUP; |
2899 | 0 | onig_syntax_warn(env, "character class has duplicated range"); |
2900 | 0 | #endif |
2901 | 0 | } |
2902 | 0 | } |
2903 | | |
2904 | | static void |
2905 | | UNKNOWN_ESC_WARN(ScanEnv *env, int c) |
2906 | 0 | { |
2907 | 0 | if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; |
2908 | 0 | onig_syntax_warn(env, "Unknown escape \\%c is ignored", c); |
2909 | 0 | } |
2910 | | |
2911 | | static UChar* |
2912 | | find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, |
2913 | | UChar **next, OnigEncoding enc) |
2914 | 0 | { |
2915 | 0 | int i; |
2916 | 0 | OnigCodePoint x; |
2917 | 0 | UChar *q; |
2918 | 0 | UChar *p = from; |
2919 | |
|
2920 | 0 | while (p < to) { |
2921 | 0 | x = ONIGENC_MBC_TO_CODE(enc, p, to); |
2922 | 0 | q = p + enclen(enc, p, to); |
2923 | 0 | if (x == s[0]) { |
2924 | 0 | for (i = 1; i < n && q < to; i++) { |
2925 | 0 | x = ONIGENC_MBC_TO_CODE(enc, q, to); |
2926 | 0 | if (x != s[i]) break; |
2927 | 0 | q += enclen(enc, q, to); |
2928 | 0 | } |
2929 | 0 | if (i >= n) { |
2930 | 0 | if (IS_NOT_NULL(next)) |
2931 | 0 | *next = q; |
2932 | 0 | return p; |
2933 | 0 | } |
2934 | 0 | } |
2935 | 0 | p = q; |
2936 | 0 | } |
2937 | 0 | return NULL_UCHARP; |
2938 | 0 | } |
2939 | | |
2940 | | static int |
2941 | | str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, |
2942 | | OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn) |
2943 | 0 | { |
2944 | 0 | int i, in_esc; |
2945 | 0 | OnigCodePoint x; |
2946 | 0 | UChar *q; |
2947 | 0 | UChar *p = from; |
2948 | |
|
2949 | 0 | in_esc = 0; |
2950 | 0 | while (p < to) { |
2951 | 0 | if (in_esc) { |
2952 | 0 | in_esc = 0; |
2953 | 0 | p += enclen(enc, p, to); |
2954 | 0 | } |
2955 | 0 | else { |
2956 | 0 | x = ONIGENC_MBC_TO_CODE(enc, p, to); |
2957 | 0 | q = p + enclen(enc, p, to); |
2958 | 0 | if (x == s[0]) { |
2959 | 0 | for (i = 1; i < n && q < to; i++) { |
2960 | 0 | x = ONIGENC_MBC_TO_CODE(enc, q, to); |
2961 | 0 | if (x != s[i]) break; |
2962 | 0 | q += enclen(enc, q, to); |
2963 | 0 | } |
2964 | 0 | if (i >= n) return 1; |
2965 | 0 | p += enclen(enc, p, to); |
2966 | 0 | } |
2967 | 0 | else { |
2968 | 0 | x = ONIGENC_MBC_TO_CODE(enc, p, to); |
2969 | 0 | if (x == bad) return 0; |
2970 | 0 | else if (x == MC_ESC(syn)) in_esc = 1; |
2971 | 0 | p = q; |
2972 | 0 | } |
2973 | 0 | } |
2974 | 0 | } |
2975 | 0 | return 0; |
2976 | 0 | } |
2977 | | |
2978 | | static int |
2979 | | fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) |
2980 | 178k | { |
2981 | 178k | int num; |
2982 | 178k | OnigCodePoint c, c2; |
2983 | 178k | const OnigSyntaxType* syn = env->syntax; |
2984 | 178k | OnigEncoding enc = env->enc; |
2985 | 178k | UChar* prev; |
2986 | 178k | UChar* p = *src; |
2987 | 178k | PFETCH_READY; |
2988 | | |
2989 | 178k | if (PEND) { |
2990 | 0 | tok->type = TK_EOT; |
2991 | 0 | return tok->type; |
2992 | 0 | } |
2993 | | |
2994 | 178k | PFETCH(c); |
2995 | 178k | tok->type = TK_CHAR; |
2996 | 178k | tok->base = 0; |
2997 | 178k | tok->u.c = c; |
2998 | 178k | tok->escaped = 0; |
2999 | | |
3000 | 178k | if (c == ']') { |
3001 | 43.9k | tok->type = TK_CC_CLOSE; |
3002 | 43.9k | } |
3003 | 134k | else if (c == '-') { |
3004 | 0 | tok->type = TK_CC_RANGE; |
3005 | 0 | } |
3006 | 134k | else if (c == MC_ESC(syn)) { |
3007 | 49.7k | if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) |
3008 | 0 | goto end; |
3009 | | |
3010 | 49.7k | if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; |
3011 | | |
3012 | 49.7k | PFETCH(c); |
3013 | 49.7k | tok->escaped = 1; |
3014 | 49.7k | tok->u.c = c; |
3015 | 49.7k | switch (c) { |
3016 | 0 | case 'w': |
3017 | 0 | tok->type = TK_CHAR_TYPE; |
3018 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3019 | 0 | tok->u.prop.not = 0; |
3020 | 0 | break; |
3021 | 0 | case 'W': |
3022 | 0 | tok->type = TK_CHAR_TYPE; |
3023 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3024 | 0 | tok->u.prop.not = 1; |
3025 | 0 | break; |
3026 | 0 | case 'd': |
3027 | 0 | tok->type = TK_CHAR_TYPE; |
3028 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3029 | 0 | tok->u.prop.not = 0; |
3030 | 0 | break; |
3031 | 0 | case 'D': |
3032 | 0 | tok->type = TK_CHAR_TYPE; |
3033 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3034 | 0 | tok->u.prop.not = 1; |
3035 | 0 | break; |
3036 | 11.7k | case 's': |
3037 | 11.7k | tok->type = TK_CHAR_TYPE; |
3038 | 11.7k | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3039 | 11.7k | tok->u.prop.not = 0; |
3040 | 11.7k | break; |
3041 | 0 | case 'S': |
3042 | 0 | tok->type = TK_CHAR_TYPE; |
3043 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3044 | 0 | tok->u.prop.not = 1; |
3045 | 0 | break; |
3046 | 0 | case 'h': |
3047 | 0 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3048 | 0 | tok->type = TK_CHAR_TYPE; |
3049 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3050 | 0 | tok->u.prop.not = 0; |
3051 | 0 | break; |
3052 | 0 | case 'H': |
3053 | 0 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3054 | 0 | tok->type = TK_CHAR_TYPE; |
3055 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3056 | 0 | tok->u.prop.not = 1; |
3057 | 0 | break; |
3058 | | |
3059 | 0 | case 'p': |
3060 | 0 | case 'P': |
3061 | 0 | if (PEND) break; |
3062 | | |
3063 | 0 | c2 = PPEEK; |
3064 | 0 | if (c2 == '{' && |
3065 | 0 | IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { |
3066 | 0 | PINC; |
3067 | 0 | tok->type = TK_CHAR_PROPERTY; |
3068 | 0 | tok->u.prop.not = (c == 'P' ? 1 : 0); |
3069 | |
|
3070 | 0 | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { |
3071 | 0 | PFETCH(c2); |
3072 | 0 | if (c2 == '^') { |
3073 | 0 | tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); |
3074 | 0 | } |
3075 | 0 | else |
3076 | 0 | PUNFETCH; |
3077 | 0 | } |
3078 | 0 | } |
3079 | 0 | else { |
3080 | 0 | onig_syntax_warn(env, "invalid Unicode Property \\%c", c); |
3081 | 0 | } |
3082 | 0 | break; |
3083 | | |
3084 | 0 | case 'x': |
3085 | 0 | if (PEND) break; |
3086 | | |
3087 | 0 | prev = p; |
3088 | 0 | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { |
3089 | 0 | PINC; |
3090 | 0 | num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); |
3091 | 0 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3092 | 0 | if (!PEND) { |
3093 | 0 | c2 = PPEEK; |
3094 | 0 | if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) |
3095 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3096 | 0 | } |
3097 | | |
3098 | 0 | if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { |
3099 | 0 | PINC; |
3100 | 0 | tok->type = TK_CODE_POINT; |
3101 | 0 | tok->base = 16; |
3102 | 0 | tok->u.code = (OnigCodePoint )num; |
3103 | 0 | } |
3104 | 0 | else { |
3105 | | /* can't read nothing or invalid format */ |
3106 | 0 | p = prev; |
3107 | 0 | } |
3108 | 0 | } |
3109 | 0 | else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { |
3110 | 0 | num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); |
3111 | 0 | if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3112 | 0 | if (p == prev) { /* can't read nothing. */ |
3113 | 0 | num = 0; /* but, it's not error */ |
3114 | 0 | } |
3115 | 0 | tok->type = TK_RAW_BYTE; |
3116 | 0 | tok->base = 16; |
3117 | 0 | tok->u.c = num; |
3118 | 0 | } |
3119 | 0 | break; |
3120 | | |
3121 | 0 | case 'u': |
3122 | 0 | if (PEND) break; |
3123 | | |
3124 | 0 | prev = p; |
3125 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { |
3126 | 0 | num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); |
3127 | 0 | if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; |
3128 | 0 | else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3129 | 0 | if (p == prev) { /* can't read nothing. */ |
3130 | 0 | num = 0; /* but, it's not error */ |
3131 | 0 | } |
3132 | 0 | tok->type = TK_CODE_POINT; |
3133 | 0 | tok->base = 16; |
3134 | 0 | tok->u.code = (OnigCodePoint )num; |
3135 | 0 | } |
3136 | 0 | break; |
3137 | | |
3138 | 0 | case 'o': |
3139 | 0 | if (PEND) break; |
3140 | | |
3141 | 0 | prev = p; |
3142 | 0 | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { |
3143 | 0 | PINC; |
3144 | 0 | num = scan_unsigned_octal_number(&p, end, 11, enc); |
3145 | 0 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3146 | 0 | if (!PEND) { |
3147 | 0 | c2 = PPEEK; |
3148 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c2) && c2 < '8') |
3149 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3150 | 0 | } |
3151 | | |
3152 | 0 | if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { |
3153 | 0 | PINC; |
3154 | 0 | tok->type = TK_CODE_POINT; |
3155 | 0 | tok->base = 8; |
3156 | 0 | tok->u.code = (OnigCodePoint )num; |
3157 | 0 | } |
3158 | 0 | else { |
3159 | | /* can't read nothing or invalid format */ |
3160 | 0 | p = prev; |
3161 | 0 | } |
3162 | 0 | } |
3163 | 0 | break; |
3164 | | |
3165 | 0 | case '0': |
3166 | 0 | case '1': case '2': case '3': case '4': case '5': case '6': case '7': |
3167 | 0 | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { |
3168 | 0 | PUNFETCH; |
3169 | 0 | prev = p; |
3170 | 0 | num = scan_unsigned_octal_number(&p, end, 3, enc); |
3171 | 0 | if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER; |
3172 | 0 | if (p == prev) { /* can't read nothing. */ |
3173 | 0 | num = 0; /* but, it's not error */ |
3174 | 0 | } |
3175 | 0 | tok->type = TK_RAW_BYTE; |
3176 | 0 | tok->base = 8; |
3177 | 0 | tok->u.c = num; |
3178 | 0 | } |
3179 | 0 | break; |
3180 | | |
3181 | 38.0k | default: |
3182 | 38.0k | PUNFETCH; |
3183 | 38.0k | num = fetch_escaped_value(&p, end, env, &c2); |
3184 | 38.0k | if (num < 0) return num; |
3185 | 38.0k | if ((OnigCodePoint )tok->u.c != c2) { |
3186 | 35.1k | tok->u.code = (OnigCodePoint )c2; |
3187 | 35.1k | tok->type = TK_CODE_POINT; |
3188 | 35.1k | } |
3189 | 38.0k | break; |
3190 | 49.7k | } |
3191 | 49.7k | } |
3192 | 84.9k | else if (c == '[') { |
3193 | 0 | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { |
3194 | 0 | OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; |
3195 | 0 | tok->backp = p; /* point at '[' is read */ |
3196 | 0 | PINC; |
3197 | 0 | if (str_exist_check_with_esc(send, 2, p, end, |
3198 | 0 | (OnigCodePoint )']', enc, syn)) { |
3199 | 0 | tok->type = TK_POSIX_BRACKET_OPEN; |
3200 | 0 | } |
3201 | 0 | else { |
3202 | 0 | PUNFETCH; |
3203 | 0 | goto cc_in_cc; |
3204 | 0 | } |
3205 | 0 | } |
3206 | 0 | else { |
3207 | 0 | cc_in_cc: |
3208 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { |
3209 | 0 | tok->type = TK_CC_CC_OPEN; |
3210 | 0 | } |
3211 | 0 | else { |
3212 | 0 | CC_ESC_WARN(env, (UChar* )"["); |
3213 | 0 | } |
3214 | 0 | } |
3215 | 0 | } |
3216 | 84.9k | else if (c == '&') { |
3217 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && |
3218 | 0 | !PEND && (PPEEK_IS('&'))) { |
3219 | 0 | PINC; |
3220 | 0 | tok->type = TK_CC_AND; |
3221 | 0 | } |
3222 | 0 | } |
3223 | | |
3224 | 178k | end: |
3225 | 178k | *src = p; |
3226 | 178k | return tok->type; |
3227 | 178k | } |
3228 | | |
3229 | | #ifdef USE_NAMED_GROUP |
3230 | | static int |
3231 | | fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src, |
3232 | | UChar* end, ScanEnv* env) |
3233 | 0 | { |
3234 | 0 | int r, num; |
3235 | 0 | const OnigSyntaxType* syn = env->syntax; |
3236 | 0 | UChar* prev; |
3237 | 0 | UChar* p = *src; |
3238 | 0 | UChar* name_end; |
3239 | 0 | int* backs; |
3240 | 0 | int back_num; |
3241 | |
|
3242 | 0 | prev = p; |
3243 | |
|
3244 | 0 | # ifdef USE_BACKREF_WITH_LEVEL |
3245 | 0 | name_end = NULL_UCHARP; /* no need. escape gcc warning. */ |
3246 | 0 | r = fetch_name_with_level(c, &p, end, &name_end, |
3247 | 0 | env, &back_num, &tok->u.backref.level); |
3248 | 0 | if (r == 1) tok->u.backref.exist_level = 1; |
3249 | 0 | else tok->u.backref.exist_level = 0; |
3250 | | # else |
3251 | | r = fetch_name(&p, end, &name_end, env, &back_num, 1); |
3252 | | # endif |
3253 | 0 | if (r < 0) return r; |
3254 | | |
3255 | 0 | if (back_num != 0) { |
3256 | 0 | if (back_num < 0) { |
3257 | 0 | back_num = BACKREF_REL_TO_ABS(back_num, env); |
3258 | 0 | if (back_num <= 0) |
3259 | 0 | return ONIGERR_INVALID_BACKREF; |
3260 | 0 | } |
3261 | | |
3262 | 0 | if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
3263 | 0 | if (back_num > env->num_mem || |
3264 | 0 | IS_NULL(SCANENV_MEM_NODES(env)[back_num])) |
3265 | 0 | return ONIGERR_INVALID_BACKREF; |
3266 | 0 | } |
3267 | 0 | tok->type = TK_BACKREF; |
3268 | 0 | tok->u.backref.by_name = 0; |
3269 | 0 | tok->u.backref.num = 1; |
3270 | 0 | tok->u.backref.ref1 = back_num; |
3271 | 0 | } |
3272 | 0 | else { |
3273 | 0 | num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); |
3274 | 0 | if (num <= 0) { |
3275 | 0 | onig_scan_env_set_error_string(env, |
3276 | 0 | ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); |
3277 | 0 | return ONIGERR_UNDEFINED_NAME_REFERENCE; |
3278 | 0 | } |
3279 | 0 | if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
3280 | 0 | int i; |
3281 | 0 | for (i = 0; i < num; i++) { |
3282 | 0 | if (backs[i] > env->num_mem || |
3283 | 0 | IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) |
3284 | 0 | return ONIGERR_INVALID_BACKREF; |
3285 | 0 | } |
3286 | 0 | } |
3287 | | |
3288 | 0 | tok->type = TK_BACKREF; |
3289 | 0 | tok->u.backref.by_name = 1; |
3290 | 0 | if (num == 1 || IS_SYNTAX_BV(syn, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) { |
3291 | 0 | tok->u.backref.num = 1; |
3292 | 0 | tok->u.backref.ref1 = backs[0]; |
3293 | 0 | } |
3294 | 0 | else { |
3295 | 0 | tok->u.backref.num = num; |
3296 | 0 | tok->u.backref.refs = backs; |
3297 | 0 | } |
3298 | 0 | } |
3299 | 0 | *src = p; |
3300 | 0 | return 0; |
3301 | 0 | } |
3302 | | #endif |
3303 | | |
3304 | | static int |
3305 | | fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) |
3306 | 1.51M | { |
3307 | 1.51M | int r, num; |
3308 | 1.51M | OnigCodePoint c; |
3309 | 1.51M | OnigEncoding enc = env->enc; |
3310 | 1.51M | const OnigSyntaxType* syn = env->syntax; |
3311 | 1.51M | UChar* prev; |
3312 | 1.51M | UChar* p = *src; |
3313 | 1.51M | PFETCH_READY; |
3314 | | |
3315 | 1.51M | start: |
3316 | 1.51M | if (PEND) { |
3317 | 67.3k | tok->type = TK_EOT; |
3318 | 67.3k | return tok->type; |
3319 | 67.3k | } |
3320 | | |
3321 | 1.44M | tok->type = TK_STRING; |
3322 | 1.44M | tok->base = 0; |
3323 | 1.44M | tok->backp = p; |
3324 | | |
3325 | 1.44M | PFETCH(c); |
3326 | 1.44M | if (p > end) return ONIGERR_PREMATURE_END_OF_CHAR_CLASS; |
3327 | 1.44M | if (IS_MC_ESC_CODE(c, syn)) { |
3328 | 52.7k | if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; |
3329 | | |
3330 | 52.7k | tok->backp = p; |
3331 | 52.7k | PFETCH(c); |
3332 | | |
3333 | 52.7k | tok->u.c = c; |
3334 | 52.7k | tok->escaped = 1; |
3335 | 52.7k | switch (c) { |
3336 | 0 | case '*': |
3337 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; |
3338 | 0 | tok->type = TK_OP_REPEAT; |
3339 | 0 | tok->u.repeat.lower = 0; |
3340 | 0 | tok->u.repeat.upper = REPEAT_INFINITE; |
3341 | 0 | goto greedy_check; |
3342 | 0 | break; |
3343 | | |
3344 | 0 | case '+': |
3345 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; |
3346 | 0 | tok->type = TK_OP_REPEAT; |
3347 | 0 | tok->u.repeat.lower = 1; |
3348 | 0 | tok->u.repeat.upper = REPEAT_INFINITE; |
3349 | 0 | goto greedy_check; |
3350 | 0 | break; |
3351 | | |
3352 | 0 | case '?': |
3353 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; |
3354 | 0 | tok->type = TK_OP_REPEAT; |
3355 | 0 | tok->u.repeat.lower = 0; |
3356 | 0 | tok->u.repeat.upper = 1; |
3357 | 87.8k | greedy_check: |
3358 | 87.8k | if (!PEND && PPEEK_IS('?') && |
3359 | 87.8k | IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { |
3360 | 2.92k | PFETCH(c); |
3361 | 2.92k | tok->u.repeat.greedy = 0; |
3362 | 2.92k | tok->u.repeat.possessive = 0; |
3363 | 2.92k | } |
3364 | 84.9k | else { |
3365 | 84.9k | possessive_check: |
3366 | 84.9k | if (!PEND && PPEEK_IS('+') && |
3367 | 84.9k | ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && |
3368 | 0 | tok->type != TK_INTERVAL) || |
3369 | 0 | (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && |
3370 | 0 | tok->type == TK_INTERVAL))) { |
3371 | 0 | PFETCH(c); |
3372 | 0 | tok->u.repeat.greedy = 1; |
3373 | 0 | tok->u.repeat.possessive = 1; |
3374 | 0 | } |
3375 | 84.9k | else { |
3376 | 84.9k | tok->u.repeat.greedy = 1; |
3377 | 84.9k | tok->u.repeat.possessive = 0; |
3378 | 84.9k | } |
3379 | 84.9k | } |
3380 | 87.8k | break; |
3381 | | |
3382 | 87.8k | case '{': |
3383 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; |
3384 | 0 | r = fetch_range_quantifier(&p, end, tok, env); |
3385 | 0 | if (r < 0) return r; /* error */ |
3386 | 0 | if (r == 0) goto greedy_check; |
3387 | 0 | else if (r == 2) { /* {n} */ |
3388 | 0 | if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) |
3389 | 0 | goto possessive_check; |
3390 | | |
3391 | 0 | goto greedy_check; |
3392 | 0 | } |
3393 | | /* r == 1 : normal char */ |
3394 | 0 | break; |
3395 | | |
3396 | 0 | case '|': |
3397 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; |
3398 | 0 | tok->type = TK_ALT; |
3399 | 0 | break; |
3400 | | |
3401 | 5.85k | case '(': |
3402 | 5.85k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; |
3403 | 0 | tok->type = TK_SUBEXP_OPEN; |
3404 | 0 | break; |
3405 | | |
3406 | 2.92k | case ')': |
3407 | 2.92k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; |
3408 | 0 | tok->type = TK_SUBEXP_CLOSE; |
3409 | 0 | break; |
3410 | | |
3411 | 0 | case 'w': |
3412 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; |
3413 | 0 | tok->type = TK_CHAR_TYPE; |
3414 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3415 | 0 | tok->u.prop.not = 0; |
3416 | 0 | break; |
3417 | | |
3418 | 0 | case 'W': |
3419 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; |
3420 | 0 | tok->type = TK_CHAR_TYPE; |
3421 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_WORD; |
3422 | 0 | tok->u.prop.not = 1; |
3423 | 0 | break; |
3424 | | |
3425 | 2.92k | case 'b': |
3426 | 2.92k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; |
3427 | 2.92k | tok->type = TK_ANCHOR; |
3428 | 2.92k | tok->u.anchor.subtype = ANCHOR_WORD_BOUND; |
3429 | 2.92k | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) |
3430 | 2.92k | && ! IS_WORD_BOUND_ALL_RANGE(env->option); |
3431 | 2.92k | break; |
3432 | | |
3433 | 0 | case 'B': |
3434 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; |
3435 | 0 | tok->type = TK_ANCHOR; |
3436 | 0 | tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND; |
3437 | 0 | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) |
3438 | 0 | && ! IS_WORD_BOUND_ALL_RANGE(env->option); |
3439 | 0 | break; |
3440 | | |
3441 | 0 | #ifdef USE_WORD_BEGIN_END |
3442 | 0 | case '<': |
3443 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; |
3444 | 0 | tok->type = TK_ANCHOR; |
3445 | 0 | tok->u.anchor.subtype = ANCHOR_WORD_BEGIN; |
3446 | 0 | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); |
3447 | 0 | break; |
3448 | | |
3449 | 0 | case '>': |
3450 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; |
3451 | 0 | tok->type = TK_ANCHOR; |
3452 | 0 | tok->u.anchor.subtype = ANCHOR_WORD_END; |
3453 | 0 | tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); |
3454 | 0 | break; |
3455 | 0 | #endif |
3456 | | |
3457 | 14.6k | case 's': |
3458 | 14.6k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; |
3459 | 14.6k | tok->type = TK_CHAR_TYPE; |
3460 | 14.6k | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3461 | 14.6k | tok->u.prop.not = 0; |
3462 | 14.6k | break; |
3463 | | |
3464 | 0 | case 'S': |
3465 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; |
3466 | 0 | tok->type = TK_CHAR_TYPE; |
3467 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; |
3468 | 0 | tok->u.prop.not = 1; |
3469 | 0 | break; |
3470 | | |
3471 | 11.7k | case 'd': |
3472 | 11.7k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; |
3473 | 11.7k | tok->type = TK_CHAR_TYPE; |
3474 | 11.7k | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3475 | 11.7k | tok->u.prop.not = 0; |
3476 | 11.7k | break; |
3477 | | |
3478 | 0 | case 'D': |
3479 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; |
3480 | 0 | tok->type = TK_CHAR_TYPE; |
3481 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; |
3482 | 0 | tok->u.prop.not = 1; |
3483 | 0 | break; |
3484 | | |
3485 | 0 | case 'h': |
3486 | 0 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3487 | 0 | tok->type = TK_CHAR_TYPE; |
3488 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3489 | 0 | tok->u.prop.not = 0; |
3490 | 0 | break; |
3491 | | |
3492 | 0 | case 'H': |
3493 | 0 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; |
3494 | 0 | tok->type = TK_CHAR_TYPE; |
3495 | 0 | tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; |
3496 | 0 | tok->u.prop.not = 1; |
3497 | 0 | break; |
3498 | | |
3499 | 0 | case 'A': |
3500 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; |
3501 | 0 | begin_buf: |
3502 | 0 | tok->type = TK_ANCHOR; |
3503 | 0 | tok->u.anchor.subtype = ANCHOR_BEGIN_BUF; |
3504 | 0 | break; |
3505 | | |
3506 | 0 | case 'Z': |
3507 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; |
3508 | 0 | tok->type = TK_ANCHOR; |
3509 | 0 | tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF; |
3510 | 0 | break; |
3511 | | |
3512 | 0 | case 'z': |
3513 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; |
3514 | 0 | end_buf: |
3515 | 0 | tok->type = TK_ANCHOR; |
3516 | 0 | tok->u.anchor.subtype = ANCHOR_END_BUF; |
3517 | 0 | break; |
3518 | | |
3519 | 0 | case 'G': |
3520 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; |
3521 | 0 | tok->type = TK_ANCHOR; |
3522 | 0 | tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION; |
3523 | 0 | break; |
3524 | | |
3525 | 0 | case '`': |
3526 | 0 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; |
3527 | 0 | goto begin_buf; |
3528 | 0 | break; |
3529 | | |
3530 | 0 | case '\'': |
3531 | 0 | if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; |
3532 | 0 | goto end_buf; |
3533 | 0 | break; |
3534 | | |
3535 | 0 | case 'x': |
3536 | 0 | if (PEND) break; |
3537 | | |
3538 | 0 | prev = p; |
3539 | 0 | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { |
3540 | 0 | PINC; |
3541 | 0 | num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); |
3542 | 0 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3543 | 0 | if (!PEND) { |
3544 | 0 | if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) |
3545 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3546 | 0 | } |
3547 | | |
3548 | 0 | if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { |
3549 | 0 | PINC; |
3550 | 0 | tok->type = TK_CODE_POINT; |
3551 | 0 | tok->u.code = (OnigCodePoint )num; |
3552 | 0 | } |
3553 | 0 | else { |
3554 | | /* can't read nothing or invalid format */ |
3555 | 0 | p = prev; |
3556 | 0 | } |
3557 | 0 | } |
3558 | 0 | else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { |
3559 | 0 | num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); |
3560 | 0 | if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3561 | 0 | if (p == prev) { /* can't read nothing. */ |
3562 | 0 | num = 0; /* but, it's not error */ |
3563 | 0 | } |
3564 | 0 | tok->type = TK_RAW_BYTE; |
3565 | 0 | tok->base = 16; |
3566 | 0 | tok->u.c = num; |
3567 | 0 | } |
3568 | 0 | break; |
3569 | | |
3570 | 0 | case 'u': |
3571 | 0 | if (PEND) break; |
3572 | | |
3573 | 0 | prev = p; |
3574 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { |
3575 | 0 | num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); |
3576 | 0 | if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; |
3577 | 0 | else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; |
3578 | 0 | if (p == prev) { /* can't read nothing. */ |
3579 | 0 | num = 0; /* but, it's not error */ |
3580 | 0 | } |
3581 | 0 | tok->type = TK_CODE_POINT; |
3582 | 0 | tok->base = 16; |
3583 | 0 | tok->u.code = (OnigCodePoint )num; |
3584 | 0 | } |
3585 | 0 | break; |
3586 | | |
3587 | 0 | case 'o': |
3588 | 0 | if (PEND) break; |
3589 | | |
3590 | 0 | prev = p; |
3591 | 0 | if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { |
3592 | 0 | PINC; |
3593 | 0 | num = scan_unsigned_octal_number(&p, end, 11, enc); |
3594 | 0 | if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; |
3595 | 0 | if (!PEND) { |
3596 | 0 | OnigCodePoint c = PPEEK; |
3597 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') |
3598 | 0 | return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; |
3599 | 0 | } |
3600 | | |
3601 | 0 | if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { |
3602 | 0 | PINC; |
3603 | 0 | tok->type = TK_CODE_POINT; |
3604 | 0 | tok->u.code = (OnigCodePoint )num; |
3605 | 0 | } |
3606 | 0 | else { |
3607 | | /* can't read nothing or invalid format */ |
3608 | 0 | p = prev; |
3609 | 0 | } |
3610 | 0 | } |
3611 | 0 | break; |
3612 | | |
3613 | 0 | case '1': case '2': case '3': case '4': |
3614 | 0 | case '5': case '6': case '7': case '8': case '9': |
3615 | 0 | PUNFETCH; |
3616 | 0 | prev = p; |
3617 | 0 | num = onig_scan_unsigned_number(&p, end, enc); |
3618 | 0 | if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { |
3619 | 0 | goto skip_backref; |
3620 | 0 | } |
3621 | | |
3622 | 0 | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && |
3623 | 0 | (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ |
3624 | 0 | if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
3625 | 0 | if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num])) |
3626 | 0 | return ONIGERR_INVALID_BACKREF; |
3627 | 0 | } |
3628 | | |
3629 | 0 | tok->type = TK_BACKREF; |
3630 | 0 | tok->u.backref.num = 1; |
3631 | 0 | tok->u.backref.ref1 = num; |
3632 | 0 | tok->u.backref.by_name = 0; |
3633 | 0 | #ifdef USE_BACKREF_WITH_LEVEL |
3634 | 0 | tok->u.backref.exist_level = 0; |
3635 | 0 | #endif |
3636 | 0 | break; |
3637 | 0 | } |
3638 | | |
3639 | 0 | skip_backref: |
3640 | 0 | if (c == '8' || c == '9') { |
3641 | | /* normal char */ |
3642 | 0 | p = prev; PINC; |
3643 | 0 | break; |
3644 | 0 | } |
3645 | | |
3646 | 0 | p = prev; |
3647 | | /* fall through */ |
3648 | 0 | case '0': |
3649 | 0 | if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { |
3650 | 0 | prev = p; |
3651 | 0 | num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); |
3652 | 0 | if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER; |
3653 | 0 | if (p == prev) { /* can't read nothing. */ |
3654 | 0 | num = 0; /* but, it's not error */ |
3655 | 0 | } |
3656 | 0 | tok->type = TK_RAW_BYTE; |
3657 | 0 | tok->base = 8; |
3658 | 0 | tok->u.c = num; |
3659 | 0 | } |
3660 | 0 | else if (c != '0') { |
3661 | 0 | PINC; |
3662 | 0 | } |
3663 | 0 | break; |
3664 | | |
3665 | 0 | #ifdef USE_NAMED_GROUP |
3666 | 0 | case 'k': |
3667 | 0 | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { |
3668 | 0 | PFETCH(c); |
3669 | 0 | if (c == '<' || c == '\'') { |
3670 | 0 | r = fetch_named_backref_token(c, tok, &p, end, env); |
3671 | 0 | if (r < 0) return r; |
3672 | 0 | } |
3673 | 0 | else { |
3674 | 0 | PUNFETCH; |
3675 | 0 | onig_syntax_warn(env, "invalid back reference"); |
3676 | 0 | } |
3677 | 0 | } |
3678 | 0 | break; |
3679 | 0 | #endif |
3680 | | |
3681 | 0 | #if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP) |
3682 | 0 | case 'g': |
3683 | 0 | # ifdef USE_NAMED_GROUP |
3684 | 0 | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) { |
3685 | 0 | PFETCH(c); |
3686 | 0 | if (c == '{') { |
3687 | 0 | r = fetch_named_backref_token(c, tok, &p, end, env); |
3688 | 0 | if (r < 0) return r; |
3689 | 0 | } |
3690 | 0 | else |
3691 | 0 | PUNFETCH; |
3692 | 0 | } |
3693 | 0 | # endif |
3694 | 0 | # ifdef USE_SUBEXP_CALL |
3695 | 0 | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { |
3696 | 0 | PFETCH(c); |
3697 | 0 | if (c == '<' || c == '\'') { |
3698 | 0 | int gnum = -1, rel = 0; |
3699 | 0 | UChar* name_end; |
3700 | 0 | OnigCodePoint cnext; |
3701 | |
|
3702 | 0 | cnext = PPEEK; |
3703 | 0 | if (cnext == '0') { |
3704 | 0 | PINC; |
3705 | 0 | if (PPEEK_IS(get_name_end_code_point(c))) { /* \g<0>, \g'0' */ |
3706 | 0 | PINC; |
3707 | 0 | name_end = p; |
3708 | 0 | gnum = 0; |
3709 | 0 | } |
3710 | 0 | } |
3711 | 0 | else if (cnext == '+') { |
3712 | 0 | PINC; |
3713 | 0 | rel = 1; |
3714 | 0 | } |
3715 | 0 | prev = p; |
3716 | 0 | if (gnum < 0) { |
3717 | 0 | r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); |
3718 | 0 | if (r < 0) return r; |
3719 | 0 | } |
3720 | | |
3721 | 0 | tok->type = TK_CALL; |
3722 | 0 | tok->u.call.name = prev; |
3723 | 0 | tok->u.call.name_end = name_end; |
3724 | 0 | tok->u.call.gnum = gnum; |
3725 | 0 | tok->u.call.rel = rel; |
3726 | 0 | } |
3727 | 0 | else { |
3728 | 0 | onig_syntax_warn(env, "invalid subexp call"); |
3729 | 0 | PUNFETCH; |
3730 | 0 | } |
3731 | 0 | } |
3732 | 0 | # endif |
3733 | 0 | break; |
3734 | 0 | #endif |
3735 | | |
3736 | 0 | case 'Q': |
3737 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { |
3738 | 0 | tok->type = TK_QUOTE_OPEN; |
3739 | 0 | } |
3740 | 0 | break; |
3741 | | |
3742 | 0 | case 'p': |
3743 | 0 | case 'P': |
3744 | 0 | if (PPEEK_IS('{') && |
3745 | 0 | IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { |
3746 | 0 | PINC; |
3747 | 0 | tok->type = TK_CHAR_PROPERTY; |
3748 | 0 | tok->u.prop.not = (c == 'P' ? 1 : 0); |
3749 | |
|
3750 | 0 | if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { |
3751 | 0 | PFETCH(c); |
3752 | 0 | if (c == '^') { |
3753 | 0 | tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); |
3754 | 0 | } |
3755 | 0 | else |
3756 | 0 | PUNFETCH; |
3757 | 0 | } |
3758 | 0 | } |
3759 | 0 | else { |
3760 | 0 | onig_syntax_warn(env, "invalid Unicode Property \\%c", c); |
3761 | 0 | } |
3762 | 0 | break; |
3763 | | |
3764 | 0 | case 'R': |
3765 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) { |
3766 | 0 | tok->type = TK_LINEBREAK; |
3767 | 0 | } |
3768 | 0 | break; |
3769 | | |
3770 | 0 | case 'X': |
3771 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) { |
3772 | 0 | tok->type = TK_EXTENDED_GRAPHEME_CLUSTER; |
3773 | 0 | } |
3774 | 0 | break; |
3775 | | |
3776 | 0 | case 'K': |
3777 | 0 | if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) { |
3778 | 0 | tok->type = TK_KEEP; |
3779 | 0 | } |
3780 | 0 | break; |
3781 | | |
3782 | 14.6k | default: |
3783 | 14.6k | { |
3784 | 14.6k | OnigCodePoint c2; |
3785 | | |
3786 | 14.6k | PUNFETCH; |
3787 | 14.6k | num = fetch_escaped_value(&p, end, env, &c2); |
3788 | 14.6k | if (num < 0) return num; |
3789 | | /* set_raw: */ |
3790 | 14.6k | if ((OnigCodePoint )tok->u.c != c2) { |
3791 | 0 | tok->type = TK_CODE_POINT; |
3792 | 0 | tok->u.code = (OnigCodePoint )c2; |
3793 | 0 | } |
3794 | 14.6k | else { /* string */ |
3795 | 14.6k | p = tok->backp + enclen(enc, tok->backp, end); |
3796 | 14.6k | } |
3797 | 14.6k | } |
3798 | 0 | break; |
3799 | 52.7k | } |
3800 | 52.7k | } |
3801 | 1.39M | else { |
3802 | 1.39M | tok->u.c = c; |
3803 | 1.39M | tok->escaped = 0; |
3804 | | |
3805 | 1.39M | #ifdef USE_VARIABLE_META_CHARS |
3806 | 1.39M | if ((c != ONIG_INEFFECTIVE_META_CHAR) && |
3807 | 1.39M | IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { |
3808 | 0 | if (c == MC_ANYCHAR(syn)) |
3809 | 0 | goto any_char; |
3810 | 0 | else if (c == MC_ANYTIME(syn)) |
3811 | 0 | goto anytime; |
3812 | 0 | else if (c == MC_ZERO_OR_ONE_TIME(syn)) |
3813 | 0 | goto zero_or_one_time; |
3814 | 0 | else if (c == MC_ONE_OR_MORE_TIME(syn)) |
3815 | 0 | goto one_or_more_time; |
3816 | 0 | else if (c == MC_ANYCHAR_ANYTIME(syn)) { |
3817 | 0 | tok->type = TK_ANYCHAR_ANYTIME; |
3818 | 0 | goto out; |
3819 | 0 | } |
3820 | 0 | } |
3821 | 1.39M | #endif |
3822 | | |
3823 | 1.39M | switch (c) { |
3824 | 29.2k | case '.': |
3825 | 29.2k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; |
3826 | 29.2k | #ifdef USE_VARIABLE_META_CHARS |
3827 | 29.2k | any_char: |
3828 | 29.2k | #endif |
3829 | 29.2k | tok->type = TK_ANYCHAR; |
3830 | 29.2k | break; |
3831 | | |
3832 | 32.2k | case '*': |
3833 | 32.2k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; |
3834 | 32.2k | #ifdef USE_VARIABLE_META_CHARS |
3835 | 32.2k | anytime: |
3836 | 32.2k | #endif |
3837 | 32.2k | tok->type = TK_OP_REPEAT; |
3838 | 32.2k | tok->u.repeat.lower = 0; |
3839 | 32.2k | tok->u.repeat.upper = REPEAT_INFINITE; |
3840 | 32.2k | goto greedy_check; |
3841 | 0 | break; |
3842 | | |
3843 | 52.7k | case '+': |
3844 | 52.7k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; |
3845 | 52.7k | #ifdef USE_VARIABLE_META_CHARS |
3846 | 52.7k | one_or_more_time: |
3847 | 52.7k | #endif |
3848 | 52.7k | tok->type = TK_OP_REPEAT; |
3849 | 52.7k | tok->u.repeat.lower = 1; |
3850 | 52.7k | tok->u.repeat.upper = REPEAT_INFINITE; |
3851 | 52.7k | goto greedy_check; |
3852 | 0 | break; |
3853 | | |
3854 | 2.92k | case '?': |
3855 | 2.92k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; |
3856 | 2.92k | #ifdef USE_VARIABLE_META_CHARS |
3857 | 2.92k | zero_or_one_time: |
3858 | 2.92k | #endif |
3859 | 2.92k | tok->type = TK_OP_REPEAT; |
3860 | 2.92k | tok->u.repeat.lower = 0; |
3861 | 2.92k | tok->u.repeat.upper = 1; |
3862 | 2.92k | goto greedy_check; |
3863 | 0 | break; |
3864 | | |
3865 | 0 | case '{': |
3866 | 0 | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; |
3867 | 0 | r = fetch_range_quantifier(&p, end, tok, env); |
3868 | 0 | if (r < 0) return r; /* error */ |
3869 | 0 | if (r == 0) goto greedy_check; |
3870 | 0 | else if (r == 2) { /* {n} */ |
3871 | 0 | if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) |
3872 | 0 | goto possessive_check; |
3873 | | |
3874 | 0 | goto greedy_check; |
3875 | 0 | } |
3876 | | /* r == 1 : normal char */ |
3877 | 0 | break; |
3878 | | |
3879 | 23.4k | case '|': |
3880 | 23.4k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; |
3881 | 23.4k | tok->type = TK_ALT; |
3882 | 23.4k | break; |
3883 | | |
3884 | 35.1k | case '(': |
3885 | 35.1k | if (PPEEK_IS('?') && |
3886 | 35.1k | IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { |
3887 | 32.2k | PINC; |
3888 | 32.2k | if (PPEEK_IS('#')) { |
3889 | 0 | PFETCH(c); |
3890 | 0 | while (1) { |
3891 | 0 | if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; |
3892 | 0 | PFETCH(c); |
3893 | 0 | if (c == MC_ESC(syn)) { |
3894 | 0 | if (!PEND) PFETCH(c); |
3895 | 0 | } |
3896 | 0 | else { |
3897 | 0 | if (c == ')') break; |
3898 | 0 | } |
3899 | 0 | } |
3900 | 0 | goto start; |
3901 | 0 | } |
3902 | 32.2k | #ifdef USE_PERL_SUBEXP_CALL |
3903 | | /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */ |
3904 | 32.2k | c = PPEEK; |
3905 | 32.2k | if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) && |
3906 | 32.2k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { |
3907 | | /* (?&name), (?n), (?R), (?0) */ |
3908 | 0 | int gnum; |
3909 | 0 | UChar *name; |
3910 | 0 | UChar *name_end; |
3911 | |
|
3912 | 0 | if (c == 'R' || c == '0') { |
3913 | 0 | PINC; /* skip 'R' / '0' */ |
3914 | 0 | if (!PPEEK_IS(')')) { |
3915 | 0 | r = ONIGERR_INVALID_GROUP_NAME; |
3916 | 0 | onig_scan_env_set_error_string(env, r, p - 1, p + 1); |
3917 | 0 | return r; |
3918 | 0 | } |
3919 | 0 | PINC; /* skip ')' */ |
3920 | 0 | name_end = name = p; |
3921 | 0 | gnum = 0; |
3922 | 0 | } |
3923 | 0 | else { |
3924 | 0 | int numref = 1; |
3925 | 0 | if (c == '&') { /* (?&name) */ |
3926 | 0 | PINC; |
3927 | 0 | numref = 0; /* don't allow number name */ |
3928 | 0 | } |
3929 | 0 | name = p; |
3930 | 0 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref); |
3931 | 0 | if (r < 0) return r; |
3932 | 0 | } |
3933 | | |
3934 | 0 | tok->type = TK_CALL; |
3935 | 0 | tok->u.call.name = name; |
3936 | 0 | tok->u.call.name_end = name_end; |
3937 | 0 | tok->u.call.gnum = gnum; |
3938 | 0 | tok->u.call.rel = 0; |
3939 | 0 | break; |
3940 | 0 | } |
3941 | 32.2k | else if ((c == '-' || c == '+') && |
3942 | 32.2k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { |
3943 | | /* (?+n), (?-n) */ |
3944 | 0 | int gnum; |
3945 | 0 | UChar *name; |
3946 | 0 | UChar *name_end; |
3947 | 0 | OnigCodePoint cnext; |
3948 | 0 | PFETCH_READY; |
3949 | |
|
3950 | 0 | PINC; /* skip '-' / '+' */ |
3951 | 0 | cnext = PPEEK; |
3952 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) { |
3953 | 0 | if (c == '-') PUNFETCH; |
3954 | 0 | name = p; |
3955 | 0 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1); |
3956 | 0 | if (r < 0) return r; |
3957 | | |
3958 | 0 | tok->type = TK_CALL; |
3959 | 0 | tok->u.call.name = name; |
3960 | 0 | tok->u.call.name_end = name_end; |
3961 | 0 | tok->u.call.gnum = gnum; |
3962 | 0 | tok->u.call.rel = 1; |
3963 | 0 | break; |
3964 | 0 | } |
3965 | 0 | } |
3966 | 32.2k | #endif /* USE_PERL_SUBEXP_CALL */ |
3967 | 32.2k | #ifdef USE_CAPITAL_P_NAMED_GROUP |
3968 | 32.2k | if (PPEEK_IS('P') && |
3969 | 32.2k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { |
3970 | 0 | int gnum; |
3971 | 0 | UChar *name; |
3972 | 0 | UChar *name_end; |
3973 | 0 | PFETCH_READY; |
3974 | |
|
3975 | 0 | PINC; /* skip 'P' */ |
3976 | 0 | if (PEND) return ONIGERR_UNDEFINED_GROUP_OPTION; |
3977 | 0 | PFETCH(c); |
3978 | 0 | if (c == '=') { /* (?P=name): backref */ |
3979 | 0 | r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env); |
3980 | 0 | if (r < 0) return r; |
3981 | 0 | break; |
3982 | 0 | } |
3983 | 0 | else if (c == '>') { /* (?P>name): subexp call */ |
3984 | 0 | name = p; |
3985 | 0 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0); |
3986 | 0 | if (r < 0) return r; |
3987 | | |
3988 | 0 | tok->type = TK_CALL; |
3989 | 0 | tok->u.call.name = name; |
3990 | 0 | tok->u.call.name_end = name_end; |
3991 | 0 | tok->u.call.gnum = gnum; |
3992 | 0 | tok->u.call.rel = 0; |
3993 | 0 | break; |
3994 | 0 | } |
3995 | 0 | } |
3996 | 32.2k | #endif /* USE_CAPITAL_P_NAMED_GROUP */ |
3997 | 32.2k | PUNFETCH; |
3998 | 32.2k | } |
3999 | | |
4000 | 35.1k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; |
4001 | 35.1k | tok->type = TK_SUBEXP_OPEN; |
4002 | 35.1k | break; |
4003 | | |
4004 | 35.1k | case ')': |
4005 | 35.1k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; |
4006 | 35.1k | tok->type = TK_SUBEXP_CLOSE; |
4007 | 35.1k | break; |
4008 | | |
4009 | 58.5k | case '^': |
4010 | 58.5k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; |
4011 | 58.5k | tok->type = TK_ANCHOR; |
4012 | 58.5k | tok->u.anchor.subtype = (IS_SINGLELINE(env->option) |
4013 | 58.5k | ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); |
4014 | 58.5k | break; |
4015 | | |
4016 | 23.4k | case '$': |
4017 | 23.4k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; |
4018 | 23.4k | tok->type = TK_ANCHOR; |
4019 | 23.4k | tok->u.anchor.subtype = (IS_SINGLELINE(env->option) |
4020 | 23.4k | ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE); |
4021 | 23.4k | break; |
4022 | | |
4023 | 43.9k | case '[': |
4024 | 43.9k | if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; |
4025 | 43.9k | tok->type = TK_CC_OPEN; |
4026 | 43.9k | break; |
4027 | | |
4028 | 0 | case ']': |
4029 | 0 | if (*src > env->pattern) /* /].../ is allowed. */ |
4030 | 0 | CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); |
4031 | 0 | break; |
4032 | | |
4033 | 0 | case '#': |
4034 | 0 | if (IS_EXTEND(env->option)) { |
4035 | 0 | while (!PEND) { |
4036 | 0 | PFETCH(c); |
4037 | 0 | if (ONIGENC_IS_CODE_NEWLINE(enc, c)) |
4038 | 0 | break; |
4039 | 0 | } |
4040 | 0 | goto start; |
4041 | 0 | break; |
4042 | 0 | } |
4043 | 0 | break; |
4044 | | |
4045 | 137k | case ' ': case '\t': case '\n': case '\r': case '\f': |
4046 | 137k | if (IS_EXTEND(env->option)) |
4047 | 0 | goto start; |
4048 | 137k | break; |
4049 | | |
4050 | 916k | default: |
4051 | | /* string */ |
4052 | 916k | break; |
4053 | 1.39M | } |
4054 | 1.39M | } |
4055 | | |
4056 | 1.44M | #ifdef USE_VARIABLE_META_CHARS |
4057 | 1.44M | out: |
4058 | 1.44M | #endif |
4059 | 1.44M | *src = p; |
4060 | 1.44M | return tok->type; |
4061 | 1.44M | } |
4062 | | |
4063 | | static int |
4064 | | add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, |
4065 | | ScanEnv* env, |
4066 | | OnigCodePoint sb_out, const OnigCodePoint mbr[]) |
4067 | 38.0k | { |
4068 | 38.0k | int i, r; |
4069 | 38.0k | OnigCodePoint j; |
4070 | | |
4071 | 38.0k | int n = ONIGENC_CODE_RANGE_NUM(mbr); |
4072 | | |
4073 | 38.0k | if (not == 0) { |
4074 | 102k | for (i = 0; i < n; i++) { |
4075 | 102k | for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); |
4076 | 377k | j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { |
4077 | 313k | if (j >= sb_out) { |
4078 | 38.0k | if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { |
4079 | 0 | r = add_code_range_to_buf(&(cc->mbuf), env, j, |
4080 | 0 | ONIGENC_CODE_RANGE_TO(mbr, i)); |
4081 | 0 | if (r != 0) return r; |
4082 | 0 | i++; |
4083 | 0 | } |
4084 | | |
4085 | 38.0k | goto sb_end; |
4086 | 38.0k | } |
4087 | 275k | BITSET_SET_BIT_CHKDUP(cc->bs, j); |
4088 | 275k | } |
4089 | 102k | } |
4090 | | |
4091 | 38.0k | sb_end: |
4092 | 986k | for ( ; i < n; i++) { |
4093 | 948k | r = add_code_range_to_buf(&(cc->mbuf), env, |
4094 | 948k | ONIGENC_CODE_RANGE_FROM(mbr, i), |
4095 | 948k | ONIGENC_CODE_RANGE_TO(mbr, i)); |
4096 | 948k | if (r != 0) return r; |
4097 | 948k | } |
4098 | 38.0k | } |
4099 | 0 | else { |
4100 | 0 | OnigCodePoint prev = 0; |
4101 | |
|
4102 | 0 | for (i = 0; i < n; i++) { |
4103 | 0 | for (j = prev; |
4104 | 0 | j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { |
4105 | 0 | if (j >= sb_out) { |
4106 | 0 | goto sb_end2; |
4107 | 0 | } |
4108 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, j); |
4109 | 0 | } |
4110 | 0 | prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; |
4111 | 0 | } |
4112 | 0 | for (j = prev; j < sb_out; j++) { |
4113 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, j); |
4114 | 0 | } |
4115 | |
|
4116 | 0 | sb_end2: |
4117 | 0 | prev = sb_out; |
4118 | |
|
4119 | 0 | for (i = 0; i < n; i++) { |
4120 | 0 | if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { |
4121 | 0 | r = add_code_range_to_buf(&(cc->mbuf), env, prev, |
4122 | 0 | ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); |
4123 | 0 | if (r != 0) return r; |
4124 | 0 | } |
4125 | 0 | prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; |
4126 | 0 | } |
4127 | 0 | if (prev < 0x7fffffff) { |
4128 | 0 | r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff); |
4129 | 0 | if (r != 0) return r; |
4130 | 0 | } |
4131 | 0 | } |
4132 | | |
4133 | 38.0k | return 0; |
4134 | 38.0k | } |
4135 | | |
4136 | | static int |
4137 | | add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* env) |
4138 | 38.0k | { |
4139 | 38.0k | int maxcode; |
4140 | 38.0k | int c, r; |
4141 | 38.0k | const OnigCodePoint *ranges; |
4142 | 38.0k | OnigCodePoint sb_out; |
4143 | 38.0k | OnigEncoding enc = env->enc; |
4144 | | |
4145 | 38.0k | r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); |
4146 | 38.0k | if (r == 0) { |
4147 | 38.0k | if (ascii_range) { |
4148 | 38.0k | CClassNode ccwork; |
4149 | 38.0k | initialize_cclass(&ccwork); |
4150 | 38.0k | r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out, |
4151 | 38.0k | ranges); |
4152 | 38.0k | if (r == 0) { |
4153 | 38.0k | if (not) { |
4154 | 0 | r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE); |
4155 | 0 | } |
4156 | 38.0k | else { |
4157 | 38.0k | CClassNode ccascii; |
4158 | 38.0k | initialize_cclass(&ccascii); |
4159 | 38.0k | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { |
4160 | 0 | r = add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F); |
4161 | 0 | } |
4162 | 38.0k | else { |
4163 | 38.0k | bitset_set_range(env, ccascii.bs, 0x00, 0x7F); |
4164 | 38.0k | r = 0; |
4165 | 38.0k | } |
4166 | 38.0k | if (r == 0) { |
4167 | 38.0k | r = and_cclass(&ccwork, &ccascii, env); |
4168 | 38.0k | } |
4169 | 38.0k | if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf); |
4170 | 38.0k | } |
4171 | 38.0k | if (r == 0) { |
4172 | 38.0k | r = or_cclass(cc, &ccwork, env); |
4173 | 38.0k | } |
4174 | 38.0k | if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf); |
4175 | 38.0k | } |
4176 | 38.0k | } |
4177 | 0 | else { |
4178 | 0 | r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges); |
4179 | 0 | } |
4180 | 38.0k | return r; |
4181 | 38.0k | } |
4182 | 0 | else if (r != ONIG_NO_SUPPORT_CONFIG) { |
4183 | 0 | return r; |
4184 | 0 | } |
4185 | | |
4186 | 0 | maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE; |
4187 | 0 | r = 0; |
4188 | 0 | switch (ctype) { |
4189 | 0 | case ONIGENC_CTYPE_ALPHA: |
4190 | 0 | case ONIGENC_CTYPE_BLANK: |
4191 | 0 | case ONIGENC_CTYPE_CNTRL: |
4192 | 0 | case ONIGENC_CTYPE_DIGIT: |
4193 | 0 | case ONIGENC_CTYPE_LOWER: |
4194 | 0 | case ONIGENC_CTYPE_PUNCT: |
4195 | 0 | case ONIGENC_CTYPE_SPACE: |
4196 | 0 | case ONIGENC_CTYPE_UPPER: |
4197 | 0 | case ONIGENC_CTYPE_XDIGIT: |
4198 | 0 | case ONIGENC_CTYPE_ASCII: |
4199 | 0 | case ONIGENC_CTYPE_ALNUM: |
4200 | 0 | if (not != 0) { |
4201 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4202 | 0 | if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) |
4203 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4204 | 0 | } |
4205 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4206 | 0 | } |
4207 | 0 | else { |
4208 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4209 | 0 | if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) |
4210 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4211 | 0 | } |
4212 | 0 | } |
4213 | 0 | break; |
4214 | | |
4215 | 0 | case ONIGENC_CTYPE_GRAPH: |
4216 | 0 | case ONIGENC_CTYPE_PRINT: |
4217 | 0 | if (not != 0) { |
4218 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4219 | 0 | if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype) |
4220 | 0 | || c >= maxcode) |
4221 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4222 | 0 | } |
4223 | 0 | if (ascii_range) |
4224 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4225 | 0 | } |
4226 | 0 | else { |
4227 | 0 | for (c = 0; c < maxcode; c++) { |
4228 | 0 | if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) |
4229 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4230 | 0 | } |
4231 | 0 | if (! ascii_range) |
4232 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4233 | 0 | } |
4234 | 0 | break; |
4235 | | |
4236 | 0 | case ONIGENC_CTYPE_WORD: |
4237 | 0 | if (not == 0) { |
4238 | 0 | for (c = 0; c < maxcode; c++) { |
4239 | 0 | if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4240 | 0 | } |
4241 | 0 | if (! ascii_range) |
4242 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4243 | 0 | } |
4244 | 0 | else { |
4245 | 0 | for (c = 0; c < SINGLE_BYTE_SIZE; c++) { |
4246 | 0 | if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ |
4247 | 0 | && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode)) |
4248 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, c); |
4249 | 0 | } |
4250 | 0 | if (ascii_range) |
4251 | 0 | ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); |
4252 | 0 | } |
4253 | 0 | break; |
4254 | | |
4255 | 0 | default: |
4256 | 0 | return ONIGERR_PARSER_BUG; |
4257 | 0 | break; |
4258 | 0 | } |
4259 | | |
4260 | 0 | return r; |
4261 | 0 | } |
4262 | | |
4263 | | static int |
4264 | | parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc, |
4265 | | UChar** src, UChar* end, ScanEnv* env) |
4266 | 0 | { |
4267 | 0 | #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 |
4268 | 0 | #define POSIX_BRACKET_NAME_MIN_LEN 4 |
4269 | |
|
4270 | 0 | static const PosixBracketEntryType PBS[] = { |
4271 | 0 | POSIX_BRACKET_ENTRY_INIT("alnum", ONIGENC_CTYPE_ALNUM), |
4272 | 0 | POSIX_BRACKET_ENTRY_INIT("alpha", ONIGENC_CTYPE_ALPHA), |
4273 | 0 | POSIX_BRACKET_ENTRY_INIT("blank", ONIGENC_CTYPE_BLANK), |
4274 | 0 | POSIX_BRACKET_ENTRY_INIT("cntrl", ONIGENC_CTYPE_CNTRL), |
4275 | 0 | POSIX_BRACKET_ENTRY_INIT("digit", ONIGENC_CTYPE_DIGIT), |
4276 | 0 | POSIX_BRACKET_ENTRY_INIT("graph", ONIGENC_CTYPE_GRAPH), |
4277 | 0 | POSIX_BRACKET_ENTRY_INIT("lower", ONIGENC_CTYPE_LOWER), |
4278 | 0 | POSIX_BRACKET_ENTRY_INIT("print", ONIGENC_CTYPE_PRINT), |
4279 | 0 | POSIX_BRACKET_ENTRY_INIT("punct", ONIGENC_CTYPE_PUNCT), |
4280 | 0 | POSIX_BRACKET_ENTRY_INIT("space", ONIGENC_CTYPE_SPACE), |
4281 | 0 | POSIX_BRACKET_ENTRY_INIT("upper", ONIGENC_CTYPE_UPPER), |
4282 | 0 | POSIX_BRACKET_ENTRY_INIT("xdigit", ONIGENC_CTYPE_XDIGIT), |
4283 | 0 | POSIX_BRACKET_ENTRY_INIT("ascii", ONIGENC_CTYPE_ASCII), |
4284 | 0 | POSIX_BRACKET_ENTRY_INIT("word", ONIGENC_CTYPE_WORD), |
4285 | 0 | }; |
4286 | |
|
4287 | 0 | const PosixBracketEntryType *pb; |
4288 | 0 | int not, i, r; |
4289 | 0 | int ascii_range; |
4290 | 0 | OnigCodePoint c; |
4291 | 0 | OnigEncoding enc = env->enc; |
4292 | 0 | UChar *p = *src; |
4293 | |
|
4294 | 0 | if (PPEEK_IS('^')) { |
4295 | 0 | PINC_S; |
4296 | 0 | not = 1; |
4297 | 0 | } |
4298 | 0 | else |
4299 | 0 | not = 0; |
4300 | |
|
4301 | 0 | if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) |
4302 | 0 | goto not_posix_bracket; |
4303 | | |
4304 | 0 | ascii_range = IS_ASCII_RANGE(env->option) && |
4305 | 0 | ! IS_POSIX_BRACKET_ALL_RANGE(env->option); |
4306 | 0 | for (pb = PBS; pb < PBS + numberof(PBS); pb++) { |
4307 | 0 | if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { |
4308 | 0 | p = (UChar* )onigenc_step(enc, p, end, pb->len); |
4309 | 0 | if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) |
4310 | 0 | return ONIGERR_INVALID_POSIX_BRACKET_TYPE; |
4311 | | |
4312 | 0 | r = add_ctype_to_cc(cc, pb->ctype, not, ascii_range, env); |
4313 | 0 | if (r != 0) return r; |
4314 | | |
4315 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4316 | 0 | if (pb->ctype != ONIGENC_CTYPE_WORD && |
4317 | 0 | pb->ctype != ONIGENC_CTYPE_ASCII && |
4318 | 0 | !ascii_range) |
4319 | 0 | r = add_ctype_to_cc(asc_cc, pb->ctype, not, ascii_range, env); |
4320 | 0 | if (r != 0) return r; |
4321 | 0 | } |
4322 | | |
4323 | 0 | PINC_S; PINC_S; |
4324 | 0 | *src = p; |
4325 | 0 | return 0; |
4326 | 0 | } |
4327 | 0 | } |
4328 | | |
4329 | 0 | not_posix_bracket: |
4330 | 0 | c = 0; |
4331 | 0 | i = 0; |
4332 | 0 | while (!PEND && ((c = PPEEK) != ':') && c != ']') { |
4333 | 0 | PINC_S; |
4334 | 0 | if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; |
4335 | 0 | } |
4336 | 0 | if (c == ':' && ! PEND) { |
4337 | 0 | PINC_S; |
4338 | 0 | if (! PEND) { |
4339 | 0 | PFETCH_S(c); |
4340 | 0 | if (c == ']') |
4341 | 0 | return ONIGERR_INVALID_POSIX_BRACKET_TYPE; |
4342 | 0 | } |
4343 | 0 | } |
4344 | | |
4345 | 0 | return 1; /* 1: is not POSIX bracket, but no error. */ |
4346 | 0 | } |
4347 | | |
4348 | | static int |
4349 | | fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) |
4350 | 0 | { |
4351 | 0 | int r; |
4352 | 0 | OnigCodePoint c; |
4353 | 0 | OnigEncoding enc = env->enc; |
4354 | 0 | UChar *prev, *start, *p = *src; |
4355 | |
|
4356 | 0 | r = 0; |
4357 | 0 | start = prev = p; |
4358 | |
|
4359 | 0 | while (!PEND) { |
4360 | 0 | prev = p; |
4361 | 0 | PFETCH_S(c); |
4362 | 0 | if (c == '}') { |
4363 | 0 | r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); |
4364 | 0 | if (r < 0) break; |
4365 | | |
4366 | 0 | *src = p; |
4367 | 0 | return r; |
4368 | 0 | } |
4369 | 0 | else if (c == '(' || c == ')' || c == '{' || c == '|') { |
4370 | 0 | r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; |
4371 | 0 | break; |
4372 | 0 | } |
4373 | 0 | } |
4374 | | |
4375 | 0 | onig_scan_env_set_error_string(env, r, *src, prev); |
4376 | 0 | return r; |
4377 | 0 | } |
4378 | | |
4379 | | static int cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env); |
4380 | | |
4381 | | static int |
4382 | | parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, |
4383 | | ScanEnv* env) |
4384 | 0 | { |
4385 | 0 | int r, ctype; |
4386 | 0 | CClassNode* cc; |
4387 | |
|
4388 | 0 | ctype = fetch_char_property_to_ctype(src, end, env); |
4389 | 0 | if (ctype < 0) return ctype; |
4390 | | |
4391 | 0 | *np = node_new_cclass(); |
4392 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
4393 | 0 | cc = NCCLASS(*np); |
4394 | 0 | r = add_ctype_to_cc(cc, ctype, 0, 0, env); |
4395 | 0 | if (r != 0) return r; |
4396 | 0 | if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); |
4397 | |
|
4398 | 0 | if (IS_IGNORECASE(env->option)) { |
4399 | 0 | if (ctype != ONIGENC_CTYPE_ASCII) |
4400 | 0 | r = cclass_case_fold(np, cc, cc, env); |
4401 | 0 | } |
4402 | 0 | return r; |
4403 | 0 | } |
4404 | | |
4405 | | |
4406 | | enum CCSTATE { |
4407 | | CCS_VALUE, |
4408 | | CCS_RANGE, |
4409 | | CCS_COMPLETE, |
4410 | | CCS_START |
4411 | | }; |
4412 | | |
4413 | | enum CCVALTYPE { |
4414 | | CCV_SB, |
4415 | | CCV_CODE_POINT, |
4416 | | CCV_CLASS |
4417 | | }; |
4418 | | |
4419 | | static int |
4420 | | next_state_class(CClassNode* cc, CClassNode* asc_cc, |
4421 | | OnigCodePoint* vs, enum CCVALTYPE* type, |
4422 | | enum CCSTATE* state, ScanEnv* env) |
4423 | 11.7k | { |
4424 | 11.7k | int r; |
4425 | | |
4426 | 11.7k | if (*state == CCS_RANGE) |
4427 | 0 | return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; |
4428 | | |
4429 | 11.7k | if (*state == CCS_VALUE && *type != CCV_CLASS) { |
4430 | 0 | if (*type == CCV_SB) { |
4431 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs)); |
4432 | 0 | if (IS_NOT_NULL(asc_cc)) |
4433 | 0 | BITSET_SET_BIT(asc_cc->bs, (int )(*vs)); |
4434 | 0 | } |
4435 | 0 | else if (*type == CCV_CODE_POINT) { |
4436 | 0 | r = add_code_range(&(cc->mbuf), env, *vs, *vs); |
4437 | 0 | if (r < 0) return r; |
4438 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4439 | 0 | r = add_code_range0(&(asc_cc->mbuf), env, *vs, *vs, 0); |
4440 | 0 | if (r < 0) return r; |
4441 | 0 | } |
4442 | 0 | } |
4443 | 0 | } |
4444 | | |
4445 | 11.7k | *state = CCS_VALUE; |
4446 | 11.7k | *type = CCV_CLASS; |
4447 | 11.7k | return 0; |
4448 | 11.7k | } |
4449 | | |
4450 | | static int |
4451 | | next_state_val(CClassNode* cc, CClassNode* asc_cc, |
4452 | | OnigCodePoint *from, OnigCodePoint to, |
4453 | | int* from_israw, int to_israw, |
4454 | | enum CCVALTYPE intype, enum CCVALTYPE* type, |
4455 | | enum CCSTATE* state, ScanEnv* env) |
4456 | 149k | { |
4457 | 149k | int r; |
4458 | | |
4459 | 149k | switch (*state) { |
4460 | 117k | case CCS_VALUE: |
4461 | 117k | if (*type == CCV_SB) { |
4462 | 105k | BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*from)); |
4463 | 105k | if (IS_NOT_NULL(asc_cc)) |
4464 | 105k | BITSET_SET_BIT(asc_cc->bs, (int )(*from)); |
4465 | 105k | } |
4466 | 11.7k | else if (*type == CCV_CODE_POINT) { |
4467 | 0 | r = add_code_range(&(cc->mbuf), env, *from, *from); |
4468 | 0 | if (r < 0) return r; |
4469 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4470 | 0 | r = add_code_range0(&(asc_cc->mbuf), env, *from, *from, 0); |
4471 | 0 | if (r < 0) return r; |
4472 | 0 | } |
4473 | 0 | } |
4474 | 117k | break; |
4475 | | |
4476 | 117k | case CCS_RANGE: |
4477 | 0 | if (intype == *type) { |
4478 | 0 | if (intype == CCV_SB) { |
4479 | 0 | if (*from > 0xff || to > 0xff) |
4480 | 0 | return ONIGERR_INVALID_CODE_POINT_VALUE; |
4481 | | |
4482 | 0 | if (*from > to) { |
4483 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) |
4484 | 0 | goto ccs_range_end; |
4485 | 0 | else |
4486 | 0 | return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; |
4487 | 0 | } |
4488 | 0 | bitset_set_range(env, cc->bs, (int )*from, (int )to); |
4489 | 0 | if (IS_NOT_NULL(asc_cc)) |
4490 | 0 | bitset_set_range(env, asc_cc->bs, (int )*from, (int )to); |
4491 | 0 | } |
4492 | 0 | else { |
4493 | 0 | r = add_code_range(&(cc->mbuf), env, *from, to); |
4494 | 0 | if (r < 0) return r; |
4495 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4496 | 0 | r = add_code_range0(&(asc_cc->mbuf), env, *from, to, 0); |
4497 | 0 | if (r < 0) return r; |
4498 | 0 | } |
4499 | 0 | } |
4500 | 0 | } |
4501 | 0 | else { |
4502 | 0 | if (*from > to) { |
4503 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) |
4504 | 0 | goto ccs_range_end; |
4505 | 0 | else |
4506 | 0 | return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; |
4507 | 0 | } |
4508 | 0 | bitset_set_range(env, cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff)); |
4509 | 0 | r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to); |
4510 | 0 | if (r < 0) return r; |
4511 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4512 | 0 | bitset_set_range(env, asc_cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff)); |
4513 | 0 | r = add_code_range0(&(asc_cc->mbuf), env, (OnigCodePoint )*from, to, 0); |
4514 | 0 | if (r < 0) return r; |
4515 | 0 | } |
4516 | 0 | } |
4517 | 0 | ccs_range_end: |
4518 | 0 | *state = CCS_COMPLETE; |
4519 | 0 | break; |
4520 | | |
4521 | 0 | case CCS_COMPLETE: |
4522 | 32.2k | case CCS_START: |
4523 | 32.2k | *state = CCS_VALUE; |
4524 | 32.2k | break; |
4525 | | |
4526 | 0 | default: |
4527 | 0 | break; |
4528 | 149k | } |
4529 | | |
4530 | 149k | *from_israw = to_israw; |
4531 | 149k | *from = to; |
4532 | 149k | *type = intype; |
4533 | 149k | return 0; |
4534 | 149k | } |
4535 | | |
4536 | | static int |
4537 | | code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, |
4538 | | ScanEnv* env) |
4539 | 0 | { |
4540 | 0 | int in_esc; |
4541 | 0 | OnigCodePoint code; |
4542 | 0 | OnigEncoding enc = env->enc; |
4543 | 0 | UChar* p = from; |
4544 | |
|
4545 | 0 | in_esc = 0; |
4546 | 0 | while (! PEND) { |
4547 | 0 | if (ignore_escaped && in_esc) { |
4548 | 0 | in_esc = 0; |
4549 | 0 | } |
4550 | 0 | else { |
4551 | 0 | PFETCH_S(code); |
4552 | 0 | if (code == c) return 1; |
4553 | 0 | if (code == MC_ESC(env->syntax)) in_esc = 1; |
4554 | 0 | } |
4555 | 0 | } |
4556 | 0 | return 0; |
4557 | 0 | } |
4558 | | |
4559 | | static int |
4560 | | parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* end, |
4561 | | ScanEnv* env) |
4562 | 43.9k | { |
4563 | 43.9k | int r, neg, len, fetched, and_start; |
4564 | 43.9k | OnigCodePoint v, vs; |
4565 | 43.9k | UChar *p; |
4566 | 43.9k | Node* node; |
4567 | 43.9k | Node* asc_node; |
4568 | 43.9k | CClassNode *cc, *prev_cc; |
4569 | 43.9k | CClassNode *asc_cc, *asc_prev_cc; |
4570 | 43.9k | CClassNode work_cc, asc_work_cc; |
4571 | | |
4572 | 43.9k | enum CCSTATE state; |
4573 | 43.9k | enum CCVALTYPE val_type, in_type; |
4574 | 43.9k | int val_israw, in_israw; |
4575 | | |
4576 | 43.9k | *np = *asc_np = NULL_NODE; |
4577 | 43.9k | env->parse_depth++; |
4578 | 43.9k | if (env->parse_depth > ParseDepthLimit) |
4579 | 0 | return ONIGERR_PARSE_DEPTH_LIMIT_OVER; |
4580 | 43.9k | prev_cc = asc_prev_cc = (CClassNode* )NULL; |
4581 | 43.9k | r = fetch_token_in_cc(tok, src, end, env); |
4582 | 43.9k | if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { |
4583 | 17.5k | neg = 1; |
4584 | 17.5k | r = fetch_token_in_cc(tok, src, end, env); |
4585 | 17.5k | } |
4586 | 26.3k | else { |
4587 | 26.3k | neg = 0; |
4588 | 26.3k | } |
4589 | | |
4590 | 43.9k | if (r < 0) return r; |
4591 | 43.9k | if (r == TK_CC_CLOSE) { |
4592 | 0 | if (! code_exist_check((OnigCodePoint )']', |
4593 | 0 | *src, env->pattern_end, 1, env)) |
4594 | 0 | return ONIGERR_EMPTY_CHAR_CLASS; |
4595 | | |
4596 | 0 | CC_ESC_WARN(env, (UChar* )"]"); |
4597 | 0 | r = tok->type = TK_CHAR; /* allow []...] */ |
4598 | 0 | } |
4599 | | |
4600 | 43.9k | *np = node = node_new_cclass(); |
4601 | 43.9k | CHECK_NULL_RETURN_MEMERR(node); |
4602 | 43.9k | cc = NCCLASS(node); |
4603 | | |
4604 | 43.9k | if (IS_IGNORECASE(env->option)) { |
4605 | 0 | *asc_np = asc_node = node_new_cclass(); |
4606 | 0 | CHECK_NULL_RETURN_MEMERR(asc_node); |
4607 | 0 | asc_cc = NCCLASS(asc_node); |
4608 | 0 | } |
4609 | 43.9k | else { |
4610 | 43.9k | asc_node = NULL_NODE; |
4611 | 43.9k | asc_cc = NULL; |
4612 | 43.9k | } |
4613 | | |
4614 | 43.9k | and_start = 0; |
4615 | 43.9k | state = CCS_START; |
4616 | 43.9k | p = *src; |
4617 | 161k | while (r != TK_CC_CLOSE) { |
4618 | 117k | fetched = 0; |
4619 | 117k | switch (r) { |
4620 | 70.2k | case TK_CHAR: |
4621 | 70.2k | if ((tok->u.code >= SINGLE_BYTE_SIZE) || |
4622 | 70.2k | (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) { |
4623 | 0 | in_type = CCV_CODE_POINT; |
4624 | 0 | } |
4625 | 70.2k | else if (len < 0) { |
4626 | 0 | r = len; |
4627 | 0 | goto err; |
4628 | 0 | } |
4629 | 70.2k | else { |
4630 | 70.2k | sb_char: |
4631 | 70.2k | in_type = CCV_SB; |
4632 | 70.2k | } |
4633 | 70.2k | v = (OnigCodePoint )tok->u.c; |
4634 | 70.2k | in_israw = 0; |
4635 | 70.2k | goto val_entry2; |
4636 | 0 | break; |
4637 | | |
4638 | 0 | case TK_RAW_BYTE: |
4639 | | /* tok->base != 0 : octal or hexadec. */ |
4640 | 0 | if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { |
4641 | 0 | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; |
4642 | 0 | UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; |
4643 | 0 | UChar* psave = p; |
4644 | 0 | int i, base = tok->base; |
4645 | |
|
4646 | 0 | buf[0] = (UChar )tok->u.c; |
4647 | 0 | for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { |
4648 | 0 | r = fetch_token_in_cc(tok, &p, end, env); |
4649 | 0 | if (r < 0) goto err; |
4650 | 0 | if (r != TK_RAW_BYTE || tok->base != base) { |
4651 | 0 | fetched = 1; |
4652 | 0 | break; |
4653 | 0 | } |
4654 | 0 | buf[i] = (UChar )tok->u.c; |
4655 | 0 | } |
4656 | | |
4657 | 0 | if (i < ONIGENC_MBC_MINLEN(env->enc)) { |
4658 | 0 | r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; |
4659 | 0 | goto err; |
4660 | 0 | } |
4661 | | |
4662 | 0 | if (env->enc == ONIG_ENCODING_EUC_JP || |
4663 | 0 | env->enc == ONIG_ENCODING_SJIS) { |
4664 | | /* Strict version of enclen does not handle invalid single code |
4665 | | * point for SJIS and EUC-JP...*/ |
4666 | 0 | len = enclen_approximate(env->enc, buf, buf + i); |
4667 | 0 | } |
4668 | 0 | else { |
4669 | 0 | len = enclen(env->enc, buf, buf + i); |
4670 | 0 | } |
4671 | 0 | if (i < len) { |
4672 | 0 | r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; |
4673 | 0 | goto err; |
4674 | 0 | } |
4675 | 0 | else if (i > len) { /* fetch back */ |
4676 | 0 | p = psave; |
4677 | 0 | for (i = 1; i < len; i++) { |
4678 | 0 | (void)fetch_token_in_cc(tok, &p, end, env); |
4679 | | /* no need to check the retun value (already checked above) */ |
4680 | 0 | } |
4681 | 0 | fetched = 0; |
4682 | 0 | } |
4683 | | |
4684 | 0 | if (i == 1) { |
4685 | 0 | v = (OnigCodePoint )buf[0]; |
4686 | 0 | goto raw_single; |
4687 | 0 | } |
4688 | 0 | else { |
4689 | 0 | v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); |
4690 | 0 | in_type = CCV_CODE_POINT; |
4691 | 0 | } |
4692 | 0 | } |
4693 | 0 | else { |
4694 | 0 | v = (OnigCodePoint )tok->u.c; |
4695 | 0 | raw_single: |
4696 | 0 | in_type = CCV_SB; |
4697 | 0 | } |
4698 | 0 | in_israw = 1; |
4699 | 0 | goto val_entry2; |
4700 | 0 | break; |
4701 | | |
4702 | 35.1k | case TK_CODE_POINT: |
4703 | 35.1k | v = tok->u.code; |
4704 | 35.1k | in_israw = 1; |
4705 | 35.1k | val_entry: |
4706 | 35.1k | len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); |
4707 | 35.1k | if (len < 0) { |
4708 | 0 | r = len; |
4709 | 0 | goto err; |
4710 | 0 | } |
4711 | 35.1k | in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); |
4712 | 105k | val_entry2: |
4713 | 105k | r = next_state_val(cc, asc_cc, &vs, v, &val_israw, in_israw, in_type, &val_type, |
4714 | 105k | &state, env); |
4715 | 105k | if (r != 0) goto err; |
4716 | 105k | break; |
4717 | | |
4718 | 105k | case TK_POSIX_BRACKET_OPEN: |
4719 | 0 | r = parse_posix_bracket(cc, asc_cc, &p, end, env); |
4720 | 0 | if (r < 0) goto err; |
4721 | 0 | if (r == 1) { /* is not POSIX bracket */ |
4722 | 0 | CC_ESC_WARN(env, (UChar* )"["); |
4723 | 0 | p = tok->backp; |
4724 | 0 | v = (OnigCodePoint )tok->u.c; |
4725 | 0 | in_israw = 0; |
4726 | 0 | goto val_entry; |
4727 | 0 | } |
4728 | 0 | goto next_class; |
4729 | 0 | break; |
4730 | | |
4731 | 11.7k | case TK_CHAR_TYPE: |
4732 | 11.7k | r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, |
4733 | 11.7k | IS_ASCII_RANGE(env->option), env); |
4734 | 11.7k | if (r != 0) return r; |
4735 | 11.7k | if (IS_NOT_NULL(asc_cc)) { |
4736 | 0 | if (tok->u.prop.ctype != ONIGENC_CTYPE_WORD) |
4737 | 0 | r = add_ctype_to_cc(asc_cc, tok->u.prop.ctype, tok->u.prop.not, |
4738 | 0 | IS_ASCII_RANGE(env->option), env); |
4739 | 0 | if (r != 0) return r; |
4740 | 0 | } |
4741 | | |
4742 | 11.7k | next_class: |
4743 | 11.7k | r = next_state_class(cc, asc_cc, &vs, &val_type, &state, env); |
4744 | 11.7k | if (r != 0) goto err; |
4745 | 11.7k | break; |
4746 | | |
4747 | 11.7k | case TK_CHAR_PROPERTY: |
4748 | 0 | { |
4749 | 0 | int ctype; |
4750 | |
|
4751 | 0 | ctype = fetch_char_property_to_ctype(&p, end, env); |
4752 | 0 | if (ctype < 0) return ctype; |
4753 | 0 | r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 0, env); |
4754 | 0 | if (r != 0) return r; |
4755 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4756 | 0 | if (ctype != ONIGENC_CTYPE_ASCII) |
4757 | 0 | r = add_ctype_to_cc(asc_cc, ctype, tok->u.prop.not, 0, env); |
4758 | 0 | if (r != 0) return r; |
4759 | 0 | } |
4760 | 0 | goto next_class; |
4761 | 0 | } |
4762 | 0 | break; |
4763 | | |
4764 | 0 | case TK_CC_RANGE: |
4765 | 0 | if (state == CCS_VALUE) { |
4766 | 0 | r = fetch_token_in_cc(tok, &p, end, env); |
4767 | 0 | if (r < 0) goto err; |
4768 | 0 | fetched = 1; |
4769 | 0 | if (r == TK_CC_CLOSE) { /* allow [x-] */ |
4770 | 0 | range_end_val: |
4771 | 0 | v = (OnigCodePoint )'-'; |
4772 | 0 | in_israw = 0; |
4773 | 0 | goto val_entry; |
4774 | 0 | } |
4775 | 0 | else if (r == TK_CC_AND) { |
4776 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4777 | 0 | goto range_end_val; |
4778 | 0 | } |
4779 | | |
4780 | 0 | if (val_type == CCV_CLASS) { |
4781 | 0 | r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; |
4782 | 0 | goto err; |
4783 | 0 | } |
4784 | | |
4785 | 0 | state = CCS_RANGE; |
4786 | 0 | } |
4787 | 0 | else if (state == CCS_START) { |
4788 | | /* [-xa] is allowed */ |
4789 | 0 | v = (OnigCodePoint )tok->u.c; |
4790 | 0 | in_israw = 0; |
4791 | |
|
4792 | 0 | r = fetch_token_in_cc(tok, &p, end, env); |
4793 | 0 | if (r < 0) goto err; |
4794 | 0 | fetched = 1; |
4795 | | /* [--x] or [a&&-x] is warned. */ |
4796 | 0 | if (r == TK_CC_RANGE || and_start != 0) |
4797 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4798 | |
|
4799 | 0 | goto val_entry; |
4800 | 0 | } |
4801 | 0 | else if (state == CCS_RANGE) { |
4802 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4803 | 0 | goto sb_char; /* [!--x] is allowed */ |
4804 | 0 | } |
4805 | 0 | else { /* CCS_COMPLETE */ |
4806 | 0 | r = fetch_token_in_cc(tok, &p, end, env); |
4807 | 0 | if (r < 0) goto err; |
4808 | 0 | fetched = 1; |
4809 | 0 | if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ |
4810 | 0 | else if (r == TK_CC_AND) { |
4811 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4812 | 0 | goto range_end_val; |
4813 | 0 | } |
4814 | | |
4815 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { |
4816 | 0 | CC_ESC_WARN(env, (UChar* )"-"); |
4817 | 0 | goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */ |
4818 | 0 | } |
4819 | 0 | r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; |
4820 | 0 | goto err; |
4821 | 0 | } |
4822 | 0 | break; |
4823 | | |
4824 | 0 | case TK_CC_CC_OPEN: /* [ */ |
4825 | 0 | { |
4826 | 0 | Node *anode, *aasc_node; |
4827 | 0 | CClassNode* acc; |
4828 | |
|
4829 | 0 | r = parse_char_class(&anode, &aasc_node, tok, &p, end, env); |
4830 | 0 | if (r == 0) { |
4831 | 0 | acc = NCCLASS(anode); |
4832 | 0 | r = or_cclass(cc, acc, env); |
4833 | 0 | } |
4834 | 0 | if (r == 0 && IS_NOT_NULL(aasc_node)) { |
4835 | 0 | acc = NCCLASS(aasc_node); |
4836 | 0 | r = or_cclass(asc_cc, acc, env); |
4837 | 0 | } |
4838 | 0 | onig_node_free(anode); |
4839 | 0 | onig_node_free(aasc_node); |
4840 | 0 | if (r != 0) goto err; |
4841 | 0 | } |
4842 | 0 | break; |
4843 | | |
4844 | 0 | case TK_CC_AND: /* && */ |
4845 | 0 | { |
4846 | 0 | if (state == CCS_VALUE) { |
4847 | 0 | r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type, |
4848 | 0 | &val_type, &state, env); |
4849 | 0 | if (r != 0) goto err; |
4850 | 0 | } |
4851 | | /* initialize local variables */ |
4852 | 0 | and_start = 1; |
4853 | 0 | state = CCS_START; |
4854 | |
|
4855 | 0 | if (IS_NOT_NULL(prev_cc)) { |
4856 | 0 | r = and_cclass(prev_cc, cc, env); |
4857 | 0 | if (r != 0) goto err; |
4858 | 0 | bbuf_free(cc->mbuf); |
4859 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4860 | 0 | r = and_cclass(asc_prev_cc, asc_cc, env); |
4861 | 0 | if (r != 0) goto err; |
4862 | 0 | bbuf_free(asc_cc->mbuf); |
4863 | 0 | } |
4864 | 0 | } |
4865 | 0 | else { |
4866 | 0 | prev_cc = cc; |
4867 | 0 | cc = &work_cc; |
4868 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4869 | 0 | asc_prev_cc = asc_cc; |
4870 | 0 | asc_cc = &asc_work_cc; |
4871 | 0 | } |
4872 | 0 | } |
4873 | 0 | initialize_cclass(cc); |
4874 | 0 | if (IS_NOT_NULL(asc_cc)) |
4875 | 0 | initialize_cclass(asc_cc); |
4876 | 0 | } |
4877 | 0 | break; |
4878 | | |
4879 | 0 | case TK_EOT: |
4880 | 0 | r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; |
4881 | 0 | goto err; |
4882 | 0 | break; |
4883 | 0 | default: |
4884 | 0 | r = ONIGERR_PARSER_BUG; |
4885 | 0 | goto err; |
4886 | 0 | break; |
4887 | 117k | } |
4888 | | |
4889 | 117k | if (fetched) |
4890 | 0 | r = tok->type; |
4891 | 117k | else { |
4892 | 117k | r = fetch_token_in_cc(tok, &p, end, env); |
4893 | 117k | if (r < 0) goto err; |
4894 | 117k | } |
4895 | 117k | } |
4896 | | |
4897 | 43.9k | if (state == CCS_VALUE) { |
4898 | 43.9k | r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type, |
4899 | 43.9k | &val_type, &state, env); |
4900 | 43.9k | if (r != 0) goto err; |
4901 | 43.9k | } |
4902 | | |
4903 | 43.9k | if (IS_NOT_NULL(prev_cc)) { |
4904 | 0 | r = and_cclass(prev_cc, cc, env); |
4905 | 0 | if (r != 0) goto err; |
4906 | 0 | bbuf_free(cc->mbuf); |
4907 | 0 | cc = prev_cc; |
4908 | 0 | if (IS_NOT_NULL(asc_cc)) { |
4909 | 0 | r = and_cclass(asc_prev_cc, asc_cc, env); |
4910 | 0 | if (r != 0) goto err; |
4911 | 0 | bbuf_free(asc_cc->mbuf); |
4912 | 0 | asc_cc = asc_prev_cc; |
4913 | 0 | } |
4914 | 0 | } |
4915 | | |
4916 | 43.9k | if (neg != 0) { |
4917 | 17.5k | NCCLASS_SET_NOT(cc); |
4918 | 17.5k | if (IS_NOT_NULL(asc_cc)) |
4919 | 17.5k | NCCLASS_SET_NOT(asc_cc); |
4920 | 17.5k | } |
4921 | 26.3k | else { |
4922 | 26.3k | NCCLASS_CLEAR_NOT(cc); |
4923 | 26.3k | if (IS_NOT_NULL(asc_cc)) |
4924 | 26.3k | NCCLASS_CLEAR_NOT(asc_cc); |
4925 | 26.3k | } |
4926 | 43.9k | if (IS_NCCLASS_NOT(cc) && |
4927 | 43.9k | IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { |
4928 | 0 | int is_empty; |
4929 | |
|
4930 | 0 | is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); |
4931 | 0 | if (is_empty != 0) |
4932 | 0 | BITSET_IS_EMPTY(cc->bs, is_empty); |
4933 | |
|
4934 | 0 | if (is_empty == 0) { |
4935 | 0 | #define NEWLINE_CODE 0x0a |
4936 | |
|
4937 | 0 | if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { |
4938 | 0 | if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) |
4939 | 0 | BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE); |
4940 | 0 | else { |
4941 | 0 | r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); |
4942 | 0 | if (r < 0) goto err; |
4943 | 0 | } |
4944 | 0 | } |
4945 | 0 | } |
4946 | 0 | } |
4947 | 43.9k | *src = p; |
4948 | 43.9k | env->parse_depth--; |
4949 | 43.9k | return 0; |
4950 | | |
4951 | 0 | err: |
4952 | 0 | if (cc != NCCLASS(*np)) |
4953 | 0 | bbuf_free(cc->mbuf); |
4954 | 0 | if (IS_NOT_NULL(asc_cc) && (asc_cc != NCCLASS(*asc_np))) |
4955 | 0 | bbuf_free(asc_cc->mbuf); |
4956 | 0 | return r; |
4957 | 43.9k | } |
4958 | | |
4959 | | static int parse_subexp(Node** top, OnigToken* tok, int term, |
4960 | | UChar** src, UChar* end, ScanEnv* env); |
4961 | | |
4962 | | static int |
4963 | | parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, |
4964 | | ScanEnv* env) |
4965 | 35.1k | { |
4966 | 35.1k | int r = 0, num; |
4967 | 35.1k | Node *target, *work1 = NULL, *work2 = NULL; |
4968 | 35.1k | OnigOptionType option; |
4969 | 35.1k | OnigCodePoint c; |
4970 | 35.1k | OnigEncoding enc = env->enc; |
4971 | | |
4972 | 35.1k | #ifdef USE_NAMED_GROUP |
4973 | 35.1k | int list_capture; |
4974 | 35.1k | #endif |
4975 | | |
4976 | 35.1k | UChar* p = *src; |
4977 | 35.1k | PFETCH_READY; |
4978 | | |
4979 | 35.1k | *np = NULL; |
4980 | 35.1k | if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; |
4981 | | |
4982 | 35.1k | option = env->option; |
4983 | 35.1k | if (PPEEK_IS('?') && |
4984 | 35.1k | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { |
4985 | 32.2k | PINC; |
4986 | 32.2k | if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; |
4987 | | |
4988 | 32.2k | PFETCH(c); |
4989 | 32.2k | switch (c) { |
4990 | 17.5k | case ':': /* (?:...) grouping only */ |
4991 | 17.5k | group: |
4992 | 17.5k | r = fetch_token(tok, &p, end, env); |
4993 | 17.5k | if (r < 0) return r; |
4994 | 17.5k | r = parse_subexp(np, tok, term, &p, end, env); |
4995 | 17.5k | if (r < 0) return r; |
4996 | 17.5k | *src = p; |
4997 | 17.5k | return 1; /* group */ |
4998 | 0 | break; |
4999 | | |
5000 | 0 | case '=': |
5001 | 0 | *np = onig_node_new_anchor(ANCHOR_PREC_READ); |
5002 | 0 | break; |
5003 | 0 | case '!': /* preceding read */ |
5004 | 0 | *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); |
5005 | 0 | break; |
5006 | 0 | case '>': /* (?>...) stop backtrack */ |
5007 | 0 | *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
5008 | 0 | break; |
5009 | 0 | case '~': /* (?~...) absent operator */ |
5010 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT)) { |
5011 | 0 | *np = node_new_enclose(ENCLOSE_ABSENT); |
5012 | 0 | } |
5013 | 0 | else { |
5014 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5015 | 0 | } |
5016 | 0 | break; |
5017 | | |
5018 | 0 | #ifdef USE_NAMED_GROUP |
5019 | 0 | case '\'': |
5020 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { |
5021 | 0 | goto named_group1; |
5022 | 0 | } |
5023 | 0 | else |
5024 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5025 | 0 | break; |
5026 | | |
5027 | 0 | # ifdef USE_CAPITAL_P_NAMED_GROUP |
5028 | 0 | case 'P': /* (?P<name>...) */ |
5029 | 0 | if (!PEND && |
5030 | 0 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { |
5031 | 0 | PFETCH(c); |
5032 | 0 | if (c == '<') goto named_group1; |
5033 | 0 | } |
5034 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5035 | 0 | break; |
5036 | 0 | # endif |
5037 | 0 | #endif |
5038 | | |
5039 | 11.7k | case '<': /* look behind (?<=...), (?<!...) */ |
5040 | 11.7k | if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; |
5041 | 11.7k | PFETCH(c); |
5042 | 11.7k | if (c == '=') |
5043 | 0 | *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND); |
5044 | 11.7k | else if (c == '!') |
5045 | 0 | *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT); |
5046 | 11.7k | #ifdef USE_NAMED_GROUP |
5047 | 11.7k | else { /* (?<name>...) */ |
5048 | 11.7k | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { |
5049 | 11.7k | UChar *name; |
5050 | 11.7k | UChar *name_end; |
5051 | | |
5052 | 11.7k | PUNFETCH; |
5053 | 11.7k | c = '<'; |
5054 | | |
5055 | 11.7k | named_group1: |
5056 | 11.7k | list_capture = 0; |
5057 | | |
5058 | 11.7k | # ifdef USE_CAPTURE_HISTORY |
5059 | 11.7k | named_group2: |
5060 | 11.7k | # endif |
5061 | 11.7k | name = p; |
5062 | 11.7k | r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); |
5063 | 11.7k | if (r < 0) return r; |
5064 | | |
5065 | 11.7k | num = scan_env_add_mem_entry(env); |
5066 | 11.7k | if (num < 0) return num; |
5067 | 11.7k | if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM) |
5068 | 0 | return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; |
5069 | | |
5070 | 11.7k | r = name_add(env->reg, name, name_end, num, env); |
5071 | 11.7k | if (r != 0) return r; |
5072 | 11.7k | *np = node_new_enclose_memory(env->option, 1); |
5073 | 11.7k | CHECK_NULL_RETURN_MEMERR(*np); |
5074 | 11.7k | NENCLOSE(*np)->regnum = num; |
5075 | 11.7k | if (list_capture != 0) |
5076 | 0 | BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); |
5077 | 11.7k | env->num_named++; |
5078 | 11.7k | } |
5079 | 0 | else { |
5080 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5081 | 0 | } |
5082 | 11.7k | } |
5083 | | #else |
5084 | | else { |
5085 | | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5086 | | } |
5087 | | #endif |
5088 | 11.7k | break; |
5089 | | |
5090 | 11.7k | #ifdef USE_CAPTURE_HISTORY |
5091 | 11.7k | case '@': |
5092 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { |
5093 | 0 | # ifdef USE_NAMED_GROUP |
5094 | 0 | if (!PEND && |
5095 | 0 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { |
5096 | 0 | PFETCH(c); |
5097 | 0 | if (c == '<' || c == '\'') { |
5098 | 0 | list_capture = 1; |
5099 | 0 | goto named_group2; /* (?@<name>...) */ |
5100 | 0 | } |
5101 | 0 | PUNFETCH; |
5102 | 0 | } |
5103 | 0 | # endif |
5104 | 0 | *np = node_new_enclose_memory(env->option, 0); |
5105 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
5106 | 0 | num = scan_env_add_mem_entry(env); |
5107 | 0 | if (num < 0) return num; |
5108 | 0 | if (num >= (int )BIT_STATUS_BITS_NUM) |
5109 | 0 | return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; |
5110 | | |
5111 | 0 | NENCLOSE(*np)->regnum = num; |
5112 | 0 | BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); |
5113 | 0 | } |
5114 | 0 | else { |
5115 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5116 | 0 | } |
5117 | 0 | break; |
5118 | 0 | #endif /* USE_CAPTURE_HISTORY */ |
5119 | | |
5120 | 0 | case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */ |
5121 | 0 | if (!PEND && |
5122 | 0 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) { |
5123 | 0 | UChar *name = NULL; |
5124 | 0 | UChar *name_end; |
5125 | 0 | PFETCH(c); |
5126 | 0 | if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */ |
5127 | 0 | PUNFETCH; |
5128 | 0 | r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1); |
5129 | 0 | if (r < 0) return r; |
5130 | | #if 0 |
5131 | | /* Relative number is not currently supported. (same as Perl) */ |
5132 | | if (num < 0) { |
5133 | | num = BACKREF_REL_TO_ABS(num, env); |
5134 | | if (num <= 0) |
5135 | | return ONIGERR_INVALID_BACKREF; |
5136 | | } |
5137 | | #endif |
5138 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { |
5139 | 0 | if (num > env->num_mem || |
5140 | 0 | IS_NULL(SCANENV_MEM_NODES(env)[num])) |
5141 | 0 | return ONIGERR_INVALID_BACKREF; |
5142 | 0 | } |
5143 | 0 | } |
5144 | 0 | #ifdef USE_NAMED_GROUP |
5145 | 0 | else if (c == '<' || c == '\'') { /* (<name>), ('name') */ |
5146 | 0 | name = p; |
5147 | 0 | r = fetch_named_backref_token(c, tok, &p, end, env); |
5148 | 0 | if (r < 0) return r; |
5149 | 0 | if (!PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; |
5150 | 0 | PINC; |
5151 | |
|
5152 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) { |
5153 | 0 | num = tok->u.backref.ref1; |
5154 | 0 | } |
5155 | 0 | else { |
5156 | | /* FIXME: |
5157 | | * Use left most named group for now. This is the same as Perl. |
5158 | | * However this should use the same strategy as normal back- |
5159 | | * references on Ruby syntax; search right to left. */ |
5160 | 0 | int len = tok->u.backref.num; |
5161 | 0 | num = len > 1 ? tok->u.backref.refs[0] : tok->u.backref.ref1; |
5162 | 0 | } |
5163 | 0 | } |
5164 | 0 | #endif |
5165 | 0 | else |
5166 | 0 | return ONIGERR_INVALID_CONDITION_PATTERN; |
5167 | 0 | *np = node_new_enclose(ENCLOSE_CONDITION); |
5168 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
5169 | 0 | NENCLOSE(*np)->regnum = num; |
5170 | 0 | if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF; |
5171 | 0 | } |
5172 | 0 | else |
5173 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5174 | 0 | break; |
5175 | | |
5176 | | #if 0 |
5177 | | case '|': /* branch reset: (?|...) */ |
5178 | | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) { |
5179 | | /* TODO */ |
5180 | | } |
5181 | | else |
5182 | | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5183 | | break; |
5184 | | #endif |
5185 | | |
5186 | 0 | case '^': /* loads default options */ |
5187 | 0 | if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { |
5188 | | /* d-imsx */ |
5189 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5190 | 0 | ONOFF(option, ONIG_OPTION_IGNORECASE, 1); |
5191 | 0 | ONOFF(option, ONIG_OPTION_SINGLELINE, 0); |
5192 | 0 | ONOFF(option, ONIG_OPTION_MULTILINE, 1); |
5193 | 0 | ONOFF(option, ONIG_OPTION_EXTEND, 1); |
5194 | 0 | PFETCH(c); |
5195 | 0 | } |
5196 | | #if 0 |
5197 | | else if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { |
5198 | | /* d-imx */ |
5199 | | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); |
5200 | | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); |
5201 | | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); |
5202 | | ONOFF(option, ONIG_OPTION_IGNORECASE, 1); |
5203 | | ONOFF(option, ONIG_OPTION_MULTILINE, 1); |
5204 | | ONOFF(option, ONIG_OPTION_EXTEND, 1); |
5205 | | PFETCH(c); |
5206 | | } |
5207 | | #endif |
5208 | 0 | else { |
5209 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5210 | 0 | } |
5211 | | /* fall through */ |
5212 | | #ifdef USE_POSIXLINE_OPTION |
5213 | | case 'p': |
5214 | | #endif |
5215 | 2.92k | case '-': case 'i': case 'm': case 's': case 'x': |
5216 | 2.92k | case 'a': case 'd': case 'l': case 'u': |
5217 | 2.92k | { |
5218 | 2.92k | int neg = 0; |
5219 | | |
5220 | 5.85k | while (1) { |
5221 | 5.85k | switch (c) { |
5222 | 2.92k | case ':': |
5223 | 2.92k | case ')': |
5224 | 2.92k | break; |
5225 | | |
5226 | 0 | case '-': neg = 1; break; |
5227 | 2.92k | case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break; |
5228 | 0 | case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break; |
5229 | 0 | case 's': |
5230 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { |
5231 | 0 | ONOFF(option, ONIG_OPTION_MULTILINE, neg); |
5232 | 0 | } |
5233 | 0 | else |
5234 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5235 | 0 | break; |
5236 | | |
5237 | 0 | case 'm': |
5238 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { |
5239 | 0 | ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); |
5240 | 0 | } |
5241 | 0 | else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { |
5242 | 0 | ONOFF(option, ONIG_OPTION_MULTILINE, neg); |
5243 | 0 | } |
5244 | 0 | else |
5245 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5246 | 0 | break; |
5247 | | #ifdef USE_POSIXLINE_OPTION |
5248 | | case 'p': |
5249 | | ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); |
5250 | | break; |
5251 | | #endif |
5252 | | |
5253 | 0 | case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */ |
5254 | 0 | if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || |
5255 | 0 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && |
5256 | 0 | (neg == 0)) { |
5257 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); |
5258 | 0 | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); |
5259 | 0 | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); |
5260 | 0 | } |
5261 | 0 | else |
5262 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5263 | 0 | break; |
5264 | | |
5265 | 0 | case 'u': |
5266 | 0 | if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || |
5267 | 0 | IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && |
5268 | 0 | (neg == 0)) { |
5269 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5270 | 0 | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); |
5271 | 0 | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); |
5272 | 0 | } |
5273 | 0 | else |
5274 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5275 | 0 | break; |
5276 | | |
5277 | 0 | case 'd': |
5278 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && |
5279 | 0 | (neg == 0)) { |
5280 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5281 | 0 | } |
5282 | 0 | else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) && |
5283 | 0 | (neg == 0)) { |
5284 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); |
5285 | 0 | ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); |
5286 | 0 | ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); |
5287 | 0 | } |
5288 | 0 | else |
5289 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5290 | 0 | break; |
5291 | | |
5292 | 0 | case 'l': |
5293 | 0 | if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) { |
5294 | 0 | ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); |
5295 | 0 | } |
5296 | 0 | else |
5297 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5298 | 0 | break; |
5299 | | |
5300 | 0 | default: |
5301 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5302 | 5.85k | } |
5303 | | |
5304 | 5.85k | if (c == ')') { |
5305 | 0 | *np = node_new_option(option); |
5306 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
5307 | 0 | *src = p; |
5308 | 0 | return 2; /* option only */ |
5309 | 0 | } |
5310 | 5.85k | else if (c == ':') { |
5311 | 2.92k | OnigOptionType prev = env->option; |
5312 | | |
5313 | 2.92k | env->option = option; |
5314 | 2.92k | r = fetch_token(tok, &p, end, env); |
5315 | 2.92k | if (r < 0) { |
5316 | 0 | env->option = prev; |
5317 | 0 | return r; |
5318 | 0 | } |
5319 | 2.92k | r = parse_subexp(&target, tok, term, &p, end, env); |
5320 | 2.92k | env->option = prev; |
5321 | 2.92k | if (r < 0) return r; |
5322 | 2.92k | *np = node_new_option(option); |
5323 | 2.92k | CHECK_NULL_RETURN_MEMERR(*np); |
5324 | 2.92k | NENCLOSE(*np)->target = target; |
5325 | 2.92k | *src = p; |
5326 | 2.92k | return 0; |
5327 | 2.92k | } |
5328 | | |
5329 | 2.92k | if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; |
5330 | 2.92k | PFETCH(c); |
5331 | 2.92k | } |
5332 | 2.92k | } |
5333 | 0 | break; |
5334 | | |
5335 | 0 | default: |
5336 | 0 | return ONIGERR_UNDEFINED_GROUP_OPTION; |
5337 | 32.2k | } |
5338 | 32.2k | } |
5339 | 2.92k | else { |
5340 | 2.92k | if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) |
5341 | 0 | goto group; |
5342 | | |
5343 | 2.92k | *np = node_new_enclose_memory(env->option, 0); |
5344 | 2.92k | CHECK_NULL_RETURN_MEMERR(*np); |
5345 | 2.92k | num = scan_env_add_mem_entry(env); |
5346 | 2.92k | if (num < 0) return num; |
5347 | 2.92k | NENCLOSE(*np)->regnum = num; |
5348 | 2.92k | } |
5349 | | |
5350 | 14.6k | CHECK_NULL_RETURN_MEMERR(*np); |
5351 | 14.6k | r = fetch_token(tok, &p, end, env); |
5352 | 14.6k | if (r < 0) return r; |
5353 | 14.6k | r = parse_subexp(&target, tok, term, &p, end, env); |
5354 | 14.6k | if (r < 0) { |
5355 | 0 | onig_node_free(target); |
5356 | 0 | return r; |
5357 | 0 | } |
5358 | | |
5359 | 14.6k | if (NTYPE(*np) == NT_ANCHOR) |
5360 | 0 | NANCHOR(*np)->target = target; |
5361 | 14.6k | else { |
5362 | 14.6k | NENCLOSE(*np)->target = target; |
5363 | 14.6k | if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) { |
5364 | | /* Don't move this to previous of parse_subexp() */ |
5365 | 14.6k | r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np); |
5366 | 14.6k | if (r != 0) return r; |
5367 | 14.6k | } |
5368 | 0 | else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) { |
5369 | 0 | if (NTYPE(target) != NT_ALT) { |
5370 | | /* convert (?(cond)yes) to (?(cond)yes|empty) */ |
5371 | 0 | work1 = node_new_empty(); |
5372 | 0 | if (IS_NULL(work1)) goto err; |
5373 | 0 | work2 = onig_node_new_alt(work1, NULL_NODE); |
5374 | 0 | if (IS_NULL(work2)) goto err; |
5375 | 0 | work1 = onig_node_new_alt(target, work2); |
5376 | 0 | if (IS_NULL(work1)) goto err; |
5377 | 0 | NENCLOSE(*np)->target = work1; |
5378 | 0 | } |
5379 | 0 | } |
5380 | 14.6k | } |
5381 | | |
5382 | 14.6k | *src = p; |
5383 | 14.6k | return 0; |
5384 | | |
5385 | 0 | err: |
5386 | 0 | onig_node_free(work1); |
5387 | 0 | onig_node_free(work2); |
5388 | 0 | onig_node_free(*np); |
5389 | 0 | *np = NULL; |
5390 | 0 | return ONIGERR_MEMORY; |
5391 | 14.6k | } |
5392 | | |
5393 | | static const char* const PopularQStr[] = { |
5394 | | "?", "*", "+", "??", "*?", "+?" |
5395 | | }; |
5396 | | |
5397 | | static const char* const ReduceQStr[] = { |
5398 | | "", "", "*", "*?", "??", "+ and ??", "+? and ?" |
5399 | | }; |
5400 | | |
5401 | | static int |
5402 | | set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) |
5403 | 87.8k | { |
5404 | 87.8k | QtfrNode* qn; |
5405 | | |
5406 | 87.8k | qn = NQTFR(qnode); |
5407 | 87.8k | if (qn->lower == 1 && qn->upper == 1) { |
5408 | 0 | return 1; |
5409 | 0 | } |
5410 | | |
5411 | 87.8k | switch (NTYPE(target)) { |
5412 | 2.92k | case NT_STR: |
5413 | 2.92k | if (! group) { |
5414 | 0 | StrNode* sn = NSTR(target); |
5415 | 0 | if (str_node_can_be_split(sn, env->enc)) { |
5416 | 0 | Node* n = str_node_split_last_char(sn, env->enc); |
5417 | 0 | if (IS_NOT_NULL(n)) { |
5418 | 0 | qn->target = n; |
5419 | 0 | return 2; |
5420 | 0 | } |
5421 | 0 | } |
5422 | 0 | } |
5423 | 2.92k | break; |
5424 | | |
5425 | 2.92k | case NT_QTFR: |
5426 | 0 | { /* check redundant double repeat. */ |
5427 | | /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ |
5428 | 0 | QtfrNode* qnt = NQTFR(target); |
5429 | 0 | int nestq_num = popular_quantifier_num(qn); |
5430 | 0 | int targetq_num = popular_quantifier_num(qnt); |
5431 | |
|
5432 | 0 | #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR |
5433 | 0 | if (nestq_num >= 0 && targetq_num >= 0 && |
5434 | 0 | IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { |
5435 | 0 | switch (ReduceTypeTable[targetq_num][nestq_num]) { |
5436 | 0 | case RQ_ASIS: |
5437 | 0 | break; |
5438 | | |
5439 | 0 | case RQ_DEL: |
5440 | 0 | if (onig_warn != onig_null_warn) { |
5441 | 0 | onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'", |
5442 | 0 | PopularQStr[targetq_num]); |
5443 | 0 | } |
5444 | 0 | goto warn_exit; |
5445 | 0 | break; |
5446 | | |
5447 | 0 | default: |
5448 | 0 | if (onig_warn != onig_null_warn) { |
5449 | 0 | onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression", |
5450 | 0 | PopularQStr[targetq_num], PopularQStr[nestq_num], |
5451 | 0 | ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); |
5452 | 0 | } |
5453 | 0 | goto warn_exit; |
5454 | 0 | break; |
5455 | 0 | } |
5456 | 0 | } |
5457 | | |
5458 | 0 | warn_exit: |
5459 | 0 | #endif |
5460 | 0 | if (targetq_num >= 0) { |
5461 | 0 | if (nestq_num >= 0) { |
5462 | 0 | onig_reduce_nested_quantifier(qnode, target); |
5463 | 0 | goto q_exit; |
5464 | 0 | } |
5465 | 0 | else if (targetq_num == 1 || targetq_num == 2) { /* * or + */ |
5466 | | /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ |
5467 | 0 | if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { |
5468 | 0 | qn->upper = (qn->lower == 0 ? 1 : qn->lower); |
5469 | 0 | } |
5470 | 0 | } |
5471 | 0 | } |
5472 | 0 | } |
5473 | 0 | break; |
5474 | | |
5475 | 84.9k | default: |
5476 | 84.9k | break; |
5477 | 87.8k | } |
5478 | | |
5479 | 87.8k | qn->target = target; |
5480 | 87.8k | q_exit: |
5481 | 87.8k | return 0; |
5482 | 87.8k | } |
5483 | | |
5484 | | |
5485 | | #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
5486 | | static int |
5487 | | clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) |
5488 | | { |
5489 | | BBuf *tbuf; |
5490 | | int r; |
5491 | | |
5492 | | if (IS_NCCLASS_NOT(cc)) { |
5493 | | bitset_invert(cc->bs); |
5494 | | |
5495 | | if (! ONIGENC_IS_SINGLEBYTE(enc)) { |
5496 | | r = not_code_range_buf(enc, cc->mbuf, &tbuf); |
5497 | | if (r != 0) return r; |
5498 | | |
5499 | | bbuf_free(cc->mbuf); |
5500 | | cc->mbuf = tbuf; |
5501 | | } |
5502 | | |
5503 | | NCCLASS_CLEAR_NOT(cc); |
5504 | | } |
5505 | | |
5506 | | return 0; |
5507 | | } |
5508 | | #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ |
5509 | | |
5510 | | typedef struct { |
5511 | | ScanEnv* env; |
5512 | | CClassNode* cc; |
5513 | | CClassNode* asc_cc; |
5514 | | Node* alt_root; |
5515 | | Node** ptail; |
5516 | | } IApplyCaseFoldArg; |
5517 | | |
5518 | | static int |
5519 | | i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], |
5520 | | int to_len, void* arg) |
5521 | 0 | { |
5522 | 0 | IApplyCaseFoldArg* iarg; |
5523 | 0 | ScanEnv* env; |
5524 | 0 | CClassNode* cc; |
5525 | 0 | CClassNode* asc_cc; |
5526 | 0 | BitSetRef bs; |
5527 | 0 | int add_flag, r; |
5528 | |
|
5529 | 0 | iarg = (IApplyCaseFoldArg* )arg; |
5530 | 0 | env = iarg->env; |
5531 | 0 | cc = iarg->cc; |
5532 | 0 | asc_cc = iarg->asc_cc; |
5533 | 0 | bs = cc->bs; |
5534 | |
|
5535 | 0 | if (IS_NULL(asc_cc)) { |
5536 | 0 | add_flag = 0; |
5537 | 0 | } |
5538 | 0 | else if (ONIGENC_IS_ASCII_CODE(from) == ONIGENC_IS_ASCII_CODE(*to)) { |
5539 | 0 | add_flag = 1; |
5540 | 0 | } |
5541 | 0 | else { |
5542 | 0 | add_flag = onig_is_code_in_cc(env->enc, from, asc_cc); |
5543 | 0 | if (IS_NCCLASS_NOT(asc_cc)) |
5544 | 0 | add_flag = !add_flag; |
5545 | 0 | } |
5546 | |
|
5547 | 0 | if (to_len == 1) { |
5548 | 0 | int is_in = onig_is_code_in_cc(env->enc, from, cc); |
5549 | 0 | #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
5550 | 0 | if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || |
5551 | 0 | (is_in == 0 && IS_NCCLASS_NOT(cc))) { |
5552 | 0 | if (add_flag) { |
5553 | 0 | if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { |
5554 | 0 | r = add_code_range0(&(cc->mbuf), env, *to, *to, 0); |
5555 | 0 | if (r < 0) return r; |
5556 | 0 | } |
5557 | 0 | else { |
5558 | 0 | BITSET_SET_BIT(bs, *to); |
5559 | 0 | } |
5560 | 0 | } |
5561 | 0 | } |
5562 | | #else |
5563 | | if (is_in != 0) { |
5564 | | if (add_flag) { |
5565 | | if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { |
5566 | | if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); |
5567 | | r = add_code_range0(&(cc->mbuf), env, *to, *to, 0); |
5568 | | if (r < 0) return r; |
5569 | | } |
5570 | | else { |
5571 | | if (IS_NCCLASS_NOT(cc)) { |
5572 | | BITSET_CLEAR_BIT(bs, *to); |
5573 | | } |
5574 | | else { |
5575 | | BITSET_SET_BIT(bs, *to); |
5576 | | } |
5577 | | } |
5578 | | } |
5579 | | } |
5580 | | #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ |
5581 | 0 | } |
5582 | 0 | else { |
5583 | 0 | int r, i, len; |
5584 | 0 | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; |
5585 | 0 | Node *snode = NULL_NODE; |
5586 | |
|
5587 | 0 | if (onig_is_code_in_cc(env->enc, from, cc) |
5588 | 0 | #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS |
5589 | 0 | && !IS_NCCLASS_NOT(cc) |
5590 | 0 | #endif |
5591 | 0 | ) { |
5592 | 0 | for (i = 0; i < to_len; i++) { |
5593 | 0 | len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); |
5594 | 0 | if (i == 0) { |
5595 | 0 | snode = onig_node_new_str(buf, buf + len); |
5596 | 0 | CHECK_NULL_RETURN_MEMERR(snode); |
5597 | | |
5598 | | /* char-class expanded multi-char only |
5599 | | compare with string folded at match time. */ |
5600 | 0 | NSTRING_SET_AMBIG(snode); |
5601 | 0 | } |
5602 | 0 | else { |
5603 | 0 | r = onig_node_str_cat(snode, buf, buf + len); |
5604 | 0 | if (r < 0) { |
5605 | 0 | onig_node_free(snode); |
5606 | 0 | return r; |
5607 | 0 | } |
5608 | 0 | } |
5609 | 0 | } |
5610 | | |
5611 | 0 | *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); |
5612 | 0 | CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); |
5613 | 0 | iarg->ptail = &(NCDR((*(iarg->ptail)))); |
5614 | 0 | } |
5615 | 0 | } |
5616 | | |
5617 | 0 | return 0; |
5618 | 0 | } |
5619 | | |
5620 | | static int |
5621 | | cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env) |
5622 | 0 | { |
5623 | 0 | int r; |
5624 | 0 | IApplyCaseFoldArg iarg; |
5625 | |
|
5626 | 0 | iarg.env = env; |
5627 | 0 | iarg.cc = cc; |
5628 | 0 | iarg.asc_cc = asc_cc; |
5629 | 0 | iarg.alt_root = NULL_NODE; |
5630 | 0 | iarg.ptail = &(iarg.alt_root); |
5631 | |
|
5632 | 0 | r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, |
5633 | 0 | i_apply_case_fold, &iarg); |
5634 | 0 | if (r != 0) { |
5635 | 0 | onig_node_free(iarg.alt_root); |
5636 | 0 | return r; |
5637 | 0 | } |
5638 | 0 | if (IS_NOT_NULL(iarg.alt_root)) { |
5639 | 0 | Node* work = onig_node_new_alt(*np, iarg.alt_root); |
5640 | 0 | if (IS_NULL(work)) { |
5641 | 0 | onig_node_free(iarg.alt_root); |
5642 | 0 | return ONIGERR_MEMORY; |
5643 | 0 | } |
5644 | 0 | *np = work; |
5645 | 0 | } |
5646 | 0 | return r; |
5647 | 0 | } |
5648 | | |
5649 | | static int |
5650 | | node_linebreak(Node** np, ScanEnv* env) |
5651 | 0 | { |
5652 | | /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */ |
5653 | 0 | Node* left = NULL; |
5654 | 0 | Node* right = NULL; |
5655 | 0 | Node* target1 = NULL; |
5656 | 0 | Node* target2 = NULL; |
5657 | 0 | CClassNode* cc; |
5658 | 0 | int num1, num2, r; |
5659 | 0 | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; |
5660 | | |
5661 | | /* \x0D\x0A */ |
5662 | 0 | num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); |
5663 | 0 | if (num1 < 0) return num1; |
5664 | 0 | num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); |
5665 | 0 | if (num2 < 0) return num2; |
5666 | 0 | left = node_new_str_raw(buf, buf + num1 + num2); |
5667 | 0 | if (IS_NULL(left)) goto err; |
5668 | | |
5669 | | /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */ |
5670 | 0 | right = node_new_cclass(); |
5671 | 0 | if (IS_NULL(right)) goto err; |
5672 | 0 | cc = NCCLASS(right); |
5673 | 0 | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { |
5674 | 0 | r = add_code_range(&(cc->mbuf), env, 0x0A, 0x0D); |
5675 | 0 | if (r != 0) goto err; |
5676 | 0 | } |
5677 | 0 | else { |
5678 | 0 | bitset_set_range(env, cc->bs, 0x0A, 0x0D); |
5679 | 0 | } |
5680 | | |
5681 | | /* TODO: move this block to enc/unicode.c */ |
5682 | 0 | if (ONIGENC_IS_UNICODE(env->enc)) { |
5683 | | /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ |
5684 | 0 | r = add_code_range(&(cc->mbuf), env, 0x85, 0x85); |
5685 | 0 | if (r != 0) goto err; |
5686 | 0 | r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); |
5687 | 0 | if (r != 0) goto err; |
5688 | 0 | } |
5689 | | |
5690 | | /* ...|... */ |
5691 | 0 | target1 = onig_node_new_alt(right, NULL_NODE); |
5692 | 0 | if (IS_NULL(target1)) goto err; |
5693 | 0 | right = NULL; |
5694 | 0 | target2 = onig_node_new_alt(left, target1); |
5695 | 0 | if (IS_NULL(target2)) goto err; |
5696 | 0 | left = NULL; |
5697 | 0 | target1 = NULL; |
5698 | | |
5699 | | /* (?>...) */ |
5700 | 0 | *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
5701 | 0 | if (IS_NULL(*np)) goto err; |
5702 | 0 | NENCLOSE(*np)->target = target2; |
5703 | 0 | return ONIG_NORMAL; |
5704 | | |
5705 | 0 | err: |
5706 | 0 | onig_node_free(left); |
5707 | 0 | onig_node_free(right); |
5708 | 0 | onig_node_free(target1); |
5709 | 0 | onig_node_free(target2); |
5710 | 0 | return ONIGERR_MEMORY; |
5711 | 0 | } |
5712 | | |
5713 | | static int |
5714 | | propname2ctype(ScanEnv* env, const char* propname) |
5715 | 0 | { |
5716 | 0 | UChar* name = (UChar* )propname; |
5717 | 0 | UChar* name_end = name + strlen(propname); |
5718 | 0 | int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII, |
5719 | 0 | name, name_end); |
5720 | 0 | if (ctype < 0) { |
5721 | 0 | onig_scan_env_set_error_string(env, ctype, name, name_end); |
5722 | 0 | } |
5723 | 0 | return ctype; |
5724 | 0 | } |
5725 | | |
5726 | | static int |
5727 | | add_property_to_cc(CClassNode* cc, const char* propname, int not, ScanEnv* env) |
5728 | 0 | { |
5729 | 0 | int ctype = propname2ctype(env, propname); |
5730 | 0 | if (ctype < 0) return ctype; |
5731 | 0 | return add_ctype_to_cc(cc, ctype, not, 0, env); |
5732 | 0 | } |
5733 | | |
5734 | | /* |
5735 | | * helper methods for node_extended_grapheme_cluster (/\X/) |
5736 | | */ |
5737 | | static int |
5738 | | create_property_node(Node **np, ScanEnv* env, const char* propname) |
5739 | 0 | { |
5740 | 0 | int r; |
5741 | 0 | CClassNode* cc; |
5742 | |
|
5743 | 0 | *np = node_new_cclass(); |
5744 | 0 | if (IS_NULL(*np)) return ONIGERR_MEMORY; |
5745 | 0 | cc = NCCLASS(*np); |
5746 | 0 | r = add_property_to_cc(cc, propname, 0, env); |
5747 | 0 | if (r != 0) |
5748 | 0 | onig_node_free(*np); |
5749 | 0 | return r; |
5750 | 0 | } |
5751 | | |
5752 | | static int |
5753 | | quantify_node(Node **np, int lower, int upper) |
5754 | 0 | { |
5755 | 0 | Node* tmp = node_new_quantifier(lower, upper, 0); |
5756 | 0 | if (IS_NULL(tmp)) return ONIGERR_MEMORY; |
5757 | 0 | NQTFR(tmp)->target = *np; |
5758 | 0 | *np = tmp; |
5759 | 0 | return 0; |
5760 | 0 | } |
5761 | | |
5762 | | static int |
5763 | | quantify_property_node(Node **np, ScanEnv* env, const char* propname, char repetitions) |
5764 | 0 | { |
5765 | 0 | int r; |
5766 | 0 | int lower = 0; |
5767 | 0 | int upper = REPEAT_INFINITE; |
5768 | |
|
5769 | 0 | r = create_property_node(np, env, propname); |
5770 | 0 | if (r != 0) return r; |
5771 | 0 | switch (repetitions) { |
5772 | 0 | case '?': upper = 1; break; |
5773 | 0 | case '+': lower = 1; break; |
5774 | 0 | case '*': break; |
5775 | 0 | case '2': lower = upper = 2; break; |
5776 | 0 | default : return ONIGERR_PARSER_BUG; |
5777 | 0 | } |
5778 | 0 | return quantify_node(np, lower, upper); |
5779 | 0 | } |
5780 | | |
5781 | 0 | #define LIST 0 |
5782 | | #define ALT 1 |
5783 | | |
5784 | | /* IMPORTANT: Make sure node_array ends with NULL_NODE */ |
5785 | | static int |
5786 | | create_node_from_array(int kind, Node **np, Node **node_array) |
5787 | 0 | { |
5788 | 0 | Node* tmp = NULL_NODE; |
5789 | 0 | int i = 0; |
5790 | |
|
5791 | 0 | while (node_array[i] != NULL_NODE) i++; |
5792 | 0 | while (--i >= 0) { |
5793 | 0 | *np = kind==LIST ? node_new_list(node_array[i], tmp) |
5794 | 0 | : onig_node_new_alt(node_array[i], tmp); |
5795 | 0 | if (IS_NULL(*np)) { |
5796 | 0 | while (i >= 0) { |
5797 | 0 | onig_node_free(node_array[i]); |
5798 | 0 | node_array[i--] = NULL_NODE; |
5799 | 0 | } |
5800 | 0 | onig_node_free(tmp); |
5801 | 0 | return ONIGERR_MEMORY; |
5802 | 0 | } |
5803 | 0 | else |
5804 | 0 | node_array[i] = NULL_NODE; |
5805 | 0 | tmp = *np; |
5806 | 0 | } |
5807 | 0 | return 0; |
5808 | 0 | } |
5809 | | |
5810 | 0 | #define R_ERR(call) r=(call);if(r!=0)goto err |
5811 | | |
5812 | | /* Memory layout for common node array: |
5813 | | * The main purpose is to be able to easily free all leftover nodes |
5814 | | * after an error. As a side effect, we share some memory. |
5815 | | * |
5816 | | * The layout is as shown below (each line corresponds to one call of |
5817 | | * create_node_from_array()). Because create_node_from_array sets all |
5818 | | * nodes of the source to NULL_NODE, we can overlap the target array |
5819 | | * as long as we do not override the actual target location. |
5820 | | * |
5821 | | * Target Array name Index |
5822 | | * |
5823 | | * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F |
5824 | | * top_alts alts[5] 0 1 2 3 4* |
5825 | | * alts+1 list[4] 0 1 2 3* |
5826 | | * list+1 core_alts[7] 0 1 2 3 4 5 6* |
5827 | | * core_alts+0 H_list[4] 0 1 2 3* |
5828 | | * H_list+1 H_alt2[4] 0 1 2 3* |
5829 | | * h_alt2+1 H_list2[3] 0 1 2* |
5830 | | * core_alts+4 XP_list[4] 0 1 2 3* |
5831 | | * XP_list+1 Ex_list[4] 0 1 2 3* |
5832 | | */ |
5833 | 0 | #define NODE_COMMON_SIZE 16 |
5834 | | |
5835 | | static int |
5836 | | node_extended_grapheme_cluster(Node** np, ScanEnv* env) |
5837 | 0 | { |
5838 | 0 | Node* tmp = NULL; |
5839 | 0 | Node* np1 = NULL; |
5840 | 0 | Node* top_alt = NULL; |
5841 | 0 | int r = 0; |
5842 | 0 | int num1; |
5843 | 0 | int i; |
5844 | 0 | int any_target_position; |
5845 | 0 | UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; |
5846 | 0 | OnigOptionType option; |
5847 | | /* node_common is function-global so that we can free all nodes |
5848 | | * in case of error. Unused slots are set to NULL_NODE at all times. */ |
5849 | 0 | Node *node_common[NODE_COMMON_SIZE]; |
5850 | 0 | Node **alts = node_common+0; /* size: 5 */ |
5851 | |
|
5852 | 0 | for (i=0; i<NODE_COMMON_SIZE; i++) |
5853 | 0 | node_common[i] = NULL_NODE; |
5854 | | |
5855 | | /* CRLF, common for both Unicode and non-Unicode */ |
5856 | | /* \x0D\x0A */ |
5857 | 0 | r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); |
5858 | 0 | if (r < 0) goto err; |
5859 | 0 | num1 = r; |
5860 | 0 | r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); |
5861 | 0 | if (r < 0) goto err; |
5862 | 0 | alts[0] = node_new_str_raw(buf, buf + num1 + r); |
5863 | 0 | if (IS_NULL(alts[0])) goto err; |
5864 | | |
5865 | 0 | #ifdef USE_UNICODE_PROPERTIES |
5866 | 0 | if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ |
5867 | 0 | CClassNode* cc; |
5868 | |
|
5869 | 0 | if (propname2ctype(env, "Grapheme_Cluster_Break=Extend") < 0) goto err; |
5870 | | /* Unicode 11.0.0 |
5871 | | * CRLF (already done) |
5872 | | * | [Control CR LF] |
5873 | | * | precore* core postcore* |
5874 | | * | . (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */ |
5875 | | |
5876 | | /* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */ |
5877 | 0 | alts[1] = node_new_cclass(); |
5878 | 0 | if (IS_NULL(alts[1])) goto err; |
5879 | 0 | cc = NCCLASS(alts[1]); |
5880 | 0 | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env)); |
5881 | 0 | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ |
5882 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */ |
5883 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */ |
5884 | 0 | } |
5885 | 0 | else { |
5886 | 0 | BITSET_SET_BIT(cc->bs, 0x0a); |
5887 | 0 | BITSET_SET_BIT(cc->bs, 0x0d); |
5888 | 0 | } |
5889 | | |
5890 | | /* precore* core postcore* */ |
5891 | 0 | { |
5892 | 0 | Node **list = alts + 3; /* size: 4 */ |
5893 | | |
5894 | | /* precore*; precore := Prepend */ |
5895 | 0 | R_ERR(quantify_property_node(list+0, env, "Grapheme_Cluster_Break=Prepend", '*')); |
5896 | | |
5897 | | /* core := hangul-syllable |
5898 | | * | ri-sequence |
5899 | | * | xpicto-sequence |
5900 | | * | [^Control CR LF] */ |
5901 | 0 | { |
5902 | 0 | Node **core_alts = list + 2; /* size: 7 */ |
5903 | | |
5904 | | /* hangul-syllable := |
5905 | | * L* (V+ | LV V* | LVT) T* |
5906 | | * | L+ |
5907 | | * | T+ */ |
5908 | | /* hangul-syllable is an alternative (would be called H_alt) |
5909 | | * inside an alternative, but we flatten it into core_alts */ |
5910 | | |
5911 | | /* L* (V+ | LV V* | LVT) T* */ |
5912 | 0 | { |
5913 | 0 | Node **H_list = core_alts + 1; /* size: 4 */ |
5914 | 0 | R_ERR(quantify_property_node(H_list+0, env, "Grapheme_Cluster_Break=L", '*')); |
5915 | | |
5916 | | /* V+ | LV V* | LVT */ |
5917 | 0 | { |
5918 | 0 | Node **H_alt2 = H_list + 2; /* size: 4 */ |
5919 | 0 | R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+')); |
5920 | | |
5921 | | /* LV V* */ |
5922 | 0 | { |
5923 | 0 | Node **H_list2 = H_alt2 + 2; /* size: 3 */ |
5924 | |
|
5925 | 0 | R_ERR(create_property_node(H_list2+0, env, "Grapheme_Cluster_Break=LV")); |
5926 | 0 | R_ERR(quantify_property_node(H_list2+1, env, "Grapheme_Cluster_Break=V", '*')); |
5927 | 0 | R_ERR(create_node_from_array(LIST, H_alt2+1, H_list2)); |
5928 | 0 | } |
5929 | | |
5930 | 0 | R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT")); |
5931 | 0 | R_ERR(create_node_from_array(ALT, H_list+1, H_alt2)); |
5932 | 0 | } |
5933 | | |
5934 | 0 | R_ERR(quantify_property_node(H_list+2, env, "Grapheme_Cluster_Break=T", '*')); |
5935 | 0 | R_ERR(create_node_from_array(LIST, core_alts+0, H_list)); |
5936 | 0 | } |
5937 | | |
5938 | 0 | R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+')); |
5939 | 0 | R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+')); |
5940 | | /* end of hangul-syllable */ |
5941 | | |
5942 | | /* ri-sequence := RI RI */ |
5943 | 0 | R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2')); |
5944 | | |
5945 | | /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */ |
5946 | 0 | { |
5947 | 0 | Node **XP_list = core_alts + 5; /* size: 3 */ |
5948 | 0 | R_ERR(create_property_node(XP_list+0, env, "Extended_Pictographic")); |
5949 | | |
5950 | | /* (Extend* ZWJ \p{Extended_Pictographic})* */ |
5951 | 0 | { |
5952 | 0 | Node **Ex_list = XP_list + 2; /* size: 4 */ |
5953 | | /* assert(Ex_list+4 == node_common+NODE_COMMON_SIZE); */ |
5954 | 0 | R_ERR(quantify_property_node(Ex_list+0, env, "Grapheme_Cluster_Break=Extend", '*')); |
5955 | | |
5956 | | /* ZWJ (ZERO WIDTH JOINER) */ |
5957 | 0 | r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); |
5958 | 0 | if (r < 0) goto err; |
5959 | 0 | Ex_list[1] = node_new_str_raw(buf, buf + r); |
5960 | 0 | if (IS_NULL(Ex_list[1])) goto err; |
5961 | | |
5962 | 0 | R_ERR(create_property_node(Ex_list+2, env, "Extended_Pictographic")); |
5963 | 0 | R_ERR(create_node_from_array(LIST, XP_list+1, Ex_list)); |
5964 | 0 | } |
5965 | 0 | R_ERR(quantify_node(XP_list+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */ |
5966 | | |
5967 | 0 | R_ERR(create_node_from_array(LIST, core_alts+4, XP_list)); |
5968 | 0 | } |
5969 | | |
5970 | | /* [^Control CR LF] */ |
5971 | 0 | core_alts[5] = node_new_cclass(); |
5972 | 0 | if (IS_NULL(core_alts[5])) goto err; |
5973 | 0 | cc = NCCLASS(core_alts[5]); |
5974 | 0 | if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ |
5975 | 0 | BBuf *inverted_buf = NULL; |
5976 | | |
5977 | | /* TODO: fix false warning */ |
5978 | 0 | const int dup_not_warned = env->warnings_flag | ~ONIG_SYN_WARN_CC_DUP; |
5979 | 0 | env->warnings_flag |= ONIG_SYN_WARN_CC_DUP; |
5980 | | |
5981 | | /* Start with a positive buffer and invert at the end. |
5982 | | * Otherwise, adding single-character ranges work the wrong way. */ |
5983 | 0 | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env)); |
5984 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */ |
5985 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */ |
5986 | 0 | R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env)); |
5987 | 0 | cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */ |
5988 | |
|
5989 | 0 | env->warnings_flag &= dup_not_warned; /* TODO: fix false warning */ |
5990 | 0 | } |
5991 | 0 | else { |
5992 | 0 | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env)); |
5993 | 0 | BITSET_CLEAR_BIT(cc->bs, 0x0a); |
5994 | 0 | BITSET_CLEAR_BIT(cc->bs, 0x0d); |
5995 | 0 | } |
5996 | | |
5997 | 0 | R_ERR(create_node_from_array(ALT, list+1, core_alts)); |
5998 | 0 | } |
5999 | | |
6000 | | /* postcore*; postcore = [Extend ZWJ SpacingMark] */ |
6001 | 0 | R_ERR(create_property_node(list+2, env, "Grapheme_Cluster_Break=Extend")); |
6002 | 0 | cc = NCCLASS(list[2]); |
6003 | 0 | R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env)); |
6004 | 0 | R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D)); |
6005 | 0 | R_ERR(quantify_node(list+2, 0, REPEAT_INFINITE)); |
6006 | | |
6007 | 0 | R_ERR(create_node_from_array(LIST, alts+2, list)); |
6008 | 0 | } |
6009 | | |
6010 | 0 | any_target_position = 3; |
6011 | 0 | } |
6012 | 0 | else |
6013 | 0 | #endif /* USE_UNICODE_PROPERTIES */ |
6014 | 0 | { |
6015 | 0 | any_target_position = 1; |
6016 | 0 | } |
6017 | | |
6018 | | /* PerlSyntax: (?s:.), RubySyntax: (?m:.), common for both Unicode and non-Unicode */ |
6019 | | /* Not in Unicode spec (UAX #29), but added to catch invalid stuff, |
6020 | | * because this is Ruby spec for String#grapheme_clusters. */ |
6021 | 0 | np1 = node_new_anychar(); |
6022 | 0 | if (IS_NULL(np1)) goto err; |
6023 | | |
6024 | 0 | option = env->option; |
6025 | 0 | ONOFF(option, ONIG_OPTION_MULTILINE, 0); |
6026 | 0 | tmp = node_new_option(option); |
6027 | 0 | if (IS_NULL(tmp)) goto err; |
6028 | 0 | NENCLOSE(tmp)->target = np1; |
6029 | 0 | alts[any_target_position] = tmp; |
6030 | 0 | np1 = NULL; |
6031 | |
|
6032 | 0 | R_ERR(create_node_from_array(ALT, &top_alt, alts)); |
6033 | | |
6034 | | /* (?>): For efficiency, because there is no text piece |
6035 | | * that is not in a grapheme cluster, and there is only one way |
6036 | | * to split a string into grapheme clusters. */ |
6037 | 0 | tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
6038 | 0 | if (IS_NULL(tmp)) goto err; |
6039 | 0 | NENCLOSE(tmp)->target = top_alt; |
6040 | 0 | np1 = tmp; |
6041 | |
|
6042 | 0 | #ifdef USE_UNICODE_PROPERTIES |
6043 | 0 | if (ONIGENC_IS_UNICODE(env->enc)) { |
6044 | | /* Don't ignore case. */ |
6045 | 0 | option = env->option; |
6046 | 0 | ONOFF(option, ONIG_OPTION_IGNORECASE, 1); |
6047 | 0 | *np = node_new_option(option); |
6048 | 0 | if (IS_NULL(*np)) goto err; |
6049 | 0 | NENCLOSE(*np)->target = np1; |
6050 | 0 | } |
6051 | 0 | else |
6052 | 0 | #endif |
6053 | 0 | { |
6054 | 0 | *np = np1; |
6055 | 0 | } |
6056 | 0 | return ONIG_NORMAL; |
6057 | | |
6058 | 0 | err: |
6059 | 0 | onig_node_free(np1); |
6060 | 0 | for (i=0; i<NODE_COMMON_SIZE; i++) |
6061 | 0 | onig_node_free(node_common[i]); |
6062 | 0 | return (r == 0) ? ONIGERR_MEMORY : r; |
6063 | 0 | } |
6064 | | #undef R_ERR |
6065 | | |
6066 | | static int |
6067 | | countbits(unsigned int bits) |
6068 | 20.4k | { |
6069 | 20.4k | bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555); |
6070 | 20.4k | bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333); |
6071 | 20.4k | bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f); |
6072 | 20.4k | bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff); |
6073 | 20.4k | return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff); |
6074 | 20.4k | } |
6075 | | |
6076 | | static int |
6077 | | is_onechar_cclass(CClassNode* cc, OnigCodePoint* code) |
6078 | 43.9k | { |
6079 | 43.9k | const OnigCodePoint not_found = ONIG_LAST_CODE_POINT; |
6080 | 43.9k | OnigCodePoint c = not_found; |
6081 | 43.9k | int i; |
6082 | 43.9k | BBuf *bbuf = cc->mbuf; |
6083 | | |
6084 | 43.9k | if (IS_NCCLASS_NOT(cc)) return 0; |
6085 | | |
6086 | | /* check bbuf */ |
6087 | 26.3k | if (IS_NOT_NULL(bbuf)) { |
6088 | 0 | OnigCodePoint n, *data; |
6089 | 0 | GET_CODE_POINT(n, bbuf->p); |
6090 | 0 | data = (OnigCodePoint* )(bbuf->p) + 1; |
6091 | 0 | if ((n == 1) && (data[0] == data[1])) { |
6092 | | /* only one char found in the bbuf, save the code point. */ |
6093 | 0 | c = data[0]; |
6094 | 0 | if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) { |
6095 | | /* skip if c is included in the bitset */ |
6096 | 0 | c = not_found; |
6097 | 0 | } |
6098 | 0 | } |
6099 | 0 | else { |
6100 | 0 | return 0; /* the bbuf contains multiple chars */ |
6101 | 0 | } |
6102 | 0 | } |
6103 | | |
6104 | | /* check bitset */ |
6105 | 46.8k | for (i = 0; i < BITSET_SIZE; i++) { |
6106 | 46.8k | Bits b1 = cc->bs[i]; |
6107 | 46.8k | if (b1 != 0) { |
6108 | 46.8k | if (((b1 & (b1 - 1)) == 0) && (c == not_found)) { |
6109 | 20.4k | c = BITS_IN_ROOM * i + countbits(b1 - 1); |
6110 | 26.3k | } else { |
6111 | 26.3k | return 0; /* the character class contains multiple chars */ |
6112 | 26.3k | } |
6113 | 46.8k | } |
6114 | 46.8k | } |
6115 | | |
6116 | 0 | if (c != not_found) { |
6117 | 0 | *code = c; |
6118 | 0 | return 1; |
6119 | 0 | } |
6120 | | |
6121 | | /* the character class contains no char. */ |
6122 | 0 | return 0; |
6123 | 0 | } |
6124 | | |
6125 | | |
6126 | | static int |
6127 | | parse_exp(Node** np, OnigToken* tok, int term, |
6128 | | UChar** src, UChar* end, ScanEnv* env) |
6129 | 345k | { |
6130 | 345k | int r, len, group = 0; |
6131 | 345k | Node* qn; |
6132 | 345k | Node** targetp; |
6133 | 345k | unsigned int parse_depth; |
6134 | | |
6135 | 345k | *np = NULL; |
6136 | 345k | if (tok->type == (enum TokenSyms )term) |
6137 | 2.92k | goto end_of_token; |
6138 | | |
6139 | 342k | parse_depth = env->parse_depth; |
6140 | | |
6141 | 342k | switch (tok->type) { |
6142 | 0 | case TK_ALT: |
6143 | 0 | case TK_EOT: |
6144 | 2.92k | end_of_token: |
6145 | 2.92k | *np = node_new_empty(); |
6146 | 2.92k | return tok->type; |
6147 | 0 | break; |
6148 | | |
6149 | 35.1k | case TK_SUBEXP_OPEN: |
6150 | 35.1k | r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env); |
6151 | 35.1k | if (r < 0) return r; |
6152 | 35.1k | if (r == 1) group = 1; |
6153 | 17.5k | else if (r == 2) { /* option only */ |
6154 | 0 | Node* target; |
6155 | 0 | OnigOptionType prev = env->option; |
6156 | |
|
6157 | 0 | env->option = NENCLOSE(*np)->option; |
6158 | 0 | r = fetch_token(tok, src, end, env); |
6159 | 0 | if (r < 0) { |
6160 | 0 | env->option = prev; |
6161 | 0 | return r; |
6162 | 0 | } |
6163 | 0 | r = parse_subexp(&target, tok, term, src, end, env); |
6164 | 0 | env->option = prev; |
6165 | 0 | if (r < 0) { |
6166 | 0 | onig_node_free(target); |
6167 | 0 | return r; |
6168 | 0 | } |
6169 | 0 | NENCLOSE(*np)->target = target; |
6170 | 0 | return tok->type; |
6171 | 0 | } |
6172 | 35.1k | break; |
6173 | | |
6174 | 35.1k | case TK_SUBEXP_CLOSE: |
6175 | 0 | if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) |
6176 | 0 | return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; |
6177 | | |
6178 | 0 | if (tok->escaped) goto tk_raw_byte; |
6179 | 0 | else goto tk_byte; |
6180 | 0 | break; |
6181 | | |
6182 | 0 | case TK_LINEBREAK: |
6183 | 0 | r = node_linebreak(np, env); |
6184 | 0 | if (r < 0) return r; |
6185 | 0 | break; |
6186 | | |
6187 | 0 | case TK_EXTENDED_GRAPHEME_CLUSTER: |
6188 | 0 | r = node_extended_grapheme_cluster(np, env); |
6189 | 0 | if (r < 0) return r; |
6190 | 0 | break; |
6191 | | |
6192 | 0 | case TK_KEEP: |
6193 | 0 | *np = onig_node_new_anchor(ANCHOR_KEEP); |
6194 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6195 | 0 | break; |
6196 | | |
6197 | 122k | case TK_STRING: |
6198 | 122k | tk_byte: |
6199 | 122k | { |
6200 | 122k | *np = node_new_str(tok->backp, *src); |
6201 | 122k | CHECK_NULL_RETURN_MEMERR(*np); |
6202 | | |
6203 | 122k | string_loop: |
6204 | 1.07M | while (1) { |
6205 | 1.07M | r = fetch_token(tok, src, end, env); |
6206 | 1.07M | if (r < 0) return r; |
6207 | 1.07M | if (r == TK_STRING) { |
6208 | 954k | r = onig_node_str_cat(*np, tok->backp, *src); |
6209 | 954k | } |
6210 | 122k | #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG |
6211 | 122k | else if (r == TK_CODE_POINT) { |
6212 | 0 | r = node_str_cat_codepoint(*np, env->enc, tok->u.code); |
6213 | 0 | } |
6214 | 122k | #endif |
6215 | 122k | else { |
6216 | 122k | break; |
6217 | 122k | } |
6218 | 954k | if (r < 0) return r; |
6219 | 954k | } |
6220 | | |
6221 | 122k | string_end: |
6222 | 122k | targetp = np; |
6223 | 122k | goto repeat; |
6224 | 122k | } |
6225 | 0 | break; |
6226 | | |
6227 | 0 | case TK_RAW_BYTE: |
6228 | 0 | tk_raw_byte: |
6229 | 0 | { |
6230 | 0 | *np = node_new_str_raw_char((UChar )tok->u.c); |
6231 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6232 | 0 | len = 1; |
6233 | 0 | while (1) { |
6234 | 0 | if (len >= ONIGENC_MBC_MINLEN(env->enc)) { |
6235 | 0 | if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) { |
6236 | 0 | r = fetch_token(tok, src, end, env); |
6237 | 0 | NSTRING_CLEAR_RAW(*np); |
6238 | 0 | goto string_end; |
6239 | 0 | } |
6240 | 0 | } |
6241 | | |
6242 | 0 | r = fetch_token(tok, src, end, env); |
6243 | 0 | if (r < 0) return r; |
6244 | 0 | if (r != TK_RAW_BYTE) { |
6245 | | /* Don't use this, it is wrong for little endian encodings. */ |
6246 | | #ifdef USE_PAD_TO_SHORT_BYTE_CHAR |
6247 | | int rem; |
6248 | | if (len < ONIGENC_MBC_MINLEN(env->enc)) { |
6249 | | rem = ONIGENC_MBC_MINLEN(env->enc) - len; |
6250 | | (void )node_str_head_pad(NSTR(*np), rem, (UChar )0); |
6251 | | if (len + rem == enclen(env->enc, NSTR(*np)->s)) { |
6252 | | NSTRING_CLEAR_RAW(*np); |
6253 | | goto string_end; |
6254 | | } |
6255 | | } |
6256 | | #endif |
6257 | 0 | return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; |
6258 | 0 | } |
6259 | | |
6260 | 0 | r = node_str_cat_char(*np, (UChar )tok->u.c); |
6261 | 0 | if (r < 0) return r; |
6262 | | |
6263 | 0 | len++; |
6264 | 0 | } |
6265 | 0 | } |
6266 | 0 | break; |
6267 | | |
6268 | 0 | case TK_CODE_POINT: |
6269 | 0 | { |
6270 | 0 | *np = node_new_empty(); |
6271 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6272 | 0 | r = node_str_cat_codepoint(*np, env->enc, tok->u.code); |
6273 | 0 | if (r != 0) return r; |
6274 | | #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG |
6275 | | NSTRING_SET_RAW(*np); |
6276 | | #else |
6277 | 0 | goto string_loop; |
6278 | 0 | #endif |
6279 | 0 | } |
6280 | 0 | break; |
6281 | | |
6282 | 0 | case TK_QUOTE_OPEN: |
6283 | 0 | { |
6284 | 0 | OnigCodePoint end_op[2]; |
6285 | 0 | UChar *qstart, *qend, *nextp; |
6286 | |
|
6287 | 0 | end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); |
6288 | 0 | end_op[1] = (OnigCodePoint )'E'; |
6289 | 0 | qstart = *src; |
6290 | 0 | qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); |
6291 | 0 | if (IS_NULL(qend)) { |
6292 | 0 | nextp = qend = end; |
6293 | 0 | } |
6294 | 0 | *np = node_new_str(qstart, qend); |
6295 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6296 | 0 | *src = nextp; |
6297 | 0 | } |
6298 | 0 | break; |
6299 | | |
6300 | 26.3k | case TK_CHAR_TYPE: |
6301 | 26.3k | { |
6302 | 26.3k | switch (tok->u.prop.ctype) { |
6303 | 0 | case ONIGENC_CTYPE_WORD: |
6304 | 0 | *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, |
6305 | 0 | IS_ASCII_RANGE(env->option)); |
6306 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6307 | 0 | break; |
6308 | | |
6309 | 14.6k | case ONIGENC_CTYPE_SPACE: |
6310 | 26.3k | case ONIGENC_CTYPE_DIGIT: |
6311 | 26.3k | case ONIGENC_CTYPE_XDIGIT: |
6312 | 26.3k | { |
6313 | 26.3k | CClassNode* cc; |
6314 | | |
6315 | 26.3k | *np = node_new_cclass(); |
6316 | 26.3k | CHECK_NULL_RETURN_MEMERR(*np); |
6317 | 26.3k | cc = NCCLASS(*np); |
6318 | 26.3k | r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0, |
6319 | 26.3k | IS_ASCII_RANGE(env->option), env); |
6320 | 26.3k | if (r != 0) return r; |
6321 | 26.3k | if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); |
6322 | 26.3k | } |
6323 | 0 | break; |
6324 | | |
6325 | 0 | default: |
6326 | 0 | return ONIGERR_PARSER_BUG; |
6327 | 0 | break; |
6328 | 26.3k | } |
6329 | 26.3k | } |
6330 | 26.3k | break; |
6331 | | |
6332 | 26.3k | case TK_CHAR_PROPERTY: |
6333 | 0 | r = parse_char_property(np, tok, src, end, env); |
6334 | 0 | if (r != 0) return r; |
6335 | 0 | break; |
6336 | | |
6337 | 43.9k | case TK_CC_OPEN: |
6338 | 43.9k | { |
6339 | 43.9k | Node *asc_node; |
6340 | 43.9k | CClassNode* cc; |
6341 | 43.9k | OnigCodePoint code; |
6342 | | |
6343 | 43.9k | r = parse_char_class(np, &asc_node, tok, src, end, env); |
6344 | 43.9k | if (r != 0) { |
6345 | 0 | onig_node_free(asc_node); |
6346 | 0 | return r; |
6347 | 0 | } |
6348 | | |
6349 | 43.9k | cc = NCCLASS(*np); |
6350 | 43.9k | if (is_onechar_cclass(cc, &code)) { |
6351 | 0 | onig_node_free(*np); |
6352 | 0 | onig_node_free(asc_node); |
6353 | 0 | *np = node_new_empty(); |
6354 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6355 | 0 | r = node_str_cat_codepoint(*np, env->enc, code); |
6356 | 0 | if (r != 0) return r; |
6357 | 0 | goto string_loop; |
6358 | 0 | } |
6359 | 43.9k | if (IS_IGNORECASE(env->option)) { |
6360 | 0 | r = cclass_case_fold(np, cc, NCCLASS(asc_node), env); |
6361 | 0 | if (r != 0) { |
6362 | 0 | onig_node_free(asc_node); |
6363 | 0 | return r; |
6364 | 0 | } |
6365 | 0 | } |
6366 | 43.9k | onig_node_free(asc_node); |
6367 | 43.9k | } |
6368 | 0 | break; |
6369 | | |
6370 | 29.2k | case TK_ANYCHAR: |
6371 | 29.2k | *np = node_new_anychar(); |
6372 | 29.2k | CHECK_NULL_RETURN_MEMERR(*np); |
6373 | 29.2k | break; |
6374 | | |
6375 | 29.2k | case TK_ANYCHAR_ANYTIME: |
6376 | 0 | *np = node_new_anychar(); |
6377 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6378 | 0 | qn = node_new_quantifier(0, REPEAT_INFINITE, 0); |
6379 | 0 | CHECK_NULL_RETURN_MEMERR(qn); |
6380 | 0 | NQTFR(qn)->target = *np; |
6381 | 0 | *np = qn; |
6382 | 0 | break; |
6383 | | |
6384 | 0 | case TK_BACKREF: |
6385 | 0 | len = tok->u.backref.num; |
6386 | 0 | *np = node_new_backref(len, |
6387 | 0 | (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), |
6388 | 0 | tok->u.backref.by_name, |
6389 | 0 | #ifdef USE_BACKREF_WITH_LEVEL |
6390 | 0 | tok->u.backref.exist_level, |
6391 | 0 | tok->u.backref.level, |
6392 | 0 | #endif |
6393 | 0 | env); |
6394 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6395 | 0 | break; |
6396 | | |
6397 | 0 | #ifdef USE_SUBEXP_CALL |
6398 | 0 | case TK_CALL: |
6399 | 0 | { |
6400 | 0 | int gnum = tok->u.call.gnum; |
6401 | |
|
6402 | 0 | if (gnum < 0 || tok->u.call.rel != 0) { |
6403 | 0 | if (gnum > 0) gnum--; |
6404 | 0 | gnum = BACKREF_REL_TO_ABS(gnum, env); |
6405 | 0 | if (gnum <= 0) |
6406 | 0 | return ONIGERR_INVALID_BACKREF; |
6407 | 0 | } |
6408 | 0 | *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum); |
6409 | 0 | CHECK_NULL_RETURN_MEMERR(*np); |
6410 | 0 | env->num_call++; |
6411 | 0 | } |
6412 | 0 | break; |
6413 | 0 | #endif |
6414 | | |
6415 | 84.9k | case TK_ANCHOR: |
6416 | 84.9k | *np = onig_node_new_anchor(tok->u.anchor.subtype); |
6417 | 84.9k | CHECK_NULL_RETURN_MEMERR(*np); |
6418 | 84.9k | NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range; |
6419 | 84.9k | break; |
6420 | | |
6421 | 0 | case TK_OP_REPEAT: |
6422 | 0 | case TK_INTERVAL: |
6423 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { |
6424 | 0 | if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) |
6425 | 0 | return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; |
6426 | 0 | else |
6427 | 0 | *np = node_new_empty(); |
6428 | 0 | } |
6429 | 0 | else { |
6430 | 0 | goto tk_byte; |
6431 | 0 | } |
6432 | 0 | break; |
6433 | | |
6434 | 0 | default: |
6435 | 0 | return ONIGERR_PARSER_BUG; |
6436 | 0 | break; |
6437 | 342k | } |
6438 | | |
6439 | 219k | { |
6440 | 219k | targetp = np; |
6441 | | |
6442 | 307k | re_entry: |
6443 | 307k | r = fetch_token(tok, src, end, env); |
6444 | 307k | if (r < 0) return r; |
6445 | | |
6446 | 430k | repeat: |
6447 | 430k | if (r == TK_OP_REPEAT || r == TK_INTERVAL) { |
6448 | 87.8k | if (is_invalid_quantifier_target(*targetp)) |
6449 | 0 | return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; |
6450 | | |
6451 | 87.8k | parse_depth++; |
6452 | 87.8k | if (parse_depth > ParseDepthLimit) |
6453 | 0 | return ONIGERR_PARSE_DEPTH_LIMIT_OVER; |
6454 | | |
6455 | 87.8k | qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, |
6456 | 87.8k | (r == TK_INTERVAL ? 1 : 0)); |
6457 | 87.8k | CHECK_NULL_RETURN_MEMERR(qn); |
6458 | 87.8k | NQTFR(qn)->greedy = tok->u.repeat.greedy; |
6459 | 87.8k | r = set_quantifier(qn, *targetp, group, env); |
6460 | 87.8k | if (r < 0) { |
6461 | 0 | onig_node_free(qn); |
6462 | 0 | return r; |
6463 | 0 | } |
6464 | | |
6465 | 87.8k | if (tok->u.repeat.possessive != 0) { |
6466 | 0 | Node* en; |
6467 | 0 | en = node_new_enclose(ENCLOSE_STOP_BACKTRACK); |
6468 | 0 | if (IS_NULL(en)) { |
6469 | 0 | onig_node_free(qn); |
6470 | 0 | return ONIGERR_MEMORY; |
6471 | 0 | } |
6472 | 0 | NENCLOSE(en)->target = qn; |
6473 | 0 | qn = en; |
6474 | 0 | } |
6475 | | |
6476 | 87.8k | if (r == 0) { |
6477 | 87.8k | *targetp = qn; |
6478 | 87.8k | } |
6479 | 0 | else if (r == 1) { |
6480 | 0 | onig_node_free(qn); |
6481 | 0 | } |
6482 | 0 | else if (r == 2) { /* split case: /abc+/ */ |
6483 | 0 | Node *tmp; |
6484 | |
|
6485 | 0 | *targetp = node_new_list(*targetp, NULL); |
6486 | 0 | if (IS_NULL(*targetp)) { |
6487 | 0 | onig_node_free(qn); |
6488 | 0 | return ONIGERR_MEMORY; |
6489 | 0 | } |
6490 | 0 | tmp = NCDR(*targetp) = node_new_list(qn, NULL); |
6491 | 0 | if (IS_NULL(tmp)) { |
6492 | 0 | onig_node_free(qn); |
6493 | 0 | return ONIGERR_MEMORY; |
6494 | 0 | } |
6495 | 0 | targetp = &(NCAR(tmp)); |
6496 | 0 | } |
6497 | 87.8k | goto re_entry; |
6498 | 87.8k | } |
6499 | 430k | } |
6500 | | |
6501 | 342k | return r; |
6502 | 430k | } |
6503 | | |
6504 | | static int |
6505 | | parse_branch(Node** top, OnigToken* tok, int term, |
6506 | | UChar** src, UChar* end, ScanEnv* env) |
6507 | 125k | { |
6508 | 125k | int r; |
6509 | 125k | Node *node, **headp; |
6510 | | |
6511 | 125k | *top = NULL; |
6512 | 125k | r = parse_exp(&node, tok, term, src, end, env); |
6513 | 125k | if (r < 0) { |
6514 | 0 | onig_node_free(node); |
6515 | 0 | return r; |
6516 | 0 | } |
6517 | | |
6518 | 125k | if (r == TK_EOT || r == term || r == TK_ALT) { |
6519 | 55.6k | *top = node; |
6520 | 55.6k | } |
6521 | 70.2k | else { |
6522 | 70.2k | *top = node_new_list(node, NULL); |
6523 | 70.2k | headp = &(NCDR(*top)); |
6524 | 289k | while (r != TK_EOT && r != term && r != TK_ALT) { |
6525 | 219k | r = parse_exp(&node, tok, term, src, end, env); |
6526 | 219k | if (r < 0) { |
6527 | 0 | onig_node_free(node); |
6528 | 0 | return r; |
6529 | 0 | } |
6530 | | |
6531 | 219k | if (NTYPE(node) == NT_LIST) { |
6532 | 0 | *headp = node; |
6533 | 0 | while (IS_NOT_NULL(NCDR(node))) node = NCDR(node); |
6534 | 0 | headp = &(NCDR(node)); |
6535 | 0 | } |
6536 | 219k | else { |
6537 | 219k | *headp = node_new_list(node, NULL); |
6538 | 219k | headp = &(NCDR(*headp)); |
6539 | 219k | } |
6540 | 219k | } |
6541 | 70.2k | } |
6542 | | |
6543 | 125k | return r; |
6544 | 125k | } |
6545 | | |
6546 | | /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ |
6547 | | static int |
6548 | | parse_subexp(Node** top, OnigToken* tok, int term, |
6549 | | UChar** src, UChar* end, ScanEnv* env) |
6550 | 102k | { |
6551 | 102k | int r; |
6552 | 102k | Node *node, **headp; |
6553 | | |
6554 | 102k | *top = NULL; |
6555 | 102k | env->parse_depth++; |
6556 | 102k | if (env->parse_depth > ParseDepthLimit) |
6557 | 0 | return ONIGERR_PARSE_DEPTH_LIMIT_OVER; |
6558 | 102k | r = parse_branch(&node, tok, term, src, end, env); |
6559 | 102k | if (r < 0) { |
6560 | 0 | onig_node_free(node); |
6561 | 0 | return r; |
6562 | 0 | } |
6563 | | |
6564 | 102k | if (r == term) { |
6565 | 84.9k | *top = node; |
6566 | 84.9k | } |
6567 | 17.5k | else if (r == TK_ALT) { |
6568 | 17.5k | *top = onig_node_new_alt(node, NULL); |
6569 | 17.5k | headp = &(NCDR(*top)); |
6570 | 40.9k | while (r == TK_ALT) { |
6571 | 23.4k | r = fetch_token(tok, src, end, env); |
6572 | 23.4k | if (r < 0) return r; |
6573 | 23.4k | r = parse_branch(&node, tok, term, src, end, env); |
6574 | 23.4k | if (r < 0) { |
6575 | 0 | onig_node_free(node); |
6576 | 0 | return r; |
6577 | 0 | } |
6578 | | |
6579 | 23.4k | *headp = onig_node_new_alt(node, NULL); |
6580 | 23.4k | headp = &(NCDR(*headp)); |
6581 | 23.4k | } |
6582 | | |
6583 | 17.5k | if (tok->type != (enum TokenSyms )term) |
6584 | 0 | goto err; |
6585 | 17.5k | } |
6586 | 0 | else { |
6587 | 0 | onig_node_free(node); |
6588 | 0 | err: |
6589 | 0 | if (term == TK_SUBEXP_CLOSE) |
6590 | 0 | return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; |
6591 | 0 | else |
6592 | 0 | return ONIGERR_PARSER_BUG; |
6593 | 0 | } |
6594 | | |
6595 | 102k | env->parse_depth--; |
6596 | 102k | return r; |
6597 | 102k | } |
6598 | | |
6599 | | static int |
6600 | | parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) |
6601 | 67.3k | { |
6602 | 67.3k | int r; |
6603 | 67.3k | OnigToken tok; |
6604 | | |
6605 | 67.3k | r = fetch_token(&tok, src, end, env); |
6606 | 67.3k | if (r < 0) return r; |
6607 | 67.3k | r = parse_subexp(top, &tok, TK_EOT, src, end, env); |
6608 | 67.3k | if (r < 0) return r; |
6609 | | |
6610 | 67.3k | #ifdef USE_SUBEXP_CALL |
6611 | 67.3k | if (env->num_call > 0) { |
6612 | | /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */ |
6613 | 0 | const int num = 0; |
6614 | 0 | Node* np; |
6615 | 0 | np = node_new_enclose_memory(env->option, 0); |
6616 | 0 | CHECK_NULL_RETURN_MEMERR(np); |
6617 | 0 | NENCLOSE(np)->regnum = num; |
6618 | 0 | NENCLOSE(np)->target = *top; |
6619 | 0 | r = scan_env_set_mem_node(env, num, np); |
6620 | 0 | if (r != 0) { |
6621 | 0 | onig_node_free(np); |
6622 | 0 | return r; |
6623 | 0 | } |
6624 | 0 | *top = np; |
6625 | 0 | } |
6626 | 67.3k | #endif |
6627 | 67.3k | return 0; |
6628 | 67.3k | } |
6629 | | |
6630 | | extern int |
6631 | | onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, |
6632 | | regex_t* reg, ScanEnv* env) |
6633 | 67.3k | { |
6634 | 67.3k | int r; |
6635 | 67.3k | UChar* p; |
6636 | | |
6637 | 67.3k | #ifdef USE_NAMED_GROUP |
6638 | 67.3k | names_clear(reg); |
6639 | 67.3k | #endif |
6640 | | |
6641 | 67.3k | scan_env_clear(env); |
6642 | 67.3k | env->option = reg->options; |
6643 | 67.3k | env->case_fold_flag = reg->case_fold_flag; |
6644 | 67.3k | env->enc = reg->enc; |
6645 | 67.3k | env->syntax = reg->syntax; |
6646 | 67.3k | env->pattern = (UChar* )pattern; |
6647 | 67.3k | env->pattern_end = (UChar* )end; |
6648 | 67.3k | env->reg = reg; |
6649 | | |
6650 | 67.3k | *root = NULL; |
6651 | 67.3k | p = (UChar* )pattern; |
6652 | 67.3k | r = parse_regexp(root, &p, (UChar* )end, env); |
6653 | 67.3k | reg->num_mem = env->num_mem; |
6654 | 67.3k | return r; |
6655 | 67.3k | } |
6656 | | |
6657 | | extern void |
6658 | | onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, |
6659 | | UChar* arg, UChar* arg_end) |
6660 | 0 | { |
6661 | 0 | env->error = arg; |
6662 | 0 | env->error_end = arg_end; |
6663 | 0 | } |