Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2013. The YARA Authors. All Rights Reserved. |
3 | | |
4 | | Redistribution and use in source and binary forms, with or without modification, |
5 | | are permitted provided that the following conditions are met: |
6 | | |
7 | | 1. Redistributions of source code must retain the above copyright notice, this |
8 | | list of conditions and the following disclaimer. |
9 | | |
10 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
11 | | this list of conditions and the following disclaimer in the documentation and/or |
12 | | other materials provided with the distribution. |
13 | | |
14 | | 3. Neither the name of the copyright holder nor the names of its contributors |
15 | | may be used to endorse or promote products derived from this software without |
16 | | specific prior written permission. |
17 | | |
18 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
19 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
20 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR |
22 | | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
23 | | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
24 | | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
25 | | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
27 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | */ |
29 | | |
30 | | /* |
31 | | |
32 | | This module implements a regular expressions engine based on Thompson's |
33 | | algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html. |
34 | | |
35 | | What the article names a "thread" has been named a "fiber" in this code, in |
36 | | order to avoid confusion with operating system threads. |
37 | | |
38 | | */ |
39 | | |
40 | | #include <assert.h> |
41 | | #include <string.h> |
42 | | #include <yara/compiler.h> |
43 | | #include <yara/error.h> |
44 | | #include <yara/globals.h> |
45 | | #include <yara/hex_lexer.h> |
46 | | #include <yara/limits.h> |
47 | | #include <yara/mem.h> |
48 | | #include <yara/re.h> |
49 | | #include <yara/re_lexer.h> |
50 | | #include <yara/strutils.h> |
51 | | #include <yara/threading.h> |
52 | | #include <yara/unaligned.h> |
53 | | #include <yara/utils.h> |
54 | | |
55 | 0 | #define EMIT_BACKWARDS 0x01 |
56 | 0 | #define EMIT_DONT_SET_FORWARDS_CODE 0x02 |
57 | 0 | #define EMIT_DONT_SET_BACKWARDS_CODE 0x04 |
58 | | |
59 | | #ifndef INT16_MAX |
60 | | #define INT16_MAX (32767) |
61 | | #endif |
62 | | |
63 | | typedef uint8_t RE_SPLIT_ID_TYPE; |
64 | | |
65 | | // RE_REPEAT_ARGS and RE_REPEAT_ANY_ARGS are structures that are embedded in |
66 | | // the regexp's instruction stream. As such, they are not always aligned to |
67 | | // 8-byte boundaries, so they need to be "packed" structures in order to prevent |
68 | | // issues due to unaligned memory accesses. |
69 | | #pragma pack(push) |
70 | | #pragma pack(1) |
71 | | |
72 | | typedef struct _RE_REPEAT_ARGS |
73 | | { |
74 | | uint16_t min; |
75 | | uint16_t max; |
76 | | int32_t offset; |
77 | | |
78 | | } RE_REPEAT_ARGS; |
79 | | |
80 | | typedef struct _RE_REPEAT_ANY_ARGS |
81 | | { |
82 | | uint16_t min; |
83 | | uint16_t max; |
84 | | |
85 | | } RE_REPEAT_ANY_ARGS; |
86 | | |
87 | | #pragma pack(pop) |
88 | | |
89 | | typedef struct _RE_EMIT_CONTEXT |
90 | | { |
91 | | YR_ARENA* arena; |
92 | | RE_SPLIT_ID_TYPE next_split_id; |
93 | | |
94 | | } RE_EMIT_CONTEXT; |
95 | | |
96 | 0 | #define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8)) |
97 | | |
98 | | static bool _yr_re_is_char_in_class( |
99 | | RE_CLASS* re_class, |
100 | | uint8_t chr, |
101 | | int case_insensitive) |
102 | 0 | { |
103 | 0 | int result = CHAR_IN_CLASS(re_class->bitmap, chr); |
104 | |
|
105 | 0 | if (case_insensitive) |
106 | 0 | result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]); |
107 | |
|
108 | 0 | if (re_class->negated) |
109 | 0 | result = !result; |
110 | |
|
111 | 0 | return result; |
112 | 0 | } |
113 | | |
114 | | static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size) |
115 | 0 | { |
116 | 0 | int result = ((yr_isalnum(input) || (*input) == '_')); |
117 | |
|
118 | 0 | if (character_size == 2) |
119 | 0 | result = result && (*(input + 1) == 0); |
120 | |
|
121 | 0 | return result; |
122 | 0 | } |
123 | | |
124 | | RE_NODE* yr_re_node_create(int type) |
125 | 0 | { |
126 | 0 | RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE)); |
127 | |
|
128 | 0 | if (result != NULL) |
129 | 0 | { |
130 | 0 | result->type = type; |
131 | 0 | result->children_head = NULL; |
132 | 0 | result->children_tail = NULL; |
133 | 0 | result->prev_sibling = NULL; |
134 | 0 | result->next_sibling = NULL; |
135 | 0 | result->greedy = true; |
136 | 0 | result->forward_code_ref = YR_ARENA_NULL_REF; |
137 | 0 | result->backward_code_ref = YR_ARENA_NULL_REF; |
138 | 0 | } |
139 | |
|
140 | 0 | return result; |
141 | 0 | } |
142 | | |
143 | | void yr_re_node_destroy(RE_NODE* node) |
144 | 0 | { |
145 | 0 | RE_NODE* child = node->children_head; |
146 | 0 | RE_NODE* next_child; |
147 | |
|
148 | 0 | while (child != NULL) |
149 | 0 | { |
150 | 0 | next_child = child->next_sibling; |
151 | 0 | yr_re_node_destroy(child); |
152 | 0 | child = next_child; |
153 | 0 | } |
154 | |
|
155 | 0 | if (node->type == RE_NODE_CLASS) |
156 | 0 | yr_free(node->re_class); |
157 | |
|
158 | 0 | yr_free(node); |
159 | 0 | } |
160 | | |
161 | | //////////////////////////////////////////////////////////////////////////////// |
162 | | // Appends a node to the end of the children list. |
163 | | // |
164 | | void yr_re_node_append_child(RE_NODE* node, RE_NODE* child) |
165 | 0 | { |
166 | 0 | if (node->children_head == NULL) |
167 | 0 | node->children_head = child; |
168 | |
|
169 | 0 | if (node->children_tail != NULL) |
170 | 0 | node->children_tail->next_sibling = child; |
171 | |
|
172 | 0 | child->prev_sibling = node->children_tail; |
173 | 0 | node->children_tail = child; |
174 | 0 | } |
175 | | |
176 | | //////////////////////////////////////////////////////////////////////////////// |
177 | | // Appends a node to the beginning of the children list. |
178 | | // |
179 | | void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child) |
180 | 0 | { |
181 | 0 | child->next_sibling = node->children_head; |
182 | |
|
183 | 0 | if (node->children_head != NULL) |
184 | 0 | node->children_head->prev_sibling = child; |
185 | |
|
186 | 0 | node->children_head = child; |
187 | |
|
188 | 0 | if (node->children_tail == NULL) |
189 | 0 | node->children_tail = child; |
190 | 0 | } |
191 | | |
192 | | int yr_re_ast_create(RE_AST** re_ast) |
193 | 0 | { |
194 | 0 | *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST)); |
195 | |
|
196 | 0 | if (*re_ast == NULL) |
197 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
198 | | |
199 | 0 | (*re_ast)->flags = 0; |
200 | 0 | (*re_ast)->root_node = NULL; |
201 | |
|
202 | 0 | return ERROR_SUCCESS; |
203 | 0 | } |
204 | | |
205 | | void yr_re_ast_destroy(RE_AST* re_ast) |
206 | 0 | { |
207 | 0 | if (re_ast->root_node != NULL) |
208 | 0 | yr_re_node_destroy(re_ast->root_node); |
209 | |
|
210 | 0 | yr_free(re_ast); |
211 | 0 | } |
212 | | |
213 | | //////////////////////////////////////////////////////////////////////////////// |
214 | | // Parses a regexp but don't emit its code. A further call to |
215 | | // yr_re_ast_emit_code is required to get the code. |
216 | | // |
217 | | int yr_re_parse( |
218 | | const char* re_string, |
219 | | RE_AST** re_ast, |
220 | | RE_ERROR* error, |
221 | | int flags) |
222 | 0 | { |
223 | 0 | return yr_parse_re_string(re_string, re_ast, error, flags); |
224 | 0 | } |
225 | | |
226 | | //////////////////////////////////////////////////////////////////////////////// |
227 | | // Parses a hex string but don't emit its code. A further call to |
228 | | // yr_re_ast_emit_code is required to get the code. |
229 | | // |
230 | | int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error) |
231 | 0 | { |
232 | 0 | return yr_parse_hex_string(hex_string, re_ast, error); |
233 | 0 | } |
234 | | |
235 | | //////////////////////////////////////////////////////////////////////////////// |
236 | | // Parses the regexp and emit its code to the provided to the |
237 | | // YR_RE_CODE_SECTION in the specified arena. |
238 | | // |
239 | | int yr_re_compile( |
240 | | const char* re_string, |
241 | | int flags, |
242 | | int parser_flags, |
243 | | YR_ARENA* arena, |
244 | | YR_ARENA_REF* ref, |
245 | | RE_ERROR* error) |
246 | 0 | { |
247 | 0 | RE_AST* re_ast; |
248 | 0 | RE _re; |
249 | 0 | int result; |
250 | |
|
251 | 0 | result = yr_re_parse(re_string, &re_ast, error, parser_flags); |
252 | 0 | if (result != ERROR_UNKNOWN_ESCAPE_SEQUENCE) |
253 | 0 | FAIL_ON_ERROR(result); |
254 | |
|
255 | 0 | _re.flags = flags; |
256 | |
|
257 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
258 | 0 | yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref), |
259 | 0 | yr_re_ast_destroy(re_ast)); |
260 | |
|
261 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
262 | 0 | yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast)); |
263 | |
|
264 | 0 | yr_re_ast_destroy(re_ast); |
265 | |
|
266 | 0 | return result; |
267 | 0 | } |
268 | | |
269 | | //////////////////////////////////////////////////////////////////////////////// |
270 | | // Verifies if the target string matches the pattern |
271 | | // |
272 | | // Args: |
273 | | // context: Scan context |
274 | | // re: A pointer to a compiled regexp |
275 | | // target: Target string |
276 | | // |
277 | | // Returns: |
278 | | // See return codes for yr_re_exec |
279 | | // |
280 | | int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target) |
281 | 0 | { |
282 | 0 | int result; |
283 | |
|
284 | 0 | yr_re_exec( |
285 | 0 | context, |
286 | 0 | re->code, |
287 | 0 | (uint8_t*) target, |
288 | 0 | strlen(target), |
289 | 0 | 0, |
290 | 0 | re->flags | RE_FLAGS_SCAN, |
291 | 0 | NULL, |
292 | 0 | NULL, |
293 | 0 | &result); |
294 | |
|
295 | 0 | return result; |
296 | 0 | } |
297 | | |
298 | | //////////////////////////////////////////////////////////////////////////////// |
299 | | // Verifies if the provided regular expression is just a literal string |
300 | | // like "abc", "12345", without any wildcard, operator, etc. In that case |
301 | | // returns the string as a SIZED_STRING, or returns NULL if otherwise. |
302 | | // |
303 | | // The caller is responsible for deallocating the returned SIZED_STRING by |
304 | | // calling yr_free. |
305 | | // |
306 | | SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast) |
307 | 0 | { |
308 | 0 | SIZED_STRING* string; |
309 | 0 | RE_NODE* child; |
310 | |
|
311 | 0 | int length = 0; |
312 | |
|
313 | 0 | if (re_ast->root_node->type == RE_NODE_LITERAL) |
314 | 0 | { |
315 | 0 | length = 1; |
316 | 0 | } |
317 | 0 | else if (re_ast->root_node->type == RE_NODE_CONCAT) |
318 | 0 | { |
319 | 0 | child = re_ast->root_node->children_tail; |
320 | |
|
321 | 0 | while (child != NULL && child->type == RE_NODE_LITERAL) |
322 | 0 | { |
323 | 0 | length++; |
324 | 0 | child = child->prev_sibling; |
325 | 0 | } |
326 | |
|
327 | 0 | if (child != NULL) |
328 | 0 | return NULL; |
329 | 0 | } |
330 | 0 | else |
331 | 0 | { |
332 | 0 | return NULL; |
333 | 0 | } |
334 | | |
335 | 0 | string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length); |
336 | |
|
337 | 0 | if (string == NULL) |
338 | 0 | return NULL; |
339 | | |
340 | 0 | string->length = length; |
341 | 0 | string->flags = 0; |
342 | |
|
343 | 0 | if (re_ast->root_node->type == RE_NODE_LITERAL) |
344 | 0 | { |
345 | 0 | string->c_string[0] = re_ast->root_node->value; |
346 | 0 | } |
347 | 0 | else |
348 | 0 | { |
349 | 0 | child = re_ast->root_node->children_tail; |
350 | |
|
351 | 0 | while (child != NULL) |
352 | 0 | { |
353 | 0 | string->c_string[--length] = child->value; |
354 | 0 | child = child->prev_sibling; |
355 | 0 | } |
356 | 0 | } |
357 | |
|
358 | 0 | string->c_string[string->length] = '\0'; |
359 | |
|
360 | 0 | return string; |
361 | 0 | } |
362 | | |
363 | | int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node) |
364 | 0 | { |
365 | 0 | RE_NODE* child; |
366 | |
|
367 | 0 | if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) && |
368 | 0 | re_node->children_head->type == RE_NODE_ANY) |
369 | 0 | return true; |
370 | | |
371 | 0 | if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE) |
372 | 0 | return true; |
373 | | |
374 | 0 | if (re_node->type == RE_NODE_CONCAT) |
375 | 0 | { |
376 | 0 | child = re_node->children_tail; |
377 | |
|
378 | 0 | while (child != NULL) |
379 | 0 | { |
380 | 0 | if (_yr_re_node_has_unbounded_quantifier_for_dot(child)) |
381 | 0 | return true; |
382 | | |
383 | 0 | child = child->prev_sibling; |
384 | 0 | } |
385 | 0 | } |
386 | | |
387 | 0 | return false; |
388 | 0 | } |
389 | | |
390 | | //////////////////////////////////////////////////////////////////////////////// |
391 | | // Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with |
392 | | // quantifiers that don't have a reasonably small upper bound causes a |
393 | | // performance penalty. This function detects such cases in order to warn the |
394 | | // user about this. |
395 | | // |
396 | | int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast) |
397 | 0 | { |
398 | 0 | return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node); |
399 | 0 | } |
400 | | |
401 | | //////////////////////////////////////////////////////////////////////////////// |
402 | | // In some cases splitting a regular expression (or hex string) in two parts is |
403 | | // convenient for increasing performance. This happens when the pattern contains |
404 | | // a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 } |
405 | | // In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where |
406 | | // the latter is chained to the former. This means that { 01 02 03 } and |
407 | | // { 04 05 06 } are handled as individual strings, and when both of them are |
408 | | // found, YARA verifies if the distance between the matches complies with the |
409 | | // [0-999] restriction. |
410 | | // |
411 | | // This function traverses a regexp's AST looking for nodes where it should be |
412 | | // splitted. It must be noticed that this only applies to two-level ASTs (i.e. |
413 | | // an AST consisting in a RE_NODE_CONCAT at the root where all the children are |
414 | | // leaves). |
415 | | // |
416 | | // For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the |
417 | | // following AST: |
418 | | // |
419 | | // RE_NODE_CONCAT |
420 | | // | |
421 | | // |- RE_NODE_LITERAL (01) |
422 | | // |- RE_NODE_LITERAL (02) |
423 | | // |- RE_NODE_LITERAL (03) |
424 | | // |- RE_NODE_RANGE_ANY (start=0, end=1000) |
425 | | // |- RE_NODE_LITERAL (04) |
426 | | // |- RE_NODE_LITERAL (05) |
427 | | // |- RE_NODE_LITERAL (06) |
428 | | // |- RE_NODE_RANGE_ANY (start=500, end=2000) |
429 | | // |- RE_NODE_LITERAL (07) |
430 | | // |- RE_NODE_LITERAL (08) |
431 | | // |- RE_NODE_LITERAL (09) |
432 | | // |
433 | | // If the AST above is passed in the re_ast argument, it will be trimmed to: |
434 | | // |
435 | | // RE_NODE_CONCAT |
436 | | // | |
437 | | // |- RE_NODE_LITERAL (01) |
438 | | // |- RE_NODE_LITERAL (02) |
439 | | // |- RE_NODE_LITERAL (03) |
440 | | // |
441 | | // While remainder_re_ast will be: |
442 | | // |
443 | | // RE_NODE_CONCAT |
444 | | // | |
445 | | // |- RE_NODE_LITERAL (04) |
446 | | // |- RE_NODE_LITERAL (05) |
447 | | // |- RE_NODE_LITERAL (06) |
448 | | // |- RE_NODE_RANGE_ANY (start=500, end=2000) |
449 | | // |- RE_NODE_LITERAL (07) |
450 | | // |- RE_NODE_LITERAL (08) |
451 | | // |- RE_NODE_LITERAL (09) |
452 | | // |
453 | | // The caller is responsible for freeing the new AST in remainder_re_ast by |
454 | | // calling yr_re_ast_destroy. |
455 | | // |
456 | | // The integers pointed to by min_gap and max_gap will be filled with the |
457 | | // minimum and maximum gap size between the sub-strings represented by the |
458 | | // two ASTs. |
459 | | // |
460 | | int yr_re_ast_split_at_chaining_point( |
461 | | RE_AST* re_ast, |
462 | | RE_AST** remainder_re_ast, |
463 | | int32_t* min_gap, |
464 | | int32_t* max_gap) |
465 | 0 | { |
466 | 0 | RE_NODE* child; |
467 | 0 | RE_NODE* concat; |
468 | |
|
469 | 0 | int result; |
470 | |
|
471 | 0 | *remainder_re_ast = NULL; |
472 | 0 | *min_gap = 0; |
473 | 0 | *max_gap = 0; |
474 | |
|
475 | 0 | if (re_ast->root_node->type != RE_NODE_CONCAT) |
476 | 0 | return ERROR_SUCCESS; |
477 | | |
478 | 0 | child = re_ast->root_node->children_head; |
479 | |
|
480 | 0 | while (child != NULL) |
481 | 0 | { |
482 | 0 | if (!child->greedy && child->type == RE_NODE_RANGE_ANY && |
483 | 0 | child->prev_sibling != NULL && child->next_sibling != NULL && |
484 | 0 | (child->start > YR_STRING_CHAINING_THRESHOLD || |
485 | 0 | child->end > YR_STRING_CHAINING_THRESHOLD)) |
486 | 0 | { |
487 | 0 | result = yr_re_ast_create(remainder_re_ast); |
488 | |
|
489 | 0 | if (result != ERROR_SUCCESS) |
490 | 0 | return result; |
491 | | |
492 | 0 | concat = yr_re_node_create(RE_NODE_CONCAT); |
493 | |
|
494 | 0 | if (concat == NULL) |
495 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
496 | | |
497 | 0 | concat->children_head = child->next_sibling; |
498 | 0 | concat->children_tail = re_ast->root_node->children_tail; |
499 | |
|
500 | 0 | re_ast->root_node->children_tail = child->prev_sibling; |
501 | |
|
502 | 0 | child->prev_sibling->next_sibling = NULL; |
503 | 0 | child->next_sibling->prev_sibling = NULL; |
504 | |
|
505 | 0 | *min_gap = child->start; |
506 | 0 | *max_gap = child->end; |
507 | |
|
508 | 0 | (*remainder_re_ast)->root_node = concat; |
509 | 0 | (*remainder_re_ast)->flags = re_ast->flags; |
510 | |
|
511 | 0 | yr_re_node_destroy(child); |
512 | |
|
513 | 0 | return ERROR_SUCCESS; |
514 | 0 | } |
515 | | |
516 | 0 | child = child->next_sibling; |
517 | 0 | } |
518 | | |
519 | 0 | return ERROR_SUCCESS; |
520 | 0 | } |
521 | | |
522 | | int _yr_emit_inst( |
523 | | RE_EMIT_CONTEXT* emit_context, |
524 | | uint8_t opcode, |
525 | | YR_ARENA_REF* instruction_ref) |
526 | 0 | { |
527 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
528 | 0 | emit_context->arena, |
529 | 0 | YR_RE_CODE_SECTION, |
530 | 0 | &opcode, |
531 | 0 | sizeof(uint8_t), |
532 | 0 | instruction_ref)); |
533 | |
|
534 | 0 | return ERROR_SUCCESS; |
535 | 0 | } |
536 | | |
537 | | int _yr_emit_inst_arg_uint8( |
538 | | RE_EMIT_CONTEXT* emit_context, |
539 | | uint8_t opcode, |
540 | | uint8_t argument, |
541 | | YR_ARENA_REF* instruction_ref, |
542 | | YR_ARENA_REF* argument_ref) |
543 | 0 | { |
544 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
545 | 0 | emit_context->arena, |
546 | 0 | YR_RE_CODE_SECTION, |
547 | 0 | &opcode, |
548 | 0 | sizeof(uint8_t), |
549 | 0 | instruction_ref)); |
550 | |
|
551 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
552 | 0 | emit_context->arena, |
553 | 0 | YR_RE_CODE_SECTION, |
554 | 0 | &argument, |
555 | 0 | sizeof(uint8_t), |
556 | 0 | argument_ref)); |
557 | |
|
558 | 0 | return ERROR_SUCCESS; |
559 | 0 | } |
560 | | |
561 | | int _yr_emit_inst_arg_uint16( |
562 | | RE_EMIT_CONTEXT* emit_context, |
563 | | uint8_t opcode, |
564 | | uint16_t argument, |
565 | | YR_ARENA_REF* instruction_ref, |
566 | | YR_ARENA_REF* argument_ref) |
567 | 0 | { |
568 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
569 | 0 | emit_context->arena, |
570 | 0 | YR_RE_CODE_SECTION, |
571 | 0 | &opcode, |
572 | 0 | sizeof(uint8_t), |
573 | 0 | instruction_ref)); |
574 | |
|
575 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
576 | 0 | emit_context->arena, |
577 | 0 | YR_RE_CODE_SECTION, |
578 | 0 | &argument, |
579 | 0 | sizeof(uint16_t), |
580 | 0 | argument_ref)); |
581 | |
|
582 | 0 | return ERROR_SUCCESS; |
583 | 0 | } |
584 | | |
585 | | int _yr_emit_inst_arg_uint32( |
586 | | RE_EMIT_CONTEXT* emit_context, |
587 | | uint8_t opcode, |
588 | | uint32_t argument, |
589 | | YR_ARENA_REF* instruction_ref, |
590 | | YR_ARENA_REF* argument_ref) |
591 | 0 | { |
592 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
593 | 0 | emit_context->arena, |
594 | 0 | YR_RE_CODE_SECTION, |
595 | 0 | &opcode, |
596 | 0 | sizeof(uint8_t), |
597 | 0 | instruction_ref)); |
598 | |
|
599 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
600 | 0 | emit_context->arena, |
601 | 0 | YR_RE_CODE_SECTION, |
602 | 0 | &argument, |
603 | 0 | sizeof(uint32_t), |
604 | 0 | argument_ref)); |
605 | |
|
606 | 0 | return ERROR_SUCCESS; |
607 | 0 | } |
608 | | |
609 | | int _yr_emit_inst_arg_int16( |
610 | | RE_EMIT_CONTEXT* emit_context, |
611 | | uint8_t opcode, |
612 | | int16_t argument, |
613 | | YR_ARENA_REF* instruction_ref, |
614 | | YR_ARENA_REF* argument_ref) |
615 | 0 | { |
616 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
617 | 0 | emit_context->arena, |
618 | 0 | YR_RE_CODE_SECTION, |
619 | 0 | &opcode, |
620 | 0 | sizeof(uint8_t), |
621 | 0 | instruction_ref)); |
622 | |
|
623 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
624 | 0 | emit_context->arena, |
625 | 0 | YR_RE_CODE_SECTION, |
626 | 0 | &argument, |
627 | 0 | sizeof(int16_t), |
628 | 0 | argument_ref)); |
629 | |
|
630 | 0 | return ERROR_SUCCESS; |
631 | 0 | } |
632 | | |
633 | | int _yr_emit_inst_arg_struct( |
634 | | RE_EMIT_CONTEXT* emit_context, |
635 | | uint8_t opcode, |
636 | | void* structure, |
637 | | size_t structure_size, |
638 | | YR_ARENA_REF* instruction_ref, |
639 | | YR_ARENA_REF* argument_ref) |
640 | 0 | { |
641 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
642 | 0 | emit_context->arena, |
643 | 0 | YR_RE_CODE_SECTION, |
644 | 0 | &opcode, |
645 | 0 | sizeof(uint8_t), |
646 | 0 | instruction_ref)); |
647 | |
|
648 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
649 | 0 | emit_context->arena, |
650 | 0 | YR_RE_CODE_SECTION, |
651 | 0 | structure, |
652 | 0 | structure_size, |
653 | 0 | argument_ref)); |
654 | |
|
655 | 0 | return ERROR_SUCCESS; |
656 | 0 | } |
657 | | |
658 | | int _yr_emit_split( |
659 | | RE_EMIT_CONTEXT* emit_context, |
660 | | uint8_t opcode, |
661 | | int16_t argument, |
662 | | YR_ARENA_REF* instruction_ref, |
663 | | YR_ARENA_REF* argument_ref) |
664 | 0 | { |
665 | 0 | assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B); |
666 | | |
667 | 0 | if (emit_context->next_split_id == RE_MAX_SPLIT_ID) |
668 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX; |
669 | | |
670 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
671 | 0 | emit_context->arena, |
672 | 0 | YR_RE_CODE_SECTION, |
673 | 0 | &opcode, |
674 | 0 | sizeof(uint8_t), |
675 | 0 | instruction_ref)); |
676 | |
|
677 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
678 | 0 | emit_context->arena, |
679 | 0 | YR_RE_CODE_SECTION, |
680 | 0 | &emit_context->next_split_id, |
681 | 0 | sizeof(RE_SPLIT_ID_TYPE), |
682 | 0 | NULL)); |
683 | |
|
684 | 0 | emit_context->next_split_id++; |
685 | |
|
686 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
687 | 0 | emit_context->arena, |
688 | 0 | YR_RE_CODE_SECTION, |
689 | 0 | &argument, |
690 | 0 | sizeof(int16_t), |
691 | 0 | argument_ref)); |
692 | |
|
693 | 0 | return ERROR_SUCCESS; |
694 | 0 | } |
695 | | |
696 | | static int _yr_re_emit( |
697 | | RE_EMIT_CONTEXT* emit_context, |
698 | | RE_NODE* re_node, |
699 | | int flags, |
700 | | YR_ARENA_REF* code_ref) |
701 | 0 | { |
702 | 0 | int16_t jmp_offset; |
703 | |
|
704 | 0 | yr_arena_off_t bookmark_1 = 0; |
705 | 0 | yr_arena_off_t bookmark_2 = 0; |
706 | 0 | yr_arena_off_t bookmark_3 = 0; |
707 | 0 | yr_arena_off_t bookmark_4 = 0; |
708 | |
|
709 | 0 | bool emit_split; |
710 | 0 | bool emit_repeat; |
711 | 0 | bool emit_prolog; |
712 | 0 | bool emit_epilog; |
713 | |
|
714 | 0 | RE_REPEAT_ARGS repeat_args; |
715 | 0 | RE_REPEAT_ARGS* repeat_start_args_addr; |
716 | 0 | RE_REPEAT_ANY_ARGS repeat_any_args; |
717 | |
|
718 | 0 | RE_NODE* child; |
719 | |
|
720 | 0 | int16_t* split_offset_addr = NULL; |
721 | 0 | int16_t* jmp_offset_addr = NULL; |
722 | |
|
723 | 0 | YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF; |
724 | 0 | YR_ARENA_REF split_offset_ref; |
725 | 0 | YR_ARENA_REF jmp_instruction_ref; |
726 | 0 | YR_ARENA_REF jmp_offset_ref; |
727 | 0 | YR_ARENA_REF repeat_start_args_ref; |
728 | |
|
729 | 0 | switch (re_node->type) |
730 | 0 | { |
731 | 0 | case RE_NODE_LITERAL: |
732 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_uint8( |
733 | 0 | emit_context, |
734 | 0 | RE_OPCODE_LITERAL, |
735 | 0 | re_node->value, |
736 | 0 | &instruction_ref, |
737 | 0 | NULL)); |
738 | 0 | break; |
739 | | |
740 | 0 | case RE_NODE_NOT_LITERAL: |
741 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_uint8( |
742 | 0 | emit_context, |
743 | 0 | RE_OPCODE_NOT_LITERAL, |
744 | 0 | re_node->value, |
745 | 0 | &instruction_ref, |
746 | 0 | NULL)); |
747 | 0 | break; |
748 | | |
749 | 0 | case RE_NODE_MASKED_LITERAL: |
750 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_uint16( |
751 | 0 | emit_context, |
752 | 0 | RE_OPCODE_MASKED_LITERAL, |
753 | 0 | re_node->mask << 8 | re_node->value, |
754 | 0 | &instruction_ref, |
755 | 0 | NULL)); |
756 | 0 | break; |
757 | | |
758 | 0 | case RE_NODE_MASKED_NOT_LITERAL: |
759 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_uint16( |
760 | 0 | emit_context, |
761 | 0 | RE_OPCODE_MASKED_NOT_LITERAL, |
762 | 0 | re_node->mask << 8 | re_node->value, |
763 | 0 | &instruction_ref, |
764 | 0 | NULL)); |
765 | 0 | break; |
766 | | |
767 | 0 | case RE_NODE_WORD_CHAR: |
768 | 0 | FAIL_ON_ERROR( |
769 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref)); |
770 | 0 | break; |
771 | | |
772 | 0 | case RE_NODE_NON_WORD_CHAR: |
773 | 0 | FAIL_ON_ERROR( |
774 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref)); |
775 | 0 | break; |
776 | | |
777 | 0 | case RE_NODE_WORD_BOUNDARY: |
778 | 0 | FAIL_ON_ERROR( |
779 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref)); |
780 | 0 | break; |
781 | | |
782 | 0 | case RE_NODE_NON_WORD_BOUNDARY: |
783 | 0 | FAIL_ON_ERROR(_yr_emit_inst( |
784 | 0 | emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref)); |
785 | 0 | break; |
786 | | |
787 | 0 | case RE_NODE_SPACE: |
788 | 0 | FAIL_ON_ERROR( |
789 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref)); |
790 | 0 | break; |
791 | | |
792 | 0 | case RE_NODE_NON_SPACE: |
793 | 0 | FAIL_ON_ERROR( |
794 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref)); |
795 | 0 | break; |
796 | | |
797 | 0 | case RE_NODE_DIGIT: |
798 | 0 | FAIL_ON_ERROR( |
799 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref)); |
800 | 0 | break; |
801 | | |
802 | 0 | case RE_NODE_NON_DIGIT: |
803 | 0 | FAIL_ON_ERROR( |
804 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref)); |
805 | 0 | break; |
806 | | |
807 | 0 | case RE_NODE_ANY: |
808 | 0 | FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref)); |
809 | 0 | break; |
810 | | |
811 | 0 | case RE_NODE_CLASS: |
812 | 0 | FAIL_ON_ERROR( |
813 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref)); |
814 | |
|
815 | 0 | FAIL_ON_ERROR(yr_arena_write_data( |
816 | 0 | emit_context->arena, |
817 | 0 | YR_RE_CODE_SECTION, |
818 | 0 | re_node->re_class, |
819 | 0 | sizeof(*re_node->re_class), |
820 | 0 | NULL)); |
821 | 0 | break; |
822 | | |
823 | 0 | case RE_NODE_ANCHOR_START: |
824 | 0 | FAIL_ON_ERROR(_yr_emit_inst( |
825 | 0 | emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref)); |
826 | 0 | break; |
827 | | |
828 | 0 | case RE_NODE_ANCHOR_END: |
829 | 0 | FAIL_ON_ERROR( |
830 | 0 | _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref)); |
831 | 0 | break; |
832 | | |
833 | 0 | case RE_NODE_CONCAT: |
834 | 0 | FAIL_ON_ERROR(_yr_re_emit( |
835 | 0 | emit_context, |
836 | 0 | (flags & EMIT_BACKWARDS) ? re_node->children_tail |
837 | 0 | : re_node->children_head, |
838 | 0 | flags, |
839 | 0 | &instruction_ref)); |
840 | |
|
841 | 0 | if (flags & EMIT_BACKWARDS) |
842 | 0 | child = re_node->children_tail->prev_sibling; |
843 | 0 | else |
844 | 0 | child = re_node->children_head->next_sibling; |
845 | |
|
846 | 0 | while (child != NULL) |
847 | 0 | { |
848 | 0 | FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL)); |
849 | |
|
850 | 0 | child = (flags & EMIT_BACKWARDS) ? child->prev_sibling |
851 | 0 | : child->next_sibling; |
852 | 0 | } |
853 | 0 | break; |
854 | | |
855 | 0 | case RE_NODE_PLUS: |
856 | | // Code for e+ looks like: |
857 | | // |
858 | | // L1: code for e |
859 | | // split L1, L2 |
860 | | // L2: |
861 | | // |
862 | 0 | FAIL_ON_ERROR(_yr_re_emit( |
863 | 0 | emit_context, re_node->children_head, flags, &instruction_ref)); |
864 | |
|
865 | 0 | bookmark_1 = yr_arena_get_current_offset( |
866 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
867 | |
|
868 | 0 | if (instruction_ref.offset - bookmark_1 < INT16_MIN) |
869 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
870 | | |
871 | 0 | jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1); |
872 | |
|
873 | 0 | FAIL_ON_ERROR(_yr_emit_split( |
874 | 0 | emit_context, |
875 | 0 | re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A, |
876 | 0 | jmp_offset, |
877 | 0 | NULL, |
878 | 0 | NULL)); |
879 | |
|
880 | 0 | break; |
881 | | |
882 | 0 | case RE_NODE_STAR: |
883 | | // Code for e* looks like: |
884 | | // |
885 | | // L1: split L1, L2 |
886 | | // code for e |
887 | | // jmp L1 |
888 | | // L2: |
889 | 0 | FAIL_ON_ERROR(_yr_emit_split( |
890 | 0 | emit_context, |
891 | 0 | re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B, |
892 | 0 | 0, |
893 | 0 | &instruction_ref, |
894 | 0 | &split_offset_ref)); |
895 | |
|
896 | 0 | FAIL_ON_ERROR( |
897 | 0 | _yr_re_emit(emit_context, re_node->children_head, flags, NULL)); |
898 | |
|
899 | 0 | bookmark_1 = yr_arena_get_current_offset( |
900 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
901 | |
|
902 | 0 | if (instruction_ref.offset - bookmark_1 < INT16_MIN) |
903 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
904 | | |
905 | 0 | jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1); |
906 | | |
907 | | // Emit jump with offset set to 0. |
908 | |
|
909 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_int16( |
910 | 0 | emit_context, RE_OPCODE_JUMP, jmp_offset, NULL, NULL)); |
911 | |
|
912 | 0 | bookmark_1 = yr_arena_get_current_offset( |
913 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
914 | |
|
915 | 0 | if (bookmark_1 - instruction_ref.offset > INT16_MAX) |
916 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
917 | | |
918 | 0 | jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset); |
919 | | |
920 | | // Update split offset. |
921 | 0 | split_offset_addr = (int16_t*) yr_arena_ref_to_ptr( |
922 | 0 | emit_context->arena, &split_offset_ref); |
923 | |
|
924 | 0 | memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset)); |
925 | 0 | break; |
926 | | |
927 | 0 | case RE_NODE_ALT: |
928 | | // Code for e1|e2 looks like: |
929 | | // |
930 | | // split L1, L2 |
931 | | // L1: code for e1 |
932 | | // jmp L3 |
933 | | // L2: code for e2 |
934 | | // L3: |
935 | | |
936 | | // Emit a split instruction with offset set to 0 temporarily. Offset |
937 | | // will be updated after we know the size of the code generated for |
938 | | // the left node (e1). |
939 | |
|
940 | 0 | FAIL_ON_ERROR(_yr_emit_split( |
941 | 0 | emit_context, |
942 | 0 | RE_OPCODE_SPLIT_A, |
943 | 0 | 0, |
944 | 0 | &instruction_ref, |
945 | 0 | &split_offset_ref)); |
946 | |
|
947 | 0 | FAIL_ON_ERROR( |
948 | 0 | _yr_re_emit(emit_context, re_node->children_head, flags, NULL)); |
949 | | |
950 | | // Emit jump with offset set to 0. |
951 | |
|
952 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_int16( |
953 | 0 | emit_context, |
954 | 0 | RE_OPCODE_JUMP, |
955 | 0 | 0, |
956 | 0 | &jmp_instruction_ref, |
957 | 0 | &jmp_offset_ref)); |
958 | |
|
959 | 0 | bookmark_1 = yr_arena_get_current_offset( |
960 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
961 | |
|
962 | 0 | if (bookmark_1 - instruction_ref.offset > INT16_MAX) |
963 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
964 | | |
965 | 0 | jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset); |
966 | | |
967 | | // Update split offset. |
968 | 0 | split_offset_addr = (int16_t*) yr_arena_ref_to_ptr( |
969 | 0 | emit_context->arena, &split_offset_ref); |
970 | |
|
971 | 0 | memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset)); |
972 | |
|
973 | 0 | FAIL_ON_ERROR( |
974 | 0 | _yr_re_emit(emit_context, re_node->children_tail, flags, NULL)); |
975 | |
|
976 | 0 | bookmark_1 = yr_arena_get_current_offset( |
977 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
978 | |
|
979 | 0 | if (bookmark_1 - jmp_instruction_ref.offset > INT16_MAX) |
980 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
981 | | |
982 | 0 | jmp_offset = (int16_t) (bookmark_1 - jmp_instruction_ref.offset); |
983 | | |
984 | | // Update offset for jmp instruction. |
985 | 0 | jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr( |
986 | 0 | emit_context->arena, &jmp_offset_ref); |
987 | |
|
988 | 0 | memcpy(jmp_offset_addr, &jmp_offset, sizeof(jmp_offset)); |
989 | 0 | break; |
990 | | |
991 | 0 | case RE_NODE_RANGE_ANY: |
992 | 0 | repeat_any_args.min = re_node->start; |
993 | 0 | repeat_any_args.max = re_node->end; |
994 | |
|
995 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_struct( |
996 | 0 | emit_context, |
997 | 0 | re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY |
998 | 0 | : RE_OPCODE_REPEAT_ANY_UNGREEDY, |
999 | 0 | &repeat_any_args, |
1000 | 0 | sizeof(repeat_any_args), |
1001 | 0 | &instruction_ref, |
1002 | 0 | NULL)); |
1003 | |
|
1004 | 0 | break; |
1005 | | |
1006 | 0 | case RE_NODE_RANGE: |
1007 | | // Code for e{n,m} looks like: |
1008 | | // |
1009 | | // code for e --- prolog |
1010 | | // repeat_start n, m, L1 --+ |
1011 | | // L0: code for e | repeat |
1012 | | // repeat_end n, m, L0 --+ |
1013 | | // L1: split L2, L3 --- split |
1014 | | // L2: code for e --- epilog |
1015 | | // L3: |
1016 | | // |
1017 | | // Not all sections (prolog, repeat, split and epilog) are generated in all |
1018 | | // cases, it depends on the values of n and m. The following table shows |
1019 | | // which sections are generated for the first few values of n and m. |
1020 | | // |
1021 | | // n,m prolog repeat split epilog |
1022 | | // (min,max) |
1023 | | // --------------------------------------- |
1024 | | // 0,0 - - - - |
1025 | | // 0,1 - - X X |
1026 | | // 0,2 - 0,1 X X |
1027 | | // 0,3 - 0,2 X X |
1028 | | // 0,M - 0,M-1 X X |
1029 | | // |
1030 | | // 1,1 X - - - |
1031 | | // 1,2 X - X X |
1032 | | // 1,3 X 0,1 X X |
1033 | | // 1,4 X 1,2 X X |
1034 | | // 1,M X 1,M-2 X X |
1035 | | // |
1036 | | // 2,2 X - - X |
1037 | | // 2,3 X 1,1 X X |
1038 | | // 2,4 X 1,2 X X |
1039 | | // 2,M X 1,M-2 X X |
1040 | | // |
1041 | | // 3,3 X 1,1 - X |
1042 | | // 3,4 X 2,2 X X |
1043 | | // 3,M X 2,M-2 X X |
1044 | | // |
1045 | | // 4,4 X 2,2 - X |
1046 | | // 4,5 X 3,3 X X |
1047 | | // 4,M X 3,M-2 X X |
1048 | | // |
1049 | | // The code can't consists simply in the repeat section, the prolog and |
1050 | | // epilog are required because we can't have atoms pointing to code inside |
1051 | | // the repeat loop. Atoms' forwards_code will point to code in the prolog |
1052 | | // and backwards_code will point to code in the epilog (or in prolog if |
1053 | | // epilog wasn't generated, like in n=1,m=1) |
1054 | |
|
1055 | 0 | emit_prolog = re_node->start > 0; |
1056 | 0 | emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2; |
1057 | 0 | emit_split = re_node->end > re_node->start; |
1058 | 0 | emit_epilog = re_node->end > re_node->start || re_node->end > 1; |
1059 | |
|
1060 | 0 | if (emit_prolog) |
1061 | 0 | { |
1062 | 0 | FAIL_ON_ERROR(_yr_re_emit( |
1063 | 0 | emit_context, re_node->children_head, flags, &instruction_ref)); |
1064 | 0 | } |
1065 | | |
1066 | 0 | if (emit_repeat) |
1067 | 0 | { |
1068 | 0 | repeat_args.min = re_node->start; |
1069 | 0 | repeat_args.max = re_node->end; |
1070 | |
|
1071 | 0 | if (emit_prolog) |
1072 | 0 | { |
1073 | 0 | repeat_args.max--; |
1074 | 0 | repeat_args.min--; |
1075 | 0 | } |
1076 | |
|
1077 | 0 | if (emit_split) |
1078 | 0 | { |
1079 | 0 | repeat_args.max--; |
1080 | 0 | } |
1081 | 0 | else |
1082 | 0 | { |
1083 | 0 | repeat_args.min--; |
1084 | 0 | repeat_args.max--; |
1085 | 0 | } |
1086 | |
|
1087 | 0 | repeat_args.offset = 0; |
1088 | |
|
1089 | 0 | bookmark_1 = yr_arena_get_current_offset( |
1090 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
1091 | |
|
1092 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_struct( |
1093 | 0 | emit_context, |
1094 | 0 | re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY |
1095 | 0 | : RE_OPCODE_REPEAT_START_UNGREEDY, |
1096 | 0 | &repeat_args, |
1097 | 0 | sizeof(repeat_args), |
1098 | 0 | emit_prolog ? NULL : &instruction_ref, |
1099 | 0 | &repeat_start_args_ref)); |
1100 | |
|
1101 | 0 | bookmark_2 = yr_arena_get_current_offset( |
1102 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
1103 | |
|
1104 | 0 | FAIL_ON_ERROR(_yr_re_emit( |
1105 | 0 | emit_context, |
1106 | 0 | re_node->children_head, |
1107 | 0 | flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE, |
1108 | 0 | NULL)); |
1109 | |
|
1110 | 0 | bookmark_3 = yr_arena_get_current_offset( |
1111 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
1112 | |
|
1113 | 0 | if (bookmark_2 - bookmark_3 < INT32_MIN) |
1114 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
1115 | | |
1116 | 0 | repeat_args.offset = (int32_t) (bookmark_2 - bookmark_3); |
1117 | |
|
1118 | 0 | FAIL_ON_ERROR(_yr_emit_inst_arg_struct( |
1119 | 0 | emit_context, |
1120 | 0 | re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY |
1121 | 0 | : RE_OPCODE_REPEAT_END_UNGREEDY, |
1122 | 0 | &repeat_args, |
1123 | 0 | sizeof(repeat_args), |
1124 | 0 | NULL, |
1125 | 0 | NULL)); |
1126 | |
|
1127 | 0 | bookmark_4 = yr_arena_get_current_offset( |
1128 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
1129 | |
|
1130 | 0 | repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr( |
1131 | 0 | emit_context->arena, &repeat_start_args_ref); |
1132 | |
|
1133 | 0 | if (bookmark_4 - bookmark_1 > INT32_MAX) |
1134 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
1135 | | |
1136 | 0 | repeat_start_args_addr->offset = (int32_t) (bookmark_4 - bookmark_1); |
1137 | 0 | } |
1138 | | |
1139 | 0 | if (emit_split) |
1140 | 0 | { |
1141 | 0 | bookmark_1 = yr_arena_get_current_offset( |
1142 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
1143 | |
|
1144 | 0 | FAIL_ON_ERROR(_yr_emit_split( |
1145 | 0 | emit_context, |
1146 | 0 | re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B, |
1147 | 0 | 0, |
1148 | 0 | NULL, |
1149 | 0 | &split_offset_ref)); |
1150 | 0 | } |
1151 | | |
1152 | 0 | if (emit_epilog) |
1153 | 0 | { |
1154 | 0 | FAIL_ON_ERROR(_yr_re_emit( |
1155 | 0 | emit_context, |
1156 | 0 | re_node->children_head, |
1157 | 0 | emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags, |
1158 | 0 | emit_prolog || emit_repeat ? NULL : &instruction_ref)); |
1159 | 0 | } |
1160 | | |
1161 | 0 | if (emit_split) |
1162 | 0 | { |
1163 | 0 | bookmark_2 = yr_arena_get_current_offset( |
1164 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
1165 | |
|
1166 | 0 | if (bookmark_2 - bookmark_1 > INT16_MAX) |
1167 | 0 | return ERROR_REGULAR_EXPRESSION_TOO_LARGE; |
1168 | | |
1169 | 0 | split_offset_addr = (int16_t*) yr_arena_ref_to_ptr( |
1170 | 0 | emit_context->arena, &split_offset_ref); |
1171 | |
|
1172 | 0 | jmp_offset = (int16_t) (bookmark_2 - bookmark_1); |
1173 | |
|
1174 | 0 | memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset)); |
1175 | 0 | } |
1176 | | |
1177 | 0 | break; |
1178 | 0 | } |
1179 | | |
1180 | 0 | if (flags & EMIT_BACKWARDS) |
1181 | 0 | { |
1182 | 0 | if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE)) |
1183 | 0 | { |
1184 | 0 | re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION; |
1185 | 0 | re_node->backward_code_ref.offset = yr_arena_get_current_offset( |
1186 | 0 | emit_context->arena, YR_RE_CODE_SECTION); |
1187 | 0 | } |
1188 | 0 | } |
1189 | 0 | else |
1190 | 0 | { |
1191 | 0 | if (!(flags & EMIT_DONT_SET_FORWARDS_CODE)) |
1192 | 0 | { |
1193 | 0 | re_node->forward_code_ref = instruction_ref; |
1194 | 0 | } |
1195 | 0 | } |
1196 | |
|
1197 | 0 | if (code_ref != NULL) |
1198 | 0 | *code_ref = instruction_ref; |
1199 | |
|
1200 | 0 | return ERROR_SUCCESS; |
1201 | 0 | } |
1202 | | |
1203 | | int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code) |
1204 | 0 | { |
1205 | 0 | RE_EMIT_CONTEXT emit_context; |
1206 | | |
1207 | | // Emit code for matching the regular expressions forwards. |
1208 | 0 | emit_context.arena = arena; |
1209 | 0 | emit_context.next_split_id = 0; |
1210 | |
|
1211 | 0 | FAIL_ON_ERROR(_yr_re_emit( |
1212 | 0 | &emit_context, |
1213 | 0 | re_ast->root_node, |
1214 | 0 | backwards_code ? EMIT_BACKWARDS : 0, |
1215 | 0 | NULL)); |
1216 | |
|
1217 | 0 | FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL)); |
1218 | |
|
1219 | 0 | return ERROR_SUCCESS; |
1220 | 0 | } |
1221 | | |
1222 | | static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber) |
1223 | 0 | { |
1224 | 0 | RE_FIBER* fiber; |
1225 | |
|
1226 | 0 | if (fiber_pool->fibers.head != NULL) |
1227 | 0 | { |
1228 | 0 | fiber = fiber_pool->fibers.head; |
1229 | 0 | fiber_pool->fibers.head = fiber->next; |
1230 | |
|
1231 | 0 | if (fiber_pool->fibers.tail == fiber) |
1232 | 0 | fiber_pool->fibers.tail = NULL; |
1233 | 0 | } |
1234 | 0 | else |
1235 | 0 | { |
1236 | 0 | if (fiber_pool->fiber_count == RE_MAX_FIBERS) |
1237 | 0 | return ERROR_TOO_MANY_RE_FIBERS; |
1238 | | |
1239 | 0 | fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER)); |
1240 | |
|
1241 | 0 | if (fiber == NULL) |
1242 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
1243 | | |
1244 | 0 | fiber_pool->fiber_count++; |
1245 | 0 | } |
1246 | | |
1247 | 0 | fiber->ip = NULL; |
1248 | 0 | fiber->sp = -1; |
1249 | 0 | fiber->rc = -1; |
1250 | 0 | fiber->next = NULL; |
1251 | 0 | fiber->prev = NULL; |
1252 | |
|
1253 | 0 | *new_fiber = fiber; |
1254 | |
|
1255 | 0 | return ERROR_SUCCESS; |
1256 | 0 | } |
1257 | | |
1258 | | //////////////////////////////////////////////////////////////////////////////// |
1259 | | // Appends 'fiber' to 'fiber_list' |
1260 | | // |
1261 | | static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber) |
1262 | 0 | { |
1263 | 0 | assert(fiber->prev == NULL); |
1264 | 0 | assert(fiber->next == NULL); |
1265 | | |
1266 | 0 | fiber->prev = fiber_list->tail; |
1267 | |
|
1268 | 0 | if (fiber_list->tail != NULL) |
1269 | 0 | fiber_list->tail->next = fiber; |
1270 | |
|
1271 | 0 | fiber_list->tail = fiber; |
1272 | |
|
1273 | 0 | if (fiber_list->head == NULL) |
1274 | 0 | fiber_list->head = fiber; |
1275 | |
|
1276 | 0 | assert(fiber_list->tail->next == NULL); |
1277 | 0 | assert(fiber_list->head->prev == NULL); |
1278 | 0 | } |
1279 | | |
1280 | | //////////////////////////////////////////////////////////////////////////////// |
1281 | | // Verifies if a fiber with the same properties (ip, rc, sp, and stack values) |
1282 | | // than 'target_fiber' exists in 'fiber_list'. The list is iterated from |
1283 | | // the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not |
1284 | | // taken into account. |
1285 | | // |
1286 | | static int _yr_re_fiber_exists( |
1287 | | RE_FIBER_LIST* fiber_list, |
1288 | | RE_FIBER* target_fiber, |
1289 | | RE_FIBER* last_fiber) |
1290 | 0 | { |
1291 | 0 | RE_FIBER* fiber = fiber_list->head; |
1292 | |
|
1293 | 0 | int equal_stacks; |
1294 | 0 | int i; |
1295 | |
|
1296 | 0 | if (last_fiber == NULL) |
1297 | 0 | return false; |
1298 | | |
1299 | 0 | while (fiber != last_fiber->next) |
1300 | 0 | { |
1301 | 0 | if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp && |
1302 | 0 | fiber->rc == target_fiber->rc) |
1303 | 0 | { |
1304 | 0 | equal_stacks = true; |
1305 | |
|
1306 | 0 | for (i = 0; i <= fiber->sp; i++) |
1307 | 0 | { |
1308 | 0 | if (fiber->stack[i] != target_fiber->stack[i]) |
1309 | 0 | { |
1310 | 0 | equal_stacks = false; |
1311 | 0 | break; |
1312 | 0 | } |
1313 | 0 | } |
1314 | |
|
1315 | 0 | if (equal_stacks) |
1316 | 0 | return true; |
1317 | 0 | } |
1318 | | |
1319 | 0 | fiber = fiber->next; |
1320 | 0 | } |
1321 | | |
1322 | 0 | return false; |
1323 | 0 | } |
1324 | | |
1325 | | //////////////////////////////////////////////////////////////////////////////// |
1326 | | // Clones a fiber in fiber_list and inserts the cloned fiber just after. |
1327 | | // the original one. If fiber_list is: |
1328 | | // |
1329 | | // f1 -> f2 -> f3 -> f4 |
1330 | | // |
1331 | | // Splitting f2 will result in: |
1332 | | // |
1333 | | // f1 -> f2 -> cloned f2 -> f3 -> f4 |
1334 | | // |
1335 | | static int _yr_re_fiber_split( |
1336 | | RE_FIBER_LIST* fiber_list, |
1337 | | RE_FIBER_POOL* fiber_pool, |
1338 | | RE_FIBER* fiber, |
1339 | | RE_FIBER** new_fiber) |
1340 | 0 | { |
1341 | 0 | int32_t i; |
1342 | |
|
1343 | 0 | FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber)); |
1344 | |
|
1345 | 0 | (*new_fiber)->sp = fiber->sp; |
1346 | 0 | (*new_fiber)->ip = fiber->ip; |
1347 | 0 | (*new_fiber)->rc = fiber->rc; |
1348 | |
|
1349 | 0 | for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i]; |
1350 | |
|
1351 | 0 | (*new_fiber)->next = fiber->next; |
1352 | 0 | (*new_fiber)->prev = fiber; |
1353 | |
|
1354 | 0 | if (fiber->next != NULL) |
1355 | 0 | fiber->next->prev = *new_fiber; |
1356 | |
|
1357 | 0 | fiber->next = *new_fiber; |
1358 | |
|
1359 | 0 | if (fiber_list->tail == fiber) |
1360 | 0 | fiber_list->tail = *new_fiber; |
1361 | |
|
1362 | 0 | assert(fiber_list->tail->next == NULL); |
1363 | 0 | assert(fiber_list->head->prev == NULL); |
1364 | | |
1365 | 0 | return ERROR_SUCCESS; |
1366 | 0 | } |
1367 | | |
1368 | | //////////////////////////////////////////////////////////////////////////////// |
1369 | | // Kills a given fiber by removing it from the fiber list and putting it in the |
1370 | | // fiber pool. |
1371 | | // |
1372 | | static RE_FIBER* _yr_re_fiber_kill( |
1373 | | RE_FIBER_LIST* fiber_list, |
1374 | | RE_FIBER_POOL* fiber_pool, |
1375 | | RE_FIBER* fiber) |
1376 | 0 | { |
1377 | 0 | RE_FIBER* next_fiber = fiber->next; |
1378 | |
|
1379 | 0 | if (fiber->prev != NULL) |
1380 | 0 | fiber->prev->next = next_fiber; |
1381 | |
|
1382 | 0 | if (next_fiber != NULL) |
1383 | 0 | next_fiber->prev = fiber->prev; |
1384 | |
|
1385 | 0 | if (fiber_pool->fibers.tail != NULL) |
1386 | 0 | fiber_pool->fibers.tail->next = fiber; |
1387 | |
|
1388 | 0 | if (fiber_list->tail == fiber) |
1389 | 0 | fiber_list->tail = fiber->prev; |
1390 | |
|
1391 | 0 | if (fiber_list->head == fiber) |
1392 | 0 | fiber_list->head = next_fiber; |
1393 | |
|
1394 | 0 | fiber->next = NULL; |
1395 | 0 | fiber->prev = fiber_pool->fibers.tail; |
1396 | 0 | fiber_pool->fibers.tail = fiber; |
1397 | |
|
1398 | 0 | if (fiber_pool->fibers.head == NULL) |
1399 | 0 | fiber_pool->fibers.head = fiber; |
1400 | |
|
1401 | 0 | return next_fiber; |
1402 | 0 | } |
1403 | | |
1404 | | //////////////////////////////////////////////////////////////////////////////// |
1405 | | // Kills all fibers from the given one up to the end of the fiber list. |
1406 | | // |
1407 | | static void _yr_re_fiber_kill_tail( |
1408 | | RE_FIBER_LIST* fiber_list, |
1409 | | RE_FIBER_POOL* fiber_pool, |
1410 | | RE_FIBER* fiber) |
1411 | 0 | { |
1412 | 0 | RE_FIBER* prev_fiber = fiber->prev; |
1413 | |
|
1414 | 0 | if (prev_fiber != NULL) |
1415 | 0 | prev_fiber->next = NULL; |
1416 | |
|
1417 | 0 | fiber->prev = fiber_pool->fibers.tail; |
1418 | |
|
1419 | 0 | if (fiber_pool->fibers.tail != NULL) |
1420 | 0 | fiber_pool->fibers.tail->next = fiber; |
1421 | |
|
1422 | 0 | fiber_pool->fibers.tail = fiber_list->tail; |
1423 | 0 | fiber_list->tail = prev_fiber; |
1424 | |
|
1425 | 0 | if (fiber_list->head == fiber) |
1426 | 0 | fiber_list->head = NULL; |
1427 | |
|
1428 | 0 | if (fiber_pool->fibers.head == NULL) |
1429 | 0 | fiber_pool->fibers.head = fiber; |
1430 | 0 | } |
1431 | | |
1432 | | //////////////////////////////////////////////////////////////////////////////// |
1433 | | // Kills all fibers in the fiber list. |
1434 | | // |
1435 | | static void _yr_re_fiber_kill_all( |
1436 | | RE_FIBER_LIST* fiber_list, |
1437 | | RE_FIBER_POOL* fiber_pool) |
1438 | 0 | { |
1439 | 0 | if (fiber_list->head != NULL) |
1440 | 0 | _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head); |
1441 | 0 | } |
1442 | | |
1443 | | //////////////////////////////////////////////////////////////////////////////// |
1444 | | // Executes a fiber until reaching an "matching" instruction. A "matching" |
1445 | | // instruction is one that actually reads a byte from the input and performs |
1446 | | // some matching. If the fiber reaches a split instruction, the new fiber is |
1447 | | // also synced. |
1448 | | // |
1449 | | static int _yr_re_fiber_sync( |
1450 | | RE_FIBER_LIST* fiber_list, |
1451 | | RE_FIBER_POOL* fiber_pool, |
1452 | | RE_FIBER* fiber_to_sync) |
1453 | 0 | { |
1454 | | // A array for keeping track of which split instructions has been already |
1455 | | // executed. Each split instruction within a regexp has an associated ID |
1456 | | // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is |
1457 | | // required to avoid infinite loops in regexps like (a*)* or (a|)* |
1458 | |
|
1459 | 0 | RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID]; |
1460 | 0 | RE_SPLIT_ID_TYPE splits_executed_count = 0; |
1461 | 0 | RE_SPLIT_ID_TYPE split_id, splits_executed_idx; |
1462 | |
|
1463 | 0 | int split_already_executed; |
1464 | |
|
1465 | 0 | RE_REPEAT_ARGS* repeat_args; |
1466 | 0 | RE_REPEAT_ANY_ARGS* repeat_any_args; |
1467 | |
|
1468 | 0 | RE_FIBER* fiber; |
1469 | 0 | RE_FIBER* last; |
1470 | 0 | RE_FIBER* next; |
1471 | 0 | RE_FIBER* branch_a; |
1472 | 0 | RE_FIBER* branch_b; |
1473 | |
|
1474 | 0 | fiber = fiber_to_sync; |
1475 | 0 | last = fiber_to_sync->next; |
1476 | |
|
1477 | 0 | while (fiber != last) |
1478 | 0 | { |
1479 | 0 | int16_t jmp_len; |
1480 | 0 | uint8_t opcode = *fiber->ip; |
1481 | |
|
1482 | 0 | switch (opcode) |
1483 | 0 | { |
1484 | 0 | case RE_OPCODE_SPLIT_A: |
1485 | 0 | case RE_OPCODE_SPLIT_B: |
1486 | |
|
1487 | 0 | split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1); |
1488 | 0 | split_already_executed = false; |
1489 | |
|
1490 | 0 | for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count; |
1491 | 0 | splits_executed_idx++) |
1492 | 0 | { |
1493 | 0 | if (split_id == splits_executed[splits_executed_idx]) |
1494 | 0 | { |
1495 | 0 | split_already_executed = true; |
1496 | 0 | break; |
1497 | 0 | } |
1498 | 0 | } |
1499 | |
|
1500 | 0 | if (split_already_executed) |
1501 | 0 | { |
1502 | 0 | fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber); |
1503 | 0 | } |
1504 | 0 | else |
1505 | 0 | { |
1506 | 0 | branch_a = fiber; |
1507 | |
|
1508 | 0 | FAIL_ON_ERROR( |
1509 | 0 | _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b)); |
1510 | | |
1511 | | // With RE_OPCODE_SPLIT_A the current fiber continues at the next |
1512 | | // instruction in the stream (branch A), while the newly created |
1513 | | // fiber starts at the address indicated by the instruction (branch B) |
1514 | | // RE_OPCODE_SPLIT_B has the opposite behavior. |
1515 | |
|
1516 | 0 | if (opcode == RE_OPCODE_SPLIT_B) |
1517 | 0 | yr_swap(branch_a, branch_b, RE_FIBER*); |
1518 | | |
1519 | | // Branch A continues at the next instruction |
1520 | 0 | branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3); |
1521 | | |
1522 | | // Branch B adds the offset indicated by the split instruction. |
1523 | 0 | jmp_len = yr_unaligned_i16(branch_b->ip + 1 + sizeof(RE_SPLIT_ID_TYPE)); |
1524 | |
|
1525 | 0 | branch_b->ip += jmp_len; |
1526 | |
|
1527 | | #ifdef YR_PARANOID_MODE |
1528 | | // In normal conditions this should never happen. But with compiled |
1529 | | // rules that has been hand-crafted by a malicious actor this could |
1530 | | // happen. |
1531 | | if (splits_executed_count >= RE_MAX_SPLIT_ID) |
1532 | | return ERROR_INTERNAL_FATAL_ERROR; |
1533 | | #endif |
1534 | |
|
1535 | 0 | splits_executed[splits_executed_count] = split_id; |
1536 | 0 | splits_executed_count++; |
1537 | 0 | } |
1538 | | |
1539 | 0 | break; |
1540 | | |
1541 | 0 | case RE_OPCODE_REPEAT_START_GREEDY: |
1542 | 0 | case RE_OPCODE_REPEAT_START_UNGREEDY: |
1543 | |
|
1544 | 0 | repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1); |
1545 | 0 | assert(repeat_args->max > 0); |
1546 | 0 | branch_a = fiber; |
1547 | |
|
1548 | 0 | if (repeat_args->min == 0) |
1549 | 0 | { |
1550 | 0 | FAIL_ON_ERROR( |
1551 | 0 | _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b)); |
1552 | |
|
1553 | 0 | if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY) |
1554 | 0 | yr_swap(branch_a, branch_b, RE_FIBER*); |
1555 | |
|
1556 | 0 | branch_b->ip += repeat_args->offset; |
1557 | 0 | } |
1558 | | |
1559 | 0 | branch_a->stack[++branch_a->sp] = 0; |
1560 | 0 | branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS)); |
1561 | 0 | break; |
1562 | | |
1563 | 0 | case RE_OPCODE_REPEAT_END_GREEDY: |
1564 | 0 | case RE_OPCODE_REPEAT_END_UNGREEDY: |
1565 | |
|
1566 | 0 | repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1); |
1567 | 0 | fiber->stack[fiber->sp]++; |
1568 | |
|
1569 | 0 | if (fiber->stack[fiber->sp] < repeat_args->min) |
1570 | 0 | { |
1571 | 0 | fiber->ip += repeat_args->offset; |
1572 | 0 | break; |
1573 | 0 | } |
1574 | | |
1575 | 0 | branch_a = fiber; |
1576 | |
|
1577 | 0 | if (fiber->stack[fiber->sp] < repeat_args->max) |
1578 | 0 | { |
1579 | 0 | FAIL_ON_ERROR( |
1580 | 0 | _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b)); |
1581 | |
|
1582 | 0 | if (opcode == RE_OPCODE_REPEAT_END_GREEDY) |
1583 | 0 | yr_swap(branch_a, branch_b, RE_FIBER*); |
1584 | |
|
1585 | 0 | branch_b->ip += repeat_args->offset; |
1586 | 0 | } |
1587 | | |
1588 | 0 | branch_a->sp--; |
1589 | 0 | branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS)); |
1590 | 0 | break; |
1591 | | |
1592 | 0 | case RE_OPCODE_REPEAT_ANY_GREEDY: |
1593 | 0 | case RE_OPCODE_REPEAT_ANY_UNGREEDY: |
1594 | |
|
1595 | 0 | repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1); |
1596 | | |
1597 | | // If repetition counter (rc) is -1 it means that we are reaching this |
1598 | | // instruction from the previous one in the instructions stream. In |
1599 | | // this case let's initialize the counter to 0 and start looping. |
1600 | |
|
1601 | 0 | if (fiber->rc == -1) |
1602 | 0 | fiber->rc = 0; |
1603 | |
|
1604 | 0 | if (fiber->rc < repeat_any_args->min) |
1605 | 0 | { |
1606 | | // Increase repetition counter and continue with next fiber. The |
1607 | | // instruction pointer for this fiber is not incremented yet, this |
1608 | | // fiber spins in this same instruction until reaching the minimum |
1609 | | // number of repetitions. |
1610 | |
|
1611 | 0 | fiber->rc++; |
1612 | 0 | fiber = fiber->next; |
1613 | 0 | } |
1614 | 0 | else if (fiber->rc < repeat_any_args->max) |
1615 | 0 | { |
1616 | | // Once the minimum number of repetitions are matched one fiber |
1617 | | // remains spinning in this instruction until reaching the maximum |
1618 | | // number of repetitions while new fibers are created. New fibers |
1619 | | // start executing at the next instruction. |
1620 | |
|
1621 | 0 | next = fiber->next; |
1622 | 0 | branch_a = fiber; |
1623 | |
|
1624 | 0 | FAIL_ON_ERROR( |
1625 | 0 | _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b)); |
1626 | |
|
1627 | 0 | if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY) |
1628 | 0 | yr_swap(branch_a, branch_b, RE_FIBER*); |
1629 | |
|
1630 | 0 | branch_a->rc++; |
1631 | 0 | branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS)); |
1632 | 0 | branch_b->rc = -1; |
1633 | |
|
1634 | 0 | FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b)); |
1635 | |
|
1636 | 0 | fiber = next; |
1637 | 0 | } |
1638 | 0 | else |
1639 | 0 | { |
1640 | | // When the maximum number of repetitions is reached the fiber keeps |
1641 | | // executing at the next instruction. The repetition counter is set |
1642 | | // to -1 indicating that we are not spinning in a repeat instruction |
1643 | | // anymore. |
1644 | |
|
1645 | 0 | fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS)); |
1646 | 0 | fiber->rc = -1; |
1647 | 0 | } |
1648 | | |
1649 | 0 | break; |
1650 | | |
1651 | 0 | case RE_OPCODE_JUMP: |
1652 | 0 | jmp_len = yr_unaligned_i16(fiber->ip + 1); |
1653 | 0 | fiber->ip += jmp_len; |
1654 | 0 | break; |
1655 | | |
1656 | 0 | default: |
1657 | 0 | fiber = fiber->next; |
1658 | 0 | } |
1659 | 0 | } |
1660 | | |
1661 | 0 | return ERROR_SUCCESS; |
1662 | 0 | } |
1663 | | |
1664 | | //////////////////////////////////////////////////////////////////////////////// |
1665 | | // Executes a regular expression. The specified regular expression will try to |
1666 | | // match the data starting at the address specified by "input". The "input" |
1667 | | // pointer can point to any address inside a memory buffer. Arguments |
1668 | | // "input_forwards_size" and "input_backwards_size" indicate how many bytes |
1669 | | // can be accessible starting at "input" and going forwards and backwards |
1670 | | // respectively. |
1671 | | // |
1672 | | // <--- input_backwards_size -->|<----------- input_forwards_size --------> |
1673 | | // |-------- memory buffer -----------------------------------------------| |
1674 | | // ^ |
1675 | | // input |
1676 | | // |
1677 | | // Args: |
1678 | | // YR_SCAN_CONTEXT *context - Scan context. |
1679 | | // const uint8_t* code - Regexp code be executed |
1680 | | // const uint8_t* input - Pointer to input data |
1681 | | // size_t input_forwards_size - Number of accessible bytes starting at |
1682 | | // "input" and going forwards. |
1683 | | // size_t input_backwards_size - Number of accessible bytes starting at |
1684 | | // "input" and going backwards |
1685 | | // int flags - Flags: |
1686 | | // RE_FLAGS_SCAN |
1687 | | // RE_FLAGS_BACKWARDS |
1688 | | // RE_FLAGS_EXHAUSTIVE |
1689 | | // RE_FLAGS_WIDE |
1690 | | // RE_FLAGS_NO_CASE |
1691 | | // RE_FLAGS_DOT_ALL |
1692 | | // RE_MATCH_CALLBACK_FUNC callback - Callback function |
1693 | | // void* callback_args - Callback argument |
1694 | | // int* matches - Pointer to an integer receiving the |
1695 | | // number of matching bytes. Notice that |
1696 | | // 0 means a zero-length match, while no |
1697 | | // matches is -1. |
1698 | | // Returns: |
1699 | | // ERROR_SUCCESS or any other error code. |
1700 | | |
1701 | | int yr_re_exec( |
1702 | | YR_SCAN_CONTEXT* context, |
1703 | | const uint8_t* code, |
1704 | | const uint8_t* input_data, |
1705 | | size_t input_forwards_size, |
1706 | | size_t input_backwards_size, |
1707 | | int flags, |
1708 | | RE_MATCH_CALLBACK_FUNC callback, |
1709 | | void* callback_args, |
1710 | | int* matches) |
1711 | 0 | { |
1712 | 0 | const uint8_t* input; |
1713 | 0 | const uint8_t* ip; |
1714 | |
|
1715 | 0 | uint16_t opcode_args; |
1716 | 0 | uint8_t mask; |
1717 | 0 | uint8_t value; |
1718 | 0 | uint8_t character_size; |
1719 | |
|
1720 | 0 | RE_FIBER_LIST fibers; |
1721 | 0 | RE_FIBER* fiber; |
1722 | 0 | RE_FIBER* next_fiber; |
1723 | |
|
1724 | 0 | int bytes_matched; |
1725 | 0 | int max_bytes_matched; |
1726 | |
|
1727 | 0 | int match; |
1728 | 0 | int input_incr; |
1729 | 0 | int kill; |
1730 | 0 | int action; |
1731 | |
|
1732 | 0 | bool prev_is_word_char = false; |
1733 | 0 | bool input_is_word_char = false; |
1734 | |
|
1735 | 0 | #define ACTION_NONE 0 |
1736 | 0 | #define ACTION_CONTINUE 1 |
1737 | 0 | #define ACTION_KILL 2 |
1738 | 0 | #define ACTION_KILL_TAIL 3 |
1739 | |
|
1740 | 0 | #define prolog \ |
1741 | 0 | { \ |
1742 | 0 | if ((bytes_matched >= max_bytes_matched) || \ |
1743 | 0 | (character_size == 2 && *(input + 1) != 0)) \ |
1744 | 0 | { \ |
1745 | 0 | action = ACTION_KILL; \ |
1746 | 0 | break; \ |
1747 | 0 | } \ |
1748 | 0 | } |
1749 | |
|
1750 | 0 | if (matches != NULL) |
1751 | 0 | *matches = -1; |
1752 | |
|
1753 | 0 | if (flags & RE_FLAGS_WIDE) |
1754 | 0 | character_size = 2; |
1755 | 0 | else |
1756 | 0 | character_size = 1; |
1757 | |
|
1758 | 0 | input = input_data; |
1759 | 0 | input_incr = character_size; |
1760 | |
|
1761 | 0 | if (flags & RE_FLAGS_BACKWARDS) |
1762 | 0 | { |
1763 | | // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX |
1764 | 0 | max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT); |
1765 | 0 | input -= character_size; |
1766 | 0 | input_incr = -input_incr; |
1767 | 0 | } |
1768 | 0 | else |
1769 | 0 | { |
1770 | | // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX |
1771 | 0 | max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT); |
1772 | 0 | } |
1773 | | |
1774 | | // Round down max_bytes_matched to a multiple of character_size, this way if |
1775 | | // character_size is 2 and max_bytes_matched is odd we are ignoring the |
1776 | | // extra byte which can't match anyways. |
1777 | |
|
1778 | 0 | max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size; |
1779 | 0 | bytes_matched = 0; |
1780 | |
|
1781 | 0 | FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber)); |
1782 | |
|
1783 | 0 | fiber->ip = code; |
1784 | 0 | fibers.head = fiber; |
1785 | 0 | fibers.tail = fiber; |
1786 | |
|
1787 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
1788 | 0 | _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber), |
1789 | 0 | _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool)); |
1790 | |
|
1791 | 0 | while (fibers.head != NULL) |
1792 | 0 | { |
1793 | 0 | fiber = fibers.head; |
1794 | |
|
1795 | 0 | while (fiber != NULL) |
1796 | 0 | { |
1797 | 0 | next_fiber = fiber->next; |
1798 | |
|
1799 | 0 | if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev)) |
1800 | 0 | _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber); |
1801 | |
|
1802 | 0 | fiber = next_fiber; |
1803 | 0 | } |
1804 | |
|
1805 | 0 | fiber = fibers.head; |
1806 | |
|
1807 | 0 | while (fiber != NULL) |
1808 | 0 | { |
1809 | 0 | ip = fiber->ip; |
1810 | 0 | action = ACTION_NONE; |
1811 | |
|
1812 | 0 | switch (*ip) |
1813 | 0 | { |
1814 | 0 | case RE_OPCODE_ANY: |
1815 | 0 | prolog; |
1816 | 0 | match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A); |
1817 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1818 | 0 | fiber->ip += 1; |
1819 | 0 | break; |
1820 | | |
1821 | 0 | case RE_OPCODE_REPEAT_ANY_GREEDY: |
1822 | 0 | case RE_OPCODE_REPEAT_ANY_UNGREEDY: |
1823 | 0 | prolog; |
1824 | 0 | match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A); |
1825 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1826 | | |
1827 | | // The instruction pointer is not incremented here. The current fiber |
1828 | | // spins in this instruction until reaching the required number of |
1829 | | // repetitions. The code controlling the number of repetitions is in |
1830 | | // _yr_re_fiber_sync. |
1831 | |
|
1832 | 0 | break; |
1833 | | |
1834 | 0 | case RE_OPCODE_LITERAL: |
1835 | 0 | prolog; |
1836 | 0 | if (flags & RE_FLAGS_NO_CASE) |
1837 | 0 | match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)]; |
1838 | 0 | else |
1839 | 0 | match = (*input == *(ip + 1)); |
1840 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1841 | 0 | fiber->ip += 2; |
1842 | 0 | break; |
1843 | | |
1844 | 0 | case RE_OPCODE_NOT_LITERAL: |
1845 | 0 | prolog; |
1846 | | |
1847 | | // We don't need to take into account the case-insensitive |
1848 | | // case because this opcode is only used with hex strings, |
1849 | | // which can't be case-insensitive. |
1850 | |
|
1851 | 0 | match = (*input != *(ip + 1)); |
1852 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1853 | 0 | fiber->ip += 2; |
1854 | 0 | break; |
1855 | | |
1856 | 0 | case RE_OPCODE_MASKED_LITERAL: |
1857 | 0 | prolog; |
1858 | 0 | opcode_args = yr_unaligned_u16(ip + 1); |
1859 | 0 | mask = opcode_args >> 8; |
1860 | 0 | value = opcode_args & 0xFF; |
1861 | | |
1862 | | // We don't need to take into account the case-insensitive |
1863 | | // case because this opcode is only used with hex strings, |
1864 | | // which can't be case-insensitive. |
1865 | |
|
1866 | 0 | match = ((*input & mask) == value); |
1867 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1868 | 0 | fiber->ip += 3; |
1869 | 0 | break; |
1870 | | |
1871 | 0 | case RE_OPCODE_MASKED_NOT_LITERAL: |
1872 | 0 | prolog; |
1873 | 0 | opcode_args = yr_unaligned_u16(ip + 1); |
1874 | 0 | mask = opcode_args >> 8; |
1875 | 0 | value = opcode_args & 0xFF; |
1876 | | |
1877 | | // We don't need to take into account the case-insensitive |
1878 | | // case because this opcode is only used with hex strings, |
1879 | | // which can't be case-insensitive. |
1880 | |
|
1881 | 0 | match = ((*input & mask) != value); |
1882 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1883 | 0 | fiber->ip += 3; |
1884 | 0 | break; |
1885 | | |
1886 | 0 | case RE_OPCODE_CLASS: |
1887 | 0 | prolog; |
1888 | 0 | match = _yr_re_is_char_in_class( |
1889 | 0 | (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE); |
1890 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1891 | 0 | fiber->ip += (sizeof(RE_CLASS) + 1); |
1892 | 0 | break; |
1893 | | |
1894 | 0 | case RE_OPCODE_WORD_CHAR: |
1895 | 0 | prolog; |
1896 | 0 | match = _yr_re_is_word_char(input, character_size); |
1897 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1898 | 0 | fiber->ip += 1; |
1899 | 0 | break; |
1900 | | |
1901 | 0 | case RE_OPCODE_NON_WORD_CHAR: |
1902 | 0 | prolog; |
1903 | 0 | match = !_yr_re_is_word_char(input, character_size); |
1904 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1905 | 0 | fiber->ip += 1; |
1906 | 0 | break; |
1907 | | |
1908 | 0 | case RE_OPCODE_SPACE: |
1909 | 0 | case RE_OPCODE_NON_SPACE: |
1910 | |
|
1911 | 0 | prolog; |
1912 | |
|
1913 | 0 | switch (*input) |
1914 | 0 | { |
1915 | 0 | case ' ': |
1916 | 0 | case '\t': |
1917 | 0 | case '\r': |
1918 | 0 | case '\n': |
1919 | 0 | case '\v': |
1920 | 0 | case '\f': |
1921 | 0 | match = true; |
1922 | 0 | break; |
1923 | 0 | default: |
1924 | 0 | match = false; |
1925 | 0 | } |
1926 | | |
1927 | 0 | if (*ip == RE_OPCODE_NON_SPACE) |
1928 | 0 | match = !match; |
1929 | |
|
1930 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1931 | 0 | fiber->ip += 1; |
1932 | 0 | break; |
1933 | | |
1934 | 0 | case RE_OPCODE_DIGIT: |
1935 | 0 | prolog; |
1936 | 0 | match = isdigit(*input); |
1937 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1938 | 0 | fiber->ip += 1; |
1939 | 0 | break; |
1940 | | |
1941 | 0 | case RE_OPCODE_NON_DIGIT: |
1942 | 0 | prolog; |
1943 | 0 | match = !isdigit(*input); |
1944 | 0 | action = match ? ACTION_NONE : ACTION_KILL; |
1945 | 0 | fiber->ip += 1; |
1946 | 0 | break; |
1947 | | |
1948 | 0 | case RE_OPCODE_WORD_BOUNDARY: |
1949 | 0 | case RE_OPCODE_NON_WORD_BOUNDARY: |
1950 | 0 | if (input - input_incr + character_size <= |
1951 | 0 | input_data + input_forwards_size && |
1952 | 0 | input - input_incr >= input_data - input_backwards_size) |
1953 | 0 | { |
1954 | 0 | prev_is_word_char = _yr_re_is_word_char( |
1955 | 0 | input - input_incr, character_size); |
1956 | 0 | } |
1957 | 0 | else |
1958 | 0 | { |
1959 | 0 | prev_is_word_char = false; |
1960 | 0 | } |
1961 | |
|
1962 | 0 | if (input + character_size <= input_data + input_forwards_size && |
1963 | 0 | input >= input_data - input_backwards_size) |
1964 | 0 | { |
1965 | 0 | input_is_word_char = _yr_re_is_word_char(input, character_size); |
1966 | 0 | } |
1967 | 0 | else |
1968 | 0 | { |
1969 | 0 | input_is_word_char = false; |
1970 | 0 | } |
1971 | |
|
1972 | 0 | match = (prev_is_word_char && !input_is_word_char) || |
1973 | 0 | (!prev_is_word_char && input_is_word_char); |
1974 | |
|
1975 | 0 | if (*ip == RE_OPCODE_NON_WORD_BOUNDARY) |
1976 | 0 | match = !match; |
1977 | |
|
1978 | 0 | action = match ? ACTION_CONTINUE : ACTION_KILL; |
1979 | 0 | fiber->ip += 1; |
1980 | 0 | break; |
1981 | | |
1982 | 0 | case RE_OPCODE_MATCH_AT_START: |
1983 | 0 | if (flags & RE_FLAGS_BACKWARDS) |
1984 | 0 | kill = input_backwards_size > (size_t) bytes_matched; |
1985 | 0 | else |
1986 | 0 | kill = input_backwards_size > 0 || (bytes_matched != 0); |
1987 | 0 | action = kill ? ACTION_KILL : ACTION_CONTINUE; |
1988 | 0 | fiber->ip += 1; |
1989 | 0 | break; |
1990 | | |
1991 | 0 | case RE_OPCODE_MATCH_AT_END: |
1992 | 0 | kill = flags & RE_FLAGS_BACKWARDS || |
1993 | 0 | input_forwards_size > (size_t) bytes_matched; |
1994 | 0 | action = kill ? ACTION_KILL : ACTION_CONTINUE; |
1995 | 0 | fiber->ip += 1; |
1996 | 0 | break; |
1997 | | |
1998 | 0 | case RE_OPCODE_MATCH: |
1999 | |
|
2000 | 0 | if (matches != NULL) |
2001 | 0 | *matches = bytes_matched; |
2002 | |
|
2003 | 0 | if (flags & RE_FLAGS_EXHAUSTIVE) |
2004 | 0 | { |
2005 | 0 | if (callback != NULL) |
2006 | 0 | { |
2007 | 0 | if (flags & RE_FLAGS_BACKWARDS) |
2008 | 0 | { |
2009 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2010 | 0 | callback( |
2011 | 0 | input + character_size, |
2012 | 0 | bytes_matched, |
2013 | 0 | flags, |
2014 | 0 | callback_args), |
2015 | 0 | _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool)); |
2016 | 0 | } |
2017 | 0 | else |
2018 | 0 | { |
2019 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2020 | 0 | callback(input_data, bytes_matched, flags, callback_args), |
2021 | 0 | _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool)); |
2022 | 0 | } |
2023 | 0 | } |
2024 | | |
2025 | 0 | action = ACTION_KILL; |
2026 | 0 | } |
2027 | 0 | else |
2028 | 0 | { |
2029 | 0 | action = ACTION_KILL_TAIL; |
2030 | 0 | } |
2031 | | |
2032 | 0 | break; |
2033 | | |
2034 | 0 | default: |
2035 | 0 | assert(false); |
2036 | 0 | } |
2037 | | |
2038 | 0 | switch (action) |
2039 | 0 | { |
2040 | 0 | case ACTION_KILL: |
2041 | 0 | fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber); |
2042 | 0 | break; |
2043 | | |
2044 | 0 | case ACTION_KILL_TAIL: |
2045 | 0 | _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber); |
2046 | 0 | fiber = NULL; |
2047 | 0 | break; |
2048 | | |
2049 | 0 | case ACTION_CONTINUE: |
2050 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2051 | 0 | _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber), |
2052 | 0 | _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool)); |
2053 | 0 | break; |
2054 | | |
2055 | 0 | default: |
2056 | 0 | next_fiber = fiber->next; |
2057 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2058 | 0 | _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber), |
2059 | 0 | _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool)); |
2060 | 0 | fiber = next_fiber; |
2061 | 0 | } |
2062 | 0 | } |
2063 | | |
2064 | 0 | input += input_incr; |
2065 | 0 | bytes_matched += character_size; |
2066 | |
|
2067 | 0 | if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched) |
2068 | 0 | { |
2069 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2070 | 0 | _yr_re_fiber_create(&context->re_fiber_pool, &fiber), |
2071 | 0 | _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool)); |
2072 | |
|
2073 | 0 | fiber->ip = code; |
2074 | 0 | _yr_re_fiber_append(&fibers, fiber); |
2075 | |
|
2076 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2077 | 0 | _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber), |
2078 | 0 | _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool)); |
2079 | 0 | } |
2080 | 0 | } |
2081 | | |
2082 | 0 | return ERROR_SUCCESS; |
2083 | 0 | } |
2084 | | |
2085 | | //////////////////////////////////////////////////////////////////////////////// |
2086 | | // Helper function that creates a RE_FAST_EXEC_POSITION by either allocating it |
2087 | | // or reusing a previously allocated one from a pool. |
2088 | | // |
2089 | | static int _yr_re_fast_exec_position_create( |
2090 | | RE_FAST_EXEC_POSITION_POOL* pool, |
2091 | | RE_FAST_EXEC_POSITION** new_position) |
2092 | 0 | { |
2093 | 0 | RE_FAST_EXEC_POSITION* position; |
2094 | |
|
2095 | 0 | if (pool->head != NULL) |
2096 | 0 | { |
2097 | 0 | position = pool->head; |
2098 | 0 | pool->head = position->next; |
2099 | 0 | } |
2100 | 0 | else |
2101 | 0 | { |
2102 | 0 | position = (RE_FAST_EXEC_POSITION*) yr_malloc( |
2103 | 0 | sizeof(RE_FAST_EXEC_POSITION)); |
2104 | |
|
2105 | 0 | if (position == NULL) |
2106 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
2107 | 0 | } |
2108 | | |
2109 | 0 | position->input = NULL; |
2110 | 0 | position->round = 0; |
2111 | 0 | position->next = NULL; |
2112 | 0 | position->prev = NULL; |
2113 | |
|
2114 | 0 | *new_position = position; |
2115 | |
|
2116 | 0 | return ERROR_SUCCESS; |
2117 | 0 | } |
2118 | | |
2119 | | //////////////////////////////////////////////////////////////////////////////// |
2120 | | // Helper function that moves a list of RE_FAST_EXEC_POSITION structures to a |
2121 | | // pool represented by a RE_FAST_EXEC_POSITION_POOL. Receives pointers to the |
2122 | | // pool, the first item, and last item in the list. |
2123 | | // |
2124 | | static void _yr_re_fast_exec_destroy_position_list( |
2125 | | RE_FAST_EXEC_POSITION_POOL* pool, |
2126 | | RE_FAST_EXEC_POSITION* first, |
2127 | | RE_FAST_EXEC_POSITION* last) |
2128 | 0 | { |
2129 | 0 | last->next = pool->head; |
2130 | |
|
2131 | 0 | if (pool->head != NULL) |
2132 | 0 | pool->head->prev = last; |
2133 | |
|
2134 | 0 | pool->head = first; |
2135 | 0 | } |
2136 | | |
2137 | | //////////////////////////////////////////////////////////////////////////////// |
2138 | | // This function replaces yr_re_exec for regular expressions marked with flag |
2139 | | // RE_FLAGS_FAST_REGEXP. These regular expressions are derived from hex strings |
2140 | | // that don't contain alternatives, like for example: |
2141 | | // |
2142 | | // { 01 02 03 04 [0-2] 04 05 06 07 } |
2143 | | // |
2144 | | // The regexp's code produced by such strings can be matched by a faster, less |
2145 | | // general algorithm, and it contains only the following opcodes: |
2146 | | // |
2147 | | // * RE_OPCODE_LITERAL |
2148 | | // * RE_OPCODE_MASKED_LITERAL, |
2149 | | // * RE_OPCODE_NOT_LITERAL |
2150 | | // * RE_OPCODE_MASKED_NOT_LITERAL |
2151 | | // * RE_OPCODE_ANY |
2152 | | // * RE_OPCODE_REPEAT_ANY_UNGREEDY |
2153 | | // * RE_OPCODE_MATCH. |
2154 | | // |
2155 | | // The regexp's code is executed one instruction at time, and the function |
2156 | | // maintains a list of positions within the input for tracking different |
2157 | | // matching alternatives, these positions are described by an instance of |
2158 | | // RE_FAST_EXEC_POSITION. With each instruction the list of positions is |
2159 | | // updated, removing those where the input data doesn't match the current |
2160 | | // instruction, or adding new positions if the instruction is |
2161 | | // RE_OPCODE_REPEAT_ANY_UNGREEDY. The list of positions is maintained sorted |
2162 | | // by the value of the pointer they hold to the input, and it doesn't contain |
2163 | | // duplicated pointer values. |
2164 | | // |
2165 | | int yr_re_fast_exec( |
2166 | | YR_SCAN_CONTEXT* context, |
2167 | | const uint8_t* code, |
2168 | | const uint8_t* input_data, |
2169 | | size_t input_forwards_size, |
2170 | | size_t input_backwards_size, |
2171 | | int flags, |
2172 | | RE_MATCH_CALLBACK_FUNC callback, |
2173 | | void* callback_args, |
2174 | | int* matches) |
2175 | 0 | { |
2176 | 0 | RE_REPEAT_ANY_ARGS* repeat_any_args; |
2177 | | |
2178 | | // Pointers to the first and last position in the list. |
2179 | 0 | RE_FAST_EXEC_POSITION* first; |
2180 | 0 | RE_FAST_EXEC_POSITION* last; |
2181 | |
|
2182 | 0 | int input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1; |
2183 | 0 | int bytes_matched; |
2184 | 0 | int max_bytes_matched; |
2185 | |
|
2186 | 0 | if (flags & RE_FLAGS_BACKWARDS) |
2187 | 0 | max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT); |
2188 | 0 | else |
2189 | 0 | max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT); |
2190 | |
|
2191 | 0 | const uint8_t* ip = code; |
2192 | | |
2193 | | // Create the first position in the list, which points to the start of the |
2194 | | // input data. Intially this is the only position, more positions will be |
2195 | | // created every time RE_OPCODE_REPEAT_ANY_UNGREEDY is executed. |
2196 | 0 | FAIL_ON_ERROR(_yr_re_fast_exec_position_create( |
2197 | 0 | &context->re_fast_exec_position_pool, &first)); |
2198 | |
|
2199 | 0 | first->round = 0; |
2200 | 0 | first->input = input_data; |
2201 | 0 | first->prev = NULL; |
2202 | 0 | first->next = NULL; |
2203 | |
|
2204 | 0 | if (flags & RE_FLAGS_BACKWARDS) |
2205 | 0 | first->input--; |
2206 | | |
2207 | | // As we are starting with a single position, the last one and the first one |
2208 | | // are the same. |
2209 | 0 | last = first; |
2210 | | |
2211 | | // Round is incremented with every regxp instruction. |
2212 | 0 | int round = 0; |
2213 | | |
2214 | | // Keep in the loop until the list of positions gets empty. Positions are |
2215 | | // removed from the list when they don't match the current instruction in |
2216 | | // the code. |
2217 | 0 | while (first != NULL) |
2218 | 0 | { |
2219 | 0 | RE_FAST_EXEC_POSITION* current = first; |
2220 | | |
2221 | | // Iterate over all the positions, executing the current instruction at |
2222 | | // all of them. |
2223 | 0 | while (current != NULL) |
2224 | 0 | { |
2225 | 0 | RE_FAST_EXEC_POSITION* next = current->next; |
2226 | | |
2227 | | // Ignore any position in the list whose round number is different from |
2228 | | // the current round. This prevents new positions added to the list in |
2229 | | // this round from being taken into account in this same round. |
2230 | 0 | if (current->round != round) |
2231 | 0 | { |
2232 | 0 | current = next; |
2233 | 0 | continue; |
2234 | 0 | } |
2235 | | |
2236 | 0 | bytes_matched = flags & RE_FLAGS_BACKWARDS |
2237 | 0 | ? (int) (input_data - current->input - 1) |
2238 | 0 | : (int) (current->input - input_data); |
2239 | |
|
2240 | 0 | uint16_t opcode_args; |
2241 | 0 | uint8_t mask; |
2242 | 0 | uint8_t value; |
2243 | |
|
2244 | 0 | bool match = false; |
2245 | |
|
2246 | 0 | switch (*ip) |
2247 | 0 | { |
2248 | 0 | case RE_OPCODE_ANY: |
2249 | 0 | if (bytes_matched >= max_bytes_matched) |
2250 | 0 | break; |
2251 | | |
2252 | 0 | match = true; |
2253 | 0 | current->input += input_incr; |
2254 | 0 | break; |
2255 | | |
2256 | 0 | case RE_OPCODE_LITERAL: |
2257 | 0 | if (bytes_matched >= max_bytes_matched) |
2258 | 0 | break; |
2259 | | |
2260 | 0 | if (*current->input == *(ip + 1)) |
2261 | 0 | { |
2262 | 0 | match = true; |
2263 | 0 | current->input += input_incr; |
2264 | 0 | } |
2265 | 0 | break; |
2266 | | |
2267 | 0 | case RE_OPCODE_NOT_LITERAL: |
2268 | 0 | if (bytes_matched >= max_bytes_matched) |
2269 | 0 | break; |
2270 | | |
2271 | 0 | if (*current->input != *(ip + 1)) |
2272 | 0 | { |
2273 | 0 | match = true; |
2274 | 0 | current->input += input_incr; |
2275 | 0 | } |
2276 | 0 | break; |
2277 | | |
2278 | 0 | case RE_OPCODE_MASKED_LITERAL: |
2279 | 0 | if (bytes_matched >= max_bytes_matched) |
2280 | 0 | break; |
2281 | | |
2282 | 0 | opcode_args = yr_unaligned_u16(ip + 1); |
2283 | 0 | mask = opcode_args >> 8; |
2284 | 0 | value = opcode_args & 0xFF; |
2285 | |
|
2286 | 0 | if ((*current->input & mask) == value) |
2287 | 0 | { |
2288 | 0 | match = true; |
2289 | 0 | current->input += input_incr; |
2290 | 0 | } |
2291 | 0 | break; |
2292 | | |
2293 | 0 | case RE_OPCODE_MASKED_NOT_LITERAL: |
2294 | 0 | if (bytes_matched >= max_bytes_matched) |
2295 | 0 | break; |
2296 | | |
2297 | 0 | opcode_args = yr_unaligned_u16(ip + 1); |
2298 | 0 | mask = opcode_args >> 8; |
2299 | 0 | value = opcode_args & 0xFF; |
2300 | |
|
2301 | 0 | if ((*current->input & mask) != value) |
2302 | 0 | { |
2303 | 0 | match = true; |
2304 | 0 | current->input += input_incr; |
2305 | 0 | } |
2306 | 0 | break; |
2307 | | |
2308 | 0 | case RE_OPCODE_REPEAT_ANY_UNGREEDY: |
2309 | 0 | repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1); |
2310 | |
|
2311 | 0 | if (bytes_matched + repeat_any_args->min >= max_bytes_matched) |
2312 | 0 | break; |
2313 | | |
2314 | 0 | match = true; |
2315 | |
|
2316 | 0 | const uint8_t* next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS); |
2317 | | |
2318 | | // insertion_point indicates the item in the list of inputs after which |
2319 | | // the newly created inputs will be inserted. |
2320 | 0 | RE_FAST_EXEC_POSITION* insertion_point = current; |
2321 | |
|
2322 | 0 | for (int j = repeat_any_args->min + 1; j <= repeat_any_args->max; j++) |
2323 | 0 | { |
2324 | | // bytes_matched is the number of bytes matched so far, and j is the |
2325 | | // minimum number of bytes that are still pending to match. If these |
2326 | | // two numbers sum more than the maximum number of bytes that can be |
2327 | | // matched the loop can be aborted. The position that we were about |
2328 | | // to create can't lead to a match without exceeding |
2329 | | // max_bytes_matched. |
2330 | 0 | if (bytes_matched + j >= max_bytes_matched) |
2331 | 0 | break; |
2332 | | |
2333 | 0 | const uint8_t* next_input = current->input + j * input_incr; |
2334 | | |
2335 | | // Find the point where next_input should be inserted. The list is |
2336 | | // kept sorted by pointer in increasing order, the insertion point is |
2337 | | // an item that has a pointer lower or equal than next_input, but |
2338 | | // whose next item have a pointer that is larger. |
2339 | 0 | while (insertion_point->next != NULL && |
2340 | 0 | insertion_point->next->input <= next_input) |
2341 | 0 | { |
2342 | 0 | insertion_point = insertion_point->next; |
2343 | 0 | } |
2344 | | |
2345 | | // If the input already exists for the next round, we don't need to |
2346 | | // insert it. |
2347 | 0 | if (insertion_point->round == round + 1 && |
2348 | 0 | insertion_point->input == next_input) |
2349 | 0 | continue; |
2350 | | |
2351 | | // The next opcode is RE_OPCODE_LITERAL, but the literal doesn't |
2352 | | // match with the byte at next_input, we don't need to add this |
2353 | | // input to the list as we already know that it won't match in the |
2354 | | // next round and will be deleted from the list anyways. |
2355 | 0 | if (*(next_opcode) == RE_OPCODE_LITERAL && |
2356 | 0 | *(next_opcode + 1) != *next_input) |
2357 | 0 | continue; |
2358 | | |
2359 | 0 | RE_FAST_EXEC_POSITION* new_input; |
2360 | |
|
2361 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2362 | 0 | _yr_re_fast_exec_position_create( |
2363 | 0 | &context->re_fast_exec_position_pool, &new_input), |
2364 | | // Cleanup |
2365 | 0 | _yr_re_fast_exec_destroy_position_list( |
2366 | 0 | &context->re_fast_exec_position_pool, first, last)); |
2367 | |
|
2368 | 0 | new_input->input = next_input; |
2369 | 0 | new_input->round = round + 1; |
2370 | 0 | new_input->prev = insertion_point; |
2371 | 0 | new_input->next = insertion_point->next; |
2372 | 0 | insertion_point->next = new_input; |
2373 | |
|
2374 | 0 | if (new_input->next != NULL) |
2375 | 0 | new_input->next->prev = new_input; |
2376 | |
|
2377 | 0 | if (insertion_point == last) |
2378 | 0 | last = new_input; |
2379 | 0 | } |
2380 | | |
2381 | 0 | current->input += input_incr * repeat_any_args->min; |
2382 | 0 | break; |
2383 | | |
2384 | 0 | case RE_OPCODE_MATCH: |
2385 | |
|
2386 | 0 | if (flags & RE_FLAGS_EXHAUSTIVE) |
2387 | 0 | { |
2388 | 0 | FAIL_ON_ERROR_WITH_CLEANUP( |
2389 | 0 | callback( |
2390 | | // When matching forwards the matching data always starts at |
2391 | | // input_data, when matching backwards it starts at input + 1 |
2392 | | // or input_data - input_backwards_size if input + 1 is |
2393 | | // outside the buffer. |
2394 | 0 | flags & RE_FLAGS_BACKWARDS |
2395 | 0 | ? yr_max( |
2396 | 0 | current->input + 1, |
2397 | 0 | input_data - input_backwards_size) |
2398 | 0 | : input_data, |
2399 | | // The number of matched bytes should not be larger than |
2400 | | // max_bytes_matched. |
2401 | 0 | yr_min(bytes_matched, max_bytes_matched), |
2402 | 0 | flags, |
2403 | 0 | callback_args), |
2404 | | // Cleanup |
2405 | 0 | _yr_re_fast_exec_destroy_position_list( |
2406 | 0 | &context->re_fast_exec_position_pool, first, last)); |
2407 | 0 | } |
2408 | 0 | else |
2409 | 0 | { |
2410 | 0 | if (matches != NULL) |
2411 | 0 | *matches = bytes_matched; |
2412 | |
|
2413 | 0 | _yr_re_fast_exec_destroy_position_list( |
2414 | 0 | &context->re_fast_exec_position_pool, first, last); |
2415 | |
|
2416 | 0 | return ERROR_SUCCESS; |
2417 | 0 | } |
2418 | 0 | break; |
2419 | | |
2420 | 0 | default: |
2421 | 0 | printf("non-supported opcode: %d\n", *ip); |
2422 | 0 | assert(false); |
2423 | 0 | } |
2424 | | |
2425 | 0 | if (match) |
2426 | 0 | { |
2427 | 0 | current->round = round + 1; |
2428 | 0 | } |
2429 | 0 | else |
2430 | 0 | { |
2431 | | // Remove current from the list. If the item being removed is the first |
2432 | | // one or the last one, the first and last pointers should be updated. |
2433 | | // The removed item is put into the pool for later reuse. |
2434 | |
|
2435 | 0 | if (current == first) |
2436 | 0 | first = current->next; |
2437 | |
|
2438 | 0 | if (current == last) |
2439 | 0 | last = current->prev; |
2440 | |
|
2441 | 0 | if (current->prev != NULL) |
2442 | 0 | current->prev->next = current->next; |
2443 | |
|
2444 | 0 | if (current->next != NULL) |
2445 | 0 | current->next->prev = current->prev; |
2446 | |
|
2447 | 0 | current->prev = NULL; |
2448 | 0 | current->next = context->re_fast_exec_position_pool.head; |
2449 | |
|
2450 | 0 | if (current->next != NULL) |
2451 | 0 | current->next->prev = current; |
2452 | |
|
2453 | 0 | context->re_fast_exec_position_pool.head = current; |
2454 | 0 | } |
2455 | |
|
2456 | 0 | current = next; |
2457 | |
|
2458 | 0 | } // while (current != NULL) |
2459 | | |
2460 | | // Move instruction pointer (ip) to next instruction. Each opcode has |
2461 | | // a different size. |
2462 | 0 | switch (*ip) |
2463 | 0 | { |
2464 | 0 | case RE_OPCODE_ANY: |
2465 | 0 | ip += 1; |
2466 | 0 | break; |
2467 | 0 | case RE_OPCODE_LITERAL: |
2468 | 0 | case RE_OPCODE_NOT_LITERAL: |
2469 | 0 | ip += 2; |
2470 | 0 | break; |
2471 | 0 | case RE_OPCODE_MASKED_LITERAL: |
2472 | 0 | case RE_OPCODE_MASKED_NOT_LITERAL: |
2473 | 0 | ip += 3; |
2474 | 0 | break; |
2475 | 0 | case RE_OPCODE_REPEAT_ANY_UNGREEDY: |
2476 | 0 | ip += 1 + sizeof(RE_REPEAT_ANY_ARGS); |
2477 | 0 | break; |
2478 | 0 | case RE_OPCODE_MATCH: |
2479 | 0 | break; |
2480 | 0 | default: |
2481 | 0 | assert(false); |
2482 | 0 | } |
2483 | | |
2484 | 0 | round++; |
2485 | 0 | } |
2486 | | |
2487 | | // If this point is reached no matches were found. |
2488 | | |
2489 | 0 | if (matches != NULL) |
2490 | 0 | *matches = -1; |
2491 | |
|
2492 | 0 | return ERROR_SUCCESS; |
2493 | 0 | } |
2494 | | |
2495 | | static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent) |
2496 | 0 | { |
2497 | 0 | RE_NODE* child; |
2498 | 0 | int i; |
2499 | |
|
2500 | 0 | if (re_node == NULL) |
2501 | 0 | return; |
2502 | | |
2503 | 0 | if (indent > 0) |
2504 | 0 | printf("\n%*s", indent, " "); |
2505 | 0 | switch (re_node->type) |
2506 | 0 | { |
2507 | 0 | case RE_NODE_ALT: |
2508 | 0 | printf("Alt("); |
2509 | 0 | _yr_re_print_node(re_node->children_head, indent + 4); |
2510 | 0 | printf(","); |
2511 | 0 | _yr_re_print_node(re_node->children_tail, indent + 4); |
2512 | 0 | printf("\n%*s%s", indent, " ", ")"); |
2513 | 0 | break; |
2514 | | |
2515 | 0 | case RE_NODE_CONCAT: |
2516 | 0 | printf("Cat("); |
2517 | 0 | child = re_node->children_head; |
2518 | 0 | while (child != NULL) |
2519 | 0 | { |
2520 | 0 | _yr_re_print_node(child, indent + 4); |
2521 | 0 | printf(","); |
2522 | 0 | child = child->next_sibling; |
2523 | 0 | } |
2524 | 0 | printf("\n%*s%s", indent, " ", ")"); |
2525 | 0 | break; |
2526 | | |
2527 | 0 | case RE_NODE_STAR: |
2528 | 0 | printf("Star("); |
2529 | 0 | _yr_re_print_node(re_node->children_head, indent + 4); |
2530 | 0 | printf(")"); |
2531 | 0 | break; |
2532 | | |
2533 | 0 | case RE_NODE_PLUS: |
2534 | 0 | printf("Plus("); |
2535 | 0 | _yr_re_print_node(re_node->children_head, indent + 4); |
2536 | 0 | printf(")"); |
2537 | 0 | break; |
2538 | | |
2539 | 0 | case RE_NODE_LITERAL: |
2540 | 0 | printf("Lit(%c)", re_node->value); |
2541 | 0 | break; |
2542 | | |
2543 | 0 | case RE_NODE_NOT_LITERAL: |
2544 | 0 | printf("NotLit(%c)", re_node->value); |
2545 | 0 | break; |
2546 | | |
2547 | 0 | case RE_NODE_MASKED_LITERAL: |
2548 | 0 | printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask); |
2549 | 0 | break; |
2550 | | |
2551 | 0 | case RE_NODE_WORD_CHAR: |
2552 | 0 | printf("WordChar"); |
2553 | 0 | break; |
2554 | | |
2555 | 0 | case RE_NODE_NON_WORD_CHAR: |
2556 | 0 | printf("NonWordChar"); |
2557 | 0 | break; |
2558 | | |
2559 | 0 | case RE_NODE_SPACE: |
2560 | 0 | printf("Space"); |
2561 | 0 | break; |
2562 | | |
2563 | 0 | case RE_NODE_NON_SPACE: |
2564 | 0 | printf("NonSpace"); |
2565 | 0 | break; |
2566 | | |
2567 | 0 | case RE_NODE_DIGIT: |
2568 | 0 | printf("Digit"); |
2569 | 0 | break; |
2570 | | |
2571 | 0 | case RE_NODE_NON_DIGIT: |
2572 | 0 | printf("NonDigit"); |
2573 | 0 | break; |
2574 | | |
2575 | 0 | case RE_NODE_ANY: |
2576 | 0 | printf("Any"); |
2577 | 0 | break; |
2578 | | |
2579 | 0 | case RE_NODE_RANGE: |
2580 | 0 | printf("Range(%d-%d, ", re_node->start, re_node->end); |
2581 | 0 | _yr_re_print_node(re_node->children_head, indent + 4); |
2582 | 0 | printf("\n%*s%s", indent, " ", ")"); |
2583 | 0 | break; |
2584 | | |
2585 | 0 | case RE_NODE_CLASS: |
2586 | 0 | printf("Class("); |
2587 | 0 | for (i = 0; i < 256; i++) |
2588 | 0 | if (_yr_re_is_char_in_class(re_node->re_class, i, false)) |
2589 | 0 | printf("%02X,", i); |
2590 | 0 | printf(")"); |
2591 | 0 | break; |
2592 | | |
2593 | 0 | case RE_NODE_EMPTY: |
2594 | 0 | printf("Empty"); |
2595 | 0 | break; |
2596 | | |
2597 | 0 | case RE_NODE_ANCHOR_START: |
2598 | 0 | printf("AnchorStart"); |
2599 | 0 | break; |
2600 | | |
2601 | 0 | case RE_NODE_ANCHOR_END: |
2602 | 0 | printf("AnchorEnd"); |
2603 | 0 | break; |
2604 | | |
2605 | 0 | case RE_NODE_WORD_BOUNDARY: |
2606 | 0 | printf("WordBoundary"); |
2607 | 0 | break; |
2608 | | |
2609 | 0 | case RE_NODE_NON_WORD_BOUNDARY: |
2610 | 0 | printf("NonWordBoundary"); |
2611 | 0 | break; |
2612 | | |
2613 | 0 | case RE_NODE_RANGE_ANY: |
2614 | 0 | printf("RangeAny"); |
2615 | 0 | break; |
2616 | | |
2617 | 0 | default: |
2618 | 0 | printf("???"); |
2619 | 0 | break; |
2620 | 0 | } |
2621 | 0 | } |
2622 | | |
2623 | | void yr_re_print(RE_AST* re_ast) |
2624 | 0 | { |
2625 | 0 | _yr_re_print_node(re_ast->root_node, 0); |
2626 | 0 | } |