Coverage Report

Created: 2023-06-07 07:18

/src/yara/libyara/re.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module implements a regular expressions engine based on Thompson's
33
algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html.
34
35
What the article names a "thread" has been named a "fiber" in this code, in
36
order to avoid confusion with operating system threads.
37
38
*/
39
40
#include <assert.h>
41
#include <string.h>
42
#include <yara/compiler.h>
43
#include <yara/error.h>
44
#include <yara/globals.h>
45
#include <yara/hex_lexer.h>
46
#include <yara/limits.h>
47
#include <yara/mem.h>
48
#include <yara/re.h>
49
#include <yara/re_lexer.h>
50
#include <yara/strutils.h>
51
#include <yara/threading.h>
52
#include <yara/unaligned.h>
53
#include <yara/utils.h>
54
55
0
#define EMIT_BACKWARDS               0x01
56
0
#define EMIT_DONT_SET_FORWARDS_CODE  0x02
57
0
#define EMIT_DONT_SET_BACKWARDS_CODE 0x04
58
59
#ifndef INT16_MAX
60
#define INT16_MAX (32767)
61
#endif
62
63
typedef uint8_t RE_SPLIT_ID_TYPE;
64
65
// RE_REPEAT_ARGS and RE_REPEAT_ANY_ARGS are structures that are embedded in
66
// the regexp's instruction stream. As such, they are not always aligned to
67
// 8-byte boundaries, so they need to be "packed" structures in order to prevent
68
// issues due to unaligned memory accesses.
69
#pragma pack(push)
70
#pragma pack(1)
71
72
typedef struct _RE_REPEAT_ARGS
73
{
74
  uint16_t min;
75
  uint16_t max;
76
  int32_t offset;
77
78
} RE_REPEAT_ARGS;
79
80
typedef struct _RE_REPEAT_ANY_ARGS
81
{
82
  uint16_t min;
83
  uint16_t max;
84
85
} RE_REPEAT_ANY_ARGS;
86
87
#pragma pack(pop)
88
89
typedef struct _RE_EMIT_CONTEXT
90
{
91
  YR_ARENA* arena;
92
  RE_SPLIT_ID_TYPE next_split_id;
93
94
} RE_EMIT_CONTEXT;
95
96
0
#define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8))
97
98
static bool _yr_re_is_char_in_class(
99
    RE_CLASS* re_class,
100
    uint8_t chr,
101
    int case_insensitive)
102
0
{
103
0
  int result = CHAR_IN_CLASS(re_class->bitmap, chr);
104
105
0
  if (case_insensitive)
106
0
    result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]);
107
108
0
  if (re_class->negated)
109
0
    result = !result;
110
111
0
  return result;
112
0
}
113
114
static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size)
115
0
{
116
0
  int result = ((yr_isalnum(input) || (*input) == '_'));
117
118
0
  if (character_size == 2)
119
0
    result = result && (*(input + 1) == 0);
120
121
0
  return result;
122
0
}
123
124
RE_NODE* yr_re_node_create(int type)
125
0
{
126
0
  RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE));
127
128
0
  if (result != NULL)
129
0
  {
130
0
    result->type = type;
131
0
    result->children_head = NULL;
132
0
    result->children_tail = NULL;
133
0
    result->prev_sibling = NULL;
134
0
    result->next_sibling = NULL;
135
0
    result->greedy = true;
136
0
    result->forward_code_ref = YR_ARENA_NULL_REF;
137
0
    result->backward_code_ref = YR_ARENA_NULL_REF;
138
0
  }
139
140
0
  return result;
141
0
}
142
143
void yr_re_node_destroy(RE_NODE* node)
144
0
{
145
0
  RE_NODE* child = node->children_head;
146
0
  RE_NODE* next_child;
147
148
0
  while (child != NULL)
149
0
  {
150
0
    next_child = child->next_sibling;
151
0
    yr_re_node_destroy(child);
152
0
    child = next_child;
153
0
  }
154
155
0
  if (node->type == RE_NODE_CLASS)
156
0
    yr_free(node->re_class);
157
158
0
  yr_free(node);
159
0
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Appends a node to the end of the children list.
163
//
164
void yr_re_node_append_child(RE_NODE* node, RE_NODE* child)
165
0
{
166
0
  if (node->children_head == NULL)
167
0
    node->children_head = child;
168
169
0
  if (node->children_tail != NULL)
170
0
    node->children_tail->next_sibling = child;
171
172
0
  child->prev_sibling = node->children_tail;
173
0
  node->children_tail = child;
174
0
}
175
176
////////////////////////////////////////////////////////////////////////////////
177
// Appends a node to the beginning of the children list.
178
//
179
void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child)
180
0
{
181
0
  child->next_sibling = node->children_head;
182
183
0
  if (node->children_head != NULL)
184
0
    node->children_head->prev_sibling = child;
185
186
0
  node->children_head = child;
187
188
0
  if (node->children_tail == NULL)
189
0
    node->children_tail = child;
190
0
}
191
192
int yr_re_ast_create(RE_AST** re_ast)
193
0
{
194
0
  *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST));
195
196
0
  if (*re_ast == NULL)
197
0
    return ERROR_INSUFFICIENT_MEMORY;
198
199
0
  (*re_ast)->flags = 0;
200
0
  (*re_ast)->root_node = NULL;
201
202
0
  return ERROR_SUCCESS;
203
0
}
204
205
void yr_re_ast_destroy(RE_AST* re_ast)
206
0
{
207
0
  if (re_ast->root_node != NULL)
208
0
    yr_re_node_destroy(re_ast->root_node);
209
210
0
  yr_free(re_ast);
211
0
}
212
213
////////////////////////////////////////////////////////////////////////////////
214
// Parses a regexp but don't emit its code. A further call to
215
// yr_re_ast_emit_code is required to get the code.
216
//
217
int yr_re_parse(const char* re_string, RE_AST** re_ast, RE_ERROR* error)
218
0
{
219
0
  return yr_parse_re_string(re_string, re_ast, error);
220
0
}
221
222
////////////////////////////////////////////////////////////////////////////////
223
// Parses a hex string but don't emit its code. A further call to
224
// yr_re_ast_emit_code is required to get the code.
225
//
226
int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error)
227
0
{
228
0
  return yr_parse_hex_string(hex_string, re_ast, error);
229
0
}
230
231
////////////////////////////////////////////////////////////////////////////////
232
// Parses the regexp and emit its code to the provided to the
233
// YR_RE_CODE_SECTION in the specified arena.
234
//
235
int yr_re_compile(
236
    const char* re_string,
237
    int flags,
238
    YR_ARENA* arena,
239
    YR_ARENA_REF* ref,
240
    RE_ERROR* error)
241
0
{
242
0
  RE_AST* re_ast;
243
0
  RE _re;
244
245
0
  FAIL_ON_ERROR(yr_re_parse(re_string, &re_ast, error));
246
247
0
  _re.flags = flags;
248
249
0
  FAIL_ON_ERROR_WITH_CLEANUP(
250
0
      yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref),
251
0
      yr_re_ast_destroy(re_ast));
252
253
0
  FAIL_ON_ERROR_WITH_CLEANUP(
254
0
      yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast));
255
256
0
  yr_re_ast_destroy(re_ast);
257
258
0
  return ERROR_SUCCESS;
259
0
}
260
261
////////////////////////////////////////////////////////////////////////////////
262
// Verifies if the target string matches the pattern
263
//
264
// Args:
265
//    context: Scan context
266
//    re: A pointer to a compiled regexp
267
//    target: Target string
268
//
269
// Returns:
270
//    See return codes for yr_re_exec
271
//
272
int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target)
273
0
{
274
0
  int result;
275
276
0
  yr_re_exec(
277
0
      context,
278
0
      re->code,
279
0
      (uint8_t*) target,
280
0
      strlen(target),
281
0
      0,
282
0
      re->flags | RE_FLAGS_SCAN,
283
0
      NULL,
284
0
      NULL,
285
0
      &result);
286
287
0
  return result;
288
0
}
289
290
////////////////////////////////////////////////////////////////////////////////
291
// Verifies if the provided regular expression is just a literal string
292
// like "abc", "12345", without any wildcard, operator, etc. In that case
293
// returns the string as a SIZED_STRING, or returns NULL if otherwise.
294
//
295
// The caller is responsible for deallocating the returned SIZED_STRING by
296
// calling yr_free.
297
//
298
SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast)
299
0
{
300
0
  SIZED_STRING* string;
301
0
  RE_NODE* child;
302
303
0
  int length = 0;
304
305
0
  if (re_ast->root_node->type == RE_NODE_LITERAL)
306
0
  {
307
0
    length = 1;
308
0
  }
309
0
  else if (re_ast->root_node->type == RE_NODE_CONCAT)
310
0
  {
311
0
    child = re_ast->root_node->children_tail;
312
313
0
    while (child != NULL && child->type == RE_NODE_LITERAL)
314
0
    {
315
0
      length++;
316
0
      child = child->prev_sibling;
317
0
    }
318
319
0
    if (child != NULL)
320
0
      return NULL;
321
0
  }
322
0
  else
323
0
  {
324
0
    return NULL;
325
0
  }
326
327
0
  string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
328
329
0
  if (string == NULL)
330
0
    return NULL;
331
332
0
  string->length = length;
333
0
  string->flags = 0;
334
335
0
  if (re_ast->root_node->type == RE_NODE_LITERAL)
336
0
  {
337
0
    string->c_string[0] = re_ast->root_node->value;
338
0
  }
339
0
  else
340
0
  {
341
0
    child = re_ast->root_node->children_tail;
342
343
0
    while (child != NULL)
344
0
    {
345
0
      string->c_string[--length] = child->value;
346
0
      child = child->prev_sibling;
347
0
    }
348
0
  }
349
350
0
  string->c_string[string->length] = '\0';
351
352
0
  return string;
353
0
}
354
355
int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node)
356
0
{
357
0
  RE_NODE* child;
358
359
0
  if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) &&
360
0
      re_node->children_head->type == RE_NODE_ANY)
361
0
    return true;
362
363
0
  if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE)
364
0
    return true;
365
366
0
  if (re_node->type == RE_NODE_CONCAT)
367
0
  {
368
0
    child = re_node->children_tail;
369
370
0
    while (child != NULL)
371
0
    {
372
0
      if (_yr_re_node_has_unbounded_quantifier_for_dot(child))
373
0
        return true;
374
375
0
      child = child->prev_sibling;
376
0
    }
377
0
  }
378
379
0
  return false;
380
0
}
381
382
////////////////////////////////////////////////////////////////////////////////
383
// Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with
384
// quantifiers that don't have a reasonably small upper bound causes a
385
// performance penalty. This function detects such cases in order to warn the
386
// user about this.
387
//
388
int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast)
389
0
{
390
0
  return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node);
391
0
}
392
393
////////////////////////////////////////////////////////////////////////////////
394
// In some cases splitting a regular expression (or hex string) in two parts is
395
// convenient for increasing performance. This happens when the pattern contains
396
// a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 }
397
// In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where
398
// the latter is chained to the former. This means that { 01 02 03 } and
399
// { 04 05 06 } are handled as individual strings, and when both of them are
400
// found, YARA verifies if the distance between the matches complies with the
401
// [0-999] restriction.
402
//
403
// This function traverses a regexp's AST looking for nodes where it should be
404
// splitted. It must be noticed that this only applies to two-level ASTs (i.e.
405
// an AST consisting in a RE_NODE_CONCAT at the root where all the children are
406
// leaves).
407
//
408
// For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the
409
// following AST:
410
//
411
// RE_NODE_CONCAT
412
// |
413
// |- RE_NODE_LITERAL (01)
414
// |- RE_NODE_LITERAL (02)
415
// |- RE_NODE_LITERAL (03)
416
// |- RE_NODE_RANGE_ANY (start=0, end=1000)
417
// |- RE_NODE_LITERAL (04)
418
// |- RE_NODE_LITERAL (05)
419
// |- RE_NODE_LITERAL (06)
420
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
421
// |- RE_NODE_LITERAL (07)
422
// |- RE_NODE_LITERAL (08)
423
// |- RE_NODE_LITERAL (09)
424
//
425
// If the AST above is passed in the re_ast argument, it will be trimmed to:
426
//
427
// RE_NODE_CONCAT
428
// |
429
// |- RE_NODE_LITERAL (01)
430
// |- RE_NODE_LITERAL (02)
431
// |- RE_NODE_LITERAL (03)
432
//
433
// While remainder_re_ast will be:
434
//
435
// RE_NODE_CONCAT
436
// |
437
// |- RE_NODE_LITERAL (04)
438
// |- RE_NODE_LITERAL (05)
439
// |- RE_NODE_LITERAL (06)
440
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
441
// |- RE_NODE_LITERAL (07)
442
// |- RE_NODE_LITERAL (08)
443
// |- RE_NODE_LITERAL (09)
444
//
445
// The caller is responsible for freeing the new AST in remainder_re_ast by
446
// calling yr_re_ast_destroy.
447
//
448
// The integers pointed to by min_gap and max_gap will be filled with the
449
// minimum and maximum gap size between the sub-strings represented by the
450
// two ASTs.
451
//
452
int yr_re_ast_split_at_chaining_point(
453
    RE_AST* re_ast,
454
    RE_AST** remainder_re_ast,
455
    int32_t* min_gap,
456
    int32_t* max_gap)
457
0
{
458
0
  RE_NODE* child;
459
0
  RE_NODE* concat;
460
461
0
  int result;
462
463
0
  *remainder_re_ast = NULL;
464
0
  *min_gap = 0;
465
0
  *max_gap = 0;
466
467
0
  if (re_ast->root_node->type != RE_NODE_CONCAT)
468
0
    return ERROR_SUCCESS;
469
470
0
  child = re_ast->root_node->children_head;
471
472
0
  while (child != NULL)
473
0
  {
474
0
    if (!child->greedy && child->type == RE_NODE_RANGE_ANY &&
475
0
        child->prev_sibling != NULL && child->next_sibling != NULL &&
476
0
        (child->start > YR_STRING_CHAINING_THRESHOLD ||
477
0
         child->end > YR_STRING_CHAINING_THRESHOLD))
478
0
    {
479
0
      result = yr_re_ast_create(remainder_re_ast);
480
481
0
      if (result != ERROR_SUCCESS)
482
0
        return result;
483
484
0
      concat = yr_re_node_create(RE_NODE_CONCAT);
485
486
0
      if (concat == NULL)
487
0
        return ERROR_INSUFFICIENT_MEMORY;
488
489
0
      concat->children_head = child->next_sibling;
490
0
      concat->children_tail = re_ast->root_node->children_tail;
491
492
0
      re_ast->root_node->children_tail = child->prev_sibling;
493
494
0
      child->prev_sibling->next_sibling = NULL;
495
0
      child->next_sibling->prev_sibling = NULL;
496
497
0
      *min_gap = child->start;
498
0
      *max_gap = child->end;
499
500
0
      (*remainder_re_ast)->root_node = concat;
501
0
      (*remainder_re_ast)->flags = re_ast->flags;
502
503
0
      yr_re_node_destroy(child);
504
505
0
      return ERROR_SUCCESS;
506
0
    }
507
508
0
    child = child->next_sibling;
509
0
  }
510
511
0
  return ERROR_SUCCESS;
512
0
}
513
514
int _yr_emit_inst(
515
    RE_EMIT_CONTEXT* emit_context,
516
    uint8_t opcode,
517
    YR_ARENA_REF* instruction_ref)
518
0
{
519
0
  FAIL_ON_ERROR(yr_arena_write_data(
520
0
      emit_context->arena,
521
0
      YR_RE_CODE_SECTION,
522
0
      &opcode,
523
0
      sizeof(uint8_t),
524
0
      instruction_ref));
525
526
0
  return ERROR_SUCCESS;
527
0
}
528
529
int _yr_emit_inst_arg_uint8(
530
    RE_EMIT_CONTEXT* emit_context,
531
    uint8_t opcode,
532
    uint8_t argument,
533
    YR_ARENA_REF* instruction_ref,
534
    YR_ARENA_REF* argument_ref)
535
0
{
536
0
  FAIL_ON_ERROR(yr_arena_write_data(
537
0
      emit_context->arena,
538
0
      YR_RE_CODE_SECTION,
539
0
      &opcode,
540
0
      sizeof(uint8_t),
541
0
      instruction_ref));
542
543
0
  FAIL_ON_ERROR(yr_arena_write_data(
544
0
      emit_context->arena,
545
0
      YR_RE_CODE_SECTION,
546
0
      &argument,
547
0
      sizeof(uint8_t),
548
0
      argument_ref));
549
550
0
  return ERROR_SUCCESS;
551
0
}
552
553
int _yr_emit_inst_arg_uint16(
554
    RE_EMIT_CONTEXT* emit_context,
555
    uint8_t opcode,
556
    uint16_t argument,
557
    YR_ARENA_REF* instruction_ref,
558
    YR_ARENA_REF* argument_ref)
559
0
{
560
0
  FAIL_ON_ERROR(yr_arena_write_data(
561
0
      emit_context->arena,
562
0
      YR_RE_CODE_SECTION,
563
0
      &opcode,
564
0
      sizeof(uint8_t),
565
0
      instruction_ref));
566
567
0
  FAIL_ON_ERROR(yr_arena_write_data(
568
0
      emit_context->arena,
569
0
      YR_RE_CODE_SECTION,
570
0
      &argument,
571
0
      sizeof(uint16_t),
572
0
      argument_ref));
573
574
0
  return ERROR_SUCCESS;
575
0
}
576
577
int _yr_emit_inst_arg_uint32(
578
    RE_EMIT_CONTEXT* emit_context,
579
    uint8_t opcode,
580
    uint32_t argument,
581
    YR_ARENA_REF* instruction_ref,
582
    YR_ARENA_REF* argument_ref)
583
0
{
584
0
  FAIL_ON_ERROR(yr_arena_write_data(
585
0
      emit_context->arena,
586
0
      YR_RE_CODE_SECTION,
587
0
      &opcode,
588
0
      sizeof(uint8_t),
589
0
      instruction_ref));
590
591
0
  FAIL_ON_ERROR(yr_arena_write_data(
592
0
      emit_context->arena,
593
0
      YR_RE_CODE_SECTION,
594
0
      &argument,
595
0
      sizeof(uint32_t),
596
0
      argument_ref));
597
598
0
  return ERROR_SUCCESS;
599
0
}
600
601
int _yr_emit_inst_arg_int16(
602
    RE_EMIT_CONTEXT* emit_context,
603
    uint8_t opcode,
604
    int16_t argument,
605
    YR_ARENA_REF* instruction_ref,
606
    YR_ARENA_REF* argument_ref)
607
0
{
608
0
  FAIL_ON_ERROR(yr_arena_write_data(
609
0
      emit_context->arena,
610
0
      YR_RE_CODE_SECTION,
611
0
      &opcode,
612
0
      sizeof(uint8_t),
613
0
      instruction_ref));
614
615
0
  FAIL_ON_ERROR(yr_arena_write_data(
616
0
      emit_context->arena,
617
0
      YR_RE_CODE_SECTION,
618
0
      &argument,
619
0
      sizeof(int16_t),
620
0
      argument_ref));
621
622
0
  return ERROR_SUCCESS;
623
0
}
624
625
int _yr_emit_inst_arg_struct(
626
    RE_EMIT_CONTEXT* emit_context,
627
    uint8_t opcode,
628
    void* structure,
629
    size_t structure_size,
630
    YR_ARENA_REF* instruction_ref,
631
    YR_ARENA_REF* argument_ref)
632
0
{
633
0
  FAIL_ON_ERROR(yr_arena_write_data(
634
0
      emit_context->arena,
635
0
      YR_RE_CODE_SECTION,
636
0
      &opcode,
637
0
      sizeof(uint8_t),
638
0
      instruction_ref));
639
640
0
  FAIL_ON_ERROR(yr_arena_write_data(
641
0
      emit_context->arena,
642
0
      YR_RE_CODE_SECTION,
643
0
      structure,
644
0
      structure_size,
645
0
      argument_ref));
646
647
0
  return ERROR_SUCCESS;
648
0
}
649
650
int _yr_emit_split(
651
    RE_EMIT_CONTEXT* emit_context,
652
    uint8_t opcode,
653
    int16_t argument,
654
    YR_ARENA_REF* instruction_ref,
655
    YR_ARENA_REF* argument_ref)
656
0
{
657
0
  assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B);
658
659
0
  if (emit_context->next_split_id == RE_MAX_SPLIT_ID)
660
0
    return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX;
661
662
0
  FAIL_ON_ERROR(yr_arena_write_data(
663
0
      emit_context->arena,
664
0
      YR_RE_CODE_SECTION,
665
0
      &opcode,
666
0
      sizeof(uint8_t),
667
0
      instruction_ref));
668
669
0
  FAIL_ON_ERROR(yr_arena_write_data(
670
0
      emit_context->arena,
671
0
      YR_RE_CODE_SECTION,
672
0
      &emit_context->next_split_id,
673
0
      sizeof(RE_SPLIT_ID_TYPE),
674
0
      NULL));
675
676
0
  emit_context->next_split_id++;
677
678
0
  FAIL_ON_ERROR(yr_arena_write_data(
679
0
      emit_context->arena,
680
0
      YR_RE_CODE_SECTION,
681
0
      &argument,
682
0
      sizeof(int16_t),
683
0
      argument_ref));
684
685
0
  return ERROR_SUCCESS;
686
0
}
687
688
static int _yr_re_emit(
689
    RE_EMIT_CONTEXT* emit_context,
690
    RE_NODE* re_node,
691
    int flags,
692
    YR_ARENA_REF* code_ref)
693
0
{
694
0
  int16_t jmp_offset;
695
696
0
  yr_arena_off_t bookmark_1 = 0;
697
0
  yr_arena_off_t bookmark_2 = 0;
698
0
  yr_arena_off_t bookmark_3 = 0;
699
0
  yr_arena_off_t bookmark_4 = 0;
700
701
0
  bool emit_split;
702
0
  bool emit_repeat;
703
0
  bool emit_prolog;
704
0
  bool emit_epilog;
705
706
0
  RE_REPEAT_ARGS repeat_args;
707
0
  RE_REPEAT_ARGS* repeat_start_args_addr;
708
0
  RE_REPEAT_ANY_ARGS repeat_any_args;
709
710
0
  RE_NODE* child;
711
712
0
  int16_t* split_offset_addr = NULL;
713
0
  int16_t* jmp_offset_addr = NULL;
714
715
0
  YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF;
716
0
  YR_ARENA_REF split_offset_ref;
717
0
  YR_ARENA_REF jmp_instruction_ref;
718
0
  YR_ARENA_REF jmp_offset_ref;
719
0
  YR_ARENA_REF repeat_start_args_ref;
720
721
0
  switch (re_node->type)
722
0
  {
723
0
  case RE_NODE_LITERAL:
724
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
725
0
        emit_context,
726
0
        RE_OPCODE_LITERAL,
727
0
        re_node->value,
728
0
        &instruction_ref,
729
0
        NULL));
730
0
    break;
731
732
0
  case RE_NODE_NOT_LITERAL:
733
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
734
0
        emit_context,
735
0
        RE_OPCODE_NOT_LITERAL,
736
0
        re_node->value,
737
0
        &instruction_ref,
738
0
        NULL));
739
0
    break;
740
741
0
  case RE_NODE_MASKED_LITERAL:
742
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
743
0
        emit_context,
744
0
        RE_OPCODE_MASKED_LITERAL,
745
0
        re_node->mask << 8 | re_node->value,
746
0
        &instruction_ref,
747
0
        NULL));
748
0
    break;
749
750
0
  case RE_NODE_MASKED_NOT_LITERAL:
751
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
752
0
        emit_context,
753
0
        RE_OPCODE_MASKED_NOT_LITERAL,
754
0
        re_node->mask << 8 | re_node->value,
755
0
        &instruction_ref,
756
0
        NULL));
757
0
    break;
758
759
0
  case RE_NODE_WORD_CHAR:
760
0
    FAIL_ON_ERROR(
761
0
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref));
762
0
    break;
763
764
0
  case RE_NODE_NON_WORD_CHAR:
765
0
    FAIL_ON_ERROR(
766
0
        _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref));
767
0
    break;
768
769
0
  case RE_NODE_WORD_BOUNDARY:
770
0
    FAIL_ON_ERROR(
771
0
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref));
772
0
    break;
773
774
0
  case RE_NODE_NON_WORD_BOUNDARY:
775
0
    FAIL_ON_ERROR(_yr_emit_inst(
776
0
        emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref));
777
0
    break;
778
779
0
  case RE_NODE_SPACE:
780
0
    FAIL_ON_ERROR(
781
0
        _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref));
782
0
    break;
783
784
0
  case RE_NODE_NON_SPACE:
785
0
    FAIL_ON_ERROR(
786
0
        _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref));
787
0
    break;
788
789
0
  case RE_NODE_DIGIT:
790
0
    FAIL_ON_ERROR(
791
0
        _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref));
792
0
    break;
793
794
0
  case RE_NODE_NON_DIGIT:
795
0
    FAIL_ON_ERROR(
796
0
        _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref));
797
0
    break;
798
799
0
  case RE_NODE_ANY:
800
0
    FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref));
801
0
    break;
802
803
0
  case RE_NODE_CLASS:
804
0
    FAIL_ON_ERROR(
805
0
        _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref));
806
807
0
    FAIL_ON_ERROR(yr_arena_write_data(
808
0
        emit_context->arena,
809
0
        YR_RE_CODE_SECTION,
810
0
        re_node->re_class,
811
0
        sizeof(*re_node->re_class),
812
0
        NULL));
813
0
    break;
814
815
0
  case RE_NODE_ANCHOR_START:
816
0
    FAIL_ON_ERROR(_yr_emit_inst(
817
0
        emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref));
818
0
    break;
819
820
0
  case RE_NODE_ANCHOR_END:
821
0
    FAIL_ON_ERROR(
822
0
        _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref));
823
0
    break;
824
825
0
  case RE_NODE_CONCAT:
826
0
    FAIL_ON_ERROR(_yr_re_emit(
827
0
        emit_context,
828
0
        (flags & EMIT_BACKWARDS) ? re_node->children_tail
829
0
                                 : re_node->children_head,
830
0
        flags,
831
0
        &instruction_ref));
832
833
0
    if (flags & EMIT_BACKWARDS)
834
0
      child = re_node->children_tail->prev_sibling;
835
0
    else
836
0
      child = re_node->children_head->next_sibling;
837
838
0
    while (child != NULL)
839
0
    {
840
0
      FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL));
841
842
0
      child = (flags & EMIT_BACKWARDS) ? child->prev_sibling
843
0
                                       : child->next_sibling;
844
0
    }
845
0
    break;
846
847
0
  case RE_NODE_PLUS:
848
    // Code for e+ looks like:
849
    //
850
    //          L1: code for e
851
    //              split L1, L2
852
    //          L2:
853
    //
854
0
    FAIL_ON_ERROR(_yr_re_emit(
855
0
        emit_context, re_node->children_head, flags, &instruction_ref));
856
857
0
    bookmark_1 = yr_arena_get_current_offset(
858
0
        emit_context->arena, YR_RE_CODE_SECTION);
859
860
0
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
861
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
862
863
0
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
864
865
0
    FAIL_ON_ERROR(_yr_emit_split(
866
0
        emit_context,
867
0
        re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A,
868
0
        jmp_offset,
869
0
        NULL,
870
0
        NULL));
871
872
0
    break;
873
874
0
  case RE_NODE_STAR:
875
    // Code for e* looks like:
876
    //
877
    //          L1: split L1, L2
878
    //              code for e
879
    //              jmp L1
880
    //          L2:
881
0
    FAIL_ON_ERROR(_yr_emit_split(
882
0
        emit_context,
883
0
        re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
884
0
        0,
885
0
        &instruction_ref,
886
0
        &split_offset_ref));
887
888
0
    FAIL_ON_ERROR(
889
0
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
890
891
0
    bookmark_1 = yr_arena_get_current_offset(
892
0
        emit_context->arena, YR_RE_CODE_SECTION);
893
894
0
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
895
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
896
897
0
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
898
899
    // Emit jump with offset set to 0.
900
901
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
902
0
        emit_context, RE_OPCODE_JUMP, jmp_offset, NULL, NULL));
903
904
0
    bookmark_1 = yr_arena_get_current_offset(
905
0
        emit_context->arena, YR_RE_CODE_SECTION);
906
907
0
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
908
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
909
910
0
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
911
912
    // Update split offset.
913
0
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
914
0
        emit_context->arena, &split_offset_ref);
915
916
0
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
917
0
    break;
918
919
0
  case RE_NODE_ALT:
920
    // Code for e1|e2 looks like:
921
    //
922
    //              split L1, L2
923
    //          L1: code for e1
924
    //              jmp L3
925
    //          L2: code for e2
926
    //          L3:
927
928
    // Emit a split instruction with offset set to 0 temporarily. Offset
929
    // will be updated after we know the size of the code generated for
930
    // the left node (e1).
931
932
0
    FAIL_ON_ERROR(_yr_emit_split(
933
0
        emit_context,
934
0
        RE_OPCODE_SPLIT_A,
935
0
        0,
936
0
        &instruction_ref,
937
0
        &split_offset_ref));
938
939
0
    FAIL_ON_ERROR(
940
0
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
941
942
    // Emit jump with offset set to 0.
943
944
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
945
0
        emit_context,
946
0
        RE_OPCODE_JUMP,
947
0
        0,
948
0
        &jmp_instruction_ref,
949
0
        &jmp_offset_ref));
950
951
0
    bookmark_1 = yr_arena_get_current_offset(
952
0
        emit_context->arena, YR_RE_CODE_SECTION);
953
954
0
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
955
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
956
957
0
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
958
959
    // Update split offset.
960
0
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
961
0
        emit_context->arena, &split_offset_ref);
962
963
0
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
964
965
0
    FAIL_ON_ERROR(
966
0
        _yr_re_emit(emit_context, re_node->children_tail, flags, NULL));
967
968
0
    bookmark_1 = yr_arena_get_current_offset(
969
0
        emit_context->arena, YR_RE_CODE_SECTION);
970
971
0
    if (bookmark_1 - jmp_instruction_ref.offset > INT16_MAX)
972
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
973
974
0
    jmp_offset = (int16_t) (bookmark_1 - jmp_instruction_ref.offset);
975
976
    // Update offset for jmp instruction.
977
0
    jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
978
0
        emit_context->arena, &jmp_offset_ref);
979
980
0
    memcpy(jmp_offset_addr, &jmp_offset, sizeof(jmp_offset));
981
0
    break;
982
983
0
  case RE_NODE_RANGE_ANY:
984
0
    repeat_any_args.min = re_node->start;
985
0
    repeat_any_args.max = re_node->end;
986
987
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
988
0
        emit_context,
989
0
        re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY
990
0
                        : RE_OPCODE_REPEAT_ANY_UNGREEDY,
991
0
        &repeat_any_args,
992
0
        sizeof(repeat_any_args),
993
0
        &instruction_ref,
994
0
        NULL));
995
996
0
    break;
997
998
0
  case RE_NODE_RANGE:
999
    // Code for e{n,m} looks like:
1000
    //
1001
    //            code for e              ---   prolog
1002
    //            repeat_start n, m, L1   --+
1003
    //        L0: code for e                |   repeat
1004
    //            repeat_end n, m, L0     --+
1005
    //        L1: split L2, L3            ---   split
1006
    //        L2: code for e              ---   epilog
1007
    //        L3:
1008
    //
1009
    // Not all sections (prolog, repeat, split and epilog) are generated in all
1010
    // cases, it depends on the values of n and m. The following table shows
1011
    // which sections are generated for the first few values of n and m.
1012
    //
1013
    //        n,m   prolog  repeat      split  epilog
1014
    //                      (min,max)
1015
    //        ---------------------------------------
1016
    //        0,0     -       -           -      -
1017
    //        0,1     -       -           X      X
1018
    //        0,2     -       0,1         X      X
1019
    //        0,3     -       0,2         X      X
1020
    //        0,M     -       0,M-1       X      X
1021
    //
1022
    //        1,1     X       -           -      -
1023
    //        1,2     X       -           X      X
1024
    //        1,3     X       0,1         X      X
1025
    //        1,4     X       1,2         X      X
1026
    //        1,M     X       1,M-2       X      X
1027
    //
1028
    //        2,2     X       -           -      X
1029
    //        2,3     X       1,1         X      X
1030
    //        2,4     X       1,2         X      X
1031
    //        2,M     X       1,M-2       X      X
1032
    //
1033
    //        3,3     X       1,1         -      X
1034
    //        3,4     X       2,2         X      X
1035
    //        3,M     X       2,M-2       X      X
1036
    //
1037
    //        4,4     X       2,2         -      X
1038
    //        4,5     X       3,3         X      X
1039
    //        4,M     X       3,M-2       X      X
1040
    //
1041
    // The code can't consists simply in the repeat section, the prolog and
1042
    // epilog are required because we can't have atoms pointing to code inside
1043
    // the repeat loop. Atoms' forwards_code will point to code in the prolog
1044
    // and backwards_code will point to code in the epilog (or in prolog if
1045
    // epilog wasn't generated, like in n=1,m=1)
1046
1047
0
    emit_prolog = re_node->start > 0;
1048
0
    emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2;
1049
0
    emit_split = re_node->end > re_node->start;
1050
0
    emit_epilog = re_node->end > re_node->start || re_node->end > 1;
1051
1052
0
    if (emit_prolog)
1053
0
    {
1054
0
      FAIL_ON_ERROR(_yr_re_emit(
1055
0
          emit_context, re_node->children_head, flags, &instruction_ref));
1056
0
    }
1057
1058
0
    if (emit_repeat)
1059
0
    {
1060
0
      repeat_args.min = re_node->start;
1061
0
      repeat_args.max = re_node->end;
1062
1063
0
      if (emit_prolog)
1064
0
      {
1065
0
        repeat_args.max--;
1066
0
        repeat_args.min--;
1067
0
      }
1068
1069
0
      if (emit_split)
1070
0
      {
1071
0
        repeat_args.max--;
1072
0
      }
1073
0
      else
1074
0
      {
1075
0
        repeat_args.min--;
1076
0
        repeat_args.max--;
1077
0
      }
1078
1079
0
      repeat_args.offset = 0;
1080
1081
0
      bookmark_1 = yr_arena_get_current_offset(
1082
0
          emit_context->arena, YR_RE_CODE_SECTION);
1083
1084
0
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1085
0
          emit_context,
1086
0
          re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY
1087
0
                          : RE_OPCODE_REPEAT_START_UNGREEDY,
1088
0
          &repeat_args,
1089
0
          sizeof(repeat_args),
1090
0
          emit_prolog ? NULL : &instruction_ref,
1091
0
          &repeat_start_args_ref));
1092
1093
0
      bookmark_2 = yr_arena_get_current_offset(
1094
0
          emit_context->arena, YR_RE_CODE_SECTION);
1095
1096
0
      FAIL_ON_ERROR(_yr_re_emit(
1097
0
          emit_context,
1098
0
          re_node->children_head,
1099
0
          flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
1100
0
          NULL));
1101
1102
0
      bookmark_3 = yr_arena_get_current_offset(
1103
0
          emit_context->arena, YR_RE_CODE_SECTION);
1104
1105
0
      if (bookmark_2 - bookmark_3 < INT32_MIN)
1106
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1107
1108
0
      repeat_args.offset = (int32_t) (bookmark_2 - bookmark_3);
1109
1110
0
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1111
0
          emit_context,
1112
0
          re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY
1113
0
                          : RE_OPCODE_REPEAT_END_UNGREEDY,
1114
0
          &repeat_args,
1115
0
          sizeof(repeat_args),
1116
0
          NULL,
1117
0
          NULL));
1118
1119
0
      bookmark_4 = yr_arena_get_current_offset(
1120
0
          emit_context->arena, YR_RE_CODE_SECTION);
1121
1122
0
      repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr(
1123
0
          emit_context->arena, &repeat_start_args_ref);
1124
1125
0
      if (bookmark_4 - bookmark_1 > INT32_MAX)
1126
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1127
1128
0
      repeat_start_args_addr->offset = (int32_t) (bookmark_4 - bookmark_1);
1129
0
    }
1130
1131
0
    if (emit_split)
1132
0
    {
1133
0
      bookmark_1 = yr_arena_get_current_offset(
1134
0
          emit_context->arena, YR_RE_CODE_SECTION);
1135
1136
0
      FAIL_ON_ERROR(_yr_emit_split(
1137
0
          emit_context,
1138
0
          re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
1139
0
          0,
1140
0
          NULL,
1141
0
          &split_offset_ref));
1142
0
    }
1143
1144
0
    if (emit_epilog)
1145
0
    {
1146
0
      FAIL_ON_ERROR(_yr_re_emit(
1147
0
          emit_context,
1148
0
          re_node->children_head,
1149
0
          emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags,
1150
0
          emit_prolog || emit_repeat ? NULL : &instruction_ref));
1151
0
    }
1152
1153
0
    if (emit_split)
1154
0
    {
1155
0
      bookmark_2 = yr_arena_get_current_offset(
1156
0
          emit_context->arena, YR_RE_CODE_SECTION);
1157
1158
0
      if (bookmark_2 - bookmark_1 > INT16_MAX)
1159
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1160
1161
0
      split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
1162
0
          emit_context->arena, &split_offset_ref);
1163
1164
0
      jmp_offset = (int16_t) (bookmark_2 - bookmark_1);
1165
1166
0
      memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
1167
0
    }
1168
1169
0
    break;
1170
0
  }
1171
1172
0
  if (flags & EMIT_BACKWARDS)
1173
0
  {
1174
0
    if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE))
1175
0
    {
1176
0
      re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION;
1177
0
      re_node->backward_code_ref.offset = yr_arena_get_current_offset(
1178
0
          emit_context->arena, YR_RE_CODE_SECTION);
1179
0
    }
1180
0
  }
1181
0
  else
1182
0
  {
1183
0
    if (!(flags & EMIT_DONT_SET_FORWARDS_CODE))
1184
0
    {
1185
0
      re_node->forward_code_ref = instruction_ref;
1186
0
    }
1187
0
  }
1188
1189
0
  if (code_ref != NULL)
1190
0
    *code_ref = instruction_ref;
1191
1192
0
  return ERROR_SUCCESS;
1193
0
}
1194
1195
int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code)
1196
0
{
1197
0
  RE_EMIT_CONTEXT emit_context;
1198
1199
  // Emit code for matching the regular expressions forwards.
1200
0
  emit_context.arena = arena;
1201
0
  emit_context.next_split_id = 0;
1202
1203
0
  FAIL_ON_ERROR(_yr_re_emit(
1204
0
      &emit_context,
1205
0
      re_ast->root_node,
1206
0
      backwards_code ? EMIT_BACKWARDS : 0,
1207
0
      NULL));
1208
1209
0
  FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL));
1210
1211
0
  return ERROR_SUCCESS;
1212
0
}
1213
1214
static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber)
1215
0
{
1216
0
  RE_FIBER* fiber;
1217
1218
0
  if (fiber_pool->fibers.head != NULL)
1219
0
  {
1220
0
    fiber = fiber_pool->fibers.head;
1221
0
    fiber_pool->fibers.head = fiber->next;
1222
1223
0
    if (fiber_pool->fibers.tail == fiber)
1224
0
      fiber_pool->fibers.tail = NULL;
1225
0
  }
1226
0
  else
1227
0
  {
1228
0
    if (fiber_pool->fiber_count == RE_MAX_FIBERS)
1229
0
      return ERROR_TOO_MANY_RE_FIBERS;
1230
1231
0
    fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER));
1232
1233
0
    if (fiber == NULL)
1234
0
      return ERROR_INSUFFICIENT_MEMORY;
1235
1236
0
    fiber_pool->fiber_count++;
1237
0
  }
1238
1239
0
  fiber->ip = NULL;
1240
0
  fiber->sp = -1;
1241
0
  fiber->rc = -1;
1242
0
  fiber->next = NULL;
1243
0
  fiber->prev = NULL;
1244
1245
0
  *new_fiber = fiber;
1246
1247
0
  return ERROR_SUCCESS;
1248
0
}
1249
1250
////////////////////////////////////////////////////////////////////////////////
1251
// Appends 'fiber' to 'fiber_list'
1252
//
1253
static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber)
1254
0
{
1255
0
  assert(fiber->prev == NULL);
1256
0
  assert(fiber->next == NULL);
1257
1258
0
  fiber->prev = fiber_list->tail;
1259
1260
0
  if (fiber_list->tail != NULL)
1261
0
    fiber_list->tail->next = fiber;
1262
1263
0
  fiber_list->tail = fiber;
1264
1265
0
  if (fiber_list->head == NULL)
1266
0
    fiber_list->head = fiber;
1267
1268
0
  assert(fiber_list->tail->next == NULL);
1269
0
  assert(fiber_list->head->prev == NULL);
1270
0
}
1271
1272
////////////////////////////////////////////////////////////////////////////////
1273
// Verifies if a fiber with the same properties (ip, rc, sp, and stack values)
1274
// than 'target_fiber' exists in 'fiber_list'. The list is iterated from
1275
// the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
1276
// taken into account.
1277
//
1278
static int _yr_re_fiber_exists(
1279
    RE_FIBER_LIST* fiber_list,
1280
    RE_FIBER* target_fiber,
1281
    RE_FIBER* last_fiber)
1282
0
{
1283
0
  RE_FIBER* fiber = fiber_list->head;
1284
1285
0
  int equal_stacks;
1286
0
  int i;
1287
1288
0
  if (last_fiber == NULL)
1289
0
    return false;
1290
1291
0
  while (fiber != last_fiber->next)
1292
0
  {
1293
0
    if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp &&
1294
0
        fiber->rc == target_fiber->rc)
1295
0
    {
1296
0
      equal_stacks = true;
1297
1298
0
      for (i = 0; i <= fiber->sp; i++)
1299
0
      {
1300
0
        if (fiber->stack[i] != target_fiber->stack[i])
1301
0
        {
1302
0
          equal_stacks = false;
1303
0
          break;
1304
0
        }
1305
0
      }
1306
1307
0
      if (equal_stacks)
1308
0
        return true;
1309
0
    }
1310
1311
0
    fiber = fiber->next;
1312
0
  }
1313
1314
0
  return false;
1315
0
}
1316
1317
////////////////////////////////////////////////////////////////////////////////
1318
// Clones a fiber in fiber_list and inserts the cloned fiber just after.
1319
// the original one. If fiber_list is:
1320
//
1321
//   f1 -> f2 -> f3 -> f4
1322
//
1323
// Splitting f2 will result in:
1324
//
1325
//   f1 -> f2 -> cloned f2 -> f3 -> f4
1326
//
1327
static int _yr_re_fiber_split(
1328
    RE_FIBER_LIST* fiber_list,
1329
    RE_FIBER_POOL* fiber_pool,
1330
    RE_FIBER* fiber,
1331
    RE_FIBER** new_fiber)
1332
0
{
1333
0
  int32_t i;
1334
1335
0
  FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber));
1336
1337
0
  (*new_fiber)->sp = fiber->sp;
1338
0
  (*new_fiber)->ip = fiber->ip;
1339
0
  (*new_fiber)->rc = fiber->rc;
1340
1341
0
  for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i];
1342
1343
0
  (*new_fiber)->next = fiber->next;
1344
0
  (*new_fiber)->prev = fiber;
1345
1346
0
  if (fiber->next != NULL)
1347
0
    fiber->next->prev = *new_fiber;
1348
1349
0
  fiber->next = *new_fiber;
1350
1351
0
  if (fiber_list->tail == fiber)
1352
0
    fiber_list->tail = *new_fiber;
1353
1354
0
  assert(fiber_list->tail->next == NULL);
1355
0
  assert(fiber_list->head->prev == NULL);
1356
1357
0
  return ERROR_SUCCESS;
1358
0
}
1359
1360
////////////////////////////////////////////////////////////////////////////////
1361
// Kills a given fiber by removing it from the fiber list and putting it in the
1362
// fiber pool.
1363
//
1364
static RE_FIBER* _yr_re_fiber_kill(
1365
    RE_FIBER_LIST* fiber_list,
1366
    RE_FIBER_POOL* fiber_pool,
1367
    RE_FIBER* fiber)
1368
0
{
1369
0
  RE_FIBER* next_fiber = fiber->next;
1370
1371
0
  if (fiber->prev != NULL)
1372
0
    fiber->prev->next = next_fiber;
1373
1374
0
  if (next_fiber != NULL)
1375
0
    next_fiber->prev = fiber->prev;
1376
1377
0
  if (fiber_pool->fibers.tail != NULL)
1378
0
    fiber_pool->fibers.tail->next = fiber;
1379
1380
0
  if (fiber_list->tail == fiber)
1381
0
    fiber_list->tail = fiber->prev;
1382
1383
0
  if (fiber_list->head == fiber)
1384
0
    fiber_list->head = next_fiber;
1385
1386
0
  fiber->next = NULL;
1387
0
  fiber->prev = fiber_pool->fibers.tail;
1388
0
  fiber_pool->fibers.tail = fiber;
1389
1390
0
  if (fiber_pool->fibers.head == NULL)
1391
0
    fiber_pool->fibers.head = fiber;
1392
1393
0
  return next_fiber;
1394
0
}
1395
1396
////////////////////////////////////////////////////////////////////////////////
1397
// Kills all fibers from the given one up to the end of the fiber list.
1398
//
1399
static void _yr_re_fiber_kill_tail(
1400
    RE_FIBER_LIST* fiber_list,
1401
    RE_FIBER_POOL* fiber_pool,
1402
    RE_FIBER* fiber)
1403
0
{
1404
0
  RE_FIBER* prev_fiber = fiber->prev;
1405
1406
0
  if (prev_fiber != NULL)
1407
0
    prev_fiber->next = NULL;
1408
1409
0
  fiber->prev = fiber_pool->fibers.tail;
1410
1411
0
  if (fiber_pool->fibers.tail != NULL)
1412
0
    fiber_pool->fibers.tail->next = fiber;
1413
1414
0
  fiber_pool->fibers.tail = fiber_list->tail;
1415
0
  fiber_list->tail = prev_fiber;
1416
1417
0
  if (fiber_list->head == fiber)
1418
0
    fiber_list->head = NULL;
1419
1420
0
  if (fiber_pool->fibers.head == NULL)
1421
0
    fiber_pool->fibers.head = fiber;
1422
0
}
1423
1424
////////////////////////////////////////////////////////////////////////////////
1425
// Kills all fibers in the fiber list.
1426
//
1427
static void _yr_re_fiber_kill_all(
1428
    RE_FIBER_LIST* fiber_list,
1429
    RE_FIBER_POOL* fiber_pool)
1430
0
{
1431
0
  if (fiber_list->head != NULL)
1432
0
    _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
1433
0
}
1434
1435
////////////////////////////////////////////////////////////////////////////////
1436
// Executes a fiber until reaching an "matching" instruction. A "matching"
1437
// instruction is one that actually reads a byte from the input and performs
1438
// some matching. If the fiber reaches a split instruction, the new fiber is
1439
// also synced.
1440
//
1441
static int _yr_re_fiber_sync(
1442
    RE_FIBER_LIST* fiber_list,
1443
    RE_FIBER_POOL* fiber_pool,
1444
    RE_FIBER* fiber_to_sync)
1445
0
{
1446
  // A array for keeping track of which split instructions has been already
1447
  // executed. Each split instruction within a regexp has an associated ID
1448
  // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is
1449
  // required to avoid infinite loops in regexps like (a*)* or (a|)*
1450
1451
0
  RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID];
1452
0
  RE_SPLIT_ID_TYPE splits_executed_count = 0;
1453
0
  RE_SPLIT_ID_TYPE split_id, splits_executed_idx;
1454
1455
0
  int split_already_executed;
1456
1457
0
  RE_REPEAT_ARGS* repeat_args;
1458
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
1459
1460
0
  RE_FIBER* fiber;
1461
0
  RE_FIBER* last;
1462
0
  RE_FIBER* next;
1463
0
  RE_FIBER* branch_a;
1464
0
  RE_FIBER* branch_b;
1465
1466
0
  fiber = fiber_to_sync;
1467
0
  last = fiber_to_sync->next;
1468
1469
0
  while (fiber != last)
1470
0
  {
1471
0
    int16_t jmp_len;
1472
0
    uint8_t opcode = *fiber->ip;
1473
1474
0
    switch (opcode)
1475
0
    {
1476
0
    case RE_OPCODE_SPLIT_A:
1477
0
    case RE_OPCODE_SPLIT_B:
1478
1479
0
      split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1);
1480
0
      split_already_executed = false;
1481
1482
0
      for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count;
1483
0
           splits_executed_idx++)
1484
0
      {
1485
0
        if (split_id == splits_executed[splits_executed_idx])
1486
0
        {
1487
0
          split_already_executed = true;
1488
0
          break;
1489
0
        }
1490
0
      }
1491
1492
0
      if (split_already_executed)
1493
0
      {
1494
0
        fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
1495
0
      }
1496
0
      else
1497
0
      {
1498
0
        branch_a = fiber;
1499
1500
0
        FAIL_ON_ERROR(
1501
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1502
1503
        // With RE_OPCODE_SPLIT_A the current fiber continues at the next
1504
        // instruction in the stream (branch A), while the newly created
1505
        // fiber starts at the address indicated by the instruction (branch B)
1506
        // RE_OPCODE_SPLIT_B has the opposite behavior.
1507
1508
0
        if (opcode == RE_OPCODE_SPLIT_B)
1509
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1510
1511
        // Branch A continues at the next instruction
1512
0
        branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3);
1513
1514
        // Branch B adds the offset indicated by the split instruction.
1515
0
        jmp_len = yr_unaligned_i16(branch_b->ip + 1 + sizeof(RE_SPLIT_ID_TYPE));
1516
1517
0
        branch_b->ip += jmp_len;
1518
1519
#ifdef YR_PARANOID_MODE
1520
        // In normal conditions this should never happen. But with compiled
1521
        // rules that has been hand-crafted by a malicious actor this could
1522
        // happen.
1523
        if (splits_executed_count >= RE_MAX_SPLIT_ID)
1524
          return ERROR_INTERNAL_FATAL_ERROR;
1525
#endif
1526
1527
0
        splits_executed[splits_executed_count] = split_id;
1528
0
        splits_executed_count++;
1529
0
      }
1530
1531
0
      break;
1532
1533
0
    case RE_OPCODE_REPEAT_START_GREEDY:
1534
0
    case RE_OPCODE_REPEAT_START_UNGREEDY:
1535
1536
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1537
0
      assert(repeat_args->max > 0);
1538
0
      branch_a = fiber;
1539
1540
0
      if (repeat_args->min == 0)
1541
0
      {
1542
0
        FAIL_ON_ERROR(
1543
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1544
1545
0
        if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY)
1546
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1547
1548
0
        branch_b->ip += repeat_args->offset;
1549
0
      }
1550
1551
0
      branch_a->stack[++branch_a->sp] = 0;
1552
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1553
0
      break;
1554
1555
0
    case RE_OPCODE_REPEAT_END_GREEDY:
1556
0
    case RE_OPCODE_REPEAT_END_UNGREEDY:
1557
1558
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1559
0
      fiber->stack[fiber->sp]++;
1560
1561
0
      if (fiber->stack[fiber->sp] < repeat_args->min)
1562
0
      {
1563
0
        fiber->ip += repeat_args->offset;
1564
0
        break;
1565
0
      }
1566
1567
0
      branch_a = fiber;
1568
1569
0
      if (fiber->stack[fiber->sp] < repeat_args->max)
1570
0
      {
1571
0
        FAIL_ON_ERROR(
1572
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1573
1574
0
        if (opcode == RE_OPCODE_REPEAT_END_GREEDY)
1575
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1576
1577
0
        branch_b->ip += repeat_args->offset;
1578
0
      }
1579
1580
0
      branch_a->sp--;
1581
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1582
0
      break;
1583
1584
0
    case RE_OPCODE_REPEAT_ANY_GREEDY:
1585
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1586
1587
0
      repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1);
1588
1589
      // If repetition counter (rc) is -1 it means that we are reaching this
1590
      // instruction from the previous one in the instructions stream. In
1591
      // this case let's initialize the counter to 0 and start looping.
1592
1593
0
      if (fiber->rc == -1)
1594
0
        fiber->rc = 0;
1595
1596
0
      if (fiber->rc < repeat_any_args->min)
1597
0
      {
1598
        // Increase repetition counter and continue with next fiber. The
1599
        // instruction pointer for this fiber is not incremented yet, this
1600
        // fiber spins in this same instruction until reaching the minimum
1601
        // number of repetitions.
1602
1603
0
        fiber->rc++;
1604
0
        fiber = fiber->next;
1605
0
      }
1606
0
      else if (fiber->rc < repeat_any_args->max)
1607
0
      {
1608
        // Once the minimum number of repetitions are matched one fiber
1609
        // remains spinning in this instruction until reaching the maximum
1610
        // number of repetitions while new fibers are created. New fibers
1611
        // start executing at the next instruction.
1612
1613
0
        next = fiber->next;
1614
0
        branch_a = fiber;
1615
1616
0
        FAIL_ON_ERROR(
1617
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1618
1619
0
        if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY)
1620
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1621
1622
0
        branch_a->rc++;
1623
0
        branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1624
0
        branch_b->rc = -1;
1625
1626
0
        FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b));
1627
1628
0
        fiber = next;
1629
0
      }
1630
0
      else
1631
0
      {
1632
        // When the maximum number of repetitions is reached the fiber keeps
1633
        // executing at the next instruction. The repetition counter is set
1634
        // to -1 indicating that we are not spinning in a repeat instruction
1635
        // anymore.
1636
1637
0
        fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1638
0
        fiber->rc = -1;
1639
0
      }
1640
1641
0
      break;
1642
1643
0
    case RE_OPCODE_JUMP:
1644
0
      jmp_len = yr_unaligned_i16(fiber->ip + 1);
1645
0
      fiber->ip += jmp_len;
1646
0
      break;
1647
1648
0
    default:
1649
0
      fiber = fiber->next;
1650
0
    }
1651
0
  }
1652
1653
0
  return ERROR_SUCCESS;
1654
0
}
1655
1656
////////////////////////////////////////////////////////////////////////////////
1657
// Executes a regular expression. The specified regular expression will try to
1658
// match the data starting at the address specified by "input". The "input"
1659
// pointer can point to any address inside a memory buffer. Arguments
1660
// "input_forwards_size" and "input_backwards_size" indicate how many bytes
1661
// can be accessible starting at "input" and going forwards and backwards
1662
// respectively.
1663
//
1664
//   <--- input_backwards_size -->|<----------- input_forwards_size -------->
1665
//  |--------  memory buffer  -----------------------------------------------|
1666
//                                ^
1667
//                              input
1668
//
1669
// Args:
1670
//   YR_SCAN_CONTEXT *context         - Scan context.
1671
//   const uint8_t* code              - Regexp code be executed
1672
//   const uint8_t* input             - Pointer to input data
1673
//   size_t input_forwards_size       - Number of accessible bytes starting at
1674
//                                      "input" and going forwards.
1675
//   size_t input_backwards_size      - Number of accessible bytes starting at
1676
//                                      "input" and going backwards
1677
//   int flags                        - Flags:
1678
//      RE_FLAGS_SCAN
1679
//      RE_FLAGS_BACKWARDS
1680
//      RE_FLAGS_EXHAUSTIVE
1681
//      RE_FLAGS_WIDE
1682
//      RE_FLAGS_NO_CASE
1683
//      RE_FLAGS_DOT_ALL
1684
//   RE_MATCH_CALLBACK_FUNC callback  - Callback function
1685
//   void* callback_args              - Callback argument
1686
//   int*  matches                    - Pointer to an integer receiving the
1687
//                                      number of matching bytes. Notice that
1688
//                                      0 means a zero-length match, while no
1689
//                                      matches is -1.
1690
// Returns:
1691
//    ERROR_SUCCESS or any other error code.
1692
1693
int yr_re_exec(
1694
    YR_SCAN_CONTEXT* context,
1695
    const uint8_t* code,
1696
    const uint8_t* input_data,
1697
    size_t input_forwards_size,
1698
    size_t input_backwards_size,
1699
    int flags,
1700
    RE_MATCH_CALLBACK_FUNC callback,
1701
    void* callback_args,
1702
    int* matches)
1703
0
{
1704
0
  const uint8_t* input;
1705
0
  const uint8_t* ip;
1706
1707
0
  uint16_t opcode_args;
1708
0
  uint8_t mask;
1709
0
  uint8_t value;
1710
0
  uint8_t character_size;
1711
1712
0
  RE_FIBER_LIST fibers;
1713
0
  RE_FIBER* fiber;
1714
0
  RE_FIBER* next_fiber;
1715
1716
0
  int bytes_matched;
1717
0
  int max_bytes_matched;
1718
1719
0
  int match;
1720
0
  int input_incr;
1721
0
  int kill;
1722
0
  int action;
1723
1724
0
#define ACTION_NONE      0
1725
0
#define ACTION_CONTINUE  1
1726
0
#define ACTION_KILL      2
1727
0
#define ACTION_KILL_TAIL 3
1728
1729
0
#define prolog                                      \
1730
0
  {                                                 \
1731
0
    if ((bytes_matched >= max_bytes_matched) ||     \
1732
0
        (character_size == 2 && *(input + 1) != 0)) \
1733
0
    {                                               \
1734
0
      action = ACTION_KILL;                         \
1735
0
      break;                                        \
1736
0
    }                                               \
1737
0
  }
1738
1739
0
  if (matches != NULL)
1740
0
    *matches = -1;
1741
1742
0
  if (flags & RE_FLAGS_WIDE)
1743
0
    character_size = 2;
1744
0
  else
1745
0
    character_size = 1;
1746
1747
0
  input = input_data;
1748
0
  input_incr = character_size;
1749
1750
0
  if (flags & RE_FLAGS_BACKWARDS)
1751
0
  {
1752
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1753
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
1754
0
    input -= character_size;
1755
0
    input_incr = -input_incr;
1756
0
  }
1757
0
  else
1758
0
  {
1759
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1760
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
1761
0
  }
1762
1763
  // Round down max_bytes_matched to a multiple of character_size, this way if
1764
  // character_size is 2 and max_bytes_matched is odd we are ignoring the
1765
  // extra byte which can't match anyways.
1766
1767
0
  max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
1768
0
  bytes_matched = 0;
1769
1770
0
  FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber));
1771
1772
0
  fiber->ip = code;
1773
0
  fibers.head = fiber;
1774
0
  fibers.tail = fiber;
1775
1776
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1777
0
      _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1778
0
      _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1779
1780
0
  while (fibers.head != NULL)
1781
0
  {
1782
0
    fiber = fibers.head;
1783
1784
0
    while (fiber != NULL)
1785
0
    {
1786
0
      next_fiber = fiber->next;
1787
1788
0
      if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev))
1789
0
        _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1790
1791
0
      fiber = next_fiber;
1792
0
    }
1793
1794
0
    fiber = fibers.head;
1795
1796
0
    while (fiber != NULL)
1797
0
    {
1798
0
      ip = fiber->ip;
1799
0
      action = ACTION_NONE;
1800
1801
0
      switch (*ip)
1802
0
      {
1803
0
      case RE_OPCODE_ANY:
1804
0
        prolog;
1805
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1806
0
        action = match ? ACTION_NONE : ACTION_KILL;
1807
0
        fiber->ip += 1;
1808
0
        break;
1809
1810
0
      case RE_OPCODE_REPEAT_ANY_GREEDY:
1811
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1812
0
        prolog;
1813
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1814
0
        action = match ? ACTION_NONE : ACTION_KILL;
1815
1816
        // The instruction pointer is not incremented here. The current fiber
1817
        // spins in this instruction until reaching the required number of
1818
        // repetitions. The code controlling the number of repetitions is in
1819
        // _yr_re_fiber_sync.
1820
1821
0
        break;
1822
1823
0
      case RE_OPCODE_LITERAL:
1824
0
        prolog;
1825
0
        if (flags & RE_FLAGS_NO_CASE)
1826
0
          match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)];
1827
0
        else
1828
0
          match = (*input == *(ip + 1));
1829
0
        action = match ? ACTION_NONE : ACTION_KILL;
1830
0
        fiber->ip += 2;
1831
0
        break;
1832
1833
0
      case RE_OPCODE_NOT_LITERAL:
1834
0
        prolog;
1835
1836
        // We don't need to take into account the case-insensitive
1837
        // case because this opcode is only used with hex strings,
1838
        // which can't be case-insensitive.
1839
1840
0
        match = (*input != *(ip + 1));
1841
0
        action = match ? ACTION_NONE : ACTION_KILL;
1842
0
        fiber->ip += 2;
1843
0
        break;
1844
1845
0
      case RE_OPCODE_MASKED_LITERAL:
1846
0
        prolog;
1847
0
        opcode_args = yr_unaligned_u16(ip + 1);
1848
0
        mask = opcode_args >> 8;
1849
0
        value = opcode_args & 0xFF;
1850
1851
        // We don't need to take into account the case-insensitive
1852
        // case because this opcode is only used with hex strings,
1853
        // which can't be case-insensitive.
1854
1855
0
        match = ((*input & mask) == value);
1856
0
        action = match ? ACTION_NONE : ACTION_KILL;
1857
0
        fiber->ip += 3;
1858
0
        break;
1859
1860
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
1861
0
        prolog;
1862
0
        opcode_args = yr_unaligned_u16(ip + 1);
1863
0
        mask = opcode_args >> 8;
1864
0
        value = opcode_args & 0xFF;
1865
1866
        // We don't need to take into account the case-insensitive
1867
        // case because this opcode is only used with hex strings,
1868
        // which can't be case-insensitive.
1869
1870
0
        match = ((*input & mask) != value);
1871
0
        action = match ? ACTION_NONE : ACTION_KILL;
1872
0
        fiber->ip += 3;
1873
0
        break;
1874
1875
0
      case RE_OPCODE_CLASS:
1876
0
        prolog;
1877
0
        match = _yr_re_is_char_in_class(
1878
0
            (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE);
1879
0
        action = match ? ACTION_NONE : ACTION_KILL;
1880
0
        fiber->ip += (sizeof(RE_CLASS) + 1);
1881
0
        break;
1882
1883
0
      case RE_OPCODE_WORD_CHAR:
1884
0
        prolog;
1885
0
        match = _yr_re_is_word_char(input, character_size);
1886
0
        action = match ? ACTION_NONE : ACTION_KILL;
1887
0
        fiber->ip += 1;
1888
0
        break;
1889
1890
0
      case RE_OPCODE_NON_WORD_CHAR:
1891
0
        prolog;
1892
0
        match = !_yr_re_is_word_char(input, character_size);
1893
0
        action = match ? ACTION_NONE : ACTION_KILL;
1894
0
        fiber->ip += 1;
1895
0
        break;
1896
1897
0
      case RE_OPCODE_SPACE:
1898
0
      case RE_OPCODE_NON_SPACE:
1899
1900
0
        prolog;
1901
1902
0
        switch (*input)
1903
0
        {
1904
0
        case ' ':
1905
0
        case '\t':
1906
0
        case '\r':
1907
0
        case '\n':
1908
0
        case '\v':
1909
0
        case '\f':
1910
0
          match = true;
1911
0
          break;
1912
0
        default:
1913
0
          match = false;
1914
0
        }
1915
1916
0
        if (*ip == RE_OPCODE_NON_SPACE)
1917
0
          match = !match;
1918
1919
0
        action = match ? ACTION_NONE : ACTION_KILL;
1920
0
        fiber->ip += 1;
1921
0
        break;
1922
1923
0
      case RE_OPCODE_DIGIT:
1924
0
        prolog;
1925
0
        match = isdigit(*input);
1926
0
        action = match ? ACTION_NONE : ACTION_KILL;
1927
0
        fiber->ip += 1;
1928
0
        break;
1929
1930
0
      case RE_OPCODE_NON_DIGIT:
1931
0
        prolog;
1932
0
        match = !isdigit(*input);
1933
0
        action = match ? ACTION_NONE : ACTION_KILL;
1934
0
        fiber->ip += 1;
1935
0
        break;
1936
1937
0
      case RE_OPCODE_WORD_BOUNDARY:
1938
0
      case RE_OPCODE_NON_WORD_BOUNDARY:
1939
1940
0
        if (bytes_matched == 0 && input_backwards_size < character_size)
1941
0
        {
1942
0
          match = true;
1943
0
        }
1944
0
        else if (bytes_matched >= max_bytes_matched)
1945
0
        {
1946
0
          match = true;
1947
0
        }
1948
0
        else
1949
0
        {
1950
0
          assert(input < input_data + input_forwards_size);
1951
0
          assert(input >= input_data - input_backwards_size);
1952
1953
0
          assert(input - input_incr < input_data + input_forwards_size);
1954
0
          assert(input - input_incr >= input_data - input_backwards_size);
1955
1956
0
          match = _yr_re_is_word_char(input, character_size) !=
1957
0
                  _yr_re_is_word_char(input - input_incr, character_size);
1958
0
        }
1959
1960
0
        if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
1961
0
          match = !match;
1962
1963
0
        action = match ? ACTION_CONTINUE : ACTION_KILL;
1964
0
        fiber->ip += 1;
1965
0
        break;
1966
1967
0
      case RE_OPCODE_MATCH_AT_START:
1968
0
        if (flags & RE_FLAGS_BACKWARDS)
1969
0
          kill = input_backwards_size > (size_t) bytes_matched;
1970
0
        else
1971
0
          kill = input_backwards_size > 0 || (bytes_matched != 0);
1972
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1973
0
        fiber->ip += 1;
1974
0
        break;
1975
1976
0
      case RE_OPCODE_MATCH_AT_END:
1977
0
        kill = flags & RE_FLAGS_BACKWARDS ||
1978
0
               input_forwards_size > (size_t) bytes_matched;
1979
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1980
0
        fiber->ip += 1;
1981
0
        break;
1982
1983
0
      case RE_OPCODE_MATCH:
1984
1985
0
        if (matches != NULL)
1986
0
          *matches = bytes_matched;
1987
1988
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
1989
0
        {
1990
0
          if (callback != NULL)
1991
0
          {
1992
0
            if (flags & RE_FLAGS_BACKWARDS)
1993
0
            {
1994
0
              FAIL_ON_ERROR_WITH_CLEANUP(
1995
0
                  callback(
1996
0
                      input + character_size,
1997
0
                      bytes_matched,
1998
0
                      flags,
1999
0
                      callback_args),
2000
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2001
0
            }
2002
0
            else
2003
0
            {
2004
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2005
0
                  callback(input_data, bytes_matched, flags, callback_args),
2006
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2007
0
            }
2008
0
          }
2009
2010
0
          action = ACTION_KILL;
2011
0
        }
2012
0
        else
2013
0
        {
2014
0
          action = ACTION_KILL_TAIL;
2015
0
        }
2016
2017
0
        break;
2018
2019
0
      default:
2020
0
        assert(false);
2021
0
      }
2022
2023
0
      switch (action)
2024
0
      {
2025
0
      case ACTION_KILL:
2026
0
        fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
2027
0
        break;
2028
2029
0
      case ACTION_KILL_TAIL:
2030
0
        _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber);
2031
0
        fiber = NULL;
2032
0
        break;
2033
2034
0
      case ACTION_CONTINUE:
2035
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2036
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2037
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2038
0
        break;
2039
2040
0
      default:
2041
0
        next_fiber = fiber->next;
2042
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2043
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2044
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2045
0
        fiber = next_fiber;
2046
0
      }
2047
0
    }
2048
2049
0
    input += input_incr;
2050
0
    bytes_matched += character_size;
2051
2052
0
    if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched)
2053
0
    {
2054
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2055
0
          _yr_re_fiber_create(&context->re_fiber_pool, &fiber),
2056
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2057
2058
0
      fiber->ip = code;
2059
0
      _yr_re_fiber_append(&fibers, fiber);
2060
2061
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2062
0
          _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2063
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2064
0
    }
2065
0
  }
2066
2067
0
  return ERROR_SUCCESS;
2068
0
}
2069
2070
////////////////////////////////////////////////////////////////////////////////
2071
// Helper function that creates a RE_FAST_EXEC_POSITION by either allocating it
2072
// or reusing a previously allocated one from a pool.
2073
//
2074
static int _yr_re_fast_exec_position_create(
2075
    RE_FAST_EXEC_POSITION_POOL* pool,
2076
    RE_FAST_EXEC_POSITION** new_position)
2077
0
{
2078
0
  RE_FAST_EXEC_POSITION* position;
2079
2080
0
  if (pool->head != NULL)
2081
0
  {
2082
0
    position = pool->head;
2083
0
    pool->head = position->next;
2084
0
  }
2085
0
  else
2086
0
  {
2087
0
    position = (RE_FAST_EXEC_POSITION*) yr_malloc(
2088
0
        sizeof(RE_FAST_EXEC_POSITION));
2089
2090
0
    if (position == NULL)
2091
0
      return ERROR_INSUFFICIENT_MEMORY;
2092
0
  }
2093
2094
0
  position->input = NULL;
2095
0
  position->round = 0;
2096
0
  position->next = NULL;
2097
0
  position->prev = NULL;
2098
2099
0
  *new_position = position;
2100
2101
0
  return ERROR_SUCCESS;
2102
0
}
2103
2104
////////////////////////////////////////////////////////////////////////////////
2105
// Helper function that moves a list of RE_FAST_EXEC_POSITION structures to a
2106
// pool represented by a RE_FAST_EXEC_POSITION_POOL. Receives pointers to the
2107
// pool, the first item, and last item in the list.
2108
//
2109
static void _yr_re_fast_exec_destroy_position_list(
2110
    RE_FAST_EXEC_POSITION_POOL* pool,
2111
    RE_FAST_EXEC_POSITION* first,
2112
    RE_FAST_EXEC_POSITION* last)
2113
0
{
2114
0
  last->next = pool->head;
2115
2116
0
  if (pool->head != NULL)
2117
0
    pool->head->prev = last;
2118
2119
0
  pool->head = first;
2120
0
}
2121
2122
////////////////////////////////////////////////////////////////////////////////
2123
// This function replaces yr_re_exec for regular expressions marked with flag
2124
// RE_FLAGS_FAST_REGEXP. These regular expressions are derived from hex strings
2125
// that don't contain alternatives, like for example:
2126
//
2127
//   { 01 02 03 04 [0-2] 04 05 06 07 }
2128
//
2129
// The regexp's code produced by such strings can be matched by a faster, less
2130
// general algorithm, and it contains only the following opcodes:
2131
//
2132
//   * RE_OPCODE_LITERAL
2133
//   * RE_OPCODE_MASKED_LITERAL,
2134
//   * RE_OPCODE_NOT_LITERAL
2135
//   * RE_OPCODE_MASKED_NOT_LITERAL
2136
//   * RE_OPCODE_ANY
2137
//   * RE_OPCODE_REPEAT_ANY_UNGREEDY
2138
//   * RE_OPCODE_MATCH.
2139
//
2140
// The regexp's code is executed one instruction at time, and the function
2141
// maintains a list of positions within the input for tracking different
2142
// matching alternatives, these positions are described by an instance of
2143
// RE_FAST_EXEC_POSITION. With each instruction the list of positions is
2144
// updated, removing those where the input data doesn't match the current
2145
// instruction, or adding new positions if the instruction is
2146
// RE_OPCODE_REPEAT_ANY_UNGREEDY. The list of positions is maintained sorted
2147
// by the value of the pointer they hold to the input, and it doesn't contain
2148
// duplicated pointer values.
2149
//
2150
int yr_re_fast_exec(
2151
    YR_SCAN_CONTEXT* context,
2152
    const uint8_t* code,
2153
    const uint8_t* input_data,
2154
    size_t input_forwards_size,
2155
    size_t input_backwards_size,
2156
    int flags,
2157
    RE_MATCH_CALLBACK_FUNC callback,
2158
    void* callback_args,
2159
    int* matches)
2160
0
{
2161
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
2162
2163
  // Pointers to the first and last position in the list.
2164
0
  RE_FAST_EXEC_POSITION* first;
2165
0
  RE_FAST_EXEC_POSITION* last;
2166
2167
0
  int input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;
2168
0
  int bytes_matched;
2169
0
  int max_bytes_matched;
2170
2171
0
  if (flags & RE_FLAGS_BACKWARDS)
2172
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
2173
0
  else
2174
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
2175
2176
0
  const uint8_t* ip = code;
2177
2178
  // Create the first position in the list, which points to the start of the
2179
  // input data. Intially this is the only position, more positions will be
2180
  // created every time RE_OPCODE_REPEAT_ANY_UNGREEDY is executed.
2181
0
  FAIL_ON_ERROR(_yr_re_fast_exec_position_create(
2182
0
      &context->re_fast_exec_position_pool, &first));
2183
2184
0
  first->round = 0;
2185
0
  first->input = input_data;
2186
0
  first->prev = NULL;
2187
0
  first->next = NULL;
2188
2189
0
  if (flags & RE_FLAGS_BACKWARDS)
2190
0
    first->input--;
2191
2192
  // As we are starting with a single position, the last one and the first one
2193
  // are the same.
2194
0
  last = first;
2195
2196
  // Round is incremented with every regxp instruction.
2197
0
  int round = 0;
2198
2199
  // Keep in the loop until the list of positions gets empty. Positions are
2200
  // removed from the list when they don't match the current instruction in
2201
  // the code.
2202
0
  while (first != NULL)
2203
0
  {
2204
0
    RE_FAST_EXEC_POSITION* current = first;
2205
2206
    // Iterate over all the positions, executing the current instruction at
2207
    // all of them.
2208
0
    while (current != NULL)
2209
0
    {
2210
0
      RE_FAST_EXEC_POSITION* next = current->next;
2211
2212
      // Ignore any position in the list whose round number is different from
2213
      // the current round. This prevents new positions added to the list in
2214
      // this round from being taken into account in this same round.
2215
0
      if (current->round != round)
2216
0
      {
2217
0
        current = next;
2218
0
        continue;
2219
0
      }
2220
2221
0
      bytes_matched = flags & RE_FLAGS_BACKWARDS
2222
0
                          ? (int) (input_data - current->input - 1)
2223
0
                          : (int) (current->input - input_data);
2224
2225
0
      uint16_t opcode_args;
2226
0
      uint8_t mask;
2227
0
      uint8_t value;
2228
2229
0
      bool match = false;
2230
2231
0
      switch (*ip)
2232
0
      {
2233
0
      case RE_OPCODE_ANY:
2234
0
        if (bytes_matched >= max_bytes_matched)
2235
0
          break;
2236
2237
0
        match = true;
2238
0
        current->input += input_incr;
2239
0
        break;
2240
2241
0
      case RE_OPCODE_LITERAL:
2242
0
        if (bytes_matched >= max_bytes_matched)
2243
0
          break;
2244
2245
0
        if (*current->input == *(ip + 1))
2246
0
        {
2247
0
          match = true;
2248
0
          current->input += input_incr;
2249
0
        }
2250
0
        break;
2251
2252
0
      case RE_OPCODE_NOT_LITERAL:
2253
0
        if (bytes_matched >= max_bytes_matched)
2254
0
          break;
2255
2256
0
        if (*current->input != *(ip + 1))
2257
0
        {
2258
0
          match = true;
2259
0
          current->input += input_incr;
2260
0
        }
2261
0
        break;
2262
2263
0
      case RE_OPCODE_MASKED_LITERAL:
2264
0
        if (bytes_matched >= max_bytes_matched)
2265
0
          break;
2266
2267
0
        opcode_args = yr_unaligned_u16(ip + 1);
2268
0
        mask = opcode_args >> 8;
2269
0
        value = opcode_args & 0xFF;
2270
2271
0
        if ((*current->input & mask) == value)
2272
0
        {
2273
0
          match = true;
2274
0
          current->input += input_incr;
2275
0
        }
2276
0
        break;
2277
2278
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
2279
0
        if (bytes_matched >= max_bytes_matched)
2280
0
          break;
2281
2282
0
        opcode_args = yr_unaligned_u16(ip + 1);
2283
0
        mask = opcode_args >> 8;
2284
0
        value = opcode_args & 0xFF;
2285
2286
0
        if ((*current->input & mask) != value)
2287
0
        {
2288
0
          match = true;
2289
0
          current->input += input_incr;
2290
0
        }
2291
0
        break;
2292
2293
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2294
0
        if (bytes_matched >= max_bytes_matched)
2295
0
          break;
2296
2297
0
        repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1);
2298
0
        match = true;
2299
2300
0
        const uint8_t* next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS);
2301
2302
        // insertion_point indicates the item in the list of inputs after which
2303
        // the newly created inputs will be inserted.
2304
0
        RE_FAST_EXEC_POSITION* insertion_point = current;
2305
2306
0
        for (int j = repeat_any_args->min + 1; j <= repeat_any_args->max; j++)
2307
0
        {
2308
          // bytes_matched is the number of bytes matched so far, and j is the
2309
          // minimum number of bytes that are still pending to match. If these
2310
          // two numbers sum more than the maximum number of bytes that can be
2311
          // matched the loop can be aborted. The position that we were about
2312
          // to create can't lead to a match without exceeding
2313
          // max_bytes_matched.
2314
0
          if (bytes_matched + j >= max_bytes_matched)
2315
0
            break;
2316
2317
0
          const uint8_t* next_input = current->input + j * input_incr;
2318
2319
          // Find the point where next_input should be inserted. The list is
2320
          // kept sorted by pointer in increasing order, the insertion point is
2321
          // an item that has a pointer lower or equal than next_input, but
2322
          // whose next item have a pointer that is larger.
2323
0
          while (insertion_point->next != NULL &&
2324
0
                 next_input >= insertion_point->next->input)
2325
0
          {
2326
0
            insertion_point = insertion_point->next;
2327
0
          }
2328
2329
          // If the pointer at the insertion point is equal to next_input we
2330
          // don't need to insert next_input in the list as this input already
2331
          // exists.
2332
0
          if (next_input == insertion_point->input)
2333
0
            continue;
2334
2335
          // The next opcode is RE_OPCODE_LITERAL, but the literal doesn't
2336
          // match with the byte at next_input, we don't need to add this
2337
          // input to the list as we already know that it won't match in the
2338
          // next round and will be deleted from the list anyways.
2339
0
          if (*(next_opcode) == RE_OPCODE_LITERAL &&
2340
0
              *(next_opcode + 1) != *next_input)
2341
0
            continue;
2342
2343
0
          RE_FAST_EXEC_POSITION* new_input;
2344
2345
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2346
0
              _yr_re_fast_exec_position_create(
2347
0
                  &context->re_fast_exec_position_pool, &new_input),
2348
              // Cleanup
2349
0
              _yr_re_fast_exec_destroy_position_list(
2350
0
                  &context->re_fast_exec_position_pool, first, last));
2351
2352
0
          new_input->input = next_input;
2353
0
          new_input->round = round + 1;
2354
0
          new_input->prev = insertion_point;
2355
0
          new_input->next = insertion_point->next;
2356
0
          insertion_point->next = new_input;
2357
2358
0
          if (new_input->next != NULL)
2359
0
            new_input->next->prev = new_input;
2360
2361
0
          if (insertion_point == last)
2362
0
            last = new_input;
2363
0
        }
2364
2365
0
        current->input += input_incr * repeat_any_args->min;
2366
0
        break;
2367
2368
0
      case RE_OPCODE_MATCH:
2369
2370
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2371
0
        {
2372
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2373
0
              callback(
2374
                  // When matching forwards the matching data always starts at
2375
                  // input_data, when matching backwards it starts at input + 1
2376
                  // or input_data - input_backwards_size if input + 1 is
2377
                  // outside the buffer.
2378
0
                  flags & RE_FLAGS_BACKWARDS
2379
0
                      ? yr_max(
2380
0
                            current->input + 1,
2381
0
                            input_data - input_backwards_size)
2382
0
                      : input_data,
2383
                  // The number of matched bytes should not be larger than
2384
                  // max_bytes_matched.
2385
0
                  yr_min(bytes_matched, max_bytes_matched),
2386
0
                  flags,
2387
0
                  callback_args),
2388
              // Cleanup
2389
0
              _yr_re_fast_exec_destroy_position_list(
2390
0
                  &context->re_fast_exec_position_pool, first, last));
2391
0
        }
2392
0
        else
2393
0
        {
2394
0
          if (matches != NULL)
2395
0
            *matches = bytes_matched;
2396
2397
0
          _yr_re_fast_exec_destroy_position_list(
2398
0
              &context->re_fast_exec_position_pool, first, last);
2399
2400
0
          return ERROR_SUCCESS;
2401
0
        }
2402
0
        break;
2403
2404
0
      default:
2405
0
        printf("non-supported opcode: %d\n", *ip);
2406
0
        assert(false);
2407
0
      }
2408
2409
0
      if (match)
2410
0
      {
2411
0
        current->round = round + 1;
2412
0
      }
2413
0
      else
2414
0
      {
2415
        // Remove current from the list. If the item being removed is the first
2416
        // one or the last one, the first and last pointers should be updated.
2417
        // The removed item is put into the pool for later reuse.
2418
2419
0
        if (current == first)
2420
0
          first = current->next;
2421
2422
0
        if (current == last)
2423
0
          last = current->prev;
2424
2425
0
        if (current->prev != NULL)
2426
0
          current->prev->next = current->next;
2427
2428
0
        if (current->next != NULL)
2429
0
          current->next->prev = current->prev;
2430
2431
0
        current->prev = NULL;
2432
0
        current->next = context->re_fast_exec_position_pool.head;
2433
2434
0
        if (current->next != NULL)
2435
0
          current->next->prev = current;
2436
2437
0
        context->re_fast_exec_position_pool.head = current;
2438
0
      }
2439
2440
0
      current = next;
2441
2442
0
    }  // while (current != NULL)
2443
2444
    // Move instruction pointer (ip) to next instruction. Each opcode has
2445
    // a different size.
2446
0
    switch (*ip)
2447
0
    {
2448
0
    case RE_OPCODE_ANY:
2449
0
      ip += 1;
2450
0
      break;
2451
0
    case RE_OPCODE_LITERAL:
2452
0
    case RE_OPCODE_NOT_LITERAL:
2453
0
      ip += 2;
2454
0
      break;
2455
0
    case RE_OPCODE_MASKED_LITERAL:
2456
0
    case RE_OPCODE_MASKED_NOT_LITERAL:
2457
0
      ip += 3;
2458
0
      break;
2459
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2460
0
      ip += 1 + sizeof(RE_REPEAT_ANY_ARGS);
2461
0
      break;
2462
0
    case RE_OPCODE_MATCH:
2463
0
      break;
2464
0
    default:
2465
0
      assert(false);
2466
0
    }
2467
2468
0
    round++;
2469
0
  }
2470
2471
  // If this point is reached no matches were found.
2472
2473
0
  if (matches != NULL)
2474
0
    *matches = -1;
2475
2476
0
  return ERROR_SUCCESS;
2477
0
}
2478
2479
static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent)
2480
0
{
2481
0
  RE_NODE* child;
2482
0
  int i;
2483
2484
0
  if (re_node == NULL)
2485
0
    return;
2486
2487
0
  if (indent > 0)
2488
0
    printf("\n%*s", indent, " ");
2489
0
  switch (re_node->type)
2490
0
  {
2491
0
  case RE_NODE_ALT:
2492
0
    printf("Alt(");
2493
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2494
0
    printf(",");
2495
0
    _yr_re_print_node(re_node->children_tail, indent + 4);
2496
0
    printf("\n%*s%s", indent, " ", ")");
2497
0
    break;
2498
2499
0
  case RE_NODE_CONCAT:
2500
0
    printf("Cat(");
2501
0
    child = re_node->children_head;
2502
0
    while (child != NULL)
2503
0
    {
2504
0
      _yr_re_print_node(child, indent + 4);
2505
0
      printf(",");
2506
0
      child = child->next_sibling;
2507
0
    }
2508
0
    printf("\n%*s%s", indent, " ", ")");
2509
0
    break;
2510
2511
0
  case RE_NODE_STAR:
2512
0
    printf("Star(");
2513
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2514
0
    printf(")");
2515
0
    break;
2516
2517
0
  case RE_NODE_PLUS:
2518
0
    printf("Plus(");
2519
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2520
0
    printf(")");
2521
0
    break;
2522
2523
0
  case RE_NODE_LITERAL:
2524
0
    printf("Lit(%c)", re_node->value);
2525
0
    break;
2526
2527
0
  case RE_NODE_NOT_LITERAL:
2528
0
    printf("NotLit(%c)", re_node->value);
2529
0
    break;
2530
2531
0
  case RE_NODE_MASKED_LITERAL:
2532
0
    printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask);
2533
0
    break;
2534
2535
0
  case RE_NODE_WORD_CHAR:
2536
0
    printf("WordChar");
2537
0
    break;
2538
2539
0
  case RE_NODE_NON_WORD_CHAR:
2540
0
    printf("NonWordChar");
2541
0
    break;
2542
2543
0
  case RE_NODE_SPACE:
2544
0
    printf("Space");
2545
0
    break;
2546
2547
0
  case RE_NODE_NON_SPACE:
2548
0
    printf("NonSpace");
2549
0
    break;
2550
2551
0
  case RE_NODE_DIGIT:
2552
0
    printf("Digit");
2553
0
    break;
2554
2555
0
  case RE_NODE_NON_DIGIT:
2556
0
    printf("NonDigit");
2557
0
    break;
2558
2559
0
  case RE_NODE_ANY:
2560
0
    printf("Any");
2561
0
    break;
2562
2563
0
  case RE_NODE_RANGE:
2564
0
    printf("Range(%d-%d, ", re_node->start, re_node->end);
2565
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2566
0
    printf("\n%*s%s", indent, " ", ")");
2567
0
    break;
2568
2569
0
  case RE_NODE_CLASS:
2570
0
    printf("Class(");
2571
0
    for (i = 0; i < 256; i++)
2572
0
      if (_yr_re_is_char_in_class(re_node->re_class, i, false))
2573
0
        printf("%02X,", i);
2574
0
    printf(")");
2575
0
    break;
2576
2577
0
  case RE_NODE_EMPTY:
2578
0
    printf("Empty");
2579
0
    break;
2580
2581
0
  case RE_NODE_ANCHOR_START:
2582
0
    printf("AnchorStart");
2583
0
    break;
2584
2585
0
  case RE_NODE_ANCHOR_END:
2586
0
    printf("AnchorEnd");
2587
0
    break;
2588
2589
0
  case RE_NODE_WORD_BOUNDARY:
2590
0
    printf("WordBoundary");
2591
0
    break;
2592
2593
0
  case RE_NODE_NON_WORD_BOUNDARY:
2594
0
    printf("NonWordBoundary");
2595
0
    break;
2596
2597
0
  case RE_NODE_RANGE_ANY:
2598
0
    printf("RangeAny");
2599
0
    break;
2600
2601
0
  default:
2602
0
    printf("???");
2603
0
    break;
2604
0
  }
2605
0
}
2606
2607
void yr_re_print(RE_AST* re_ast)
2608
0
{
2609
0
  _yr_re_print_node(re_ast->root_node, 0);
2610
0
}