Coverage Report

Created: 2025-07-11 06:08

/src/yara/libyara/re.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module implements a regular expressions engine based on Thompson's
33
algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html.
34
35
What the article names a "thread" has been named a "fiber" in this code, in
36
order to avoid confusion with operating system threads.
37
38
*/
39
40
#include <assert.h>
41
#include <string.h>
42
#include <yara/compiler.h>
43
#include <yara/error.h>
44
#include <yara/globals.h>
45
#include <yara/hex_lexer.h>
46
#include <yara/limits.h>
47
#include <yara/mem.h>
48
#include <yara/re.h>
49
#include <yara/re_lexer.h>
50
#include <yara/strutils.h>
51
#include <yara/threading.h>
52
#include <yara/unaligned.h>
53
#include <yara/utils.h>
54
55
0
#define EMIT_BACKWARDS               0x01
56
0
#define EMIT_DONT_SET_FORWARDS_CODE  0x02
57
0
#define EMIT_DONT_SET_BACKWARDS_CODE 0x04
58
59
#ifndef INT16_MAX
60
#define INT16_MAX (32767)
61
#endif
62
63
typedef uint8_t RE_SPLIT_ID_TYPE;
64
65
// RE_REPEAT_ARGS and RE_REPEAT_ANY_ARGS are structures that are embedded in
66
// the regexp's instruction stream. As such, they are not always aligned to
67
// 8-byte boundaries, so they need to be "packed" structures in order to prevent
68
// issues due to unaligned memory accesses.
69
#pragma pack(push)
70
#pragma pack(1)
71
72
typedef struct _RE_REPEAT_ARGS
73
{
74
  uint16_t min;
75
  uint16_t max;
76
  int32_t offset;
77
78
} RE_REPEAT_ARGS;
79
80
typedef struct _RE_REPEAT_ANY_ARGS
81
{
82
  uint16_t min;
83
  uint16_t max;
84
85
} RE_REPEAT_ANY_ARGS;
86
87
#pragma pack(pop)
88
89
typedef struct _RE_EMIT_CONTEXT
90
{
91
  YR_ARENA* arena;
92
  RE_SPLIT_ID_TYPE next_split_id;
93
94
} RE_EMIT_CONTEXT;
95
96
0
#define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8))
97
98
static bool _yr_re_is_char_in_class(
99
    RE_CLASS* re_class,
100
    uint8_t chr,
101
    int case_insensitive)
102
0
{
103
0
  int result = CHAR_IN_CLASS(re_class->bitmap, chr);
104
105
0
  if (case_insensitive)
106
0
    result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]);
107
108
0
  if (re_class->negated)
109
0
    result = !result;
110
111
0
  return result;
112
0
}
113
114
static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size)
115
0
{
116
0
  int result = ((yr_isalnum(input) || (*input) == '_'));
117
118
0
  if (character_size == 2)
119
0
    result = result && (*(input + 1) == 0);
120
121
0
  return result;
122
0
}
123
124
RE_NODE* yr_re_node_create(int type)
125
0
{
126
0
  RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE));
127
128
0
  if (result != NULL)
129
0
  {
130
0
    result->type = type;
131
0
    result->children_head = NULL;
132
0
    result->children_tail = NULL;
133
0
    result->prev_sibling = NULL;
134
0
    result->next_sibling = NULL;
135
0
    result->greedy = true;
136
0
    result->forward_code_ref = YR_ARENA_NULL_REF;
137
0
    result->backward_code_ref = YR_ARENA_NULL_REF;
138
0
  }
139
140
0
  return result;
141
0
}
142
143
void yr_re_node_destroy(RE_NODE* node)
144
0
{
145
0
  RE_NODE* child = node->children_head;
146
0
  RE_NODE* next_child;
147
148
0
  while (child != NULL)
149
0
  {
150
0
    next_child = child->next_sibling;
151
0
    yr_re_node_destroy(child);
152
0
    child = next_child;
153
0
  }
154
155
0
  if (node->type == RE_NODE_CLASS)
156
0
    yr_free(node->re_class);
157
158
0
  yr_free(node);
159
0
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Appends a node to the end of the children list.
163
//
164
void yr_re_node_append_child(RE_NODE* node, RE_NODE* child)
165
0
{
166
0
  if (node->children_head == NULL)
167
0
    node->children_head = child;
168
169
0
  if (node->children_tail != NULL)
170
0
    node->children_tail->next_sibling = child;
171
172
0
  child->prev_sibling = node->children_tail;
173
0
  node->children_tail = child;
174
0
}
175
176
////////////////////////////////////////////////////////////////////////////////
177
// Appends a node to the beginning of the children list.
178
//
179
void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child)
180
0
{
181
0
  child->next_sibling = node->children_head;
182
183
0
  if (node->children_head != NULL)
184
0
    node->children_head->prev_sibling = child;
185
186
0
  node->children_head = child;
187
188
0
  if (node->children_tail == NULL)
189
0
    node->children_tail = child;
190
0
}
191
192
int yr_re_ast_create(RE_AST** re_ast)
193
0
{
194
0
  *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST));
195
196
0
  if (*re_ast == NULL)
197
0
    return ERROR_INSUFFICIENT_MEMORY;
198
199
0
  (*re_ast)->flags = 0;
200
0
  (*re_ast)->root_node = NULL;
201
202
0
  return ERROR_SUCCESS;
203
0
}
204
205
void yr_re_ast_destroy(RE_AST* re_ast)
206
0
{
207
0
  if (re_ast->root_node != NULL)
208
0
    yr_re_node_destroy(re_ast->root_node);
209
210
0
  yr_free(re_ast);
211
0
}
212
213
////////////////////////////////////////////////////////////////////////////////
214
// Parses a regexp but don't emit its code. A further call to
215
// yr_re_ast_emit_code is required to get the code.
216
//
217
int yr_re_parse(
218
    const char* re_string,
219
    RE_AST** re_ast,
220
    RE_ERROR* error,
221
    int flags)
222
0
{
223
0
  return yr_parse_re_string(re_string, re_ast, error, flags);
224
0
}
225
226
////////////////////////////////////////////////////////////////////////////////
227
// Parses a hex string but don't emit its code. A further call to
228
// yr_re_ast_emit_code is required to get the code.
229
//
230
int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error)
231
0
{
232
0
  return yr_parse_hex_string(hex_string, re_ast, error);
233
0
}
234
235
////////////////////////////////////////////////////////////////////////////////
236
// Parses the regexp and emit its code to the provided to the
237
// YR_RE_CODE_SECTION in the specified arena.
238
//
239
int yr_re_compile(
240
    const char* re_string,
241
    int flags,
242
    int parser_flags,
243
    YR_ARENA* arena,
244
    YR_ARENA_REF* ref,
245
    RE_ERROR* error)
246
0
{
247
0
  RE_AST* re_ast;
248
0
  RE _re;
249
0
  int result;
250
251
0
  result = yr_re_parse(re_string, &re_ast, error, parser_flags);
252
0
  if (result != ERROR_UNKNOWN_ESCAPE_SEQUENCE)
253
0
    FAIL_ON_ERROR(result);
254
255
0
  _re.flags = flags;
256
257
0
  FAIL_ON_ERROR_WITH_CLEANUP(
258
0
      yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref),
259
0
      yr_re_ast_destroy(re_ast));
260
261
0
  FAIL_ON_ERROR_WITH_CLEANUP(
262
0
      yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast));
263
264
0
  yr_re_ast_destroy(re_ast);
265
266
0
  return result;
267
0
}
268
269
////////////////////////////////////////////////////////////////////////////////
270
// Verifies if the target string matches the pattern
271
//
272
// Args:
273
//    context: Scan context
274
//    re: A pointer to a compiled regexp
275
//    target: Target string
276
//
277
// Returns:
278
//    See return codes for yr_re_exec
279
//
280
int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target)
281
0
{
282
0
  int result;
283
284
0
  yr_re_exec(
285
0
      context,
286
0
      re->code,
287
0
      (uint8_t*) target,
288
0
      strlen(target),
289
0
      0,
290
0
      re->flags | RE_FLAGS_SCAN,
291
0
      NULL,
292
0
      NULL,
293
0
      &result);
294
295
0
  return result;
296
0
}
297
298
////////////////////////////////////////////////////////////////////////////////
299
// Verifies if the provided regular expression is just a literal string
300
// like "abc", "12345", without any wildcard, operator, etc. In that case
301
// returns the string as a SIZED_STRING, or returns NULL if otherwise.
302
//
303
// The caller is responsible for deallocating the returned SIZED_STRING by
304
// calling yr_free.
305
//
306
SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast)
307
0
{
308
0
  SIZED_STRING* string;
309
0
  RE_NODE* child;
310
311
0
  int length = 0;
312
313
0
  if (re_ast->root_node->type == RE_NODE_LITERAL)
314
0
  {
315
0
    length = 1;
316
0
  }
317
0
  else if (re_ast->root_node->type == RE_NODE_CONCAT)
318
0
  {
319
0
    child = re_ast->root_node->children_tail;
320
321
0
    while (child != NULL && child->type == RE_NODE_LITERAL)
322
0
    {
323
0
      length++;
324
0
      child = child->prev_sibling;
325
0
    }
326
327
0
    if (child != NULL)
328
0
      return NULL;
329
0
  }
330
0
  else
331
0
  {
332
0
    return NULL;
333
0
  }
334
335
0
  string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
336
337
0
  if (string == NULL)
338
0
    return NULL;
339
340
0
  string->length = length;
341
0
  string->flags = 0;
342
343
0
  if (re_ast->root_node->type == RE_NODE_LITERAL)
344
0
  {
345
0
    string->c_string[0] = re_ast->root_node->value;
346
0
  }
347
0
  else
348
0
  {
349
0
    child = re_ast->root_node->children_tail;
350
351
0
    while (child != NULL)
352
0
    {
353
0
      string->c_string[--length] = child->value;
354
0
      child = child->prev_sibling;
355
0
    }
356
0
  }
357
358
0
  string->c_string[string->length] = '\0';
359
360
0
  return string;
361
0
}
362
363
int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node)
364
0
{
365
0
  RE_NODE* child;
366
367
0
  if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) &&
368
0
      re_node->children_head->type == RE_NODE_ANY)
369
0
    return true;
370
371
0
  if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE)
372
0
    return true;
373
374
0
  if (re_node->type == RE_NODE_CONCAT)
375
0
  {
376
0
    child = re_node->children_tail;
377
378
0
    while (child != NULL)
379
0
    {
380
0
      if (_yr_re_node_has_unbounded_quantifier_for_dot(child))
381
0
        return true;
382
383
0
      child = child->prev_sibling;
384
0
    }
385
0
  }
386
387
0
  return false;
388
0
}
389
390
////////////////////////////////////////////////////////////////////////////////
391
// Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with
392
// quantifiers that don't have a reasonably small upper bound causes a
393
// performance penalty. This function detects such cases in order to warn the
394
// user about this.
395
//
396
int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast)
397
0
{
398
0
  return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node);
399
0
}
400
401
////////////////////////////////////////////////////////////////////////////////
402
// In some cases splitting a regular expression (or hex string) in two parts is
403
// convenient for increasing performance. This happens when the pattern contains
404
// a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 }
405
// In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where
406
// the latter is chained to the former. This means that { 01 02 03 } and
407
// { 04 05 06 } are handled as individual strings, and when both of them are
408
// found, YARA verifies if the distance between the matches complies with the
409
// [0-999] restriction.
410
//
411
// This function traverses a regexp's AST looking for nodes where it should be
412
// splitted. It must be noticed that this only applies to two-level ASTs (i.e.
413
// an AST consisting in a RE_NODE_CONCAT at the root where all the children are
414
// leaves).
415
//
416
// For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the
417
// following AST:
418
//
419
// RE_NODE_CONCAT
420
// |
421
// |- RE_NODE_LITERAL (01)
422
// |- RE_NODE_LITERAL (02)
423
// |- RE_NODE_LITERAL (03)
424
// |- RE_NODE_RANGE_ANY (start=0, end=1000)
425
// |- RE_NODE_LITERAL (04)
426
// |- RE_NODE_LITERAL (05)
427
// |- RE_NODE_LITERAL (06)
428
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
429
// |- RE_NODE_LITERAL (07)
430
// |- RE_NODE_LITERAL (08)
431
// |- RE_NODE_LITERAL (09)
432
//
433
// If the AST above is passed in the re_ast argument, it will be trimmed to:
434
//
435
// RE_NODE_CONCAT
436
// |
437
// |- RE_NODE_LITERAL (01)
438
// |- RE_NODE_LITERAL (02)
439
// |- RE_NODE_LITERAL (03)
440
//
441
// While remainder_re_ast will be:
442
//
443
// RE_NODE_CONCAT
444
// |
445
// |- RE_NODE_LITERAL (04)
446
// |- RE_NODE_LITERAL (05)
447
// |- RE_NODE_LITERAL (06)
448
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
449
// |- RE_NODE_LITERAL (07)
450
// |- RE_NODE_LITERAL (08)
451
// |- RE_NODE_LITERAL (09)
452
//
453
// The caller is responsible for freeing the new AST in remainder_re_ast by
454
// calling yr_re_ast_destroy.
455
//
456
// The integers pointed to by min_gap and max_gap will be filled with the
457
// minimum and maximum gap size between the sub-strings represented by the
458
// two ASTs.
459
//
460
int yr_re_ast_split_at_chaining_point(
461
    RE_AST* re_ast,
462
    RE_AST** remainder_re_ast,
463
    int32_t* min_gap,
464
    int32_t* max_gap)
465
0
{
466
0
  RE_NODE* child;
467
0
  RE_NODE* concat;
468
469
0
  int result;
470
471
0
  *remainder_re_ast = NULL;
472
0
  *min_gap = 0;
473
0
  *max_gap = 0;
474
475
0
  if (re_ast->root_node->type != RE_NODE_CONCAT)
476
0
    return ERROR_SUCCESS;
477
478
0
  child = re_ast->root_node->children_head;
479
480
0
  while (child != NULL)
481
0
  {
482
0
    if (!child->greedy && child->type == RE_NODE_RANGE_ANY &&
483
0
        child->prev_sibling != NULL && child->next_sibling != NULL &&
484
0
        (child->start > YR_STRING_CHAINING_THRESHOLD ||
485
0
         child->end > YR_STRING_CHAINING_THRESHOLD))
486
0
    {
487
0
      result = yr_re_ast_create(remainder_re_ast);
488
489
0
      if (result != ERROR_SUCCESS)
490
0
        return result;
491
492
0
      concat = yr_re_node_create(RE_NODE_CONCAT);
493
494
0
      if (concat == NULL)
495
0
        return ERROR_INSUFFICIENT_MEMORY;
496
497
0
      concat->children_head = child->next_sibling;
498
0
      concat->children_tail = re_ast->root_node->children_tail;
499
500
0
      re_ast->root_node->children_tail = child->prev_sibling;
501
502
0
      child->prev_sibling->next_sibling = NULL;
503
0
      child->next_sibling->prev_sibling = NULL;
504
505
0
      *min_gap = child->start;
506
0
      *max_gap = child->end;
507
508
0
      (*remainder_re_ast)->root_node = concat;
509
0
      (*remainder_re_ast)->flags = re_ast->flags;
510
511
0
      yr_re_node_destroy(child);
512
513
0
      return ERROR_SUCCESS;
514
0
    }
515
516
0
    child = child->next_sibling;
517
0
  }
518
519
0
  return ERROR_SUCCESS;
520
0
}
521
522
int _yr_emit_inst(
523
    RE_EMIT_CONTEXT* emit_context,
524
    uint8_t opcode,
525
    YR_ARENA_REF* instruction_ref)
526
0
{
527
0
  FAIL_ON_ERROR(yr_arena_write_data(
528
0
      emit_context->arena,
529
0
      YR_RE_CODE_SECTION,
530
0
      &opcode,
531
0
      sizeof(uint8_t),
532
0
      instruction_ref));
533
534
0
  return ERROR_SUCCESS;
535
0
}
536
537
int _yr_emit_inst_arg_uint8(
538
    RE_EMIT_CONTEXT* emit_context,
539
    uint8_t opcode,
540
    uint8_t argument,
541
    YR_ARENA_REF* instruction_ref,
542
    YR_ARENA_REF* argument_ref)
543
0
{
544
0
  FAIL_ON_ERROR(yr_arena_write_data(
545
0
      emit_context->arena,
546
0
      YR_RE_CODE_SECTION,
547
0
      &opcode,
548
0
      sizeof(uint8_t),
549
0
      instruction_ref));
550
551
0
  FAIL_ON_ERROR(yr_arena_write_data(
552
0
      emit_context->arena,
553
0
      YR_RE_CODE_SECTION,
554
0
      &argument,
555
0
      sizeof(uint8_t),
556
0
      argument_ref));
557
558
0
  return ERROR_SUCCESS;
559
0
}
560
561
int _yr_emit_inst_arg_uint16(
562
    RE_EMIT_CONTEXT* emit_context,
563
    uint8_t opcode,
564
    uint16_t argument,
565
    YR_ARENA_REF* instruction_ref,
566
    YR_ARENA_REF* argument_ref)
567
0
{
568
0
  FAIL_ON_ERROR(yr_arena_write_data(
569
0
      emit_context->arena,
570
0
      YR_RE_CODE_SECTION,
571
0
      &opcode,
572
0
      sizeof(uint8_t),
573
0
      instruction_ref));
574
575
0
  FAIL_ON_ERROR(yr_arena_write_data(
576
0
      emit_context->arena,
577
0
      YR_RE_CODE_SECTION,
578
0
      &argument,
579
0
      sizeof(uint16_t),
580
0
      argument_ref));
581
582
0
  return ERROR_SUCCESS;
583
0
}
584
585
int _yr_emit_inst_arg_uint32(
586
    RE_EMIT_CONTEXT* emit_context,
587
    uint8_t opcode,
588
    uint32_t argument,
589
    YR_ARENA_REF* instruction_ref,
590
    YR_ARENA_REF* argument_ref)
591
0
{
592
0
  FAIL_ON_ERROR(yr_arena_write_data(
593
0
      emit_context->arena,
594
0
      YR_RE_CODE_SECTION,
595
0
      &opcode,
596
0
      sizeof(uint8_t),
597
0
      instruction_ref));
598
599
0
  FAIL_ON_ERROR(yr_arena_write_data(
600
0
      emit_context->arena,
601
0
      YR_RE_CODE_SECTION,
602
0
      &argument,
603
0
      sizeof(uint32_t),
604
0
      argument_ref));
605
606
0
  return ERROR_SUCCESS;
607
0
}
608
609
int _yr_emit_inst_arg_int16(
610
    RE_EMIT_CONTEXT* emit_context,
611
    uint8_t opcode,
612
    int16_t argument,
613
    YR_ARENA_REF* instruction_ref,
614
    YR_ARENA_REF* argument_ref)
615
0
{
616
0
  FAIL_ON_ERROR(yr_arena_write_data(
617
0
      emit_context->arena,
618
0
      YR_RE_CODE_SECTION,
619
0
      &opcode,
620
0
      sizeof(uint8_t),
621
0
      instruction_ref));
622
623
0
  FAIL_ON_ERROR(yr_arena_write_data(
624
0
      emit_context->arena,
625
0
      YR_RE_CODE_SECTION,
626
0
      &argument,
627
0
      sizeof(int16_t),
628
0
      argument_ref));
629
630
0
  return ERROR_SUCCESS;
631
0
}
632
633
int _yr_emit_inst_arg_struct(
634
    RE_EMIT_CONTEXT* emit_context,
635
    uint8_t opcode,
636
    void* structure,
637
    size_t structure_size,
638
    YR_ARENA_REF* instruction_ref,
639
    YR_ARENA_REF* argument_ref)
640
0
{
641
0
  FAIL_ON_ERROR(yr_arena_write_data(
642
0
      emit_context->arena,
643
0
      YR_RE_CODE_SECTION,
644
0
      &opcode,
645
0
      sizeof(uint8_t),
646
0
      instruction_ref));
647
648
0
  FAIL_ON_ERROR(yr_arena_write_data(
649
0
      emit_context->arena,
650
0
      YR_RE_CODE_SECTION,
651
0
      structure,
652
0
      structure_size,
653
0
      argument_ref));
654
655
0
  return ERROR_SUCCESS;
656
0
}
657
658
int _yr_emit_split(
659
    RE_EMIT_CONTEXT* emit_context,
660
    uint8_t opcode,
661
    int16_t argument,
662
    YR_ARENA_REF* instruction_ref,
663
    YR_ARENA_REF* argument_ref)
664
0
{
665
0
  assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B);
666
667
0
  if (emit_context->next_split_id == RE_MAX_SPLIT_ID)
668
0
    return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX;
669
670
0
  FAIL_ON_ERROR(yr_arena_write_data(
671
0
      emit_context->arena,
672
0
      YR_RE_CODE_SECTION,
673
0
      &opcode,
674
0
      sizeof(uint8_t),
675
0
      instruction_ref));
676
677
0
  FAIL_ON_ERROR(yr_arena_write_data(
678
0
      emit_context->arena,
679
0
      YR_RE_CODE_SECTION,
680
0
      &emit_context->next_split_id,
681
0
      sizeof(RE_SPLIT_ID_TYPE),
682
0
      NULL));
683
684
0
  emit_context->next_split_id++;
685
686
0
  FAIL_ON_ERROR(yr_arena_write_data(
687
0
      emit_context->arena,
688
0
      YR_RE_CODE_SECTION,
689
0
      &argument,
690
0
      sizeof(int16_t),
691
0
      argument_ref));
692
693
0
  return ERROR_SUCCESS;
694
0
}
695
696
static int _yr_re_emit(
697
    RE_EMIT_CONTEXT* emit_context,
698
    RE_NODE* re_node,
699
    int flags,
700
    YR_ARENA_REF* code_ref)
701
0
{
702
0
  int16_t jmp_offset;
703
704
0
  yr_arena_off_t bookmark_1 = 0;
705
0
  yr_arena_off_t bookmark_2 = 0;
706
0
  yr_arena_off_t bookmark_3 = 0;
707
0
  yr_arena_off_t bookmark_4 = 0;
708
709
0
  bool emit_split;
710
0
  bool emit_repeat;
711
0
  bool emit_prolog;
712
0
  bool emit_epilog;
713
714
0
  RE_REPEAT_ARGS repeat_args;
715
0
  RE_REPEAT_ARGS* repeat_start_args_addr;
716
0
  RE_REPEAT_ANY_ARGS repeat_any_args;
717
718
0
  RE_NODE* child;
719
720
0
  int16_t* split_offset_addr = NULL;
721
0
  int16_t* jmp_offset_addr = NULL;
722
723
0
  YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF;
724
0
  YR_ARENA_REF split_offset_ref;
725
0
  YR_ARENA_REF jmp_instruction_ref;
726
0
  YR_ARENA_REF jmp_offset_ref;
727
0
  YR_ARENA_REF repeat_start_args_ref;
728
729
0
  switch (re_node->type)
730
0
  {
731
0
  case RE_NODE_LITERAL:
732
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
733
0
        emit_context,
734
0
        RE_OPCODE_LITERAL,
735
0
        re_node->value,
736
0
        &instruction_ref,
737
0
        NULL));
738
0
    break;
739
740
0
  case RE_NODE_NOT_LITERAL:
741
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
742
0
        emit_context,
743
0
        RE_OPCODE_NOT_LITERAL,
744
0
        re_node->value,
745
0
        &instruction_ref,
746
0
        NULL));
747
0
    break;
748
749
0
  case RE_NODE_MASKED_LITERAL:
750
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
751
0
        emit_context,
752
0
        RE_OPCODE_MASKED_LITERAL,
753
0
        re_node->mask << 8 | re_node->value,
754
0
        &instruction_ref,
755
0
        NULL));
756
0
    break;
757
758
0
  case RE_NODE_MASKED_NOT_LITERAL:
759
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
760
0
        emit_context,
761
0
        RE_OPCODE_MASKED_NOT_LITERAL,
762
0
        re_node->mask << 8 | re_node->value,
763
0
        &instruction_ref,
764
0
        NULL));
765
0
    break;
766
767
0
  case RE_NODE_WORD_CHAR:
768
0
    FAIL_ON_ERROR(
769
0
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref));
770
0
    break;
771
772
0
  case RE_NODE_NON_WORD_CHAR:
773
0
    FAIL_ON_ERROR(
774
0
        _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref));
775
0
    break;
776
777
0
  case RE_NODE_WORD_BOUNDARY:
778
0
    FAIL_ON_ERROR(
779
0
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref));
780
0
    break;
781
782
0
  case RE_NODE_NON_WORD_BOUNDARY:
783
0
    FAIL_ON_ERROR(_yr_emit_inst(
784
0
        emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref));
785
0
    break;
786
787
0
  case RE_NODE_SPACE:
788
0
    FAIL_ON_ERROR(
789
0
        _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref));
790
0
    break;
791
792
0
  case RE_NODE_NON_SPACE:
793
0
    FAIL_ON_ERROR(
794
0
        _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref));
795
0
    break;
796
797
0
  case RE_NODE_DIGIT:
798
0
    FAIL_ON_ERROR(
799
0
        _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref));
800
0
    break;
801
802
0
  case RE_NODE_NON_DIGIT:
803
0
    FAIL_ON_ERROR(
804
0
        _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref));
805
0
    break;
806
807
0
  case RE_NODE_ANY:
808
0
    FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref));
809
0
    break;
810
811
0
  case RE_NODE_CLASS:
812
0
    FAIL_ON_ERROR(
813
0
        _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref));
814
815
0
    FAIL_ON_ERROR(yr_arena_write_data(
816
0
        emit_context->arena,
817
0
        YR_RE_CODE_SECTION,
818
0
        re_node->re_class,
819
0
        sizeof(*re_node->re_class),
820
0
        NULL));
821
0
    break;
822
823
0
  case RE_NODE_ANCHOR_START:
824
0
    FAIL_ON_ERROR(_yr_emit_inst(
825
0
        emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref));
826
0
    break;
827
828
0
  case RE_NODE_ANCHOR_END:
829
0
    FAIL_ON_ERROR(
830
0
        _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref));
831
0
    break;
832
833
0
  case RE_NODE_CONCAT:
834
0
    FAIL_ON_ERROR(_yr_re_emit(
835
0
        emit_context,
836
0
        (flags & EMIT_BACKWARDS) ? re_node->children_tail
837
0
                                 : re_node->children_head,
838
0
        flags,
839
0
        &instruction_ref));
840
841
0
    if (flags & EMIT_BACKWARDS)
842
0
      child = re_node->children_tail->prev_sibling;
843
0
    else
844
0
      child = re_node->children_head->next_sibling;
845
846
0
    while (child != NULL)
847
0
    {
848
0
      FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL));
849
850
0
      child = (flags & EMIT_BACKWARDS) ? child->prev_sibling
851
0
                                       : child->next_sibling;
852
0
    }
853
0
    break;
854
855
0
  case RE_NODE_PLUS:
856
    // Code for e+ looks like:
857
    //
858
    //          L1: code for e
859
    //              split L1, L2
860
    //          L2:
861
    //
862
0
    FAIL_ON_ERROR(_yr_re_emit(
863
0
        emit_context, re_node->children_head, flags, &instruction_ref));
864
865
0
    bookmark_1 = yr_arena_get_current_offset(
866
0
        emit_context->arena, YR_RE_CODE_SECTION);
867
868
0
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
869
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
870
871
0
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
872
873
0
    FAIL_ON_ERROR(_yr_emit_split(
874
0
        emit_context,
875
0
        re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A,
876
0
        jmp_offset,
877
0
        NULL,
878
0
        NULL));
879
880
0
    break;
881
882
0
  case RE_NODE_STAR:
883
    // Code for e* looks like:
884
    //
885
    //          L1: split L1, L2
886
    //              code for e
887
    //              jmp L1
888
    //          L2:
889
0
    FAIL_ON_ERROR(_yr_emit_split(
890
0
        emit_context,
891
0
        re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
892
0
        0,
893
0
        &instruction_ref,
894
0
        &split_offset_ref));
895
896
0
    FAIL_ON_ERROR(
897
0
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
898
899
0
    bookmark_1 = yr_arena_get_current_offset(
900
0
        emit_context->arena, YR_RE_CODE_SECTION);
901
902
0
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
903
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
904
905
0
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
906
907
    // Emit jump with offset set to 0.
908
909
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
910
0
        emit_context, RE_OPCODE_JUMP, jmp_offset, NULL, NULL));
911
912
0
    bookmark_1 = yr_arena_get_current_offset(
913
0
        emit_context->arena, YR_RE_CODE_SECTION);
914
915
0
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
916
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
917
918
0
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
919
920
    // Update split offset.
921
0
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
922
0
        emit_context->arena, &split_offset_ref);
923
924
0
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
925
0
    break;
926
927
0
  case RE_NODE_ALT:
928
    // Code for e1|e2 looks like:
929
    //
930
    //              split L1, L2
931
    //          L1: code for e1
932
    //              jmp L3
933
    //          L2: code for e2
934
    //          L3:
935
936
    // Emit a split instruction with offset set to 0 temporarily. Offset
937
    // will be updated after we know the size of the code generated for
938
    // the left node (e1).
939
940
0
    FAIL_ON_ERROR(_yr_emit_split(
941
0
        emit_context,
942
0
        RE_OPCODE_SPLIT_A,
943
0
        0,
944
0
        &instruction_ref,
945
0
        &split_offset_ref));
946
947
0
    FAIL_ON_ERROR(
948
0
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
949
950
    // Emit jump with offset set to 0.
951
952
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
953
0
        emit_context,
954
0
        RE_OPCODE_JUMP,
955
0
        0,
956
0
        &jmp_instruction_ref,
957
0
        &jmp_offset_ref));
958
959
0
    bookmark_1 = yr_arena_get_current_offset(
960
0
        emit_context->arena, YR_RE_CODE_SECTION);
961
962
0
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
963
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
964
965
0
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
966
967
    // Update split offset.
968
0
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
969
0
        emit_context->arena, &split_offset_ref);
970
971
0
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
972
973
0
    FAIL_ON_ERROR(
974
0
        _yr_re_emit(emit_context, re_node->children_tail, flags, NULL));
975
976
0
    bookmark_1 = yr_arena_get_current_offset(
977
0
        emit_context->arena, YR_RE_CODE_SECTION);
978
979
0
    if (bookmark_1 - jmp_instruction_ref.offset > INT16_MAX)
980
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
981
982
0
    jmp_offset = (int16_t) (bookmark_1 - jmp_instruction_ref.offset);
983
984
    // Update offset for jmp instruction.
985
0
    jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
986
0
        emit_context->arena, &jmp_offset_ref);
987
988
0
    memcpy(jmp_offset_addr, &jmp_offset, sizeof(jmp_offset));
989
0
    break;
990
991
0
  case RE_NODE_RANGE_ANY:
992
0
    repeat_any_args.min = re_node->start;
993
0
    repeat_any_args.max = re_node->end;
994
995
0
    FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
996
0
        emit_context,
997
0
        re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY
998
0
                        : RE_OPCODE_REPEAT_ANY_UNGREEDY,
999
0
        &repeat_any_args,
1000
0
        sizeof(repeat_any_args),
1001
0
        &instruction_ref,
1002
0
        NULL));
1003
1004
0
    break;
1005
1006
0
  case RE_NODE_RANGE:
1007
    // Code for e{n,m} looks like:
1008
    //
1009
    //            code for e              ---   prolog
1010
    //            repeat_start n, m, L1   --+
1011
    //        L0: code for e                |   repeat
1012
    //            repeat_end n, m, L0     --+
1013
    //        L1: split L2, L3            ---   split
1014
    //        L2: code for e              ---   epilog
1015
    //        L3:
1016
    //
1017
    // Not all sections (prolog, repeat, split and epilog) are generated in all
1018
    // cases, it depends on the values of n and m. The following table shows
1019
    // which sections are generated for the first few values of n and m.
1020
    //
1021
    //        n,m   prolog  repeat      split  epilog
1022
    //                      (min,max)
1023
    //        ---------------------------------------
1024
    //        0,0     -       -           -      -
1025
    //        0,1     -       -           X      X
1026
    //        0,2     -       0,1         X      X
1027
    //        0,3     -       0,2         X      X
1028
    //        0,M     -       0,M-1       X      X
1029
    //
1030
    //        1,1     X       -           -      -
1031
    //        1,2     X       -           X      X
1032
    //        1,3     X       0,1         X      X
1033
    //        1,4     X       1,2         X      X
1034
    //        1,M     X       1,M-2       X      X
1035
    //
1036
    //        2,2     X       -           -      X
1037
    //        2,3     X       1,1         X      X
1038
    //        2,4     X       1,2         X      X
1039
    //        2,M     X       1,M-2       X      X
1040
    //
1041
    //        3,3     X       1,1         -      X
1042
    //        3,4     X       2,2         X      X
1043
    //        3,M     X       2,M-2       X      X
1044
    //
1045
    //        4,4     X       2,2         -      X
1046
    //        4,5     X       3,3         X      X
1047
    //        4,M     X       3,M-2       X      X
1048
    //
1049
    // The code can't consists simply in the repeat section, the prolog and
1050
    // epilog are required because we can't have atoms pointing to code inside
1051
    // the repeat loop. Atoms' forwards_code will point to code in the prolog
1052
    // and backwards_code will point to code in the epilog (or in prolog if
1053
    // epilog wasn't generated, like in n=1,m=1)
1054
1055
0
    emit_prolog = re_node->start > 0;
1056
0
    emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2;
1057
0
    emit_split = re_node->end > re_node->start;
1058
0
    emit_epilog = re_node->end > re_node->start || re_node->end > 1;
1059
1060
0
    if (emit_prolog)
1061
0
    {
1062
0
      FAIL_ON_ERROR(_yr_re_emit(
1063
0
          emit_context, re_node->children_head, flags, &instruction_ref));
1064
0
    }
1065
1066
0
    if (emit_repeat)
1067
0
    {
1068
0
      repeat_args.min = re_node->start;
1069
0
      repeat_args.max = re_node->end;
1070
1071
0
      if (emit_prolog)
1072
0
      {
1073
0
        repeat_args.max--;
1074
0
        repeat_args.min--;
1075
0
      }
1076
1077
0
      if (emit_split)
1078
0
      {
1079
0
        repeat_args.max--;
1080
0
      }
1081
0
      else
1082
0
      {
1083
0
        repeat_args.min--;
1084
0
        repeat_args.max--;
1085
0
      }
1086
1087
0
      repeat_args.offset = 0;
1088
1089
0
      bookmark_1 = yr_arena_get_current_offset(
1090
0
          emit_context->arena, YR_RE_CODE_SECTION);
1091
1092
0
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1093
0
          emit_context,
1094
0
          re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY
1095
0
                          : RE_OPCODE_REPEAT_START_UNGREEDY,
1096
0
          &repeat_args,
1097
0
          sizeof(repeat_args),
1098
0
          emit_prolog ? NULL : &instruction_ref,
1099
0
          &repeat_start_args_ref));
1100
1101
0
      bookmark_2 = yr_arena_get_current_offset(
1102
0
          emit_context->arena, YR_RE_CODE_SECTION);
1103
1104
0
      FAIL_ON_ERROR(_yr_re_emit(
1105
0
          emit_context,
1106
0
          re_node->children_head,
1107
0
          flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
1108
0
          NULL));
1109
1110
0
      bookmark_3 = yr_arena_get_current_offset(
1111
0
          emit_context->arena, YR_RE_CODE_SECTION);
1112
1113
0
      if (bookmark_2 - bookmark_3 < INT32_MIN)
1114
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1115
1116
0
      repeat_args.offset = (int32_t) (bookmark_2 - bookmark_3);
1117
1118
0
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1119
0
          emit_context,
1120
0
          re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY
1121
0
                          : RE_OPCODE_REPEAT_END_UNGREEDY,
1122
0
          &repeat_args,
1123
0
          sizeof(repeat_args),
1124
0
          NULL,
1125
0
          NULL));
1126
1127
0
      bookmark_4 = yr_arena_get_current_offset(
1128
0
          emit_context->arena, YR_RE_CODE_SECTION);
1129
1130
0
      repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr(
1131
0
          emit_context->arena, &repeat_start_args_ref);
1132
1133
0
      if (bookmark_4 - bookmark_1 > INT32_MAX)
1134
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1135
1136
0
      repeat_start_args_addr->offset = (int32_t) (bookmark_4 - bookmark_1);
1137
0
    }
1138
1139
0
    if (emit_split)
1140
0
    {
1141
0
      bookmark_1 = yr_arena_get_current_offset(
1142
0
          emit_context->arena, YR_RE_CODE_SECTION);
1143
1144
0
      FAIL_ON_ERROR(_yr_emit_split(
1145
0
          emit_context,
1146
0
          re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
1147
0
          0,
1148
0
          NULL,
1149
0
          &split_offset_ref));
1150
0
    }
1151
1152
0
    if (emit_epilog)
1153
0
    {
1154
0
      FAIL_ON_ERROR(_yr_re_emit(
1155
0
          emit_context,
1156
0
          re_node->children_head,
1157
0
          emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags,
1158
0
          emit_prolog || emit_repeat ? NULL : &instruction_ref));
1159
0
    }
1160
1161
0
    if (emit_split)
1162
0
    {
1163
0
      bookmark_2 = yr_arena_get_current_offset(
1164
0
          emit_context->arena, YR_RE_CODE_SECTION);
1165
1166
0
      if (bookmark_2 - bookmark_1 > INT16_MAX)
1167
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1168
1169
0
      split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
1170
0
          emit_context->arena, &split_offset_ref);
1171
1172
0
      jmp_offset = (int16_t) (bookmark_2 - bookmark_1);
1173
1174
0
      memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
1175
0
    }
1176
1177
0
    break;
1178
0
  }
1179
1180
0
  if (flags & EMIT_BACKWARDS)
1181
0
  {
1182
0
    if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE))
1183
0
    {
1184
0
      re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION;
1185
0
      re_node->backward_code_ref.offset = yr_arena_get_current_offset(
1186
0
          emit_context->arena, YR_RE_CODE_SECTION);
1187
0
    }
1188
0
  }
1189
0
  else
1190
0
  {
1191
0
    if (!(flags & EMIT_DONT_SET_FORWARDS_CODE))
1192
0
    {
1193
0
      re_node->forward_code_ref = instruction_ref;
1194
0
    }
1195
0
  }
1196
1197
0
  if (code_ref != NULL)
1198
0
    *code_ref = instruction_ref;
1199
1200
0
  return ERROR_SUCCESS;
1201
0
}
1202
1203
int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code)
1204
0
{
1205
0
  RE_EMIT_CONTEXT emit_context;
1206
1207
  // Emit code for matching the regular expressions forwards.
1208
0
  emit_context.arena = arena;
1209
0
  emit_context.next_split_id = 0;
1210
1211
0
  FAIL_ON_ERROR(_yr_re_emit(
1212
0
      &emit_context,
1213
0
      re_ast->root_node,
1214
0
      backwards_code ? EMIT_BACKWARDS : 0,
1215
0
      NULL));
1216
1217
0
  FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL));
1218
1219
0
  return ERROR_SUCCESS;
1220
0
}
1221
1222
static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber)
1223
0
{
1224
0
  RE_FIBER* fiber;
1225
1226
0
  if (fiber_pool->fibers.head != NULL)
1227
0
  {
1228
0
    fiber = fiber_pool->fibers.head;
1229
0
    fiber_pool->fibers.head = fiber->next;
1230
1231
0
    if (fiber_pool->fibers.tail == fiber)
1232
0
      fiber_pool->fibers.tail = NULL;
1233
0
  }
1234
0
  else
1235
0
  {
1236
0
    if (fiber_pool->fiber_count == RE_MAX_FIBERS)
1237
0
      return ERROR_TOO_MANY_RE_FIBERS;
1238
1239
0
    fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER));
1240
1241
0
    if (fiber == NULL)
1242
0
      return ERROR_INSUFFICIENT_MEMORY;
1243
1244
0
    fiber_pool->fiber_count++;
1245
0
  }
1246
1247
0
  fiber->ip = NULL;
1248
0
  fiber->sp = -1;
1249
0
  fiber->rc = -1;
1250
0
  fiber->next = NULL;
1251
0
  fiber->prev = NULL;
1252
1253
0
  *new_fiber = fiber;
1254
1255
0
  return ERROR_SUCCESS;
1256
0
}
1257
1258
////////////////////////////////////////////////////////////////////////////////
1259
// Appends 'fiber' to 'fiber_list'
1260
//
1261
static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber)
1262
0
{
1263
0
  assert(fiber->prev == NULL);
1264
0
  assert(fiber->next == NULL);
1265
1266
0
  fiber->prev = fiber_list->tail;
1267
1268
0
  if (fiber_list->tail != NULL)
1269
0
    fiber_list->tail->next = fiber;
1270
1271
0
  fiber_list->tail = fiber;
1272
1273
0
  if (fiber_list->head == NULL)
1274
0
    fiber_list->head = fiber;
1275
1276
0
  assert(fiber_list->tail->next == NULL);
1277
0
  assert(fiber_list->head->prev == NULL);
1278
0
}
1279
1280
////////////////////////////////////////////////////////////////////////////////
1281
// Verifies if a fiber with the same properties (ip, rc, sp, and stack values)
1282
// than 'target_fiber' exists in 'fiber_list'. The list is iterated from
1283
// the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
1284
// taken into account.
1285
//
1286
static int _yr_re_fiber_exists(
1287
    RE_FIBER_LIST* fiber_list,
1288
    RE_FIBER* target_fiber,
1289
    RE_FIBER* last_fiber)
1290
0
{
1291
0
  RE_FIBER* fiber = fiber_list->head;
1292
1293
0
  int equal_stacks;
1294
0
  int i;
1295
1296
0
  if (last_fiber == NULL)
1297
0
    return false;
1298
1299
0
  while (fiber != last_fiber->next)
1300
0
  {
1301
0
    if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp &&
1302
0
        fiber->rc == target_fiber->rc)
1303
0
    {
1304
0
      equal_stacks = true;
1305
1306
0
      for (i = 0; i <= fiber->sp; i++)
1307
0
      {
1308
0
        if (fiber->stack[i] != target_fiber->stack[i])
1309
0
        {
1310
0
          equal_stacks = false;
1311
0
          break;
1312
0
        }
1313
0
      }
1314
1315
0
      if (equal_stacks)
1316
0
        return true;
1317
0
    }
1318
1319
0
    fiber = fiber->next;
1320
0
  }
1321
1322
0
  return false;
1323
0
}
1324
1325
////////////////////////////////////////////////////////////////////////////////
1326
// Clones a fiber in fiber_list and inserts the cloned fiber just after.
1327
// the original one. If fiber_list is:
1328
//
1329
//   f1 -> f2 -> f3 -> f4
1330
//
1331
// Splitting f2 will result in:
1332
//
1333
//   f1 -> f2 -> cloned f2 -> f3 -> f4
1334
//
1335
static int _yr_re_fiber_split(
1336
    RE_FIBER_LIST* fiber_list,
1337
    RE_FIBER_POOL* fiber_pool,
1338
    RE_FIBER* fiber,
1339
    RE_FIBER** new_fiber)
1340
0
{
1341
0
  int32_t i;
1342
1343
0
  FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber));
1344
1345
0
  (*new_fiber)->sp = fiber->sp;
1346
0
  (*new_fiber)->ip = fiber->ip;
1347
0
  (*new_fiber)->rc = fiber->rc;
1348
1349
0
  for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i];
1350
1351
0
  (*new_fiber)->next = fiber->next;
1352
0
  (*new_fiber)->prev = fiber;
1353
1354
0
  if (fiber->next != NULL)
1355
0
    fiber->next->prev = *new_fiber;
1356
1357
0
  fiber->next = *new_fiber;
1358
1359
0
  if (fiber_list->tail == fiber)
1360
0
    fiber_list->tail = *new_fiber;
1361
1362
0
  assert(fiber_list->tail->next == NULL);
1363
0
  assert(fiber_list->head->prev == NULL);
1364
1365
0
  return ERROR_SUCCESS;
1366
0
}
1367
1368
////////////////////////////////////////////////////////////////////////////////
1369
// Kills a given fiber by removing it from the fiber list and putting it in the
1370
// fiber pool.
1371
//
1372
static RE_FIBER* _yr_re_fiber_kill(
1373
    RE_FIBER_LIST* fiber_list,
1374
    RE_FIBER_POOL* fiber_pool,
1375
    RE_FIBER* fiber)
1376
0
{
1377
0
  RE_FIBER* next_fiber = fiber->next;
1378
1379
0
  if (fiber->prev != NULL)
1380
0
    fiber->prev->next = next_fiber;
1381
1382
0
  if (next_fiber != NULL)
1383
0
    next_fiber->prev = fiber->prev;
1384
1385
0
  if (fiber_pool->fibers.tail != NULL)
1386
0
    fiber_pool->fibers.tail->next = fiber;
1387
1388
0
  if (fiber_list->tail == fiber)
1389
0
    fiber_list->tail = fiber->prev;
1390
1391
0
  if (fiber_list->head == fiber)
1392
0
    fiber_list->head = next_fiber;
1393
1394
0
  fiber->next = NULL;
1395
0
  fiber->prev = fiber_pool->fibers.tail;
1396
0
  fiber_pool->fibers.tail = fiber;
1397
1398
0
  if (fiber_pool->fibers.head == NULL)
1399
0
    fiber_pool->fibers.head = fiber;
1400
1401
0
  return next_fiber;
1402
0
}
1403
1404
////////////////////////////////////////////////////////////////////////////////
1405
// Kills all fibers from the given one up to the end of the fiber list.
1406
//
1407
static void _yr_re_fiber_kill_tail(
1408
    RE_FIBER_LIST* fiber_list,
1409
    RE_FIBER_POOL* fiber_pool,
1410
    RE_FIBER* fiber)
1411
0
{
1412
0
  RE_FIBER* prev_fiber = fiber->prev;
1413
1414
0
  if (prev_fiber != NULL)
1415
0
    prev_fiber->next = NULL;
1416
1417
0
  fiber->prev = fiber_pool->fibers.tail;
1418
1419
0
  if (fiber_pool->fibers.tail != NULL)
1420
0
    fiber_pool->fibers.tail->next = fiber;
1421
1422
0
  fiber_pool->fibers.tail = fiber_list->tail;
1423
0
  fiber_list->tail = prev_fiber;
1424
1425
0
  if (fiber_list->head == fiber)
1426
0
    fiber_list->head = NULL;
1427
1428
0
  if (fiber_pool->fibers.head == NULL)
1429
0
    fiber_pool->fibers.head = fiber;
1430
0
}
1431
1432
////////////////////////////////////////////////////////////////////////////////
1433
// Kills all fibers in the fiber list.
1434
//
1435
static void _yr_re_fiber_kill_all(
1436
    RE_FIBER_LIST* fiber_list,
1437
    RE_FIBER_POOL* fiber_pool)
1438
0
{
1439
0
  if (fiber_list->head != NULL)
1440
0
    _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
1441
0
}
1442
1443
////////////////////////////////////////////////////////////////////////////////
1444
// Executes a fiber until reaching an "matching" instruction. A "matching"
1445
// instruction is one that actually reads a byte from the input and performs
1446
// some matching. If the fiber reaches a split instruction, the new fiber is
1447
// also synced.
1448
//
1449
static int _yr_re_fiber_sync(
1450
    RE_FIBER_LIST* fiber_list,
1451
    RE_FIBER_POOL* fiber_pool,
1452
    RE_FIBER* fiber_to_sync)
1453
0
{
1454
  // A array for keeping track of which split instructions has been already
1455
  // executed. Each split instruction within a regexp has an associated ID
1456
  // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is
1457
  // required to avoid infinite loops in regexps like (a*)* or (a|)*
1458
1459
0
  RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID];
1460
0
  RE_SPLIT_ID_TYPE splits_executed_count = 0;
1461
0
  RE_SPLIT_ID_TYPE split_id, splits_executed_idx;
1462
1463
0
  int split_already_executed;
1464
1465
0
  RE_REPEAT_ARGS* repeat_args;
1466
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
1467
1468
0
  RE_FIBER* fiber;
1469
0
  RE_FIBER* last;
1470
0
  RE_FIBER* next;
1471
0
  RE_FIBER* branch_a;
1472
0
  RE_FIBER* branch_b;
1473
1474
0
  fiber = fiber_to_sync;
1475
0
  last = fiber_to_sync->next;
1476
1477
0
  while (fiber != last)
1478
0
  {
1479
0
    int16_t jmp_len;
1480
0
    uint8_t opcode = *fiber->ip;
1481
1482
0
    switch (opcode)
1483
0
    {
1484
0
    case RE_OPCODE_SPLIT_A:
1485
0
    case RE_OPCODE_SPLIT_B:
1486
1487
0
      split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1);
1488
0
      split_already_executed = false;
1489
1490
0
      for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count;
1491
0
           splits_executed_idx++)
1492
0
      {
1493
0
        if (split_id == splits_executed[splits_executed_idx])
1494
0
        {
1495
0
          split_already_executed = true;
1496
0
          break;
1497
0
        }
1498
0
      }
1499
1500
0
      if (split_already_executed)
1501
0
      {
1502
0
        fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
1503
0
      }
1504
0
      else
1505
0
      {
1506
0
        branch_a = fiber;
1507
1508
0
        FAIL_ON_ERROR(
1509
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1510
1511
        // With RE_OPCODE_SPLIT_A the current fiber continues at the next
1512
        // instruction in the stream (branch A), while the newly created
1513
        // fiber starts at the address indicated by the instruction (branch B)
1514
        // RE_OPCODE_SPLIT_B has the opposite behavior.
1515
1516
0
        if (opcode == RE_OPCODE_SPLIT_B)
1517
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1518
1519
        // Branch A continues at the next instruction
1520
0
        branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3);
1521
1522
        // Branch B adds the offset indicated by the split instruction.
1523
0
        jmp_len = yr_unaligned_i16(branch_b->ip + 1 + sizeof(RE_SPLIT_ID_TYPE));
1524
1525
0
        branch_b->ip += jmp_len;
1526
1527
#ifdef YR_PARANOID_MODE
1528
        // In normal conditions this should never happen. But with compiled
1529
        // rules that has been hand-crafted by a malicious actor this could
1530
        // happen.
1531
        if (splits_executed_count >= RE_MAX_SPLIT_ID)
1532
          return ERROR_INTERNAL_FATAL_ERROR;
1533
#endif
1534
1535
0
        splits_executed[splits_executed_count] = split_id;
1536
0
        splits_executed_count++;
1537
0
      }
1538
1539
0
      break;
1540
1541
0
    case RE_OPCODE_REPEAT_START_GREEDY:
1542
0
    case RE_OPCODE_REPEAT_START_UNGREEDY:
1543
1544
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1545
0
      assert(repeat_args->max > 0);
1546
0
      branch_a = fiber;
1547
1548
0
      if (repeat_args->min == 0)
1549
0
      {
1550
0
        FAIL_ON_ERROR(
1551
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1552
1553
0
        if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY)
1554
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1555
1556
0
        branch_b->ip += repeat_args->offset;
1557
0
      }
1558
1559
0
      branch_a->stack[++branch_a->sp] = 0;
1560
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1561
0
      break;
1562
1563
0
    case RE_OPCODE_REPEAT_END_GREEDY:
1564
0
    case RE_OPCODE_REPEAT_END_UNGREEDY:
1565
1566
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1567
0
      fiber->stack[fiber->sp]++;
1568
1569
0
      if (fiber->stack[fiber->sp] < repeat_args->min)
1570
0
      {
1571
0
        fiber->ip += repeat_args->offset;
1572
0
        break;
1573
0
      }
1574
1575
0
      branch_a = fiber;
1576
1577
0
      if (fiber->stack[fiber->sp] < repeat_args->max)
1578
0
      {
1579
0
        FAIL_ON_ERROR(
1580
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1581
1582
0
        if (opcode == RE_OPCODE_REPEAT_END_GREEDY)
1583
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1584
1585
0
        branch_b->ip += repeat_args->offset;
1586
0
      }
1587
1588
0
      branch_a->sp--;
1589
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1590
0
      break;
1591
1592
0
    case RE_OPCODE_REPEAT_ANY_GREEDY:
1593
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1594
1595
0
      repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1);
1596
1597
      // If repetition counter (rc) is -1 it means that we are reaching this
1598
      // instruction from the previous one in the instructions stream. In
1599
      // this case let's initialize the counter to 0 and start looping.
1600
1601
0
      if (fiber->rc == -1)
1602
0
        fiber->rc = 0;
1603
1604
0
      if (fiber->rc < repeat_any_args->min)
1605
0
      {
1606
        // Increase repetition counter and continue with next fiber. The
1607
        // instruction pointer for this fiber is not incremented yet, this
1608
        // fiber spins in this same instruction until reaching the minimum
1609
        // number of repetitions.
1610
1611
0
        fiber->rc++;
1612
0
        fiber = fiber->next;
1613
0
      }
1614
0
      else if (fiber->rc < repeat_any_args->max)
1615
0
      {
1616
        // Once the minimum number of repetitions are matched one fiber
1617
        // remains spinning in this instruction until reaching the maximum
1618
        // number of repetitions while new fibers are created. New fibers
1619
        // start executing at the next instruction.
1620
1621
0
        next = fiber->next;
1622
0
        branch_a = fiber;
1623
1624
0
        FAIL_ON_ERROR(
1625
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1626
1627
0
        if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY)
1628
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1629
1630
0
        branch_a->rc++;
1631
0
        branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1632
0
        branch_b->rc = -1;
1633
1634
0
        FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b));
1635
1636
0
        fiber = next;
1637
0
      }
1638
0
      else
1639
0
      {
1640
        // When the maximum number of repetitions is reached the fiber keeps
1641
        // executing at the next instruction. The repetition counter is set
1642
        // to -1 indicating that we are not spinning in a repeat instruction
1643
        // anymore.
1644
1645
0
        fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1646
0
        fiber->rc = -1;
1647
0
      }
1648
1649
0
      break;
1650
1651
0
    case RE_OPCODE_JUMP:
1652
0
      jmp_len = yr_unaligned_i16(fiber->ip + 1);
1653
0
      fiber->ip += jmp_len;
1654
0
      break;
1655
1656
0
    default:
1657
0
      fiber = fiber->next;
1658
0
    }
1659
0
  }
1660
1661
0
  return ERROR_SUCCESS;
1662
0
}
1663
1664
////////////////////////////////////////////////////////////////////////////////
1665
// Executes a regular expression. The specified regular expression will try to
1666
// match the data starting at the address specified by "input". The "input"
1667
// pointer can point to any address inside a memory buffer. Arguments
1668
// "input_forwards_size" and "input_backwards_size" indicate how many bytes
1669
// can be accessible starting at "input" and going forwards and backwards
1670
// respectively.
1671
//
1672
//   <--- input_backwards_size -->|<----------- input_forwards_size -------->
1673
//  |--------  memory buffer  -----------------------------------------------|
1674
//                                ^
1675
//                              input
1676
//
1677
// Args:
1678
//   YR_SCAN_CONTEXT *context         - Scan context.
1679
//   const uint8_t* code              - Regexp code be executed
1680
//   const uint8_t* input             - Pointer to input data
1681
//   size_t input_forwards_size       - Number of accessible bytes starting at
1682
//                                      "input" and going forwards.
1683
//   size_t input_backwards_size      - Number of accessible bytes starting at
1684
//                                      "input" and going backwards
1685
//   int flags                        - Flags:
1686
//      RE_FLAGS_SCAN
1687
//      RE_FLAGS_BACKWARDS
1688
//      RE_FLAGS_EXHAUSTIVE
1689
//      RE_FLAGS_WIDE
1690
//      RE_FLAGS_NO_CASE
1691
//      RE_FLAGS_DOT_ALL
1692
//   RE_MATCH_CALLBACK_FUNC callback  - Callback function
1693
//   void* callback_args              - Callback argument
1694
//   int*  matches                    - Pointer to an integer receiving the
1695
//                                      number of matching bytes. Notice that
1696
//                                      0 means a zero-length match, while no
1697
//                                      matches is -1.
1698
// Returns:
1699
//    ERROR_SUCCESS or any other error code.
1700
1701
int yr_re_exec(
1702
    YR_SCAN_CONTEXT* context,
1703
    const uint8_t* code,
1704
    const uint8_t* input_data,
1705
    size_t input_forwards_size,
1706
    size_t input_backwards_size,
1707
    int flags,
1708
    RE_MATCH_CALLBACK_FUNC callback,
1709
    void* callback_args,
1710
    int* matches)
1711
0
{
1712
0
  const uint8_t* input;
1713
0
  const uint8_t* ip;
1714
1715
0
  uint16_t opcode_args;
1716
0
  uint8_t mask;
1717
0
  uint8_t value;
1718
0
  uint8_t character_size;
1719
1720
0
  RE_FIBER_LIST fibers;
1721
0
  RE_FIBER* fiber;
1722
0
  RE_FIBER* next_fiber;
1723
1724
0
  int bytes_matched;
1725
0
  int max_bytes_matched;
1726
1727
0
  int match;
1728
0
  int input_incr;
1729
0
  int kill;
1730
0
  int action;
1731
1732
0
  bool prev_is_word_char = false;
1733
0
  bool input_is_word_char = false;
1734
1735
0
#define ACTION_NONE      0
1736
0
#define ACTION_CONTINUE  1
1737
0
#define ACTION_KILL      2
1738
0
#define ACTION_KILL_TAIL 3
1739
1740
0
#define prolog                                      \
1741
0
  {                                                 \
1742
0
    if ((bytes_matched >= max_bytes_matched) ||     \
1743
0
        (character_size == 2 && *(input + 1) != 0)) \
1744
0
    {                                               \
1745
0
      action = ACTION_KILL;                         \
1746
0
      break;                                        \
1747
0
    }                                               \
1748
0
  }
1749
1750
0
  if (matches != NULL)
1751
0
    *matches = -1;
1752
1753
0
  if (flags & RE_FLAGS_WIDE)
1754
0
    character_size = 2;
1755
0
  else
1756
0
    character_size = 1;
1757
1758
0
  input = input_data;
1759
0
  input_incr = character_size;
1760
1761
0
  if (flags & RE_FLAGS_BACKWARDS)
1762
0
  {
1763
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1764
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
1765
0
    input -= character_size;
1766
0
    input_incr = -input_incr;
1767
0
  }
1768
0
  else
1769
0
  {
1770
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1771
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
1772
0
  }
1773
1774
  // Round down max_bytes_matched to a multiple of character_size, this way if
1775
  // character_size is 2 and max_bytes_matched is odd we are ignoring the
1776
  // extra byte which can't match anyways.
1777
1778
0
  max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
1779
0
  bytes_matched = 0;
1780
1781
0
  FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber));
1782
1783
0
  fiber->ip = code;
1784
0
  fibers.head = fiber;
1785
0
  fibers.tail = fiber;
1786
1787
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1788
0
      _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1789
0
      _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1790
1791
0
  while (fibers.head != NULL)
1792
0
  {
1793
0
    fiber = fibers.head;
1794
1795
0
    while (fiber != NULL)
1796
0
    {
1797
0
      next_fiber = fiber->next;
1798
1799
0
      if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev))
1800
0
        _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1801
1802
0
      fiber = next_fiber;
1803
0
    }
1804
1805
0
    fiber = fibers.head;
1806
1807
0
    while (fiber != NULL)
1808
0
    {
1809
0
      ip = fiber->ip;
1810
0
      action = ACTION_NONE;
1811
1812
0
      switch (*ip)
1813
0
      {
1814
0
      case RE_OPCODE_ANY:
1815
0
        prolog;
1816
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1817
0
        action = match ? ACTION_NONE : ACTION_KILL;
1818
0
        fiber->ip += 1;
1819
0
        break;
1820
1821
0
      case RE_OPCODE_REPEAT_ANY_GREEDY:
1822
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1823
0
        prolog;
1824
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1825
0
        action = match ? ACTION_NONE : ACTION_KILL;
1826
1827
        // The instruction pointer is not incremented here. The current fiber
1828
        // spins in this instruction until reaching the required number of
1829
        // repetitions. The code controlling the number of repetitions is in
1830
        // _yr_re_fiber_sync.
1831
1832
0
        break;
1833
1834
0
      case RE_OPCODE_LITERAL:
1835
0
        prolog;
1836
0
        if (flags & RE_FLAGS_NO_CASE)
1837
0
          match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)];
1838
0
        else
1839
0
          match = (*input == *(ip + 1));
1840
0
        action = match ? ACTION_NONE : ACTION_KILL;
1841
0
        fiber->ip += 2;
1842
0
        break;
1843
1844
0
      case RE_OPCODE_NOT_LITERAL:
1845
0
        prolog;
1846
1847
        // We don't need to take into account the case-insensitive
1848
        // case because this opcode is only used with hex strings,
1849
        // which can't be case-insensitive.
1850
1851
0
        match = (*input != *(ip + 1));
1852
0
        action = match ? ACTION_NONE : ACTION_KILL;
1853
0
        fiber->ip += 2;
1854
0
        break;
1855
1856
0
      case RE_OPCODE_MASKED_LITERAL:
1857
0
        prolog;
1858
0
        opcode_args = yr_unaligned_u16(ip + 1);
1859
0
        mask = opcode_args >> 8;
1860
0
        value = opcode_args & 0xFF;
1861
1862
        // We don't need to take into account the case-insensitive
1863
        // case because this opcode is only used with hex strings,
1864
        // which can't be case-insensitive.
1865
1866
0
        match = ((*input & mask) == value);
1867
0
        action = match ? ACTION_NONE : ACTION_KILL;
1868
0
        fiber->ip += 3;
1869
0
        break;
1870
1871
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
1872
0
        prolog;
1873
0
        opcode_args = yr_unaligned_u16(ip + 1);
1874
0
        mask = opcode_args >> 8;
1875
0
        value = opcode_args & 0xFF;
1876
1877
        // We don't need to take into account the case-insensitive
1878
        // case because this opcode is only used with hex strings,
1879
        // which can't be case-insensitive.
1880
1881
0
        match = ((*input & mask) != value);
1882
0
        action = match ? ACTION_NONE : ACTION_KILL;
1883
0
        fiber->ip += 3;
1884
0
        break;
1885
1886
0
      case RE_OPCODE_CLASS:
1887
0
        prolog;
1888
0
        match = _yr_re_is_char_in_class(
1889
0
            (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE);
1890
0
        action = match ? ACTION_NONE : ACTION_KILL;
1891
0
        fiber->ip += (sizeof(RE_CLASS) + 1);
1892
0
        break;
1893
1894
0
      case RE_OPCODE_WORD_CHAR:
1895
0
        prolog;
1896
0
        match = _yr_re_is_word_char(input, character_size);
1897
0
        action = match ? ACTION_NONE : ACTION_KILL;
1898
0
        fiber->ip += 1;
1899
0
        break;
1900
1901
0
      case RE_OPCODE_NON_WORD_CHAR:
1902
0
        prolog;
1903
0
        match = !_yr_re_is_word_char(input, character_size);
1904
0
        action = match ? ACTION_NONE : ACTION_KILL;
1905
0
        fiber->ip += 1;
1906
0
        break;
1907
1908
0
      case RE_OPCODE_SPACE:
1909
0
      case RE_OPCODE_NON_SPACE:
1910
1911
0
        prolog;
1912
1913
0
        switch (*input)
1914
0
        {
1915
0
        case ' ':
1916
0
        case '\t':
1917
0
        case '\r':
1918
0
        case '\n':
1919
0
        case '\v':
1920
0
        case '\f':
1921
0
          match = true;
1922
0
          break;
1923
0
        default:
1924
0
          match = false;
1925
0
        }
1926
1927
0
        if (*ip == RE_OPCODE_NON_SPACE)
1928
0
          match = !match;
1929
1930
0
        action = match ? ACTION_NONE : ACTION_KILL;
1931
0
        fiber->ip += 1;
1932
0
        break;
1933
1934
0
      case RE_OPCODE_DIGIT:
1935
0
        prolog;
1936
0
        match = isdigit(*input);
1937
0
        action = match ? ACTION_NONE : ACTION_KILL;
1938
0
        fiber->ip += 1;
1939
0
        break;
1940
1941
0
      case RE_OPCODE_NON_DIGIT:
1942
0
        prolog;
1943
0
        match = !isdigit(*input);
1944
0
        action = match ? ACTION_NONE : ACTION_KILL;
1945
0
        fiber->ip += 1;
1946
0
        break;
1947
1948
0
      case RE_OPCODE_WORD_BOUNDARY:
1949
0
      case RE_OPCODE_NON_WORD_BOUNDARY:
1950
0
        if (input - input_incr + character_size <=
1951
0
                input_data + input_forwards_size &&
1952
0
            input - input_incr >= input_data - input_backwards_size)
1953
0
        {
1954
0
          prev_is_word_char = _yr_re_is_word_char(
1955
0
              input - input_incr, character_size);
1956
0
        }
1957
0
        else
1958
0
        {
1959
0
          prev_is_word_char = false;
1960
0
        }
1961
1962
0
        if (input + character_size <= input_data + input_forwards_size &&
1963
0
            input >= input_data - input_backwards_size)
1964
0
        {
1965
0
          input_is_word_char = _yr_re_is_word_char(input, character_size);
1966
0
        }
1967
0
        else
1968
0
        {
1969
0
          input_is_word_char = false;
1970
0
        }
1971
1972
0
        match = (prev_is_word_char && !input_is_word_char) ||
1973
0
                (!prev_is_word_char && input_is_word_char);
1974
1975
0
        if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
1976
0
          match = !match;
1977
1978
0
        action = match ? ACTION_CONTINUE : ACTION_KILL;
1979
0
        fiber->ip += 1;
1980
0
        break;
1981
1982
0
      case RE_OPCODE_MATCH_AT_START:
1983
0
        if (flags & RE_FLAGS_BACKWARDS)
1984
0
          kill = input_backwards_size > (size_t) bytes_matched;
1985
0
        else
1986
0
          kill = input_backwards_size > 0 || (bytes_matched != 0);
1987
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1988
0
        fiber->ip += 1;
1989
0
        break;
1990
1991
0
      case RE_OPCODE_MATCH_AT_END:
1992
0
        kill = flags & RE_FLAGS_BACKWARDS ||
1993
0
               input_forwards_size > (size_t) bytes_matched;
1994
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1995
0
        fiber->ip += 1;
1996
0
        break;
1997
1998
0
      case RE_OPCODE_MATCH:
1999
2000
0
        if (matches != NULL)
2001
0
          *matches = bytes_matched;
2002
2003
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2004
0
        {
2005
0
          if (callback != NULL)
2006
0
          {
2007
0
            if (flags & RE_FLAGS_BACKWARDS)
2008
0
            {
2009
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2010
0
                  callback(
2011
0
                      input + character_size,
2012
0
                      bytes_matched,
2013
0
                      flags,
2014
0
                      callback_args),
2015
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2016
0
            }
2017
0
            else
2018
0
            {
2019
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2020
0
                  callback(input_data, bytes_matched, flags, callback_args),
2021
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2022
0
            }
2023
0
          }
2024
2025
0
          action = ACTION_KILL;
2026
0
        }
2027
0
        else
2028
0
        {
2029
0
          action = ACTION_KILL_TAIL;
2030
0
        }
2031
2032
0
        break;
2033
2034
0
      default:
2035
0
        assert(false);
2036
0
      }
2037
2038
0
      switch (action)
2039
0
      {
2040
0
      case ACTION_KILL:
2041
0
        fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
2042
0
        break;
2043
2044
0
      case ACTION_KILL_TAIL:
2045
0
        _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber);
2046
0
        fiber = NULL;
2047
0
        break;
2048
2049
0
      case ACTION_CONTINUE:
2050
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2051
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2052
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2053
0
        break;
2054
2055
0
      default:
2056
0
        next_fiber = fiber->next;
2057
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2058
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2059
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2060
0
        fiber = next_fiber;
2061
0
      }
2062
0
    }
2063
2064
0
    input += input_incr;
2065
0
    bytes_matched += character_size;
2066
2067
0
    if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched)
2068
0
    {
2069
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2070
0
          _yr_re_fiber_create(&context->re_fiber_pool, &fiber),
2071
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2072
2073
0
      fiber->ip = code;
2074
0
      _yr_re_fiber_append(&fibers, fiber);
2075
2076
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2077
0
          _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2078
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2079
0
    }
2080
0
  }
2081
2082
0
  return ERROR_SUCCESS;
2083
0
}
2084
2085
////////////////////////////////////////////////////////////////////////////////
2086
// Helper function that creates a RE_FAST_EXEC_POSITION by either allocating it
2087
// or reusing a previously allocated one from a pool.
2088
//
2089
static int _yr_re_fast_exec_position_create(
2090
    RE_FAST_EXEC_POSITION_POOL* pool,
2091
    RE_FAST_EXEC_POSITION** new_position)
2092
0
{
2093
0
  RE_FAST_EXEC_POSITION* position;
2094
2095
0
  if (pool->head != NULL)
2096
0
  {
2097
0
    position = pool->head;
2098
0
    pool->head = position->next;
2099
0
  }
2100
0
  else
2101
0
  {
2102
0
    position = (RE_FAST_EXEC_POSITION*) yr_malloc(
2103
0
        sizeof(RE_FAST_EXEC_POSITION));
2104
2105
0
    if (position == NULL)
2106
0
      return ERROR_INSUFFICIENT_MEMORY;
2107
0
  }
2108
2109
0
  position->input = NULL;
2110
0
  position->round = 0;
2111
0
  position->next = NULL;
2112
0
  position->prev = NULL;
2113
2114
0
  *new_position = position;
2115
2116
0
  return ERROR_SUCCESS;
2117
0
}
2118
2119
////////////////////////////////////////////////////////////////////////////////
2120
// Helper function that moves a list of RE_FAST_EXEC_POSITION structures to a
2121
// pool represented by a RE_FAST_EXEC_POSITION_POOL. Receives pointers to the
2122
// pool, the first item, and last item in the list.
2123
//
2124
static void _yr_re_fast_exec_destroy_position_list(
2125
    RE_FAST_EXEC_POSITION_POOL* pool,
2126
    RE_FAST_EXEC_POSITION* first,
2127
    RE_FAST_EXEC_POSITION* last)
2128
0
{
2129
0
  last->next = pool->head;
2130
2131
0
  if (pool->head != NULL)
2132
0
    pool->head->prev = last;
2133
2134
0
  pool->head = first;
2135
0
}
2136
2137
////////////////////////////////////////////////////////////////////////////////
2138
// This function replaces yr_re_exec for regular expressions marked with flag
2139
// RE_FLAGS_FAST_REGEXP. These regular expressions are derived from hex strings
2140
// that don't contain alternatives, like for example:
2141
//
2142
//   { 01 02 03 04 [0-2] 04 05 06 07 }
2143
//
2144
// The regexp's code produced by such strings can be matched by a faster, less
2145
// general algorithm, and it contains only the following opcodes:
2146
//
2147
//   * RE_OPCODE_LITERAL
2148
//   * RE_OPCODE_MASKED_LITERAL,
2149
//   * RE_OPCODE_NOT_LITERAL
2150
//   * RE_OPCODE_MASKED_NOT_LITERAL
2151
//   * RE_OPCODE_ANY
2152
//   * RE_OPCODE_REPEAT_ANY_UNGREEDY
2153
//   * RE_OPCODE_MATCH.
2154
//
2155
// The regexp's code is executed one instruction at time, and the function
2156
// maintains a list of positions within the input for tracking different
2157
// matching alternatives, these positions are described by an instance of
2158
// RE_FAST_EXEC_POSITION. With each instruction the list of positions is
2159
// updated, removing those where the input data doesn't match the current
2160
// instruction, or adding new positions if the instruction is
2161
// RE_OPCODE_REPEAT_ANY_UNGREEDY. The list of positions is maintained sorted
2162
// by the value of the pointer they hold to the input, and it doesn't contain
2163
// duplicated pointer values.
2164
//
2165
int yr_re_fast_exec(
2166
    YR_SCAN_CONTEXT* context,
2167
    const uint8_t* code,
2168
    const uint8_t* input_data,
2169
    size_t input_forwards_size,
2170
    size_t input_backwards_size,
2171
    int flags,
2172
    RE_MATCH_CALLBACK_FUNC callback,
2173
    void* callback_args,
2174
    int* matches)
2175
0
{
2176
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
2177
2178
  // Pointers to the first and last position in the list.
2179
0
  RE_FAST_EXEC_POSITION* first;
2180
0
  RE_FAST_EXEC_POSITION* last;
2181
2182
0
  int input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;
2183
0
  int bytes_matched;
2184
0
  int max_bytes_matched;
2185
2186
0
  if (flags & RE_FLAGS_BACKWARDS)
2187
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
2188
0
  else
2189
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
2190
2191
0
  const uint8_t* ip = code;
2192
2193
  // Create the first position in the list, which points to the start of the
2194
  // input data. Intially this is the only position, more positions will be
2195
  // created every time RE_OPCODE_REPEAT_ANY_UNGREEDY is executed.
2196
0
  FAIL_ON_ERROR(_yr_re_fast_exec_position_create(
2197
0
      &context->re_fast_exec_position_pool, &first));
2198
2199
0
  first->round = 0;
2200
0
  first->input = input_data;
2201
0
  first->prev = NULL;
2202
0
  first->next = NULL;
2203
2204
0
  if (flags & RE_FLAGS_BACKWARDS)
2205
0
    first->input--;
2206
2207
  // As we are starting with a single position, the last one and the first one
2208
  // are the same.
2209
0
  last = first;
2210
2211
  // Round is incremented with every regxp instruction.
2212
0
  int round = 0;
2213
2214
  // Keep in the loop until the list of positions gets empty. Positions are
2215
  // removed from the list when they don't match the current instruction in
2216
  // the code.
2217
0
  while (first != NULL)
2218
0
  {
2219
0
    RE_FAST_EXEC_POSITION* current = first;
2220
2221
    // Iterate over all the positions, executing the current instruction at
2222
    // all of them.
2223
0
    while (current != NULL)
2224
0
    {
2225
0
      RE_FAST_EXEC_POSITION* next = current->next;
2226
2227
      // Ignore any position in the list whose round number is different from
2228
      // the current round. This prevents new positions added to the list in
2229
      // this round from being taken into account in this same round.
2230
0
      if (current->round != round)
2231
0
      {
2232
0
        current = next;
2233
0
        continue;
2234
0
      }
2235
2236
0
      bytes_matched = flags & RE_FLAGS_BACKWARDS
2237
0
                          ? (int) (input_data - current->input - 1)
2238
0
                          : (int) (current->input - input_data);
2239
2240
0
      uint16_t opcode_args;
2241
0
      uint8_t mask;
2242
0
      uint8_t value;
2243
2244
0
      bool match = false;
2245
2246
0
      switch (*ip)
2247
0
      {
2248
0
      case RE_OPCODE_ANY:
2249
0
        if (bytes_matched >= max_bytes_matched)
2250
0
          break;
2251
2252
0
        match = true;
2253
0
        current->input += input_incr;
2254
0
        break;
2255
2256
0
      case RE_OPCODE_LITERAL:
2257
0
        if (bytes_matched >= max_bytes_matched)
2258
0
          break;
2259
2260
0
        if (*current->input == *(ip + 1))
2261
0
        {
2262
0
          match = true;
2263
0
          current->input += input_incr;
2264
0
        }
2265
0
        break;
2266
2267
0
      case RE_OPCODE_NOT_LITERAL:
2268
0
        if (bytes_matched >= max_bytes_matched)
2269
0
          break;
2270
2271
0
        if (*current->input != *(ip + 1))
2272
0
        {
2273
0
          match = true;
2274
0
          current->input += input_incr;
2275
0
        }
2276
0
        break;
2277
2278
0
      case RE_OPCODE_MASKED_LITERAL:
2279
0
        if (bytes_matched >= max_bytes_matched)
2280
0
          break;
2281
2282
0
        opcode_args = yr_unaligned_u16(ip + 1);
2283
0
        mask = opcode_args >> 8;
2284
0
        value = opcode_args & 0xFF;
2285
2286
0
        if ((*current->input & mask) == value)
2287
0
        {
2288
0
          match = true;
2289
0
          current->input += input_incr;
2290
0
        }
2291
0
        break;
2292
2293
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
2294
0
        if (bytes_matched >= max_bytes_matched)
2295
0
          break;
2296
2297
0
        opcode_args = yr_unaligned_u16(ip + 1);
2298
0
        mask = opcode_args >> 8;
2299
0
        value = opcode_args & 0xFF;
2300
2301
0
        if ((*current->input & mask) != value)
2302
0
        {
2303
0
          match = true;
2304
0
          current->input += input_incr;
2305
0
        }
2306
0
        break;
2307
2308
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2309
0
        repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1);
2310
2311
0
        if (bytes_matched + repeat_any_args->min >= max_bytes_matched)
2312
0
          break;
2313
2314
0
        match = true;
2315
2316
0
        const uint8_t* next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS);
2317
2318
        // insertion_point indicates the item in the list of inputs after which
2319
        // the newly created inputs will be inserted.
2320
0
        RE_FAST_EXEC_POSITION* insertion_point = current;
2321
2322
0
        for (int j = repeat_any_args->min + 1; j <= repeat_any_args->max; j++)
2323
0
        {
2324
          // bytes_matched is the number of bytes matched so far, and j is the
2325
          // minimum number of bytes that are still pending to match. If these
2326
          // two numbers sum more than the maximum number of bytes that can be
2327
          // matched the loop can be aborted. The position that we were about
2328
          // to create can't lead to a match without exceeding
2329
          // max_bytes_matched.
2330
0
          if (bytes_matched + j >= max_bytes_matched)
2331
0
            break;
2332
2333
0
          const uint8_t* next_input = current->input + j * input_incr;
2334
2335
          // Find the point where next_input should be inserted. The list is
2336
          // kept sorted by pointer in increasing order, the insertion point is
2337
          // an item that has a pointer lower or equal than next_input, but
2338
          // whose next item have a pointer that is larger.
2339
0
          while (insertion_point->next != NULL &&
2340
0
                 insertion_point->next->input <= next_input)
2341
0
          {
2342
0
            insertion_point = insertion_point->next;
2343
0
          }
2344
2345
          // If the input already exists for the next round, we don't need to
2346
          // insert it.
2347
0
          if (insertion_point->round == round + 1 &&
2348
0
              insertion_point->input == next_input)
2349
0
            continue;
2350
2351
          // The next opcode is RE_OPCODE_LITERAL, but the literal doesn't
2352
          // match with the byte at next_input, we don't need to add this
2353
          // input to the list as we already know that it won't match in the
2354
          // next round and will be deleted from the list anyways.
2355
0
          if (*(next_opcode) == RE_OPCODE_LITERAL &&
2356
0
              *(next_opcode + 1) != *next_input)
2357
0
            continue;
2358
2359
0
          RE_FAST_EXEC_POSITION* new_input;
2360
2361
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2362
0
              _yr_re_fast_exec_position_create(
2363
0
                  &context->re_fast_exec_position_pool, &new_input),
2364
              // Cleanup
2365
0
              _yr_re_fast_exec_destroy_position_list(
2366
0
                  &context->re_fast_exec_position_pool, first, last));
2367
2368
0
          new_input->input = next_input;
2369
0
          new_input->round = round + 1;
2370
0
          new_input->prev = insertion_point;
2371
0
          new_input->next = insertion_point->next;
2372
0
          insertion_point->next = new_input;
2373
2374
0
          if (new_input->next != NULL)
2375
0
            new_input->next->prev = new_input;
2376
2377
0
          if (insertion_point == last)
2378
0
            last = new_input;
2379
0
        }
2380
2381
0
        current->input += input_incr * repeat_any_args->min;
2382
0
        break;
2383
2384
0
      case RE_OPCODE_MATCH:
2385
2386
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2387
0
        {
2388
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2389
0
              callback(
2390
                  // When matching forwards the matching data always starts at
2391
                  // input_data, when matching backwards it starts at input + 1
2392
                  // or input_data - input_backwards_size if input + 1 is
2393
                  // outside the buffer.
2394
0
                  flags & RE_FLAGS_BACKWARDS
2395
0
                      ? yr_max(
2396
0
                            current->input + 1,
2397
0
                            input_data - input_backwards_size)
2398
0
                      : input_data,
2399
                  // The number of matched bytes should not be larger than
2400
                  // max_bytes_matched.
2401
0
                  yr_min(bytes_matched, max_bytes_matched),
2402
0
                  flags,
2403
0
                  callback_args),
2404
              // Cleanup
2405
0
              _yr_re_fast_exec_destroy_position_list(
2406
0
                  &context->re_fast_exec_position_pool, first, last));
2407
0
        }
2408
0
        else
2409
0
        {
2410
0
          if (matches != NULL)
2411
0
            *matches = bytes_matched;
2412
2413
0
          _yr_re_fast_exec_destroy_position_list(
2414
0
              &context->re_fast_exec_position_pool, first, last);
2415
2416
0
          return ERROR_SUCCESS;
2417
0
        }
2418
0
        break;
2419
2420
0
      default:
2421
0
        printf("non-supported opcode: %d\n", *ip);
2422
0
        assert(false);
2423
0
      }
2424
2425
0
      if (match)
2426
0
      {
2427
0
        current->round = round + 1;
2428
0
      }
2429
0
      else
2430
0
      {
2431
        // Remove current from the list. If the item being removed is the first
2432
        // one or the last one, the first and last pointers should be updated.
2433
        // The removed item is put into the pool for later reuse.
2434
2435
0
        if (current == first)
2436
0
          first = current->next;
2437
2438
0
        if (current == last)
2439
0
          last = current->prev;
2440
2441
0
        if (current->prev != NULL)
2442
0
          current->prev->next = current->next;
2443
2444
0
        if (current->next != NULL)
2445
0
          current->next->prev = current->prev;
2446
2447
0
        current->prev = NULL;
2448
0
        current->next = context->re_fast_exec_position_pool.head;
2449
2450
0
        if (current->next != NULL)
2451
0
          current->next->prev = current;
2452
2453
0
        context->re_fast_exec_position_pool.head = current;
2454
0
      }
2455
2456
0
      current = next;
2457
2458
0
    }  // while (current != NULL)
2459
2460
    // Move instruction pointer (ip) to next instruction. Each opcode has
2461
    // a different size.
2462
0
    switch (*ip)
2463
0
    {
2464
0
    case RE_OPCODE_ANY:
2465
0
      ip += 1;
2466
0
      break;
2467
0
    case RE_OPCODE_LITERAL:
2468
0
    case RE_OPCODE_NOT_LITERAL:
2469
0
      ip += 2;
2470
0
      break;
2471
0
    case RE_OPCODE_MASKED_LITERAL:
2472
0
    case RE_OPCODE_MASKED_NOT_LITERAL:
2473
0
      ip += 3;
2474
0
      break;
2475
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2476
0
      ip += 1 + sizeof(RE_REPEAT_ANY_ARGS);
2477
0
      break;
2478
0
    case RE_OPCODE_MATCH:
2479
0
      break;
2480
0
    default:
2481
0
      assert(false);
2482
0
    }
2483
2484
0
    round++;
2485
0
  }
2486
2487
  // If this point is reached no matches were found.
2488
2489
0
  if (matches != NULL)
2490
0
    *matches = -1;
2491
2492
0
  return ERROR_SUCCESS;
2493
0
}
2494
2495
static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent)
2496
0
{
2497
0
  RE_NODE* child;
2498
0
  int i;
2499
2500
0
  if (re_node == NULL)
2501
0
    return;
2502
2503
0
  if (indent > 0)
2504
0
    printf("\n%*s", indent, " ");
2505
0
  switch (re_node->type)
2506
0
  {
2507
0
  case RE_NODE_ALT:
2508
0
    printf("Alt(");
2509
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2510
0
    printf(",");
2511
0
    _yr_re_print_node(re_node->children_tail, indent + 4);
2512
0
    printf("\n%*s%s", indent, " ", ")");
2513
0
    break;
2514
2515
0
  case RE_NODE_CONCAT:
2516
0
    printf("Cat(");
2517
0
    child = re_node->children_head;
2518
0
    while (child != NULL)
2519
0
    {
2520
0
      _yr_re_print_node(child, indent + 4);
2521
0
      printf(",");
2522
0
      child = child->next_sibling;
2523
0
    }
2524
0
    printf("\n%*s%s", indent, " ", ")");
2525
0
    break;
2526
2527
0
  case RE_NODE_STAR:
2528
0
    printf("Star(");
2529
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2530
0
    printf(")");
2531
0
    break;
2532
2533
0
  case RE_NODE_PLUS:
2534
0
    printf("Plus(");
2535
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2536
0
    printf(")");
2537
0
    break;
2538
2539
0
  case RE_NODE_LITERAL:
2540
0
    printf("Lit(%c)", re_node->value);
2541
0
    break;
2542
2543
0
  case RE_NODE_NOT_LITERAL:
2544
0
    printf("NotLit(%c)", re_node->value);
2545
0
    break;
2546
2547
0
  case RE_NODE_MASKED_LITERAL:
2548
0
    printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask);
2549
0
    break;
2550
2551
0
  case RE_NODE_WORD_CHAR:
2552
0
    printf("WordChar");
2553
0
    break;
2554
2555
0
  case RE_NODE_NON_WORD_CHAR:
2556
0
    printf("NonWordChar");
2557
0
    break;
2558
2559
0
  case RE_NODE_SPACE:
2560
0
    printf("Space");
2561
0
    break;
2562
2563
0
  case RE_NODE_NON_SPACE:
2564
0
    printf("NonSpace");
2565
0
    break;
2566
2567
0
  case RE_NODE_DIGIT:
2568
0
    printf("Digit");
2569
0
    break;
2570
2571
0
  case RE_NODE_NON_DIGIT:
2572
0
    printf("NonDigit");
2573
0
    break;
2574
2575
0
  case RE_NODE_ANY:
2576
0
    printf("Any");
2577
0
    break;
2578
2579
0
  case RE_NODE_RANGE:
2580
0
    printf("Range(%d-%d, ", re_node->start, re_node->end);
2581
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2582
0
    printf("\n%*s%s", indent, " ", ")");
2583
0
    break;
2584
2585
0
  case RE_NODE_CLASS:
2586
0
    printf("Class(");
2587
0
    for (i = 0; i < 256; i++)
2588
0
      if (_yr_re_is_char_in_class(re_node->re_class, i, false))
2589
0
        printf("%02X,", i);
2590
0
    printf(")");
2591
0
    break;
2592
2593
0
  case RE_NODE_EMPTY:
2594
0
    printf("Empty");
2595
0
    break;
2596
2597
0
  case RE_NODE_ANCHOR_START:
2598
0
    printf("AnchorStart");
2599
0
    break;
2600
2601
0
  case RE_NODE_ANCHOR_END:
2602
0
    printf("AnchorEnd");
2603
0
    break;
2604
2605
0
  case RE_NODE_WORD_BOUNDARY:
2606
0
    printf("WordBoundary");
2607
0
    break;
2608
2609
0
  case RE_NODE_NON_WORD_BOUNDARY:
2610
0
    printf("NonWordBoundary");
2611
0
    break;
2612
2613
0
  case RE_NODE_RANGE_ANY:
2614
0
    printf("RangeAny");
2615
0
    break;
2616
2617
0
  default:
2618
0
    printf("???");
2619
0
    break;
2620
0
  }
2621
0
}
2622
2623
void yr_re_print(RE_AST* re_ast)
2624
0
{
2625
0
  _yr_re_print_node(re_ast->root_node, 0);
2626
0
}