Coverage Report

Created: 2025-10-10 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/yara/libyara/re.c
Line
Count
Source
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module implements a regular expressions engine based on Thompson's
33
algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html.
34
35
What the article names a "thread" has been named a "fiber" in this code, in
36
order to avoid confusion with operating system threads.
37
38
*/
39
40
#include <assert.h>
41
#include <string.h>
42
#include <yara/compiler.h>
43
#include <yara/error.h>
44
#include <yara/globals.h>
45
#include <yara/hex_lexer.h>
46
#include <yara/limits.h>
47
#include <yara/mem.h>
48
#include <yara/re.h>
49
#include <yara/re_lexer.h>
50
#include <yara/strutils.h>
51
#include <yara/threading.h>
52
#include <yara/unaligned.h>
53
#include <yara/utils.h>
54
55
408M
#define EMIT_BACKWARDS               0x01
56
235M
#define EMIT_DONT_SET_FORWARDS_CODE  0x02
57
5.51M
#define EMIT_DONT_SET_BACKWARDS_CODE 0x04
58
59
#ifndef INT16_MAX
60
#define INT16_MAX (32767)
61
#endif
62
63
typedef uint8_t RE_SPLIT_ID_TYPE;
64
65
// RE_REPEAT_ARGS and RE_REPEAT_ANY_ARGS are structures that are embedded in
66
// the regexp's instruction stream. As such, they are not always aligned to
67
// 8-byte boundaries, so they need to be "packed" structures in order to prevent
68
// issues due to unaligned memory accesses.
69
#pragma pack(push)
70
#pragma pack(1)
71
72
typedef struct _RE_REPEAT_ARGS
73
{
74
  uint16_t min;
75
  uint16_t max;
76
  int32_t offset;
77
78
} RE_REPEAT_ARGS;
79
80
typedef struct _RE_REPEAT_ANY_ARGS
81
{
82
  uint16_t min;
83
  uint16_t max;
84
85
} RE_REPEAT_ANY_ARGS;
86
87
#pragma pack(pop)
88
89
typedef struct _RE_EMIT_CONTEXT
90
{
91
  YR_ARENA* arena;
92
  RE_SPLIT_ID_TYPE next_split_id;
93
94
} RE_EMIT_CONTEXT;
95
96
0
#define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8))
97
98
static bool _yr_re_is_char_in_class(
99
    RE_CLASS* re_class,
100
    uint8_t chr,
101
    int case_insensitive)
102
0
{
103
0
  int result = CHAR_IN_CLASS(re_class->bitmap, chr);
104
105
0
  if (case_insensitive)
106
0
    result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]);
107
108
0
  if (re_class->negated)
109
0
    result = !result;
110
111
0
  return result;
112
0
}
113
114
static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size)
115
0
{
116
0
  int result = ((yr_isalnum(input) || (*input) == '_'));
117
118
0
  if (character_size == 2)
119
0
    result = result && (*(input + 1) == 0);
120
121
0
  return result;
122
0
}
123
124
RE_NODE* yr_re_node_create(int type)
125
2.49M
{
126
2.49M
  RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE));
127
128
2.49M
  if (result != NULL)
129
2.49M
  {
130
2.49M
    result->type = type;
131
2.49M
    result->children_head = NULL;
132
2.49M
    result->children_tail = NULL;
133
2.49M
    result->prev_sibling = NULL;
134
2.49M
    result->next_sibling = NULL;
135
2.49M
    result->greedy = true;
136
2.49M
    result->forward_code_ref = YR_ARENA_NULL_REF;
137
2.49M
    result->backward_code_ref = YR_ARENA_NULL_REF;
138
2.49M
  }
139
140
2.49M
  return result;
141
2.49M
}
142
143
void yr_re_node_destroy(RE_NODE* node)
144
2.49M
{
145
2.49M
  RE_NODE* child = node->children_head;
146
2.49M
  RE_NODE* next_child;
147
148
4.94M
  while (child != NULL)
149
2.44M
  {
150
2.44M
    next_child = child->next_sibling;
151
2.44M
    yr_re_node_destroy(child);
152
2.44M
    child = next_child;
153
2.44M
  }
154
155
2.49M
  if (node->type == RE_NODE_CLASS)
156
2.25k
    yr_free(node->re_class);
157
158
2.49M
  yr_free(node);
159
2.49M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Appends a node to the end of the children list.
163
//
164
void yr_re_node_append_child(RE_NODE* node, RE_NODE* child)
165
2.45M
{
166
2.45M
  if (node->children_head == NULL)
167
106k
    node->children_head = child;
168
169
2.45M
  if (node->children_tail != NULL)
170
2.34M
    node->children_tail->next_sibling = child;
171
172
2.45M
  child->prev_sibling = node->children_tail;
173
2.45M
  node->children_tail = child;
174
2.45M
}
175
176
////////////////////////////////////////////////////////////////////////////////
177
// Appends a node to the beginning of the children list.
178
//
179
void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child)
180
1.28k
{
181
1.28k
  child->next_sibling = node->children_head;
182
183
1.28k
  if (node->children_head != NULL)
184
1.28k
    node->children_head->prev_sibling = child;
185
186
1.28k
  node->children_head = child;
187
188
1.28k
  if (node->children_tail == NULL)
189
0
    node->children_tail = child;
190
1.28k
}
191
192
int yr_re_ast_create(RE_AST** re_ast)
193
34.2k
{
194
34.2k
  *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST));
195
196
34.2k
  if (*re_ast == NULL)
197
0
    return ERROR_INSUFFICIENT_MEMORY;
198
199
34.2k
  (*re_ast)->flags = 0;
200
34.2k
  (*re_ast)->root_node = NULL;
201
202
34.2k
  return ERROR_SUCCESS;
203
34.2k
}
204
205
void yr_re_ast_destroy(RE_AST* re_ast)
206
34.2k
{
207
34.2k
  if (re_ast->root_node != NULL)
208
33.4k
    yr_re_node_destroy(re_ast->root_node);
209
210
34.2k
  yr_free(re_ast);
211
34.2k
}
212
213
////////////////////////////////////////////////////////////////////////////////
214
// Parses a regexp but don't emit its code. A further call to
215
// yr_re_ast_emit_code is required to get the code.
216
//
217
int yr_re_parse(
218
    const char* re_string,
219
    RE_AST** re_ast,
220
    RE_ERROR* error,
221
    int flags)
222
27.6k
{
223
27.6k
  return yr_parse_re_string(re_string, re_ast, error, flags);
224
27.6k
}
225
226
////////////////////////////////////////////////////////////////////////////////
227
// Parses a hex string but don't emit its code. A further call to
228
// yr_re_ast_emit_code is required to get the code.
229
//
230
int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error)
231
1.13k
{
232
1.13k
  return yr_parse_hex_string(hex_string, re_ast, error);
233
1.13k
}
234
235
////////////////////////////////////////////////////////////////////////////////
236
// Parses the regexp and emit its code to the provided to the
237
// YR_RE_CODE_SECTION in the specified arena.
238
//
239
int yr_re_compile(
240
    const char* re_string,
241
    int flags,
242
    int parser_flags,
243
    YR_ARENA* arena,
244
    YR_ARENA_REF* ref,
245
    RE_ERROR* error)
246
1.39k
{
247
1.39k
  RE_AST* re_ast;
248
1.39k
  RE _re;
249
1.39k
  int result;
250
251
1.39k
  result = yr_re_parse(re_string, &re_ast, error, parser_flags);
252
1.39k
  if (result != ERROR_UNKNOWN_ESCAPE_SEQUENCE)
253
1.39k
    FAIL_ON_ERROR(result);
254
255
1.18k
  _re.flags = flags;
256
257
1.18k
  FAIL_ON_ERROR_WITH_CLEANUP(
258
1.18k
      yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref),
259
1.18k
      yr_re_ast_destroy(re_ast));
260
261
1.18k
  FAIL_ON_ERROR_WITH_CLEANUP(
262
1.18k
      yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast));
263
264
1.14k
  yr_re_ast_destroy(re_ast);
265
266
1.14k
  return result;
267
1.18k
}
268
269
////////////////////////////////////////////////////////////////////////////////
270
// Verifies if the target string matches the pattern
271
//
272
// Args:
273
//    context: Scan context
274
//    re: A pointer to a compiled regexp
275
//    target: Target string
276
//
277
// Returns:
278
//    See return codes for yr_re_exec
279
//
280
int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target)
281
0
{
282
0
  int result;
283
284
0
  yr_re_exec(
285
0
      context,
286
0
      re->code,
287
0
      (uint8_t*) target,
288
0
      strlen(target),
289
0
      0,
290
0
      re->flags | RE_FLAGS_SCAN,
291
0
      NULL,
292
0
      NULL,
293
0
      &result);
294
295
0
  return result;
296
0
}
297
298
////////////////////////////////////////////////////////////////////////////////
299
// Verifies if the provided regular expression is just a literal string
300
// like "abc", "12345", without any wildcard, operator, etc. In that case
301
// returns the string as a SIZED_STRING, or returns NULL if otherwise.
302
//
303
// The caller is responsible for deallocating the returned SIZED_STRING by
304
// calling yr_free.
305
//
306
SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast)
307
31.9k
{
308
31.9k
  SIZED_STRING* string;
309
31.9k
  RE_NODE* child;
310
311
31.9k
  int length = 0;
312
313
31.9k
  if (re_ast->root_node->type == RE_NODE_LITERAL)
314
196
  {
315
196
    length = 1;
316
196
  }
317
31.7k
  else if (re_ast->root_node->type == RE_NODE_CONCAT)
318
30.9k
  {
319
30.9k
    child = re_ast->root_node->children_tail;
320
321
160k
    while (child != NULL && child->type == RE_NODE_LITERAL)
322
130k
    {
323
130k
      length++;
324
130k
      child = child->prev_sibling;
325
130k
    }
326
327
30.9k
    if (child != NULL)
328
6.59k
      return NULL;
329
30.9k
  }
330
792
  else
331
792
  {
332
792
    return NULL;
333
792
  }
334
335
24.5k
  string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
336
337
24.5k
  if (string == NULL)
338
0
    return NULL;
339
340
24.5k
  string->length = length;
341
24.5k
  string->flags = 0;
342
343
24.5k
  if (re_ast->root_node->type == RE_NODE_LITERAL)
344
196
  {
345
196
    string->c_string[0] = re_ast->root_node->value;
346
196
  }
347
24.3k
  else
348
24.3k
  {
349
24.3k
    child = re_ast->root_node->children_tail;
350
351
144k
    while (child != NULL)
352
120k
    {
353
120k
      string->c_string[--length] = child->value;
354
120k
      child = child->prev_sibling;
355
120k
    }
356
24.3k
  }
357
358
24.5k
  string->c_string[string->length] = '\0';
359
360
24.5k
  return string;
361
24.5k
}
362
363
int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node)
364
188k
{
365
188k
  RE_NODE* child;
366
367
188k
  if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) &&
368
945
      re_node->children_head->type == RE_NODE_ANY)
369
359
    return true;
370
371
188k
  if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE)
372
959
    return true;
373
374
187k
  if (re_node->type == RE_NODE_CONCAT)
375
29.8k
  {
376
29.8k
    child = re_node->children_tail;
377
378
188k
    while (child != NULL)
379
162k
    {
380
162k
      if (_yr_re_node_has_unbounded_quantifier_for_dot(child))
381
3.78k
        return true;
382
383
158k
      child = child->prev_sibling;
384
158k
    }
385
29.8k
  }
386
387
183k
  return false;
388
187k
}
389
390
////////////////////////////////////////////////////////////////////////////////
391
// Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with
392
// quantifiers that don't have a reasonably small upper bound causes a
393
// performance penalty. This function detects such cases in order to warn the
394
// user about this.
395
//
396
int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast)
397
26.5k
{
398
26.5k
  return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node);
399
26.5k
}
400
401
////////////////////////////////////////////////////////////////////////////////
402
// In some cases splitting a regular expression (or hex string) in two parts is
403
// convenient for increasing performance. This happens when the pattern contains
404
// a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 }
405
// In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where
406
// the latter is chained to the former. This means that { 01 02 03 } and
407
// { 04 05 06 } are handled as individual strings, and when both of them are
408
// found, YARA verifies if the distance between the matches complies with the
409
// [0-999] restriction.
410
//
411
// This function traverses a regexp's AST looking for nodes where it should be
412
// splitted. It must be noticed that this only applies to two-level ASTs (i.e.
413
// an AST consisting in a RE_NODE_CONCAT at the root where all the children are
414
// leaves).
415
//
416
// For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the
417
// following AST:
418
//
419
// RE_NODE_CONCAT
420
// |
421
// |- RE_NODE_LITERAL (01)
422
// |- RE_NODE_LITERAL (02)
423
// |- RE_NODE_LITERAL (03)
424
// |- RE_NODE_RANGE_ANY (start=0, end=1000)
425
// |- RE_NODE_LITERAL (04)
426
// |- RE_NODE_LITERAL (05)
427
// |- RE_NODE_LITERAL (06)
428
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
429
// |- RE_NODE_LITERAL (07)
430
// |- RE_NODE_LITERAL (08)
431
// |- RE_NODE_LITERAL (09)
432
//
433
// If the AST above is passed in the re_ast argument, it will be trimmed to:
434
//
435
// RE_NODE_CONCAT
436
// |
437
// |- RE_NODE_LITERAL (01)
438
// |- RE_NODE_LITERAL (02)
439
// |- RE_NODE_LITERAL (03)
440
//
441
// While remainder_re_ast will be:
442
//
443
// RE_NODE_CONCAT
444
// |
445
// |- RE_NODE_LITERAL (04)
446
// |- RE_NODE_LITERAL (05)
447
// |- RE_NODE_LITERAL (06)
448
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
449
// |- RE_NODE_LITERAL (07)
450
// |- RE_NODE_LITERAL (08)
451
// |- RE_NODE_LITERAL (09)
452
//
453
// The caller is responsible for freeing the new AST in remainder_re_ast by
454
// calling yr_re_ast_destroy.
455
//
456
// The integers pointed to by min_gap and max_gap will be filled with the
457
// minimum and maximum gap size between the sub-strings represented by the
458
// two ASTs.
459
//
460
int yr_re_ast_split_at_chaining_point(
461
    RE_AST* re_ast,
462
    RE_AST** remainder_re_ast,
463
    int32_t* min_gap,
464
    int32_t* max_gap)
465
31.9k
{
466
31.9k
  RE_NODE* child;
467
31.9k
  RE_NODE* concat;
468
469
31.9k
  int result;
470
471
31.9k
  *remainder_re_ast = NULL;
472
31.9k
  *min_gap = 0;
473
31.9k
  *max_gap = 0;
474
475
31.9k
  if (re_ast->root_node->type != RE_NODE_CONCAT)
476
988
    return ERROR_SUCCESS;
477
478
30.9k
  child = re_ast->root_node->children_head;
479
480
195k
  while (child != NULL)
481
169k
  {
482
169k
    if (!child->greedy && child->type == RE_NODE_RANGE_ANY &&
483
8.25k
        child->prev_sibling != NULL && child->next_sibling != NULL &&
484
6.18k
        (child->start > YR_STRING_CHAINING_THRESHOLD ||
485
4.51k
         child->end > YR_STRING_CHAINING_THRESHOLD))
486
5.42k
    {
487
5.42k
      result = yr_re_ast_create(remainder_re_ast);
488
489
5.42k
      if (result != ERROR_SUCCESS)
490
0
        return result;
491
492
5.42k
      concat = yr_re_node_create(RE_NODE_CONCAT);
493
494
5.42k
      if (concat == NULL)
495
0
        return ERROR_INSUFFICIENT_MEMORY;
496
497
5.42k
      concat->children_head = child->next_sibling;
498
5.42k
      concat->children_tail = re_ast->root_node->children_tail;
499
500
5.42k
      re_ast->root_node->children_tail = child->prev_sibling;
501
502
5.42k
      child->prev_sibling->next_sibling = NULL;
503
5.42k
      child->next_sibling->prev_sibling = NULL;
504
505
5.42k
      *min_gap = child->start;
506
5.42k
      *max_gap = child->end;
507
508
5.42k
      (*remainder_re_ast)->root_node = concat;
509
5.42k
      (*remainder_re_ast)->flags = re_ast->flags;
510
511
5.42k
      yr_re_node_destroy(child);
512
513
5.42k
      return ERROR_SUCCESS;
514
5.42k
    }
515
516
164k
    child = child->next_sibling;
517
164k
  }
518
519
25.5k
  return ERROR_SUCCESS;
520
30.9k
}
521
522
int _yr_emit_inst(
523
    RE_EMIT_CONTEXT* emit_context,
524
    uint8_t opcode,
525
    YR_ARENA_REF* instruction_ref)
526
67.3M
{
527
67.3M
  FAIL_ON_ERROR(yr_arena_write_data(
528
67.3M
      emit_context->arena,
529
67.3M
      YR_RE_CODE_SECTION,
530
67.3M
      &opcode,
531
67.3M
      sizeof(uint8_t),
532
67.3M
      instruction_ref));
533
534
67.3M
  return ERROR_SUCCESS;
535
67.3M
}
536
537
int _yr_emit_inst_arg_uint8(
538
    RE_EMIT_CONTEXT* emit_context,
539
    uint8_t opcode,
540
    uint8_t argument,
541
    YR_ARENA_REF* instruction_ref,
542
    YR_ARENA_REF* argument_ref)
543
122M
{
544
122M
  FAIL_ON_ERROR(yr_arena_write_data(
545
122M
      emit_context->arena,
546
122M
      YR_RE_CODE_SECTION,
547
122M
      &opcode,
548
122M
      sizeof(uint8_t),
549
122M
      instruction_ref));
550
551
122M
  FAIL_ON_ERROR(yr_arena_write_data(
552
122M
      emit_context->arena,
553
122M
      YR_RE_CODE_SECTION,
554
122M
      &argument,
555
122M
      sizeof(uint8_t),
556
122M
      argument_ref));
557
558
122M
  return ERROR_SUCCESS;
559
122M
}
560
561
int _yr_emit_inst_arg_uint16(
562
    RE_EMIT_CONTEXT* emit_context,
563
    uint8_t opcode,
564
    uint16_t argument,
565
    YR_ARENA_REF* instruction_ref,
566
    YR_ARENA_REF* argument_ref)
567
5.70k
{
568
5.70k
  FAIL_ON_ERROR(yr_arena_write_data(
569
5.70k
      emit_context->arena,
570
5.70k
      YR_RE_CODE_SECTION,
571
5.70k
      &opcode,
572
5.70k
      sizeof(uint8_t),
573
5.70k
      instruction_ref));
574
575
5.70k
  FAIL_ON_ERROR(yr_arena_write_data(
576
5.70k
      emit_context->arena,
577
5.70k
      YR_RE_CODE_SECTION,
578
5.70k
      &argument,
579
5.70k
      sizeof(uint16_t),
580
5.70k
      argument_ref));
581
582
5.70k
  return ERROR_SUCCESS;
583
5.70k
}
584
585
int _yr_emit_inst_arg_uint32(
586
    RE_EMIT_CONTEXT* emit_context,
587
    uint8_t opcode,
588
    uint32_t argument,
589
    YR_ARENA_REF* instruction_ref,
590
    YR_ARENA_REF* argument_ref)
591
0
{
592
0
  FAIL_ON_ERROR(yr_arena_write_data(
593
0
      emit_context->arena,
594
0
      YR_RE_CODE_SECTION,
595
0
      &opcode,
596
0
      sizeof(uint8_t),
597
0
      instruction_ref));
598
599
0
  FAIL_ON_ERROR(yr_arena_write_data(
600
0
      emit_context->arena,
601
0
      YR_RE_CODE_SECTION,
602
0
      &argument,
603
0
      sizeof(uint32_t),
604
0
      argument_ref));
605
606
0
  return ERROR_SUCCESS;
607
0
}
608
609
int _yr_emit_inst_arg_int16(
610
    RE_EMIT_CONTEXT* emit_context,
611
    uint8_t opcode,
612
    int16_t argument,
613
    YR_ARENA_REF* instruction_ref,
614
    YR_ARENA_REF* argument_ref)
615
24.5k
{
616
24.5k
  FAIL_ON_ERROR(yr_arena_write_data(
617
24.5k
      emit_context->arena,
618
24.5k
      YR_RE_CODE_SECTION,
619
24.5k
      &opcode,
620
24.5k
      sizeof(uint8_t),
621
24.5k
      instruction_ref));
622
623
24.5k
  FAIL_ON_ERROR(yr_arena_write_data(
624
24.5k
      emit_context->arena,
625
24.5k
      YR_RE_CODE_SECTION,
626
24.5k
      &argument,
627
24.5k
      sizeof(int16_t),
628
24.5k
      argument_ref));
629
630
24.5k
  return ERROR_SUCCESS;
631
24.5k
}
632
633
int _yr_emit_inst_arg_struct(
634
    RE_EMIT_CONTEXT* emit_context,
635
    uint8_t opcode,
636
    void* structure,
637
    size_t structure_size,
638
    YR_ARENA_REF* instruction_ref,
639
    YR_ARENA_REF* argument_ref)
640
49.3M
{
641
49.3M
  FAIL_ON_ERROR(yr_arena_write_data(
642
49.3M
      emit_context->arena,
643
49.3M
      YR_RE_CODE_SECTION,
644
49.3M
      &opcode,
645
49.3M
      sizeof(uint8_t),
646
49.3M
      instruction_ref));
647
648
49.3M
  FAIL_ON_ERROR(yr_arena_write_data(
649
49.3M
      emit_context->arena,
650
49.3M
      YR_RE_CODE_SECTION,
651
49.3M
      structure,
652
49.3M
      structure_size,
653
49.3M
      argument_ref));
654
655
49.3M
  return ERROR_SUCCESS;
656
49.3M
}
657
658
int _yr_emit_split(
659
    RE_EMIT_CONTEXT* emit_context,
660
    uint8_t opcode,
661
    int16_t argument,
662
    YR_ARENA_REF* instruction_ref,
663
    YR_ARENA_REF* argument_ref)
664
30.0k
{
665
30.0k
  assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B);
666
667
30.0k
  if (emit_context->next_split_id == RE_MAX_SPLIT_ID)
668
74
    return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX;
669
670
29.9k
  FAIL_ON_ERROR(yr_arena_write_data(
671
29.9k
      emit_context->arena,
672
29.9k
      YR_RE_CODE_SECTION,
673
29.9k
      &opcode,
674
29.9k
      sizeof(uint8_t),
675
29.9k
      instruction_ref));
676
677
29.9k
  FAIL_ON_ERROR(yr_arena_write_data(
678
29.9k
      emit_context->arena,
679
29.9k
      YR_RE_CODE_SECTION,
680
29.9k
      &emit_context->next_split_id,
681
29.9k
      sizeof(RE_SPLIT_ID_TYPE),
682
29.9k
      NULL));
683
684
29.9k
  emit_context->next_split_id++;
685
686
29.9k
  FAIL_ON_ERROR(yr_arena_write_data(
687
29.9k
      emit_context->arena,
688
29.9k
      YR_RE_CODE_SECTION,
689
29.9k
      &argument,
690
29.9k
      sizeof(int16_t),
691
29.9k
      argument_ref));
692
693
29.9k
  return ERROR_SUCCESS;
694
29.9k
}
695
696
static int _yr_re_emit(
697
    RE_EMIT_CONTEXT* emit_context,
698
    RE_NODE* re_node,
699
    int flags,
700
    YR_ARENA_REF* code_ref)
701
241M
{
702
241M
  int16_t jmp_offset;
703
704
241M
  yr_arena_off_t bookmark_1 = 0;
705
241M
  yr_arena_off_t bookmark_2 = 0;
706
241M
  yr_arena_off_t bookmark_3 = 0;
707
241M
  yr_arena_off_t bookmark_4 = 0;
708
709
241M
  bool emit_split;
710
241M
  bool emit_repeat;
711
241M
  bool emit_prolog;
712
241M
  bool emit_epilog;
713
714
241M
  RE_REPEAT_ARGS repeat_args;
715
241M
  RE_REPEAT_ARGS* repeat_start_args_addr;
716
241M
  RE_REPEAT_ANY_ARGS repeat_any_args;
717
718
241M
  RE_NODE* child;
719
720
241M
  int16_t* split_offset_addr = NULL;
721
241M
  int16_t* jmp_offset_addr = NULL;
722
723
241M
  YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF;
724
241M
  YR_ARENA_REF split_offset_ref;
725
241M
  YR_ARENA_REF jmp_instruction_ref;
726
241M
  YR_ARENA_REF jmp_offset_ref;
727
241M
  YR_ARENA_REF repeat_start_args_ref;
728
729
241M
  switch (re_node->type)
730
241M
  {
731
122M
  case RE_NODE_LITERAL:
732
122M
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
733
122M
        emit_context,
734
122M
        RE_OPCODE_LITERAL,
735
122M
        re_node->value,
736
122M
        &instruction_ref,
737
122M
        NULL));
738
122M
    break;
739
740
515
  case RE_NODE_NOT_LITERAL:
741
515
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
742
515
        emit_context,
743
515
        RE_OPCODE_NOT_LITERAL,
744
515
        re_node->value,
745
515
        &instruction_ref,
746
515
        NULL));
747
515
    break;
748
749
4.52k
  case RE_NODE_MASKED_LITERAL:
750
4.52k
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
751
4.52k
        emit_context,
752
4.52k
        RE_OPCODE_MASKED_LITERAL,
753
4.52k
        re_node->mask << 8 | re_node->value,
754
4.52k
        &instruction_ref,
755
4.52k
        NULL));
756
4.52k
    break;
757
758
1.18k
  case RE_NODE_MASKED_NOT_LITERAL:
759
1.18k
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
760
1.18k
        emit_context,
761
1.18k
        RE_OPCODE_MASKED_NOT_LITERAL,
762
1.18k
        re_node->mask << 8 | re_node->value,
763
1.18k
        &instruction_ref,
764
1.18k
        NULL));
765
1.18k
    break;
766
767
417
  case RE_NODE_WORD_CHAR:
768
417
    FAIL_ON_ERROR(
769
417
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref));
770
417
    break;
771
772
211
  case RE_NODE_NON_WORD_CHAR:
773
211
    FAIL_ON_ERROR(
774
211
        _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref));
775
211
    break;
776
777
812
  case RE_NODE_WORD_BOUNDARY:
778
812
    FAIL_ON_ERROR(
779
812
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref));
780
812
    break;
781
782
258
  case RE_NODE_NON_WORD_BOUNDARY:
783
258
    FAIL_ON_ERROR(_yr_emit_inst(
784
258
        emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref));
785
258
    break;
786
787
1.50k
  case RE_NODE_SPACE:
788
1.50k
    FAIL_ON_ERROR(
789
1.50k
        _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref));
790
1.50k
    break;
791
792
276
  case RE_NODE_NON_SPACE:
793
276
    FAIL_ON_ERROR(
794
276
        _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref));
795
276
    break;
796
797
396
  case RE_NODE_DIGIT:
798
396
    FAIL_ON_ERROR(
799
396
        _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref));
800
396
    break;
801
802
161
  case RE_NODE_NON_DIGIT:
803
161
    FAIL_ON_ERROR(
804
161
        _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref));
805
161
    break;
806
807
1.78M
  case RE_NODE_ANY:
808
1.78M
    FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref));
809
1.78M
    break;
810
811
65.1M
  case RE_NODE_CLASS:
812
65.1M
    FAIL_ON_ERROR(
813
65.1M
        _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref));
814
815
65.1M
    FAIL_ON_ERROR(yr_arena_write_data(
816
65.1M
        emit_context->arena,
817
65.1M
        YR_RE_CODE_SECTION,
818
65.1M
        re_node->re_class,
819
65.1M
        sizeof(*re_node->re_class),
820
65.1M
        NULL));
821
65.1M
    break;
822
823
308k
  case RE_NODE_ANCHOR_START:
824
308k
    FAIL_ON_ERROR(_yr_emit_inst(
825
308k
        emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref));
826
308k
    break;
827
828
138k
  case RE_NODE_ANCHOR_END:
829
138k
    FAIL_ON_ERROR(
830
138k
        _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref));
831
138k
    break;
832
833
26.5M
  case RE_NODE_CONCAT:
834
26.5M
    FAIL_ON_ERROR(_yr_re_emit(
835
26.5M
        emit_context,
836
26.5M
        (flags & EMIT_BACKWARDS) ? re_node->children_tail
837
26.5M
                                 : re_node->children_head,
838
26.5M
        flags,
839
26.5M
        &instruction_ref));
840
841
26.5M
    if (flags & EMIT_BACKWARDS)
842
244k
      child = re_node->children_tail->prev_sibling;
843
26.2M
    else
844
26.2M
      child = re_node->children_head->next_sibling;
845
846
167M
    while (child != NULL)
847
140M
    {
848
140M
      FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL));
849
850
140M
      child = (flags & EMIT_BACKWARDS) ? child->prev_sibling
851
140M
                                       : child->next_sibling;
852
140M
    }
853
26.5M
    break;
854
855
26.5M
  case RE_NODE_PLUS:
856
    // Code for e+ looks like:
857
    //
858
    //          L1: code for e
859
    //              split L1, L2
860
    //          L2:
861
    //
862
2.96k
    FAIL_ON_ERROR(_yr_re_emit(
863
2.96k
        emit_context, re_node->children_head, flags, &instruction_ref));
864
865
2.85k
    bookmark_1 = yr_arena_get_current_offset(
866
2.85k
        emit_context->arena, YR_RE_CODE_SECTION);
867
868
2.85k
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
869
10
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
870
871
2.84k
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
872
873
2.84k
    FAIL_ON_ERROR(_yr_emit_split(
874
2.84k
        emit_context,
875
2.84k
        re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A,
876
2.84k
        jmp_offset,
877
2.84k
        NULL,
878
2.84k
        NULL));
879
880
2.82k
    break;
881
882
2.38k
  case RE_NODE_STAR:
883
    // Code for e* looks like:
884
    //
885
    //          L1: split L1, L2
886
    //              code for e
887
    //              jmp L1
888
    //          L2:
889
2.38k
    FAIL_ON_ERROR(_yr_emit_split(
890
2.38k
        emit_context,
891
2.38k
        re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
892
2.38k
        0,
893
2.38k
        &instruction_ref,
894
2.38k
        &split_offset_ref));
895
896
2.37k
    FAIL_ON_ERROR(
897
2.37k
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
898
899
2.21k
    bookmark_1 = yr_arena_get_current_offset(
900
2.21k
        emit_context->arena, YR_RE_CODE_SECTION);
901
902
2.21k
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
903
16
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
904
905
2.19k
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
906
907
    // Emit jump with offset set to 0.
908
909
2.19k
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
910
2.19k
        emit_context, RE_OPCODE_JUMP, jmp_offset, NULL, NULL));
911
912
2.19k
    bookmark_1 = yr_arena_get_current_offset(
913
2.19k
        emit_context->arena, YR_RE_CODE_SECTION);
914
915
2.19k
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
916
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
917
918
2.19k
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
919
920
    // Update split offset.
921
2.19k
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
922
2.19k
        emit_context->arena, &split_offset_ref);
923
924
2.19k
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
925
2.19k
    break;
926
927
23.7k
  case RE_NODE_ALT:
928
    // Code for e1|e2 looks like:
929
    //
930
    //              split L1, L2
931
    //          L1: code for e1
932
    //              jmp L3
933
    //          L2: code for e2
934
    //          L3:
935
936
    // Emit a split instruction with offset set to 0 temporarily. Offset
937
    // will be updated after we know the size of the code generated for
938
    // the left node (e1).
939
940
23.7k
    FAIL_ON_ERROR(_yr_emit_split(
941
23.7k
        emit_context,
942
23.7k
        RE_OPCODE_SPLIT_A,
943
23.7k
        0,
944
23.7k
        &instruction_ref,
945
23.7k
        &split_offset_ref));
946
947
23.7k
    FAIL_ON_ERROR(
948
23.7k
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
949
950
    // Emit jump with offset set to 0.
951
952
22.3k
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
953
22.3k
        emit_context,
954
22.3k
        RE_OPCODE_JUMP,
955
22.3k
        0,
956
22.3k
        &jmp_instruction_ref,
957
22.3k
        &jmp_offset_ref));
958
959
22.3k
    bookmark_1 = yr_arena_get_current_offset(
960
22.3k
        emit_context->arena, YR_RE_CODE_SECTION);
961
962
22.3k
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
963
30
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
964
965
22.3k
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
966
967
    // Update split offset.
968
22.3k
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
969
22.3k
        emit_context->arena, &split_offset_ref);
970
971
22.3k
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
972
973
22.3k
    FAIL_ON_ERROR(
974
22.3k
        _yr_re_emit(emit_context, re_node->children_tail, flags, NULL));
975
976
21.9k
    bookmark_1 = yr_arena_get_current_offset(
977
21.9k
        emit_context->arena, YR_RE_CODE_SECTION);
978
979
21.9k
    if (bookmark_1 - jmp_instruction_ref.offset > INT16_MAX)
980
22
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
981
982
21.9k
    jmp_offset = (int16_t) (bookmark_1 - jmp_instruction_ref.offset);
983
984
    // Update offset for jmp instruction.
985
21.9k
    jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
986
21.9k
        emit_context->arena, &jmp_offset_ref);
987
988
21.9k
    memcpy(jmp_offset_addr, &jmp_offset, sizeof(jmp_offset));
989
21.9k
    break;
990
991
6.34k
  case RE_NODE_RANGE_ANY:
992
6.34k
    repeat_any_args.min = re_node->start;
993
6.34k
    repeat_any_args.max = re_node->end;
994
995
6.34k
    FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
996
6.34k
        emit_context,
997
6.34k
        re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY
998
6.34k
                        : RE_OPCODE_REPEAT_ANY_UNGREEDY,
999
6.34k
        &repeat_any_args,
1000
6.34k
        sizeof(repeat_any_args),
1001
6.34k
        &instruction_ref,
1002
6.34k
        NULL));
1003
1004
6.34k
    break;
1005
1006
24.7M
  case RE_NODE_RANGE:
1007
    // Code for e{n,m} looks like:
1008
    //
1009
    //            code for e              ---   prolog
1010
    //            repeat_start n, m, L1   --+
1011
    //        L0: code for e                |   repeat
1012
    //            repeat_end n, m, L0     --+
1013
    //        L1: split L2, L3            ---   split
1014
    //        L2: code for e              ---   epilog
1015
    //        L3:
1016
    //
1017
    // Not all sections (prolog, repeat, split and epilog) are generated in all
1018
    // cases, it depends on the values of n and m. The following table shows
1019
    // which sections are generated for the first few values of n and m.
1020
    //
1021
    //        n,m   prolog  repeat      split  epilog
1022
    //                      (min,max)
1023
    //        ---------------------------------------
1024
    //        0,0     -       -           -      -
1025
    //        0,1     -       -           X      X
1026
    //        0,2     -       0,1         X      X
1027
    //        0,3     -       0,2         X      X
1028
    //        0,M     -       0,M-1       X      X
1029
    //
1030
    //        1,1     X       -           -      -
1031
    //        1,2     X       -           X      X
1032
    //        1,3     X       0,1         X      X
1033
    //        1,4     X       1,2         X      X
1034
    //        1,M     X       1,M-2       X      X
1035
    //
1036
    //        2,2     X       -           -      X
1037
    //        2,3     X       1,1         X      X
1038
    //        2,4     X       1,2         X      X
1039
    //        2,M     X       1,M-2       X      X
1040
    //
1041
    //        3,3     X       1,1         -      X
1042
    //        3,4     X       2,2         X      X
1043
    //        3,M     X       2,M-2       X      X
1044
    //
1045
    //        4,4     X       2,2         -      X
1046
    //        4,5     X       3,3         X      X
1047
    //        4,M     X       3,M-2       X      X
1048
    //
1049
    // The code can't consists simply in the repeat section, the prolog and
1050
    // epilog are required because we can't have atoms pointing to code inside
1051
    // the repeat loop. Atoms' forwards_code will point to code in the prolog
1052
    // and backwards_code will point to code in the epilog (or in prolog if
1053
    // epilog wasn't generated, like in n=1,m=1)
1054
1055
24.7M
    emit_prolog = re_node->start > 0;
1056
24.7M
    emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2;
1057
24.7M
    emit_split = re_node->end > re_node->start;
1058
24.7M
    emit_epilog = re_node->end > re_node->start || re_node->end > 1;
1059
1060
24.7M
    if (emit_prolog)
1061
24.7M
    {
1062
24.7M
      FAIL_ON_ERROR(_yr_re_emit(
1063
24.7M
          emit_context, re_node->children_head, flags, &instruction_ref));
1064
24.7M
    }
1065
1066
24.7M
    if (emit_repeat)
1067
24.6M
    {
1068
24.6M
      repeat_args.min = re_node->start;
1069
24.6M
      repeat_args.max = re_node->end;
1070
1071
24.6M
      if (emit_prolog)
1072
24.6M
      {
1073
24.6M
        repeat_args.max--;
1074
24.6M
        repeat_args.min--;
1075
24.6M
      }
1076
1077
24.6M
      if (emit_split)
1078
402
      {
1079
402
        repeat_args.max--;
1080
402
      }
1081
24.6M
      else
1082
24.6M
      {
1083
24.6M
        repeat_args.min--;
1084
24.6M
        repeat_args.max--;
1085
24.6M
      }
1086
1087
24.6M
      repeat_args.offset = 0;
1088
1089
24.6M
      bookmark_1 = yr_arena_get_current_offset(
1090
24.6M
          emit_context->arena, YR_RE_CODE_SECTION);
1091
1092
24.6M
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1093
24.6M
          emit_context,
1094
24.6M
          re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY
1095
24.6M
                          : RE_OPCODE_REPEAT_START_UNGREEDY,
1096
24.6M
          &repeat_args,
1097
24.6M
          sizeof(repeat_args),
1098
24.6M
          emit_prolog ? NULL : &instruction_ref,
1099
24.6M
          &repeat_start_args_ref));
1100
1101
24.6M
      bookmark_2 = yr_arena_get_current_offset(
1102
24.6M
          emit_context->arena, YR_RE_CODE_SECTION);
1103
1104
24.6M
      FAIL_ON_ERROR(_yr_re_emit(
1105
24.6M
          emit_context,
1106
24.6M
          re_node->children_head,
1107
24.6M
          flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
1108
24.6M
          NULL));
1109
1110
24.6M
      bookmark_3 = yr_arena_get_current_offset(
1111
24.6M
          emit_context->arena, YR_RE_CODE_SECTION);
1112
1113
24.6M
      if (bookmark_2 - bookmark_3 < INT32_MIN)
1114
8
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1115
1116
24.6M
      repeat_args.offset = (int32_t) (bookmark_2 - bookmark_3);
1117
1118
24.6M
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1119
24.6M
          emit_context,
1120
24.6M
          re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY
1121
24.6M
                          : RE_OPCODE_REPEAT_END_UNGREEDY,
1122
24.6M
          &repeat_args,
1123
24.6M
          sizeof(repeat_args),
1124
24.6M
          NULL,
1125
24.6M
          NULL));
1126
1127
24.6M
      bookmark_4 = yr_arena_get_current_offset(
1128
24.6M
          emit_context->arena, YR_RE_CODE_SECTION);
1129
1130
24.6M
      repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr(
1131
24.6M
          emit_context->arena, &repeat_start_args_ref);
1132
1133
24.6M
      if (bookmark_4 - bookmark_1 > INT32_MAX)
1134
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1135
1136
24.6M
      repeat_start_args_addr->offset = (int32_t) (bookmark_4 - bookmark_1);
1137
24.6M
    }
1138
1139
24.7M
    if (emit_split)
1140
1.03k
    {
1141
1.03k
      bookmark_1 = yr_arena_get_current_offset(
1142
1.03k
          emit_context->arena, YR_RE_CODE_SECTION);
1143
1144
1.03k
      FAIL_ON_ERROR(_yr_emit_split(
1145
1.03k
          emit_context,
1146
1.03k
          re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
1147
1.03k
          0,
1148
1.03k
          NULL,
1149
1.03k
          &split_offset_ref));
1150
1.02k
    }
1151
1152
24.7M
    if (emit_epilog)
1153
24.7M
    {
1154
24.7M
      FAIL_ON_ERROR(_yr_re_emit(
1155
24.7M
          emit_context,
1156
24.7M
          re_node->children_head,
1157
24.7M
          emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags,
1158
24.7M
          emit_prolog || emit_repeat ? NULL : &instruction_ref));
1159
24.7M
    }
1160
1161
24.7M
    if (emit_split)
1162
978
    {
1163
978
      bookmark_2 = yr_arena_get_current_offset(
1164
978
          emit_context->arena, YR_RE_CODE_SECTION);
1165
1166
978
      if (bookmark_2 - bookmark_1 > INT16_MAX)
1167
2
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1168
1169
976
      split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
1170
976
          emit_context->arena, &split_offset_ref);
1171
1172
976
      jmp_offset = (int16_t) (bookmark_2 - bookmark_1);
1173
1174
976
      memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
1175
976
    }
1176
1177
24.7M
    break;
1178
241M
  }
1179
1180
241M
  if (flags & EMIT_BACKWARDS)
1181
5.51M
  {
1182
5.51M
    if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE))
1183
1.53M
    {
1184
1.53M
      re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION;
1185
1.53M
      re_node->backward_code_ref.offset = yr_arena_get_current_offset(
1186
1.53M
          emit_context->arena, YR_RE_CODE_SECTION);
1187
1.53M
    }
1188
5.51M
  }
1189
235M
  else
1190
235M
  {
1191
235M
    if (!(flags & EMIT_DONT_SET_FORWARDS_CODE))
1192
1.80M
    {
1193
1.80M
      re_node->forward_code_ref = instruction_ref;
1194
1.80M
    }
1195
235M
  }
1196
1197
241M
  if (code_ref != NULL)
1198
51.2M
    *code_ref = instruction_ref;
1199
1200
241M
  return ERROR_SUCCESS;
1201
241M
}
1202
1203
int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code)
1204
15.8k
{
1205
15.8k
  RE_EMIT_CONTEXT emit_context;
1206
1207
  // Emit code for matching the regular expressions forwards.
1208
15.8k
  emit_context.arena = arena;
1209
15.8k
  emit_context.next_split_id = 0;
1210
1211
15.8k
  FAIL_ON_ERROR(_yr_re_emit(
1212
15.8k
      &emit_context,
1213
15.8k
      re_ast->root_node,
1214
15.8k
      backwards_code ? EMIT_BACKWARDS : 0,
1215
15.8k
      NULL));
1216
1217
15.6k
  FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL));
1218
1219
15.6k
  return ERROR_SUCCESS;
1220
15.6k
}
1221
1222
static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber)
1223
0
{
1224
0
  RE_FIBER* fiber;
1225
1226
0
  if (fiber_pool->fibers.head != NULL)
1227
0
  {
1228
0
    fiber = fiber_pool->fibers.head;
1229
0
    fiber_pool->fibers.head = fiber->next;
1230
1231
0
    if (fiber_pool->fibers.tail == fiber)
1232
0
      fiber_pool->fibers.tail = NULL;
1233
0
  }
1234
0
  else
1235
0
  {
1236
0
    if (fiber_pool->fiber_count == RE_MAX_FIBERS)
1237
0
      return ERROR_TOO_MANY_RE_FIBERS;
1238
1239
0
    fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER));
1240
1241
0
    if (fiber == NULL)
1242
0
      return ERROR_INSUFFICIENT_MEMORY;
1243
1244
0
    fiber_pool->fiber_count++;
1245
0
  }
1246
1247
0
  fiber->ip = NULL;
1248
0
  fiber->sp = -1;
1249
0
  fiber->rc = -1;
1250
0
  fiber->next = NULL;
1251
0
  fiber->prev = NULL;
1252
1253
0
  *new_fiber = fiber;
1254
1255
0
  return ERROR_SUCCESS;
1256
0
}
1257
1258
////////////////////////////////////////////////////////////////////////////////
1259
// Appends 'fiber' to 'fiber_list'
1260
//
1261
static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber)
1262
0
{
1263
0
  assert(fiber->prev == NULL);
1264
0
  assert(fiber->next == NULL);
1265
1266
0
  fiber->prev = fiber_list->tail;
1267
1268
0
  if (fiber_list->tail != NULL)
1269
0
    fiber_list->tail->next = fiber;
1270
1271
0
  fiber_list->tail = fiber;
1272
1273
0
  if (fiber_list->head == NULL)
1274
0
    fiber_list->head = fiber;
1275
1276
0
  assert(fiber_list->tail->next == NULL);
1277
0
  assert(fiber_list->head->prev == NULL);
1278
0
}
1279
1280
////////////////////////////////////////////////////////////////////////////////
1281
// Verifies if a fiber with the same properties (ip, rc, sp, and stack values)
1282
// than 'target_fiber' exists in 'fiber_list'. The list is iterated from
1283
// the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
1284
// taken into account.
1285
//
1286
static int _yr_re_fiber_exists(
1287
    RE_FIBER_LIST* fiber_list,
1288
    RE_FIBER* target_fiber,
1289
    RE_FIBER* last_fiber)
1290
0
{
1291
0
  RE_FIBER* fiber = fiber_list->head;
1292
1293
0
  int equal_stacks;
1294
0
  int i;
1295
1296
0
  if (last_fiber == NULL)
1297
0
    return false;
1298
1299
0
  while (fiber != last_fiber->next)
1300
0
  {
1301
0
    if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp &&
1302
0
        fiber->rc == target_fiber->rc)
1303
0
    {
1304
0
      equal_stacks = true;
1305
1306
0
      for (i = 0; i <= fiber->sp; i++)
1307
0
      {
1308
0
        if (fiber->stack[i] != target_fiber->stack[i])
1309
0
        {
1310
0
          equal_stacks = false;
1311
0
          break;
1312
0
        }
1313
0
      }
1314
1315
0
      if (equal_stacks)
1316
0
        return true;
1317
0
    }
1318
1319
0
    fiber = fiber->next;
1320
0
  }
1321
1322
0
  return false;
1323
0
}
1324
1325
////////////////////////////////////////////////////////////////////////////////
1326
// Clones a fiber in fiber_list and inserts the cloned fiber just after.
1327
// the original one. If fiber_list is:
1328
//
1329
//   f1 -> f2 -> f3 -> f4
1330
//
1331
// Splitting f2 will result in:
1332
//
1333
//   f1 -> f2 -> cloned f2 -> f3 -> f4
1334
//
1335
static int _yr_re_fiber_split(
1336
    RE_FIBER_LIST* fiber_list,
1337
    RE_FIBER_POOL* fiber_pool,
1338
    RE_FIBER* fiber,
1339
    RE_FIBER** new_fiber)
1340
0
{
1341
0
  int32_t i;
1342
1343
0
  FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber));
1344
1345
0
  (*new_fiber)->sp = fiber->sp;
1346
0
  (*new_fiber)->ip = fiber->ip;
1347
0
  (*new_fiber)->rc = fiber->rc;
1348
1349
0
  for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i];
1350
1351
0
  (*new_fiber)->next = fiber->next;
1352
0
  (*new_fiber)->prev = fiber;
1353
1354
0
  if (fiber->next != NULL)
1355
0
    fiber->next->prev = *new_fiber;
1356
1357
0
  fiber->next = *new_fiber;
1358
1359
0
  if (fiber_list->tail == fiber)
1360
0
    fiber_list->tail = *new_fiber;
1361
1362
0
  assert(fiber_list->tail->next == NULL);
1363
0
  assert(fiber_list->head->prev == NULL);
1364
1365
0
  return ERROR_SUCCESS;
1366
0
}
1367
1368
////////////////////////////////////////////////////////////////////////////////
1369
// Kills a given fiber by removing it from the fiber list and putting it in the
1370
// fiber pool.
1371
//
1372
static RE_FIBER* _yr_re_fiber_kill(
1373
    RE_FIBER_LIST* fiber_list,
1374
    RE_FIBER_POOL* fiber_pool,
1375
    RE_FIBER* fiber)
1376
0
{
1377
0
  RE_FIBER* next_fiber = fiber->next;
1378
1379
0
  if (fiber->prev != NULL)
1380
0
    fiber->prev->next = next_fiber;
1381
1382
0
  if (next_fiber != NULL)
1383
0
    next_fiber->prev = fiber->prev;
1384
1385
0
  if (fiber_pool->fibers.tail != NULL)
1386
0
    fiber_pool->fibers.tail->next = fiber;
1387
1388
0
  if (fiber_list->tail == fiber)
1389
0
    fiber_list->tail = fiber->prev;
1390
1391
0
  if (fiber_list->head == fiber)
1392
0
    fiber_list->head = next_fiber;
1393
1394
0
  fiber->next = NULL;
1395
0
  fiber->prev = fiber_pool->fibers.tail;
1396
0
  fiber_pool->fibers.tail = fiber;
1397
1398
0
  if (fiber_pool->fibers.head == NULL)
1399
0
    fiber_pool->fibers.head = fiber;
1400
1401
0
  return next_fiber;
1402
0
}
1403
1404
////////////////////////////////////////////////////////////////////////////////
1405
// Kills all fibers from the given one up to the end of the fiber list.
1406
//
1407
static void _yr_re_fiber_kill_tail(
1408
    RE_FIBER_LIST* fiber_list,
1409
    RE_FIBER_POOL* fiber_pool,
1410
    RE_FIBER* fiber)
1411
0
{
1412
0
  RE_FIBER* prev_fiber = fiber->prev;
1413
1414
0
  if (prev_fiber != NULL)
1415
0
    prev_fiber->next = NULL;
1416
1417
0
  fiber->prev = fiber_pool->fibers.tail;
1418
1419
0
  if (fiber_pool->fibers.tail != NULL)
1420
0
    fiber_pool->fibers.tail->next = fiber;
1421
1422
0
  fiber_pool->fibers.tail = fiber_list->tail;
1423
0
  fiber_list->tail = prev_fiber;
1424
1425
0
  if (fiber_list->head == fiber)
1426
0
    fiber_list->head = NULL;
1427
1428
0
  if (fiber_pool->fibers.head == NULL)
1429
0
    fiber_pool->fibers.head = fiber;
1430
0
}
1431
1432
////////////////////////////////////////////////////////////////////////////////
1433
// Kills all fibers in the fiber list.
1434
//
1435
static void _yr_re_fiber_kill_all(
1436
    RE_FIBER_LIST* fiber_list,
1437
    RE_FIBER_POOL* fiber_pool)
1438
0
{
1439
0
  if (fiber_list->head != NULL)
1440
0
    _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
1441
0
}
1442
1443
////////////////////////////////////////////////////////////////////////////////
1444
// Executes a fiber until reaching an "matching" instruction. A "matching"
1445
// instruction is one that actually reads a byte from the input and performs
1446
// some matching. If the fiber reaches a split instruction, the new fiber is
1447
// also synced.
1448
//
1449
static int _yr_re_fiber_sync(
1450
    RE_FIBER_LIST* fiber_list,
1451
    RE_FIBER_POOL* fiber_pool,
1452
    RE_FIBER* fiber_to_sync)
1453
0
{
1454
  // A array for keeping track of which split instructions has been already
1455
  // executed. Each split instruction within a regexp has an associated ID
1456
  // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is
1457
  // required to avoid infinite loops in regexps like (a*)* or (a|)*
1458
1459
0
  RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID];
1460
0
  RE_SPLIT_ID_TYPE splits_executed_count = 0;
1461
0
  RE_SPLIT_ID_TYPE split_id, splits_executed_idx;
1462
1463
0
  int split_already_executed;
1464
1465
0
  RE_REPEAT_ARGS* repeat_args;
1466
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
1467
1468
0
  RE_FIBER* fiber;
1469
0
  RE_FIBER* last;
1470
0
  RE_FIBER* next;
1471
0
  RE_FIBER* branch_a;
1472
0
  RE_FIBER* branch_b;
1473
1474
0
  fiber = fiber_to_sync;
1475
0
  last = fiber_to_sync->next;
1476
1477
0
  while (fiber != last)
1478
0
  {
1479
0
    int16_t jmp_len;
1480
0
    uint8_t opcode = *fiber->ip;
1481
1482
0
    switch (opcode)
1483
0
    {
1484
0
    case RE_OPCODE_SPLIT_A:
1485
0
    case RE_OPCODE_SPLIT_B:
1486
1487
0
      split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1);
1488
0
      split_already_executed = false;
1489
1490
0
      for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count;
1491
0
           splits_executed_idx++)
1492
0
      {
1493
0
        if (split_id == splits_executed[splits_executed_idx])
1494
0
        {
1495
0
          split_already_executed = true;
1496
0
          break;
1497
0
        }
1498
0
      }
1499
1500
0
      if (split_already_executed)
1501
0
      {
1502
0
        fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
1503
0
      }
1504
0
      else
1505
0
      {
1506
0
        branch_a = fiber;
1507
1508
0
        FAIL_ON_ERROR(
1509
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1510
1511
        // With RE_OPCODE_SPLIT_A the current fiber continues at the next
1512
        // instruction in the stream (branch A), while the newly created
1513
        // fiber starts at the address indicated by the instruction (branch B)
1514
        // RE_OPCODE_SPLIT_B has the opposite behavior.
1515
1516
0
        if (opcode == RE_OPCODE_SPLIT_B)
1517
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1518
1519
        // Branch A continues at the next instruction
1520
0
        branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3);
1521
1522
        // Branch B adds the offset indicated by the split instruction.
1523
0
        jmp_len = yr_unaligned_i16(branch_b->ip + 1 + sizeof(RE_SPLIT_ID_TYPE));
1524
1525
0
        branch_b->ip += jmp_len;
1526
1527
#ifdef YR_PARANOID_MODE
1528
        // In normal conditions this should never happen. But with compiled
1529
        // rules that has been hand-crafted by a malicious actor this could
1530
        // happen.
1531
        if (splits_executed_count >= RE_MAX_SPLIT_ID)
1532
          return ERROR_INTERNAL_FATAL_ERROR;
1533
#endif
1534
1535
0
        splits_executed[splits_executed_count] = split_id;
1536
0
        splits_executed_count++;
1537
0
      }
1538
1539
0
      break;
1540
1541
0
    case RE_OPCODE_REPEAT_START_GREEDY:
1542
0
    case RE_OPCODE_REPEAT_START_UNGREEDY:
1543
1544
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1545
0
      assert(repeat_args->max > 0);
1546
0
      branch_a = fiber;
1547
1548
0
      if (repeat_args->min == 0)
1549
0
      {
1550
0
        FAIL_ON_ERROR(
1551
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1552
1553
0
        if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY)
1554
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1555
1556
0
        branch_b->ip += repeat_args->offset;
1557
0
      }
1558
1559
0
      branch_a->stack[++branch_a->sp] = 0;
1560
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1561
0
      break;
1562
1563
0
    case RE_OPCODE_REPEAT_END_GREEDY:
1564
0
    case RE_OPCODE_REPEAT_END_UNGREEDY:
1565
1566
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1567
0
      fiber->stack[fiber->sp]++;
1568
1569
0
      if (fiber->stack[fiber->sp] < repeat_args->min)
1570
0
      {
1571
0
        fiber->ip += repeat_args->offset;
1572
0
        break;
1573
0
      }
1574
1575
0
      branch_a = fiber;
1576
1577
0
      if (fiber->stack[fiber->sp] < repeat_args->max)
1578
0
      {
1579
0
        FAIL_ON_ERROR(
1580
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1581
1582
0
        if (opcode == RE_OPCODE_REPEAT_END_GREEDY)
1583
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1584
1585
0
        branch_b->ip += repeat_args->offset;
1586
0
      }
1587
1588
0
      branch_a->sp--;
1589
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1590
0
      break;
1591
1592
0
    case RE_OPCODE_REPEAT_ANY_GREEDY:
1593
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1594
1595
0
      repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1);
1596
1597
      // If repetition counter (rc) is -1 it means that we are reaching this
1598
      // instruction from the previous one in the instructions stream. In
1599
      // this case let's initialize the counter to 0 and start looping.
1600
1601
0
      if (fiber->rc == -1)
1602
0
        fiber->rc = 0;
1603
1604
0
      if (fiber->rc < repeat_any_args->min)
1605
0
      {
1606
        // Increase repetition counter and continue with next fiber. The
1607
        // instruction pointer for this fiber is not incremented yet, this
1608
        // fiber spins in this same instruction until reaching the minimum
1609
        // number of repetitions.
1610
1611
0
        fiber->rc++;
1612
0
        fiber = fiber->next;
1613
0
      }
1614
0
      else if (fiber->rc < repeat_any_args->max)
1615
0
      {
1616
        // Once the minimum number of repetitions are matched one fiber
1617
        // remains spinning in this instruction until reaching the maximum
1618
        // number of repetitions while new fibers are created. New fibers
1619
        // start executing at the next instruction.
1620
1621
0
        next = fiber->next;
1622
0
        branch_a = fiber;
1623
1624
0
        FAIL_ON_ERROR(
1625
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1626
1627
0
        if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY)
1628
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1629
1630
0
        branch_a->rc++;
1631
0
        branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1632
0
        branch_b->rc = -1;
1633
1634
0
        FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b));
1635
1636
0
        fiber = next;
1637
0
      }
1638
0
      else
1639
0
      {
1640
        // When the maximum number of repetitions is reached the fiber keeps
1641
        // executing at the next instruction. The repetition counter is set
1642
        // to -1 indicating that we are not spinning in a repeat instruction
1643
        // anymore.
1644
1645
0
        fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1646
0
        fiber->rc = -1;
1647
0
      }
1648
1649
0
      break;
1650
1651
0
    case RE_OPCODE_JUMP:
1652
0
      jmp_len = yr_unaligned_i16(fiber->ip + 1);
1653
0
      fiber->ip += jmp_len;
1654
0
      break;
1655
1656
0
    default:
1657
0
      fiber = fiber->next;
1658
0
    }
1659
0
  }
1660
1661
0
  return ERROR_SUCCESS;
1662
0
}
1663
1664
////////////////////////////////////////////////////////////////////////////////
1665
// Executes a regular expression. The specified regular expression will try to
1666
// match the data starting at the address specified by "input". The "input"
1667
// pointer can point to any address inside a memory buffer. Arguments
1668
// "input_forwards_size" and "input_backwards_size" indicate how many bytes
1669
// can be accessible starting at "input" and going forwards and backwards
1670
// respectively.
1671
//
1672
//   <--- input_backwards_size -->|<----------- input_forwards_size -------->
1673
//  |--------  memory buffer  -----------------------------------------------|
1674
//                                ^
1675
//                              input
1676
//
1677
// Args:
1678
//   YR_SCAN_CONTEXT *context         - Scan context.
1679
//   const uint8_t* code              - Regexp code be executed
1680
//   const uint8_t* input             - Pointer to input data
1681
//   size_t input_forwards_size       - Number of accessible bytes starting at
1682
//                                      "input" and going forwards.
1683
//   size_t input_backwards_size      - Number of accessible bytes starting at
1684
//                                      "input" and going backwards
1685
//   int flags                        - Flags:
1686
//      RE_FLAGS_SCAN
1687
//      RE_FLAGS_BACKWARDS
1688
//      RE_FLAGS_EXHAUSTIVE
1689
//      RE_FLAGS_WIDE
1690
//      RE_FLAGS_NO_CASE
1691
//      RE_FLAGS_DOT_ALL
1692
//   RE_MATCH_CALLBACK_FUNC callback  - Callback function
1693
//   void* callback_args              - Callback argument
1694
//   int*  matches                    - Pointer to an integer receiving the
1695
//                                      number of matching bytes. Notice that
1696
//                                      0 means a zero-length match, while no
1697
//                                      matches is -1.
1698
// Returns:
1699
//    ERROR_SUCCESS or any other error code.
1700
1701
int yr_re_exec(
1702
    YR_SCAN_CONTEXT* context,
1703
    const uint8_t* code,
1704
    const uint8_t* input_data,
1705
    size_t input_forwards_size,
1706
    size_t input_backwards_size,
1707
    int flags,
1708
    RE_MATCH_CALLBACK_FUNC callback,
1709
    void* callback_args,
1710
    int* matches)
1711
0
{
1712
0
  const uint8_t* input;
1713
0
  const uint8_t* ip;
1714
1715
0
  uint16_t opcode_args;
1716
0
  uint8_t mask;
1717
0
  uint8_t value;
1718
0
  uint8_t character_size;
1719
1720
0
  RE_FIBER_LIST fibers;
1721
0
  RE_FIBER* fiber;
1722
0
  RE_FIBER* next_fiber;
1723
1724
0
  int bytes_matched;
1725
0
  int max_bytes_matched;
1726
1727
0
  int match;
1728
0
  int input_incr;
1729
0
  int kill;
1730
0
  int action;
1731
1732
0
  bool prev_is_word_char = false;
1733
0
  bool input_is_word_char = false;
1734
1735
0
#define ACTION_NONE      0
1736
0
#define ACTION_CONTINUE  1
1737
0
#define ACTION_KILL      2
1738
0
#define ACTION_KILL_TAIL 3
1739
1740
0
#define prolog                                      \
1741
0
  {                                                 \
1742
0
    if ((bytes_matched >= max_bytes_matched) ||     \
1743
0
        (character_size == 2 && *(input + 1) != 0)) \
1744
0
    {                                               \
1745
0
      action = ACTION_KILL;                         \
1746
0
      break;                                        \
1747
0
    }                                               \
1748
0
  }
1749
1750
0
  if (matches != NULL)
1751
0
    *matches = -1;
1752
1753
0
  if (flags & RE_FLAGS_WIDE)
1754
0
    character_size = 2;
1755
0
  else
1756
0
    character_size = 1;
1757
1758
0
  input = input_data;
1759
0
  input_incr = character_size;
1760
1761
0
  if (flags & RE_FLAGS_BACKWARDS)
1762
0
  {
1763
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1764
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
1765
0
    input -= character_size;
1766
0
    input_incr = -input_incr;
1767
0
  }
1768
0
  else
1769
0
  {
1770
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1771
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
1772
0
  }
1773
1774
  // Round down max_bytes_matched to a multiple of character_size, this way if
1775
  // character_size is 2 and max_bytes_matched is odd we are ignoring the
1776
  // extra byte which can't match anyways.
1777
1778
0
  max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
1779
0
  bytes_matched = 0;
1780
1781
0
  FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber));
1782
1783
0
  fiber->ip = code;
1784
0
  fibers.head = fiber;
1785
0
  fibers.tail = fiber;
1786
1787
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1788
0
      _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1789
0
      _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1790
1791
0
  while (fibers.head != NULL)
1792
0
  {
1793
0
    fiber = fibers.head;
1794
1795
0
    while (fiber != NULL)
1796
0
    {
1797
0
      next_fiber = fiber->next;
1798
1799
0
      if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev))
1800
0
        _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1801
1802
0
      fiber = next_fiber;
1803
0
    }
1804
1805
0
    fiber = fibers.head;
1806
1807
0
    while (fiber != NULL)
1808
0
    {
1809
0
      ip = fiber->ip;
1810
0
      action = ACTION_NONE;
1811
1812
0
      switch (*ip)
1813
0
      {
1814
0
      case RE_OPCODE_ANY:
1815
0
        prolog;
1816
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1817
0
        action = match ? ACTION_NONE : ACTION_KILL;
1818
0
        fiber->ip += 1;
1819
0
        break;
1820
1821
0
      case RE_OPCODE_REPEAT_ANY_GREEDY:
1822
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1823
0
        prolog;
1824
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1825
0
        action = match ? ACTION_NONE : ACTION_KILL;
1826
1827
        // The instruction pointer is not incremented here. The current fiber
1828
        // spins in this instruction until reaching the required number of
1829
        // repetitions. The code controlling the number of repetitions is in
1830
        // _yr_re_fiber_sync.
1831
1832
0
        break;
1833
1834
0
      case RE_OPCODE_LITERAL:
1835
0
        prolog;
1836
0
        if (flags & RE_FLAGS_NO_CASE)
1837
0
          match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)];
1838
0
        else
1839
0
          match = (*input == *(ip + 1));
1840
0
        action = match ? ACTION_NONE : ACTION_KILL;
1841
0
        fiber->ip += 2;
1842
0
        break;
1843
1844
0
      case RE_OPCODE_NOT_LITERAL:
1845
0
        prolog;
1846
1847
        // We don't need to take into account the case-insensitive
1848
        // case because this opcode is only used with hex strings,
1849
        // which can't be case-insensitive.
1850
1851
0
        match = (*input != *(ip + 1));
1852
0
        action = match ? ACTION_NONE : ACTION_KILL;
1853
0
        fiber->ip += 2;
1854
0
        break;
1855
1856
0
      case RE_OPCODE_MASKED_LITERAL:
1857
0
        prolog;
1858
0
        opcode_args = yr_unaligned_u16(ip + 1);
1859
0
        mask = opcode_args >> 8;
1860
0
        value = opcode_args & 0xFF;
1861
1862
        // We don't need to take into account the case-insensitive
1863
        // case because this opcode is only used with hex strings,
1864
        // which can't be case-insensitive.
1865
1866
0
        match = ((*input & mask) == value);
1867
0
        action = match ? ACTION_NONE : ACTION_KILL;
1868
0
        fiber->ip += 3;
1869
0
        break;
1870
1871
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
1872
0
        prolog;
1873
0
        opcode_args = yr_unaligned_u16(ip + 1);
1874
0
        mask = opcode_args >> 8;
1875
0
        value = opcode_args & 0xFF;
1876
1877
        // We don't need to take into account the case-insensitive
1878
        // case because this opcode is only used with hex strings,
1879
        // which can't be case-insensitive.
1880
1881
0
        match = ((*input & mask) != value);
1882
0
        action = match ? ACTION_NONE : ACTION_KILL;
1883
0
        fiber->ip += 3;
1884
0
        break;
1885
1886
0
      case RE_OPCODE_CLASS:
1887
0
        prolog;
1888
0
        match = _yr_re_is_char_in_class(
1889
0
            (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE);
1890
0
        action = match ? ACTION_NONE : ACTION_KILL;
1891
0
        fiber->ip += (sizeof(RE_CLASS) + 1);
1892
0
        break;
1893
1894
0
      case RE_OPCODE_WORD_CHAR:
1895
0
        prolog;
1896
0
        match = _yr_re_is_word_char(input, character_size);
1897
0
        action = match ? ACTION_NONE : ACTION_KILL;
1898
0
        fiber->ip += 1;
1899
0
        break;
1900
1901
0
      case RE_OPCODE_NON_WORD_CHAR:
1902
0
        prolog;
1903
0
        match = !_yr_re_is_word_char(input, character_size);
1904
0
        action = match ? ACTION_NONE : ACTION_KILL;
1905
0
        fiber->ip += 1;
1906
0
        break;
1907
1908
0
      case RE_OPCODE_SPACE:
1909
0
      case RE_OPCODE_NON_SPACE:
1910
1911
0
        prolog;
1912
1913
0
        switch (*input)
1914
0
        {
1915
0
        case ' ':
1916
0
        case '\t':
1917
0
        case '\r':
1918
0
        case '\n':
1919
0
        case '\v':
1920
0
        case '\f':
1921
0
          match = true;
1922
0
          break;
1923
0
        default:
1924
0
          match = false;
1925
0
        }
1926
1927
0
        if (*ip == RE_OPCODE_NON_SPACE)
1928
0
          match = !match;
1929
1930
0
        action = match ? ACTION_NONE : ACTION_KILL;
1931
0
        fiber->ip += 1;
1932
0
        break;
1933
1934
0
      case RE_OPCODE_DIGIT:
1935
0
        prolog;
1936
0
        match = isdigit(*input);
1937
0
        action = match ? ACTION_NONE : ACTION_KILL;
1938
0
        fiber->ip += 1;
1939
0
        break;
1940
1941
0
      case RE_OPCODE_NON_DIGIT:
1942
0
        prolog;
1943
0
        match = !isdigit(*input);
1944
0
        action = match ? ACTION_NONE : ACTION_KILL;
1945
0
        fiber->ip += 1;
1946
0
        break;
1947
1948
0
      case RE_OPCODE_WORD_BOUNDARY:
1949
0
      case RE_OPCODE_NON_WORD_BOUNDARY:
1950
0
        if (input - input_incr + character_size <=
1951
0
                input_data + input_forwards_size &&
1952
0
            input - input_incr >= input_data - input_backwards_size)
1953
0
        {
1954
0
          prev_is_word_char = _yr_re_is_word_char(
1955
0
              input - input_incr, character_size);
1956
0
        }
1957
0
        else
1958
0
        {
1959
0
          prev_is_word_char = false;
1960
0
        }
1961
1962
0
        if (input + character_size <= input_data + input_forwards_size &&
1963
0
            input >= input_data - input_backwards_size)
1964
0
        {
1965
0
          input_is_word_char = _yr_re_is_word_char(input, character_size);
1966
0
        }
1967
0
        else
1968
0
        {
1969
0
          input_is_word_char = false;
1970
0
        }
1971
1972
0
        match = (prev_is_word_char && !input_is_word_char) ||
1973
0
                (!prev_is_word_char && input_is_word_char);
1974
1975
0
        if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
1976
0
          match = !match;
1977
1978
0
        action = match ? ACTION_CONTINUE : ACTION_KILL;
1979
0
        fiber->ip += 1;
1980
0
        break;
1981
1982
0
      case RE_OPCODE_MATCH_AT_START:
1983
0
        if (flags & RE_FLAGS_BACKWARDS)
1984
0
          kill = input_backwards_size > (size_t) bytes_matched;
1985
0
        else
1986
0
          kill = input_backwards_size > 0 || (bytes_matched != 0);
1987
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1988
0
        fiber->ip += 1;
1989
0
        break;
1990
1991
0
      case RE_OPCODE_MATCH_AT_END:
1992
0
        kill = flags & RE_FLAGS_BACKWARDS ||
1993
0
               input_forwards_size > (size_t) bytes_matched;
1994
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1995
0
        fiber->ip += 1;
1996
0
        break;
1997
1998
0
      case RE_OPCODE_MATCH:
1999
2000
0
        if (matches != NULL)
2001
0
          *matches = bytes_matched;
2002
2003
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2004
0
        {
2005
0
          if (callback != NULL)
2006
0
          {
2007
0
            if (flags & RE_FLAGS_BACKWARDS)
2008
0
            {
2009
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2010
0
                  callback(
2011
0
                      input + character_size,
2012
0
                      bytes_matched,
2013
0
                      flags,
2014
0
                      callback_args),
2015
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2016
0
            }
2017
0
            else
2018
0
            {
2019
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2020
0
                  callback(input_data, bytes_matched, flags, callback_args),
2021
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2022
0
            }
2023
0
          }
2024
2025
0
          action = ACTION_KILL;
2026
0
        }
2027
0
        else
2028
0
        {
2029
0
          action = ACTION_KILL_TAIL;
2030
0
        }
2031
2032
0
        break;
2033
2034
0
      default:
2035
0
        assert(false);
2036
0
      }
2037
2038
0
      switch (action)
2039
0
      {
2040
0
      case ACTION_KILL:
2041
0
        fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
2042
0
        break;
2043
2044
0
      case ACTION_KILL_TAIL:
2045
0
        _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber);
2046
0
        fiber = NULL;
2047
0
        break;
2048
2049
0
      case ACTION_CONTINUE:
2050
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2051
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2052
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2053
0
        break;
2054
2055
0
      default:
2056
0
        next_fiber = fiber->next;
2057
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2058
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2059
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2060
0
        fiber = next_fiber;
2061
0
      }
2062
0
    }
2063
2064
0
    input += input_incr;
2065
0
    bytes_matched += character_size;
2066
2067
0
    if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched)
2068
0
    {
2069
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2070
0
          _yr_re_fiber_create(&context->re_fiber_pool, &fiber),
2071
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2072
2073
0
      fiber->ip = code;
2074
0
      _yr_re_fiber_append(&fibers, fiber);
2075
2076
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2077
0
          _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2078
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2079
0
    }
2080
0
  }
2081
2082
0
  return ERROR_SUCCESS;
2083
0
}
2084
2085
////////////////////////////////////////////////////////////////////////////////
2086
// Helper function that creates a RE_FAST_EXEC_POSITION by either allocating it
2087
// or reusing a previously allocated one from a pool.
2088
//
2089
static int _yr_re_fast_exec_position_create(
2090
    RE_FAST_EXEC_POSITION_POOL* pool,
2091
    RE_FAST_EXEC_POSITION** new_position)
2092
0
{
2093
0
  RE_FAST_EXEC_POSITION* position;
2094
2095
0
  if (pool->head != NULL)
2096
0
  {
2097
0
    position = pool->head;
2098
0
    pool->head = position->next;
2099
0
  }
2100
0
  else
2101
0
  {
2102
0
    position = (RE_FAST_EXEC_POSITION*) yr_malloc(
2103
0
        sizeof(RE_FAST_EXEC_POSITION));
2104
2105
0
    if (position == NULL)
2106
0
      return ERROR_INSUFFICIENT_MEMORY;
2107
0
  }
2108
2109
0
  position->input = NULL;
2110
0
  position->round = 0;
2111
0
  position->next = NULL;
2112
0
  position->prev = NULL;
2113
2114
0
  *new_position = position;
2115
2116
0
  return ERROR_SUCCESS;
2117
0
}
2118
2119
////////////////////////////////////////////////////////////////////////////////
2120
// Helper function that moves a list of RE_FAST_EXEC_POSITION structures to a
2121
// pool represented by a RE_FAST_EXEC_POSITION_POOL. Receives pointers to the
2122
// pool, the first item, and last item in the list.
2123
//
2124
static void _yr_re_fast_exec_destroy_position_list(
2125
    RE_FAST_EXEC_POSITION_POOL* pool,
2126
    RE_FAST_EXEC_POSITION* first,
2127
    RE_FAST_EXEC_POSITION* last)
2128
0
{
2129
0
  last->next = pool->head;
2130
2131
0
  if (pool->head != NULL)
2132
0
    pool->head->prev = last;
2133
2134
0
  pool->head = first;
2135
0
}
2136
2137
////////////////////////////////////////////////////////////////////////////////
2138
// This function replaces yr_re_exec for regular expressions marked with flag
2139
// RE_FLAGS_FAST_REGEXP. These regular expressions are derived from hex strings
2140
// that don't contain alternatives, like for example:
2141
//
2142
//   { 01 02 03 04 [0-2] 04 05 06 07 }
2143
//
2144
// The regexp's code produced by such strings can be matched by a faster, less
2145
// general algorithm, and it contains only the following opcodes:
2146
//
2147
//   * RE_OPCODE_LITERAL
2148
//   * RE_OPCODE_MASKED_LITERAL,
2149
//   * RE_OPCODE_NOT_LITERAL
2150
//   * RE_OPCODE_MASKED_NOT_LITERAL
2151
//   * RE_OPCODE_ANY
2152
//   * RE_OPCODE_REPEAT_ANY_UNGREEDY
2153
//   * RE_OPCODE_MATCH.
2154
//
2155
// The regexp's code is executed one instruction at time, and the function
2156
// maintains a list of positions within the input for tracking different
2157
// matching alternatives, these positions are described by an instance of
2158
// RE_FAST_EXEC_POSITION. With each instruction the list of positions is
2159
// updated, removing those where the input data doesn't match the current
2160
// instruction, or adding new positions if the instruction is
2161
// RE_OPCODE_REPEAT_ANY_UNGREEDY. The list of positions is maintained sorted
2162
// by the value of the pointer they hold to the input, and it doesn't contain
2163
// duplicated pointer values.
2164
//
2165
int yr_re_fast_exec(
2166
    YR_SCAN_CONTEXT* context,
2167
    const uint8_t* code,
2168
    const uint8_t* input_data,
2169
    size_t input_forwards_size,
2170
    size_t input_backwards_size,
2171
    int flags,
2172
    RE_MATCH_CALLBACK_FUNC callback,
2173
    void* callback_args,
2174
    int* matches)
2175
0
{
2176
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
2177
2178
  // Pointers to the first and last position in the list.
2179
0
  RE_FAST_EXEC_POSITION* first;
2180
0
  RE_FAST_EXEC_POSITION* last;
2181
2182
0
  int input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;
2183
0
  int bytes_matched;
2184
0
  int max_bytes_matched;
2185
2186
0
  if (flags & RE_FLAGS_BACKWARDS)
2187
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
2188
0
  else
2189
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
2190
2191
0
  const uint8_t* ip = code;
2192
2193
  // Create the first position in the list, which points to the start of the
2194
  // input data. Intially this is the only position, more positions will be
2195
  // created every time RE_OPCODE_REPEAT_ANY_UNGREEDY is executed.
2196
0
  FAIL_ON_ERROR(_yr_re_fast_exec_position_create(
2197
0
      &context->re_fast_exec_position_pool, &first));
2198
2199
0
  first->round = 0;
2200
0
  first->input = input_data;
2201
0
  first->prev = NULL;
2202
0
  first->next = NULL;
2203
2204
0
  if (flags & RE_FLAGS_BACKWARDS)
2205
0
    first->input--;
2206
2207
  // As we are starting with a single position, the last one and the first one
2208
  // are the same.
2209
0
  last = first;
2210
2211
  // Round is incremented with every regxp instruction.
2212
0
  int round = 0;
2213
2214
  // Keep in the loop until the list of positions gets empty. Positions are
2215
  // removed from the list when they don't match the current instruction in
2216
  // the code.
2217
0
  while (first != NULL)
2218
0
  {
2219
0
    RE_FAST_EXEC_POSITION* current = first;
2220
2221
    // Iterate over all the positions, executing the current instruction at
2222
    // all of them.
2223
0
    while (current != NULL)
2224
0
    {
2225
0
      RE_FAST_EXEC_POSITION* next = current->next;
2226
2227
      // Ignore any position in the list whose round number is different from
2228
      // the current round. This prevents new positions added to the list in
2229
      // this round from being taken into account in this same round.
2230
0
      if (current->round != round)
2231
0
      {
2232
0
        current = next;
2233
0
        continue;
2234
0
      }
2235
2236
0
      bytes_matched = flags & RE_FLAGS_BACKWARDS
2237
0
                          ? (int) (input_data - current->input - 1)
2238
0
                          : (int) (current->input - input_data);
2239
2240
0
      uint16_t opcode_args;
2241
0
      uint8_t mask;
2242
0
      uint8_t value;
2243
2244
0
      bool match = false;
2245
2246
0
      switch (*ip)
2247
0
      {
2248
0
      case RE_OPCODE_ANY:
2249
0
        if (bytes_matched >= max_bytes_matched)
2250
0
          break;
2251
2252
0
        match = true;
2253
0
        current->input += input_incr;
2254
0
        break;
2255
2256
0
      case RE_OPCODE_LITERAL:
2257
0
        if (bytes_matched >= max_bytes_matched)
2258
0
          break;
2259
2260
0
        if (*current->input == *(ip + 1))
2261
0
        {
2262
0
          match = true;
2263
0
          current->input += input_incr;
2264
0
        }
2265
0
        break;
2266
2267
0
      case RE_OPCODE_NOT_LITERAL:
2268
0
        if (bytes_matched >= max_bytes_matched)
2269
0
          break;
2270
2271
0
        if (*current->input != *(ip + 1))
2272
0
        {
2273
0
          match = true;
2274
0
          current->input += input_incr;
2275
0
        }
2276
0
        break;
2277
2278
0
      case RE_OPCODE_MASKED_LITERAL:
2279
0
        if (bytes_matched >= max_bytes_matched)
2280
0
          break;
2281
2282
0
        opcode_args = yr_unaligned_u16(ip + 1);
2283
0
        mask = opcode_args >> 8;
2284
0
        value = opcode_args & 0xFF;
2285
2286
0
        if ((*current->input & mask) == value)
2287
0
        {
2288
0
          match = true;
2289
0
          current->input += input_incr;
2290
0
        }
2291
0
        break;
2292
2293
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
2294
0
        if (bytes_matched >= max_bytes_matched)
2295
0
          break;
2296
2297
0
        opcode_args = yr_unaligned_u16(ip + 1);
2298
0
        mask = opcode_args >> 8;
2299
0
        value = opcode_args & 0xFF;
2300
2301
0
        if ((*current->input & mask) != value)
2302
0
        {
2303
0
          match = true;
2304
0
          current->input += input_incr;
2305
0
        }
2306
0
        break;
2307
2308
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2309
0
        repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1);
2310
2311
0
        if (bytes_matched + repeat_any_args->min >= max_bytes_matched)
2312
0
          break;
2313
2314
0
        match = true;
2315
2316
0
        const uint8_t* next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS);
2317
2318
        // insertion_point indicates the item in the list of inputs after which
2319
        // the newly created inputs will be inserted.
2320
0
        RE_FAST_EXEC_POSITION* insertion_point = current;
2321
2322
0
        for (int j = repeat_any_args->min + 1; j <= repeat_any_args->max; j++)
2323
0
        {
2324
          // bytes_matched is the number of bytes matched so far, and j is the
2325
          // minimum number of bytes that are still pending to match. If these
2326
          // two numbers sum more than the maximum number of bytes that can be
2327
          // matched the loop can be aborted. The position that we were about
2328
          // to create can't lead to a match without exceeding
2329
          // max_bytes_matched.
2330
0
          if (bytes_matched + j >= max_bytes_matched)
2331
0
            break;
2332
2333
0
          const uint8_t* next_input = current->input + j * input_incr;
2334
2335
          // Find the point where next_input should be inserted. The list is
2336
          // kept sorted by pointer in increasing order, the insertion point is
2337
          // an item that has a pointer lower or equal than next_input, but
2338
          // whose next item have a pointer that is larger.
2339
0
          while (insertion_point->next != NULL &&
2340
0
                 insertion_point->next->input <= next_input)
2341
0
          {
2342
0
            insertion_point = insertion_point->next;
2343
0
          }
2344
2345
          // If the input already exists for the next round, we don't need to
2346
          // insert it.
2347
0
          if (insertion_point->round == round + 1 &&
2348
0
              insertion_point->input == next_input)
2349
0
            continue;
2350
2351
          // The next opcode is RE_OPCODE_LITERAL, but the literal doesn't
2352
          // match with the byte at next_input, we don't need to add this
2353
          // input to the list as we already know that it won't match in the
2354
          // next round and will be deleted from the list anyways.
2355
0
          if (*(next_opcode) == RE_OPCODE_LITERAL &&
2356
0
              *(next_opcode + 1) != *next_input)
2357
0
            continue;
2358
2359
0
          RE_FAST_EXEC_POSITION* new_input;
2360
2361
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2362
0
              _yr_re_fast_exec_position_create(
2363
0
                  &context->re_fast_exec_position_pool, &new_input),
2364
              // Cleanup
2365
0
              _yr_re_fast_exec_destroy_position_list(
2366
0
                  &context->re_fast_exec_position_pool, first, last));
2367
2368
0
          new_input->input = next_input;
2369
0
          new_input->round = round + 1;
2370
0
          new_input->prev = insertion_point;
2371
0
          new_input->next = insertion_point->next;
2372
0
          insertion_point->next = new_input;
2373
2374
0
          if (new_input->next != NULL)
2375
0
            new_input->next->prev = new_input;
2376
2377
0
          if (insertion_point == last)
2378
0
            last = new_input;
2379
0
        }
2380
2381
0
        current->input += input_incr * repeat_any_args->min;
2382
0
        break;
2383
2384
0
      case RE_OPCODE_MATCH:
2385
2386
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2387
0
        {
2388
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2389
0
              callback(
2390
                  // When matching forwards the matching data always starts at
2391
                  // input_data, when matching backwards it starts at input + 1
2392
                  // or input_data - input_backwards_size if input + 1 is
2393
                  // outside the buffer.
2394
0
                  flags & RE_FLAGS_BACKWARDS
2395
0
                      ? yr_max(
2396
0
                            current->input + 1,
2397
0
                            input_data - input_backwards_size)
2398
0
                      : input_data,
2399
                  // The number of matched bytes should not be larger than
2400
                  // max_bytes_matched.
2401
0
                  yr_min(bytes_matched, max_bytes_matched),
2402
0
                  flags,
2403
0
                  callback_args),
2404
              // Cleanup
2405
0
              _yr_re_fast_exec_destroy_position_list(
2406
0
                  &context->re_fast_exec_position_pool, first, last));
2407
0
        }
2408
0
        else
2409
0
        {
2410
0
          if (matches != NULL)
2411
0
            *matches = bytes_matched;
2412
2413
0
          _yr_re_fast_exec_destroy_position_list(
2414
0
              &context->re_fast_exec_position_pool, first, last);
2415
2416
0
          return ERROR_SUCCESS;
2417
0
        }
2418
0
        break;
2419
2420
0
      default:
2421
0
        printf("non-supported opcode: %d\n", *ip);
2422
0
        assert(false);
2423
0
      }
2424
2425
0
      if (match)
2426
0
      {
2427
0
        current->round = round + 1;
2428
0
      }
2429
0
      else
2430
0
      {
2431
        // Remove current from the list. If the item being removed is the first
2432
        // one or the last one, the first and last pointers should be updated.
2433
        // The removed item is put into the pool for later reuse.
2434
2435
0
        if (current == first)
2436
0
          first = current->next;
2437
2438
0
        if (current == last)
2439
0
          last = current->prev;
2440
2441
0
        if (current->prev != NULL)
2442
0
          current->prev->next = current->next;
2443
2444
0
        if (current->next != NULL)
2445
0
          current->next->prev = current->prev;
2446
2447
0
        current->prev = NULL;
2448
0
        current->next = context->re_fast_exec_position_pool.head;
2449
2450
0
        if (current->next != NULL)
2451
0
          current->next->prev = current;
2452
2453
0
        context->re_fast_exec_position_pool.head = current;
2454
0
      }
2455
2456
0
      current = next;
2457
2458
0
    }  // while (current != NULL)
2459
2460
    // Move instruction pointer (ip) to next instruction. Each opcode has
2461
    // a different size.
2462
0
    switch (*ip)
2463
0
    {
2464
0
    case RE_OPCODE_ANY:
2465
0
      ip += 1;
2466
0
      break;
2467
0
    case RE_OPCODE_LITERAL:
2468
0
    case RE_OPCODE_NOT_LITERAL:
2469
0
      ip += 2;
2470
0
      break;
2471
0
    case RE_OPCODE_MASKED_LITERAL:
2472
0
    case RE_OPCODE_MASKED_NOT_LITERAL:
2473
0
      ip += 3;
2474
0
      break;
2475
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2476
0
      ip += 1 + sizeof(RE_REPEAT_ANY_ARGS);
2477
0
      break;
2478
0
    case RE_OPCODE_MATCH:
2479
0
      break;
2480
0
    default:
2481
0
      assert(false);
2482
0
    }
2483
2484
0
    round++;
2485
0
  }
2486
2487
  // If this point is reached no matches were found.
2488
2489
0
  if (matches != NULL)
2490
0
    *matches = -1;
2491
2492
0
  return ERROR_SUCCESS;
2493
0
}
2494
2495
static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent)
2496
0
{
2497
0
  RE_NODE* child;
2498
0
  int i;
2499
2500
0
  if (re_node == NULL)
2501
0
    return;
2502
2503
0
  if (indent > 0)
2504
0
    printf("\n%*s", indent, " ");
2505
0
  switch (re_node->type)
2506
0
  {
2507
0
  case RE_NODE_ALT:
2508
0
    printf("Alt(");
2509
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2510
0
    printf(",");
2511
0
    _yr_re_print_node(re_node->children_tail, indent + 4);
2512
0
    printf("\n%*s%s", indent, " ", ")");
2513
0
    break;
2514
2515
0
  case RE_NODE_CONCAT:
2516
0
    printf("Cat(");
2517
0
    child = re_node->children_head;
2518
0
    while (child != NULL)
2519
0
    {
2520
0
      _yr_re_print_node(child, indent + 4);
2521
0
      printf(",");
2522
0
      child = child->next_sibling;
2523
0
    }
2524
0
    printf("\n%*s%s", indent, " ", ")");
2525
0
    break;
2526
2527
0
  case RE_NODE_STAR:
2528
0
    printf("Star(");
2529
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2530
0
    printf(")");
2531
0
    break;
2532
2533
0
  case RE_NODE_PLUS:
2534
0
    printf("Plus(");
2535
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2536
0
    printf(")");
2537
0
    break;
2538
2539
0
  case RE_NODE_LITERAL:
2540
0
    printf("Lit(%c)", re_node->value);
2541
0
    break;
2542
2543
0
  case RE_NODE_NOT_LITERAL:
2544
0
    printf("NotLit(%c)", re_node->value);
2545
0
    break;
2546
2547
0
  case RE_NODE_MASKED_LITERAL:
2548
0
    printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask);
2549
0
    break;
2550
2551
0
  case RE_NODE_WORD_CHAR:
2552
0
    printf("WordChar");
2553
0
    break;
2554
2555
0
  case RE_NODE_NON_WORD_CHAR:
2556
0
    printf("NonWordChar");
2557
0
    break;
2558
2559
0
  case RE_NODE_SPACE:
2560
0
    printf("Space");
2561
0
    break;
2562
2563
0
  case RE_NODE_NON_SPACE:
2564
0
    printf("NonSpace");
2565
0
    break;
2566
2567
0
  case RE_NODE_DIGIT:
2568
0
    printf("Digit");
2569
0
    break;
2570
2571
0
  case RE_NODE_NON_DIGIT:
2572
0
    printf("NonDigit");
2573
0
    break;
2574
2575
0
  case RE_NODE_ANY:
2576
0
    printf("Any");
2577
0
    break;
2578
2579
0
  case RE_NODE_RANGE:
2580
0
    printf("Range(%d-%d, ", re_node->start, re_node->end);
2581
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2582
0
    printf("\n%*s%s", indent, " ", ")");
2583
0
    break;
2584
2585
0
  case RE_NODE_CLASS:
2586
0
    printf("Class(");
2587
0
    for (i = 0; i < 256; i++)
2588
0
      if (_yr_re_is_char_in_class(re_node->re_class, i, false))
2589
0
        printf("%02X,", i);
2590
0
    printf(")");
2591
0
    break;
2592
2593
0
  case RE_NODE_EMPTY:
2594
0
    printf("Empty");
2595
0
    break;
2596
2597
0
  case RE_NODE_ANCHOR_START:
2598
0
    printf("AnchorStart");
2599
0
    break;
2600
2601
0
  case RE_NODE_ANCHOR_END:
2602
0
    printf("AnchorEnd");
2603
0
    break;
2604
2605
0
  case RE_NODE_WORD_BOUNDARY:
2606
0
    printf("WordBoundary");
2607
0
    break;
2608
2609
0
  case RE_NODE_NON_WORD_BOUNDARY:
2610
0
    printf("NonWordBoundary");
2611
0
    break;
2612
2613
0
  case RE_NODE_RANGE_ANY:
2614
0
    printf("RangeAny");
2615
0
    break;
2616
2617
0
  default:
2618
0
    printf("???");
2619
0
    break;
2620
0
  }
2621
0
}
2622
2623
void yr_re_print(RE_AST* re_ast)
2624
0
{
2625
0
  _yr_re_print_node(re_ast->root_node, 0);
2626
0
}