Coverage Report

Created: 2025-07-18 06:53

/src/yara/libyara/re.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module implements a regular expressions engine based on Thompson's
33
algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html.
34
35
What the article names a "thread" has been named a "fiber" in this code, in
36
order to avoid confusion with operating system threads.
37
38
*/
39
40
#include <assert.h>
41
#include <string.h>
42
#include <yara/compiler.h>
43
#include <yara/error.h>
44
#include <yara/globals.h>
45
#include <yara/hex_lexer.h>
46
#include <yara/limits.h>
47
#include <yara/mem.h>
48
#include <yara/re.h>
49
#include <yara/re_lexer.h>
50
#include <yara/strutils.h>
51
#include <yara/threading.h>
52
#include <yara/unaligned.h>
53
#include <yara/utils.h>
54
55
358M
#define EMIT_BACKWARDS               0x01
56
199M
#define EMIT_DONT_SET_FORWARDS_CODE  0x02
57
4.96M
#define EMIT_DONT_SET_BACKWARDS_CODE 0x04
58
59
#ifndef INT16_MAX
60
#define INT16_MAX (32767)
61
#endif
62
63
typedef uint8_t RE_SPLIT_ID_TYPE;
64
65
// RE_REPEAT_ARGS and RE_REPEAT_ANY_ARGS are structures that are embedded in
66
// the regexp's instruction stream. As such, they are not always aligned to
67
// 8-byte boundaries, so they need to be "packed" structures in order to prevent
68
// issues due to unaligned memory accesses.
69
#pragma pack(push)
70
#pragma pack(1)
71
72
typedef struct _RE_REPEAT_ARGS
73
{
74
  uint16_t min;
75
  uint16_t max;
76
  int32_t offset;
77
78
} RE_REPEAT_ARGS;
79
80
typedef struct _RE_REPEAT_ANY_ARGS
81
{
82
  uint16_t min;
83
  uint16_t max;
84
85
} RE_REPEAT_ANY_ARGS;
86
87
#pragma pack(pop)
88
89
typedef struct _RE_EMIT_CONTEXT
90
{
91
  YR_ARENA* arena;
92
  RE_SPLIT_ID_TYPE next_split_id;
93
94
} RE_EMIT_CONTEXT;
95
96
0
#define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8))
97
98
static bool _yr_re_is_char_in_class(
99
    RE_CLASS* re_class,
100
    uint8_t chr,
101
    int case_insensitive)
102
0
{
103
0
  int result = CHAR_IN_CLASS(re_class->bitmap, chr);
104
105
0
  if (case_insensitive)
106
0
    result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]);
107
108
0
  if (re_class->negated)
109
0
    result = !result;
110
111
0
  return result;
112
0
}
113
114
static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size)
115
0
{
116
0
  int result = ((yr_isalnum(input) || (*input) == '_'));
117
118
0
  if (character_size == 2)
119
0
    result = result && (*(input + 1) == 0);
120
121
0
  return result;
122
0
}
123
124
RE_NODE* yr_re_node_create(int type)
125
2.71M
{
126
2.71M
  RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE));
127
128
2.71M
  if (result != NULL)
129
2.71M
  {
130
2.71M
    result->type = type;
131
2.71M
    result->children_head = NULL;
132
2.71M
    result->children_tail = NULL;
133
2.71M
    result->prev_sibling = NULL;
134
2.71M
    result->next_sibling = NULL;
135
2.71M
    result->greedy = true;
136
2.71M
    result->forward_code_ref = YR_ARENA_NULL_REF;
137
2.71M
    result->backward_code_ref = YR_ARENA_NULL_REF;
138
2.71M
  }
139
140
2.71M
  return result;
141
2.71M
}
142
143
void yr_re_node_destroy(RE_NODE* node)
144
2.71M
{
145
2.71M
  RE_NODE* child = node->children_head;
146
2.71M
  RE_NODE* next_child;
147
148
5.38M
  while (child != NULL)
149
2.66M
  {
150
2.66M
    next_child = child->next_sibling;
151
2.66M
    yr_re_node_destroy(child);
152
2.66M
    child = next_child;
153
2.66M
  }
154
155
2.71M
  if (node->type == RE_NODE_CLASS)
156
1.25k
    yr_free(node->re_class);
157
158
2.71M
  yr_free(node);
159
2.71M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Appends a node to the end of the children list.
163
//
164
void yr_re_node_append_child(RE_NODE* node, RE_NODE* child)
165
2.67M
{
166
2.67M
  if (node->children_head == NULL)
167
128k
    node->children_head = child;
168
169
2.67M
  if (node->children_tail != NULL)
170
2.54M
    node->children_tail->next_sibling = child;
171
172
2.67M
  child->prev_sibling = node->children_tail;
173
2.67M
  node->children_tail = child;
174
2.67M
}
175
176
////////////////////////////////////////////////////////////////////////////////
177
// Appends a node to the beginning of the children list.
178
//
179
void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child)
180
1.86k
{
181
1.86k
  child->next_sibling = node->children_head;
182
183
1.86k
  if (node->children_head != NULL)
184
1.86k
    node->children_head->prev_sibling = child;
185
186
1.86k
  node->children_head = child;
187
188
1.86k
  if (node->children_tail == NULL)
189
0
    node->children_tail = child;
190
1.86k
}
191
192
int yr_re_ast_create(RE_AST** re_ast)
193
34.0k
{
194
34.0k
  *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST));
195
196
34.0k
  if (*re_ast == NULL)
197
0
    return ERROR_INSUFFICIENT_MEMORY;
198
199
34.0k
  (*re_ast)->flags = 0;
200
34.0k
  (*re_ast)->root_node = NULL;
201
202
34.0k
  return ERROR_SUCCESS;
203
34.0k
}
204
205
void yr_re_ast_destroy(RE_AST* re_ast)
206
34.0k
{
207
34.0k
  if (re_ast->root_node != NULL)
208
33.3k
    yr_re_node_destroy(re_ast->root_node);
209
210
34.0k
  yr_free(re_ast);
211
34.0k
}
212
213
////////////////////////////////////////////////////////////////////////////////
214
// Parses a regexp but don't emit its code. A further call to
215
// yr_re_ast_emit_code is required to get the code.
216
//
217
int yr_re_parse(
218
    const char* re_string,
219
    RE_AST** re_ast,
220
    RE_ERROR* error,
221
    int flags)
222
27.4k
{
223
27.4k
  return yr_parse_re_string(re_string, re_ast, error, flags);
224
27.4k
}
225
226
////////////////////////////////////////////////////////////////////////////////
227
// Parses a hex string but don't emit its code. A further call to
228
// yr_re_ast_emit_code is required to get the code.
229
//
230
int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error)
231
1.27k
{
232
1.27k
  return yr_parse_hex_string(hex_string, re_ast, error);
233
1.27k
}
234
235
////////////////////////////////////////////////////////////////////////////////
236
// Parses the regexp and emit its code to the provided to the
237
// YR_RE_CODE_SECTION in the specified arena.
238
//
239
int yr_re_compile(
240
    const char* re_string,
241
    int flags,
242
    int parser_flags,
243
    YR_ARENA* arena,
244
    YR_ARENA_REF* ref,
245
    RE_ERROR* error)
246
1.50k
{
247
1.50k
  RE_AST* re_ast;
248
1.50k
  RE _re;
249
1.50k
  int result;
250
251
1.50k
  result = yr_re_parse(re_string, &re_ast, error, parser_flags);
252
1.50k
  if (result != ERROR_UNKNOWN_ESCAPE_SEQUENCE)
253
1.50k
    FAIL_ON_ERROR(result);
254
255
1.32k
  _re.flags = flags;
256
257
1.32k
  FAIL_ON_ERROR_WITH_CLEANUP(
258
1.32k
      yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref),
259
1.32k
      yr_re_ast_destroy(re_ast));
260
261
1.32k
  FAIL_ON_ERROR_WITH_CLEANUP(
262
1.32k
      yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast));
263
264
1.27k
  yr_re_ast_destroy(re_ast);
265
266
1.27k
  return result;
267
1.32k
}
268
269
////////////////////////////////////////////////////////////////////////////////
270
// Verifies if the target string matches the pattern
271
//
272
// Args:
273
//    context: Scan context
274
//    re: A pointer to a compiled regexp
275
//    target: Target string
276
//
277
// Returns:
278
//    See return codes for yr_re_exec
279
//
280
int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target)
281
0
{
282
0
  int result;
283
284
0
  yr_re_exec(
285
0
      context,
286
0
      re->code,
287
0
      (uint8_t*) target,
288
0
      strlen(target),
289
0
      0,
290
0
      re->flags | RE_FLAGS_SCAN,
291
0
      NULL,
292
0
      NULL,
293
0
      &result);
294
295
0
  return result;
296
0
}
297
298
////////////////////////////////////////////////////////////////////////////////
299
// Verifies if the provided regular expression is just a literal string
300
// like "abc", "12345", without any wildcard, operator, etc. In that case
301
// returns the string as a SIZED_STRING, or returns NULL if otherwise.
302
//
303
// The caller is responsible for deallocating the returned SIZED_STRING by
304
// calling yr_free.
305
//
306
SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast)
307
31.7k
{
308
31.7k
  SIZED_STRING* string;
309
31.7k
  RE_NODE* child;
310
311
31.7k
  int length = 0;
312
313
31.7k
  if (re_ast->root_node->type == RE_NODE_LITERAL)
314
234
  {
315
234
    length = 1;
316
234
  }
317
31.4k
  else if (re_ast->root_node->type == RE_NODE_CONCAT)
318
30.6k
  {
319
30.6k
    child = re_ast->root_node->children_tail;
320
321
158k
    while (child != NULL && child->type == RE_NODE_LITERAL)
322
127k
    {
323
127k
      length++;
324
127k
      child = child->prev_sibling;
325
127k
    }
326
327
30.6k
    if (child != NULL)
328
6.72k
      return NULL;
329
30.6k
  }
330
880
  else
331
880
  {
332
880
    return NULL;
333
880
  }
334
335
24.1k
  string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
336
337
24.1k
  if (string == NULL)
338
0
    return NULL;
339
340
24.1k
  string->length = length;
341
24.1k
  string->flags = 0;
342
343
24.1k
  if (re_ast->root_node->type == RE_NODE_LITERAL)
344
234
  {
345
234
    string->c_string[0] = re_ast->root_node->value;
346
234
  }
347
23.8k
  else
348
23.8k
  {
349
23.8k
    child = re_ast->root_node->children_tail;
350
351
146k
    while (child != NULL)
352
122k
    {
353
122k
      string->c_string[--length] = child->value;
354
122k
      child = child->prev_sibling;
355
122k
    }
356
23.8k
  }
357
358
24.1k
  string->c_string[string->length] = '\0';
359
360
24.1k
  return string;
361
24.1k
}
362
363
int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node)
364
176k
{
365
176k
  RE_NODE* child;
366
367
176k
  if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) &&
368
176k
      re_node->children_head->type == RE_NODE_ANY)
369
377
    return true;
370
371
176k
  if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE)
372
1.23k
    return true;
373
374
175k
  if (re_node->type == RE_NODE_CONCAT)
375
29.9k
  {
376
29.9k
    child = re_node->children_tail;
377
378
175k
    while (child != NULL)
379
150k
    {
380
150k
      if (_yr_re_node_has_unbounded_quantifier_for_dot(child))
381
5.08k
        return true;
382
383
145k
      child = child->prev_sibling;
384
145k
    }
385
29.9k
  }
386
387
170k
  return false;
388
175k
}
389
390
////////////////////////////////////////////////////////////////////////////////
391
// Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with
392
// quantifiers that don't have a reasonably small upper bound causes a
393
// performance penalty. This function detects such cases in order to warn the
394
// user about this.
395
//
396
int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast)
397
26.3k
{
398
26.3k
  return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node);
399
26.3k
}
400
401
////////////////////////////////////////////////////////////////////////////////
402
// In some cases splitting a regular expression (or hex string) in two parts is
403
// convenient for increasing performance. This happens when the pattern contains
404
// a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 }
405
// In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where
406
// the latter is chained to the former. This means that { 01 02 03 } and
407
// { 04 05 06 } are handled as individual strings, and when both of them are
408
// found, YARA verifies if the distance between the matches complies with the
409
// [0-999] restriction.
410
//
411
// This function traverses a regexp's AST looking for nodes where it should be
412
// splitted. It must be noticed that this only applies to two-level ASTs (i.e.
413
// an AST consisting in a RE_NODE_CONCAT at the root where all the children are
414
// leaves).
415
//
416
// For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the
417
// following AST:
418
//
419
// RE_NODE_CONCAT
420
// |
421
// |- RE_NODE_LITERAL (01)
422
// |- RE_NODE_LITERAL (02)
423
// |- RE_NODE_LITERAL (03)
424
// |- RE_NODE_RANGE_ANY (start=0, end=1000)
425
// |- RE_NODE_LITERAL (04)
426
// |- RE_NODE_LITERAL (05)
427
// |- RE_NODE_LITERAL (06)
428
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
429
// |- RE_NODE_LITERAL (07)
430
// |- RE_NODE_LITERAL (08)
431
// |- RE_NODE_LITERAL (09)
432
//
433
// If the AST above is passed in the re_ast argument, it will be trimmed to:
434
//
435
// RE_NODE_CONCAT
436
// |
437
// |- RE_NODE_LITERAL (01)
438
// |- RE_NODE_LITERAL (02)
439
// |- RE_NODE_LITERAL (03)
440
//
441
// While remainder_re_ast will be:
442
//
443
// RE_NODE_CONCAT
444
// |
445
// |- RE_NODE_LITERAL (04)
446
// |- RE_NODE_LITERAL (05)
447
// |- RE_NODE_LITERAL (06)
448
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
449
// |- RE_NODE_LITERAL (07)
450
// |- RE_NODE_LITERAL (08)
451
// |- RE_NODE_LITERAL (09)
452
//
453
// The caller is responsible for freeing the new AST in remainder_re_ast by
454
// calling yr_re_ast_destroy.
455
//
456
// The integers pointed to by min_gap and max_gap will be filled with the
457
// minimum and maximum gap size between the sub-strings represented by the
458
// two ASTs.
459
//
460
int yr_re_ast_split_at_chaining_point(
461
    RE_AST* re_ast,
462
    RE_AST** remainder_re_ast,
463
    int32_t* min_gap,
464
    int32_t* max_gap)
465
31.7k
{
466
31.7k
  RE_NODE* child;
467
31.7k
  RE_NODE* concat;
468
469
31.7k
  int result;
470
471
31.7k
  *remainder_re_ast = NULL;
472
31.7k
  *min_gap = 0;
473
31.7k
  *max_gap = 0;
474
475
31.7k
  if (re_ast->root_node->type != RE_NODE_CONCAT)
476
1.11k
    return ERROR_SUCCESS;
477
478
30.6k
  child = re_ast->root_node->children_head;
479
480
182k
  while (child != NULL)
481
157k
  {
482
157k
    if (!child->greedy && child->type == RE_NODE_RANGE_ANY &&
483
157k
        child->prev_sibling != NULL && child->next_sibling != NULL &&
484
157k
        (child->start > YR_STRING_CHAINING_THRESHOLD ||
485
5.85k
         child->end > YR_STRING_CHAINING_THRESHOLD))
486
5.33k
    {
487
5.33k
      result = yr_re_ast_create(remainder_re_ast);
488
489
5.33k
      if (result != ERROR_SUCCESS)
490
0
        return result;
491
492
5.33k
      concat = yr_re_node_create(RE_NODE_CONCAT);
493
494
5.33k
      if (concat == NULL)
495
0
        return ERROR_INSUFFICIENT_MEMORY;
496
497
5.33k
      concat->children_head = child->next_sibling;
498
5.33k
      concat->children_tail = re_ast->root_node->children_tail;
499
500
5.33k
      re_ast->root_node->children_tail = child->prev_sibling;
501
502
5.33k
      child->prev_sibling->next_sibling = NULL;
503
5.33k
      child->next_sibling->prev_sibling = NULL;
504
505
5.33k
      *min_gap = child->start;
506
5.33k
      *max_gap = child->end;
507
508
5.33k
      (*remainder_re_ast)->root_node = concat;
509
5.33k
      (*remainder_re_ast)->flags = re_ast->flags;
510
511
5.33k
      yr_re_node_destroy(child);
512
513
5.33k
      return ERROR_SUCCESS;
514
5.33k
    }
515
516
152k
    child = child->next_sibling;
517
152k
  }
518
519
25.2k
  return ERROR_SUCCESS;
520
30.6k
}
521
522
int _yr_emit_inst(
523
    RE_EMIT_CONTEXT* emit_context,
524
    uint8_t opcode,
525
    YR_ARENA_REF* instruction_ref)
526
74.5M
{
527
74.5M
  FAIL_ON_ERROR(yr_arena_write_data(
528
74.5M
      emit_context->arena,
529
74.5M
      YR_RE_CODE_SECTION,
530
74.5M
      &opcode,
531
74.5M
      sizeof(uint8_t),
532
74.5M
      instruction_ref));
533
534
74.5M
  return ERROR_SUCCESS;
535
74.5M
}
536
537
int _yr_emit_inst_arg_uint8(
538
    RE_EMIT_CONTEXT* emit_context,
539
    uint8_t opcode,
540
    uint8_t argument,
541
    YR_ARENA_REF* instruction_ref,
542
    YR_ARENA_REF* argument_ref)
543
95.1M
{
544
95.1M
  FAIL_ON_ERROR(yr_arena_write_data(
545
95.1M
      emit_context->arena,
546
95.1M
      YR_RE_CODE_SECTION,
547
95.1M
      &opcode,
548
95.1M
      sizeof(uint8_t),
549
95.1M
      instruction_ref));
550
551
95.1M
  FAIL_ON_ERROR(yr_arena_write_data(
552
95.1M
      emit_context->arena,
553
95.1M
      YR_RE_CODE_SECTION,
554
95.1M
      &argument,
555
95.1M
      sizeof(uint8_t),
556
95.1M
      argument_ref));
557
558
95.1M
  return ERROR_SUCCESS;
559
95.1M
}
560
561
int _yr_emit_inst_arg_uint16(
562
    RE_EMIT_CONTEXT* emit_context,
563
    uint8_t opcode,
564
    uint16_t argument,
565
    YR_ARENA_REF* instruction_ref,
566
    YR_ARENA_REF* argument_ref)
567
7.63k
{
568
7.63k
  FAIL_ON_ERROR(yr_arena_write_data(
569
7.63k
      emit_context->arena,
570
7.63k
      YR_RE_CODE_SECTION,
571
7.63k
      &opcode,
572
7.63k
      sizeof(uint8_t),
573
7.63k
      instruction_ref));
574
575
7.63k
  FAIL_ON_ERROR(yr_arena_write_data(
576
7.63k
      emit_context->arena,
577
7.63k
      YR_RE_CODE_SECTION,
578
7.63k
      &argument,
579
7.63k
      sizeof(uint16_t),
580
7.63k
      argument_ref));
581
582
7.63k
  return ERROR_SUCCESS;
583
7.63k
}
584
585
int _yr_emit_inst_arg_uint32(
586
    RE_EMIT_CONTEXT* emit_context,
587
    uint8_t opcode,
588
    uint32_t argument,
589
    YR_ARENA_REF* instruction_ref,
590
    YR_ARENA_REF* argument_ref)
591
0
{
592
0
  FAIL_ON_ERROR(yr_arena_write_data(
593
0
      emit_context->arena,
594
0
      YR_RE_CODE_SECTION,
595
0
      &opcode,
596
0
      sizeof(uint8_t),
597
0
      instruction_ref));
598
599
0
  FAIL_ON_ERROR(yr_arena_write_data(
600
0
      emit_context->arena,
601
0
      YR_RE_CODE_SECTION,
602
0
      &argument,
603
0
      sizeof(uint32_t),
604
0
      argument_ref));
605
606
0
  return ERROR_SUCCESS;
607
0
}
608
609
int _yr_emit_inst_arg_int16(
610
    RE_EMIT_CONTEXT* emit_context,
611
    uint8_t opcode,
612
    int16_t argument,
613
    YR_ARENA_REF* instruction_ref,
614
    YR_ARENA_REF* argument_ref)
615
27.0k
{
616
27.0k
  FAIL_ON_ERROR(yr_arena_write_data(
617
27.0k
      emit_context->arena,
618
27.0k
      YR_RE_CODE_SECTION,
619
27.0k
      &opcode,
620
27.0k
      sizeof(uint8_t),
621
27.0k
      instruction_ref));
622
623
27.0k
  FAIL_ON_ERROR(yr_arena_write_data(
624
27.0k
      emit_context->arena,
625
27.0k
      YR_RE_CODE_SECTION,
626
27.0k
      &argument,
627
27.0k
      sizeof(int16_t),
628
27.0k
      argument_ref));
629
630
27.0k
  return ERROR_SUCCESS;
631
27.0k
}
632
633
int _yr_emit_inst_arg_struct(
634
    RE_EMIT_CONTEXT* emit_context,
635
    uint8_t opcode,
636
    void* structure,
637
    size_t structure_size,
638
    YR_ARENA_REF* instruction_ref,
639
    YR_ARENA_REF* argument_ref)
640
33.7M
{
641
33.7M
  FAIL_ON_ERROR(yr_arena_write_data(
642
33.7M
      emit_context->arena,
643
33.7M
      YR_RE_CODE_SECTION,
644
33.7M
      &opcode,
645
33.7M
      sizeof(uint8_t),
646
33.7M
      instruction_ref));
647
648
33.7M
  FAIL_ON_ERROR(yr_arena_write_data(
649
33.7M
      emit_context->arena,
650
33.7M
      YR_RE_CODE_SECTION,
651
33.7M
      structure,
652
33.7M
      structure_size,
653
33.7M
      argument_ref));
654
655
33.7M
  return ERROR_SUCCESS;
656
33.7M
}
657
658
int _yr_emit_split(
659
    RE_EMIT_CONTEXT* emit_context,
660
    uint8_t opcode,
661
    int16_t argument,
662
    YR_ARENA_REF* instruction_ref,
663
    YR_ARENA_REF* argument_ref)
664
35.3k
{
665
35.3k
  assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B);
666
667
35.3k
  if (emit_context->next_split_id == RE_MAX_SPLIT_ID)
668
100
    return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX;
669
670
35.2k
  FAIL_ON_ERROR(yr_arena_write_data(
671
35.2k
      emit_context->arena,
672
35.2k
      YR_RE_CODE_SECTION,
673
35.2k
      &opcode,
674
35.2k
      sizeof(uint8_t),
675
35.2k
      instruction_ref));
676
677
35.2k
  FAIL_ON_ERROR(yr_arena_write_data(
678
35.2k
      emit_context->arena,
679
35.2k
      YR_RE_CODE_SECTION,
680
35.2k
      &emit_context->next_split_id,
681
35.2k
      sizeof(RE_SPLIT_ID_TYPE),
682
35.2k
      NULL));
683
684
35.2k
  emit_context->next_split_id++;
685
686
35.2k
  FAIL_ON_ERROR(yr_arena_write_data(
687
35.2k
      emit_context->arena,
688
35.2k
      YR_RE_CODE_SECTION,
689
35.2k
      &argument,
690
35.2k
      sizeof(int16_t),
691
35.2k
      argument_ref));
692
693
35.2k
  return ERROR_SUCCESS;
694
35.2k
}
695
696
static int _yr_re_emit(
697
    RE_EMIT_CONTEXT* emit_context,
698
    RE_NODE* re_node,
699
    int flags,
700
    YR_ARENA_REF* code_ref)
701
204M
{
702
204M
  int16_t jmp_offset;
703
704
204M
  yr_arena_off_t bookmark_1 = 0;
705
204M
  yr_arena_off_t bookmark_2 = 0;
706
204M
  yr_arena_off_t bookmark_3 = 0;
707
204M
  yr_arena_off_t bookmark_4 = 0;
708
709
204M
  bool emit_split;
710
204M
  bool emit_repeat;
711
204M
  bool emit_prolog;
712
204M
  bool emit_epilog;
713
714
204M
  RE_REPEAT_ARGS repeat_args;
715
204M
  RE_REPEAT_ARGS* repeat_start_args_addr;
716
204M
  RE_REPEAT_ANY_ARGS repeat_any_args;
717
718
204M
  RE_NODE* child;
719
720
204M
  int16_t* split_offset_addr = NULL;
721
204M
  int16_t* jmp_offset_addr = NULL;
722
723
204M
  YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF;
724
204M
  YR_ARENA_REF split_offset_ref;
725
204M
  YR_ARENA_REF jmp_instruction_ref;
726
204M
  YR_ARENA_REF jmp_offset_ref;
727
204M
  YR_ARENA_REF repeat_start_args_ref;
728
729
204M
  switch (re_node->type)
730
204M
  {
731
95.1M
  case RE_NODE_LITERAL:
732
95.1M
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
733
95.1M
        emit_context,
734
95.1M
        RE_OPCODE_LITERAL,
735
95.1M
        re_node->value,
736
95.1M
        &instruction_ref,
737
95.1M
        NULL));
738
95.1M
    break;
739
740
599
  case RE_NODE_NOT_LITERAL:
741
599
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
742
599
        emit_context,
743
599
        RE_OPCODE_NOT_LITERAL,
744
599
        re_node->value,
745
599
        &instruction_ref,
746
599
        NULL));
747
599
    break;
748
749
5.87k
  case RE_NODE_MASKED_LITERAL:
750
5.87k
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
751
5.87k
        emit_context,
752
5.87k
        RE_OPCODE_MASKED_LITERAL,
753
5.87k
        re_node->mask << 8 | re_node->value,
754
5.87k
        &instruction_ref,
755
5.87k
        NULL));
756
5.87k
    break;
757
758
1.75k
  case RE_NODE_MASKED_NOT_LITERAL:
759
1.75k
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
760
1.75k
        emit_context,
761
1.75k
        RE_OPCODE_MASKED_NOT_LITERAL,
762
1.75k
        re_node->mask << 8 | re_node->value,
763
1.75k
        &instruction_ref,
764
1.75k
        NULL));
765
1.75k
    break;
766
767
308
  case RE_NODE_WORD_CHAR:
768
308
    FAIL_ON_ERROR(
769
308
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref));
770
308
    break;
771
772
306
  case RE_NODE_NON_WORD_CHAR:
773
306
    FAIL_ON_ERROR(
774
306
        _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref));
775
306
    break;
776
777
261
  case RE_NODE_WORD_BOUNDARY:
778
261
    FAIL_ON_ERROR(
779
261
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref));
780
261
    break;
781
782
256
  case RE_NODE_NON_WORD_BOUNDARY:
783
256
    FAIL_ON_ERROR(_yr_emit_inst(
784
256
        emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref));
785
256
    break;
786
787
6.26k
  case RE_NODE_SPACE:
788
6.26k
    FAIL_ON_ERROR(
789
6.26k
        _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref));
790
6.26k
    break;
791
792
1.65k
  case RE_NODE_NON_SPACE:
793
1.65k
    FAIL_ON_ERROR(
794
1.65k
        _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref));
795
1.65k
    break;
796
797
498
  case RE_NODE_DIGIT:
798
498
    FAIL_ON_ERROR(
799
498
        _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref));
800
498
    break;
801
802
158
  case RE_NODE_NON_DIGIT:
803
158
    FAIL_ON_ERROR(
804
158
        _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref));
805
158
    break;
806
807
902k
  case RE_NODE_ANY:
808
902k
    FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref));
809
902k
    break;
810
811
73.2M
  case RE_NODE_CLASS:
812
73.2M
    FAIL_ON_ERROR(
813
73.2M
        _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref));
814
815
73.2M
    FAIL_ON_ERROR(yr_arena_write_data(
816
73.2M
        emit_context->arena,
817
73.2M
        YR_RE_CODE_SECTION,
818
73.2M
        re_node->re_class,
819
73.2M
        sizeof(*re_node->re_class),
820
73.2M
        NULL));
821
73.2M
    break;
822
823
189k
  case RE_NODE_ANCHOR_START:
824
189k
    FAIL_ON_ERROR(_yr_emit_inst(
825
189k
        emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref));
826
189k
    break;
827
828
180k
  case RE_NODE_ANCHOR_END:
829
180k
    FAIL_ON_ERROR(
830
180k
        _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref));
831
180k
    break;
832
833
18.1M
  case RE_NODE_CONCAT:
834
18.1M
    FAIL_ON_ERROR(_yr_re_emit(
835
18.1M
        emit_context,
836
18.1M
        (flags & EMIT_BACKWARDS) ? re_node->children_tail
837
18.1M
                                 : re_node->children_head,
838
18.1M
        flags,
839
18.1M
        &instruction_ref));
840
841
18.1M
    if (flags & EMIT_BACKWARDS)
842
143k
      child = re_node->children_tail->prev_sibling;
843
17.9M
    else
844
17.9M
      child = re_node->children_head->next_sibling;
845
846
153M
    while (child != NULL)
847
135M
    {
848
135M
      FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL));
849
850
135M
      child = (flags & EMIT_BACKWARDS) ? child->prev_sibling
851
135M
                                       : child->next_sibling;
852
135M
    }
853
18.1M
    break;
854
855
18.1M
  case RE_NODE_PLUS:
856
    // Code for e+ looks like:
857
    //
858
    //          L1: code for e
859
    //              split L1, L2
860
    //          L2:
861
    //
862
3.81k
    FAIL_ON_ERROR(_yr_re_emit(
863
3.81k
        emit_context, re_node->children_head, flags, &instruction_ref));
864
865
3.66k
    bookmark_1 = yr_arena_get_current_offset(
866
3.66k
        emit_context->arena, YR_RE_CODE_SECTION);
867
868
3.66k
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
869
8
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
870
871
3.65k
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
872
873
3.65k
    FAIL_ON_ERROR(_yr_emit_split(
874
3.65k
        emit_context,
875
3.65k
        re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A,
876
3.65k
        jmp_offset,
877
3.65k
        NULL,
878
3.65k
        NULL));
879
880
3.61k
    break;
881
882
3.01k
  case RE_NODE_STAR:
883
    // Code for e* looks like:
884
    //
885
    //          L1: split L1, L2
886
    //              code for e
887
    //              jmp L1
888
    //          L2:
889
3.01k
    FAIL_ON_ERROR(_yr_emit_split(
890
3.01k
        emit_context,
891
3.01k
        re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
892
3.01k
        0,
893
3.01k
        &instruction_ref,
894
3.01k
        &split_offset_ref));
895
896
2.99k
    FAIL_ON_ERROR(
897
2.99k
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
898
899
2.71k
    bookmark_1 = yr_arena_get_current_offset(
900
2.71k
        emit_context->arena, YR_RE_CODE_SECTION);
901
902
2.71k
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
903
15
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
904
905
2.69k
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
906
907
    // Emit jump with offset set to 0.
908
909
2.69k
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
910
2.69k
        emit_context, RE_OPCODE_JUMP, jmp_offset, NULL, NULL));
911
912
2.69k
    bookmark_1 = yr_arena_get_current_offset(
913
2.69k
        emit_context->arena, YR_RE_CODE_SECTION);
914
915
2.69k
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
916
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
917
918
2.69k
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
919
920
    // Update split offset.
921
2.69k
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
922
2.69k
        emit_context->arena, &split_offset_ref);
923
924
2.69k
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
925
2.69k
    break;
926
927
26.4k
  case RE_NODE_ALT:
928
    // Code for e1|e2 looks like:
929
    //
930
    //              split L1, L2
931
    //          L1: code for e1
932
    //              jmp L3
933
    //          L2: code for e2
934
    //          L3:
935
936
    // Emit a split instruction with offset set to 0 temporarily. Offset
937
    // will be updated after we know the size of the code generated for
938
    // the left node (e1).
939
940
26.4k
    FAIL_ON_ERROR(_yr_emit_split(
941
26.4k
        emit_context,
942
26.4k
        RE_OPCODE_SPLIT_A,
943
26.4k
        0,
944
26.4k
        &instruction_ref,
945
26.4k
        &split_offset_ref));
946
947
26.4k
    FAIL_ON_ERROR(
948
26.4k
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
949
950
    // Emit jump with offset set to 0.
951
952
24.4k
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
953
24.4k
        emit_context,
954
24.4k
        RE_OPCODE_JUMP,
955
24.4k
        0,
956
24.4k
        &jmp_instruction_ref,
957
24.4k
        &jmp_offset_ref));
958
959
24.4k
    bookmark_1 = yr_arena_get_current_offset(
960
24.4k
        emit_context->arena, YR_RE_CODE_SECTION);
961
962
24.4k
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
963
20
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
964
965
24.3k
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
966
967
    // Update split offset.
968
24.3k
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
969
24.3k
        emit_context->arena, &split_offset_ref);
970
971
24.3k
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
972
973
24.3k
    FAIL_ON_ERROR(
974
24.3k
        _yr_re_emit(emit_context, re_node->children_tail, flags, NULL));
975
976
24.0k
    bookmark_1 = yr_arena_get_current_offset(
977
24.0k
        emit_context->arena, YR_RE_CODE_SECTION);
978
979
24.0k
    if (bookmark_1 - jmp_instruction_ref.offset > INT16_MAX)
980
19
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
981
982
23.9k
    jmp_offset = (int16_t) (bookmark_1 - jmp_instruction_ref.offset);
983
984
    // Update offset for jmp instruction.
985
23.9k
    jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
986
23.9k
        emit_context->arena, &jmp_offset_ref);
987
988
23.9k
    memcpy(jmp_offset_addr, &jmp_offset, sizeof(jmp_offset));
989
23.9k
    break;
990
991
4.84k
  case RE_NODE_RANGE_ANY:
992
4.84k
    repeat_any_args.min = re_node->start;
993
4.84k
    repeat_any_args.max = re_node->end;
994
995
4.84k
    FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
996
4.84k
        emit_context,
997
4.84k
        re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY
998
4.84k
                        : RE_OPCODE_REPEAT_ANY_UNGREEDY,
999
4.84k
        &repeat_any_args,
1000
4.84k
        sizeof(repeat_any_args),
1001
4.84k
        &instruction_ref,
1002
4.84k
        NULL));
1003
1004
4.84k
    break;
1005
1006
17.1M
  case RE_NODE_RANGE:
1007
    // Code for e{n,m} looks like:
1008
    //
1009
    //            code for e              ---   prolog
1010
    //            repeat_start n, m, L1   --+
1011
    //        L0: code for e                |   repeat
1012
    //            repeat_end n, m, L0     --+
1013
    //        L1: split L2, L3            ---   split
1014
    //        L2: code for e              ---   epilog
1015
    //        L3:
1016
    //
1017
    // Not all sections (prolog, repeat, split and epilog) are generated in all
1018
    // cases, it depends on the values of n and m. The following table shows
1019
    // which sections are generated for the first few values of n and m.
1020
    //
1021
    //        n,m   prolog  repeat      split  epilog
1022
    //                      (min,max)
1023
    //        ---------------------------------------
1024
    //        0,0     -       -           -      -
1025
    //        0,1     -       -           X      X
1026
    //        0,2     -       0,1         X      X
1027
    //        0,3     -       0,2         X      X
1028
    //        0,M     -       0,M-1       X      X
1029
    //
1030
    //        1,1     X       -           -      -
1031
    //        1,2     X       -           X      X
1032
    //        1,3     X       0,1         X      X
1033
    //        1,4     X       1,2         X      X
1034
    //        1,M     X       1,M-2       X      X
1035
    //
1036
    //        2,2     X       -           -      X
1037
    //        2,3     X       1,1         X      X
1038
    //        2,4     X       1,2         X      X
1039
    //        2,M     X       1,M-2       X      X
1040
    //
1041
    //        3,3     X       1,1         -      X
1042
    //        3,4     X       2,2         X      X
1043
    //        3,M     X       2,M-2       X      X
1044
    //
1045
    //        4,4     X       2,2         -      X
1046
    //        4,5     X       3,3         X      X
1047
    //        4,M     X       3,M-2       X      X
1048
    //
1049
    // The code can't consists simply in the repeat section, the prolog and
1050
    // epilog are required because we can't have atoms pointing to code inside
1051
    // the repeat loop. Atoms' forwards_code will point to code in the prolog
1052
    // and backwards_code will point to code in the epilog (or in prolog if
1053
    // epilog wasn't generated, like in n=1,m=1)
1054
1055
17.1M
    emit_prolog = re_node->start > 0;
1056
17.1M
    emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2;
1057
17.1M
    emit_split = re_node->end > re_node->start;
1058
17.1M
    emit_epilog = re_node->end > re_node->start || re_node->end > 1;
1059
1060
17.1M
    if (emit_prolog)
1061
17.1M
    {
1062
17.1M
      FAIL_ON_ERROR(_yr_re_emit(
1063
17.1M
          emit_context, re_node->children_head, flags, &instruction_ref));
1064
17.1M
    }
1065
1066
17.1M
    if (emit_repeat)
1067
16.8M
    {
1068
16.8M
      repeat_args.min = re_node->start;
1069
16.8M
      repeat_args.max = re_node->end;
1070
1071
16.8M
      if (emit_prolog)
1072
16.8M
      {
1073
16.8M
        repeat_args.max--;
1074
16.8M
        repeat_args.min--;
1075
16.8M
      }
1076
1077
16.8M
      if (emit_split)
1078
732
      {
1079
732
        repeat_args.max--;
1080
732
      }
1081
16.8M
      else
1082
16.8M
      {
1083
16.8M
        repeat_args.min--;
1084
16.8M
        repeat_args.max--;
1085
16.8M
      }
1086
1087
16.8M
      repeat_args.offset = 0;
1088
1089
16.8M
      bookmark_1 = yr_arena_get_current_offset(
1090
16.8M
          emit_context->arena, YR_RE_CODE_SECTION);
1091
1092
16.8M
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1093
16.8M
          emit_context,
1094
16.8M
          re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY
1095
16.8M
                          : RE_OPCODE_REPEAT_START_UNGREEDY,
1096
16.8M
          &repeat_args,
1097
16.8M
          sizeof(repeat_args),
1098
16.8M
          emit_prolog ? NULL : &instruction_ref,
1099
16.8M
          &repeat_start_args_ref));
1100
1101
16.8M
      bookmark_2 = yr_arena_get_current_offset(
1102
16.8M
          emit_context->arena, YR_RE_CODE_SECTION);
1103
1104
16.8M
      FAIL_ON_ERROR(_yr_re_emit(
1105
16.8M
          emit_context,
1106
16.8M
          re_node->children_head,
1107
16.8M
          flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
1108
16.8M
          NULL));
1109
1110
16.8M
      bookmark_3 = yr_arena_get_current_offset(
1111
16.8M
          emit_context->arena, YR_RE_CODE_SECTION);
1112
1113
16.8M
      if (bookmark_2 - bookmark_3 < INT32_MIN)
1114
9
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1115
1116
16.8M
      repeat_args.offset = (int32_t) (bookmark_2 - bookmark_3);
1117
1118
16.8M
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1119
16.8M
          emit_context,
1120
16.8M
          re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY
1121
16.8M
                          : RE_OPCODE_REPEAT_END_UNGREEDY,
1122
16.8M
          &repeat_args,
1123
16.8M
          sizeof(repeat_args),
1124
16.8M
          NULL,
1125
16.8M
          NULL));
1126
1127
16.8M
      bookmark_4 = yr_arena_get_current_offset(
1128
16.8M
          emit_context->arena, YR_RE_CODE_SECTION);
1129
1130
16.8M
      repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr(
1131
16.8M
          emit_context->arena, &repeat_start_args_ref);
1132
1133
16.8M
      if (bookmark_4 - bookmark_1 > INT32_MAX)
1134
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1135
1136
16.8M
      repeat_start_args_addr->offset = (int32_t) (bookmark_4 - bookmark_1);
1137
16.8M
    }
1138
1139
17.1M
    if (emit_split)
1140
2.19k
    {
1141
2.19k
      bookmark_1 = yr_arena_get_current_offset(
1142
2.19k
          emit_context->arena, YR_RE_CODE_SECTION);
1143
1144
2.19k
      FAIL_ON_ERROR(_yr_emit_split(
1145
2.19k
          emit_context,
1146
2.19k
          re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
1147
2.19k
          0,
1148
2.19k
          NULL,
1149
2.19k
          &split_offset_ref));
1150
2.18k
    }
1151
1152
17.1M
    if (emit_epilog)
1153
17.1M
    {
1154
17.1M
      FAIL_ON_ERROR(_yr_re_emit(
1155
17.1M
          emit_context,
1156
17.1M
          re_node->children_head,
1157
17.1M
          emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags,
1158
17.1M
          emit_prolog || emit_repeat ? NULL : &instruction_ref));
1159
17.1M
    }
1160
1161
17.1M
    if (emit_split)
1162
2.06k
    {
1163
2.06k
      bookmark_2 = yr_arena_get_current_offset(
1164
2.06k
          emit_context->arena, YR_RE_CODE_SECTION);
1165
1166
2.06k
      if (bookmark_2 - bookmark_1 > INT16_MAX)
1167
9
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1168
1169
2.05k
      split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
1170
2.05k
          emit_context->arena, &split_offset_ref);
1171
1172
2.05k
      jmp_offset = (int16_t) (bookmark_2 - bookmark_1);
1173
1174
2.05k
      memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
1175
2.05k
    }
1176
1177
17.1M
    break;
1178
204M
  }
1179
1180
204M
  if (flags & EMIT_BACKWARDS)
1181
4.96M
  {
1182
4.96M
    if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE))
1183
2.07M
    {
1184
2.07M
      re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION;
1185
2.07M
      re_node->backward_code_ref.offset = yr_arena_get_current_offset(
1186
2.07M
          emit_context->arena, YR_RE_CODE_SECTION);
1187
2.07M
    }
1188
4.96M
  }
1189
199M
  else
1190
199M
  {
1191
199M
    if (!(flags & EMIT_DONT_SET_FORWARDS_CODE))
1192
2.16M
    {
1193
2.16M
      re_node->forward_code_ref = instruction_ref;
1194
2.16M
    }
1195
199M
  }
1196
1197
204M
  if (code_ref != NULL)
1198
35.2M
    *code_ref = instruction_ref;
1199
1200
204M
  return ERROR_SUCCESS;
1201
204M
}
1202
1203
int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code)
1204
16.4k
{
1205
16.4k
  RE_EMIT_CONTEXT emit_context;
1206
1207
  // Emit code for matching the regular expressions forwards.
1208
16.4k
  emit_context.arena = arena;
1209
16.4k
  emit_context.next_split_id = 0;
1210
1211
16.4k
  FAIL_ON_ERROR(_yr_re_emit(
1212
16.4k
      &emit_context,
1213
16.4k
      re_ast->root_node,
1214
16.4k
      backwards_code ? EMIT_BACKWARDS : 0,
1215
16.4k
      NULL));
1216
1217
16.2k
  FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL));
1218
1219
16.2k
  return ERROR_SUCCESS;
1220
16.2k
}
1221
1222
static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber)
1223
0
{
1224
0
  RE_FIBER* fiber;
1225
1226
0
  if (fiber_pool->fibers.head != NULL)
1227
0
  {
1228
0
    fiber = fiber_pool->fibers.head;
1229
0
    fiber_pool->fibers.head = fiber->next;
1230
1231
0
    if (fiber_pool->fibers.tail == fiber)
1232
0
      fiber_pool->fibers.tail = NULL;
1233
0
  }
1234
0
  else
1235
0
  {
1236
0
    if (fiber_pool->fiber_count == RE_MAX_FIBERS)
1237
0
      return ERROR_TOO_MANY_RE_FIBERS;
1238
1239
0
    fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER));
1240
1241
0
    if (fiber == NULL)
1242
0
      return ERROR_INSUFFICIENT_MEMORY;
1243
1244
0
    fiber_pool->fiber_count++;
1245
0
  }
1246
1247
0
  fiber->ip = NULL;
1248
0
  fiber->sp = -1;
1249
0
  fiber->rc = -1;
1250
0
  fiber->next = NULL;
1251
0
  fiber->prev = NULL;
1252
1253
0
  *new_fiber = fiber;
1254
1255
0
  return ERROR_SUCCESS;
1256
0
}
1257
1258
////////////////////////////////////////////////////////////////////////////////
1259
// Appends 'fiber' to 'fiber_list'
1260
//
1261
static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber)
1262
0
{
1263
0
  assert(fiber->prev == NULL);
1264
0
  assert(fiber->next == NULL);
1265
1266
0
  fiber->prev = fiber_list->tail;
1267
1268
0
  if (fiber_list->tail != NULL)
1269
0
    fiber_list->tail->next = fiber;
1270
1271
0
  fiber_list->tail = fiber;
1272
1273
0
  if (fiber_list->head == NULL)
1274
0
    fiber_list->head = fiber;
1275
1276
0
  assert(fiber_list->tail->next == NULL);
1277
0
  assert(fiber_list->head->prev == NULL);
1278
0
}
1279
1280
////////////////////////////////////////////////////////////////////////////////
1281
// Verifies if a fiber with the same properties (ip, rc, sp, and stack values)
1282
// than 'target_fiber' exists in 'fiber_list'. The list is iterated from
1283
// the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
1284
// taken into account.
1285
//
1286
static int _yr_re_fiber_exists(
1287
    RE_FIBER_LIST* fiber_list,
1288
    RE_FIBER* target_fiber,
1289
    RE_FIBER* last_fiber)
1290
0
{
1291
0
  RE_FIBER* fiber = fiber_list->head;
1292
1293
0
  int equal_stacks;
1294
0
  int i;
1295
1296
0
  if (last_fiber == NULL)
1297
0
    return false;
1298
1299
0
  while (fiber != last_fiber->next)
1300
0
  {
1301
0
    if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp &&
1302
0
        fiber->rc == target_fiber->rc)
1303
0
    {
1304
0
      equal_stacks = true;
1305
1306
0
      for (i = 0; i <= fiber->sp; i++)
1307
0
      {
1308
0
        if (fiber->stack[i] != target_fiber->stack[i])
1309
0
        {
1310
0
          equal_stacks = false;
1311
0
          break;
1312
0
        }
1313
0
      }
1314
1315
0
      if (equal_stacks)
1316
0
        return true;
1317
0
    }
1318
1319
0
    fiber = fiber->next;
1320
0
  }
1321
1322
0
  return false;
1323
0
}
1324
1325
////////////////////////////////////////////////////////////////////////////////
1326
// Clones a fiber in fiber_list and inserts the cloned fiber just after.
1327
// the original one. If fiber_list is:
1328
//
1329
//   f1 -> f2 -> f3 -> f4
1330
//
1331
// Splitting f2 will result in:
1332
//
1333
//   f1 -> f2 -> cloned f2 -> f3 -> f4
1334
//
1335
static int _yr_re_fiber_split(
1336
    RE_FIBER_LIST* fiber_list,
1337
    RE_FIBER_POOL* fiber_pool,
1338
    RE_FIBER* fiber,
1339
    RE_FIBER** new_fiber)
1340
0
{
1341
0
  int32_t i;
1342
1343
0
  FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber));
1344
1345
0
  (*new_fiber)->sp = fiber->sp;
1346
0
  (*new_fiber)->ip = fiber->ip;
1347
0
  (*new_fiber)->rc = fiber->rc;
1348
1349
0
  for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i];
1350
1351
0
  (*new_fiber)->next = fiber->next;
1352
0
  (*new_fiber)->prev = fiber;
1353
1354
0
  if (fiber->next != NULL)
1355
0
    fiber->next->prev = *new_fiber;
1356
1357
0
  fiber->next = *new_fiber;
1358
1359
0
  if (fiber_list->tail == fiber)
1360
0
    fiber_list->tail = *new_fiber;
1361
1362
0
  assert(fiber_list->tail->next == NULL);
1363
0
  assert(fiber_list->head->prev == NULL);
1364
1365
0
  return ERROR_SUCCESS;
1366
0
}
1367
1368
////////////////////////////////////////////////////////////////////////////////
1369
// Kills a given fiber by removing it from the fiber list and putting it in the
1370
// fiber pool.
1371
//
1372
static RE_FIBER* _yr_re_fiber_kill(
1373
    RE_FIBER_LIST* fiber_list,
1374
    RE_FIBER_POOL* fiber_pool,
1375
    RE_FIBER* fiber)
1376
0
{
1377
0
  RE_FIBER* next_fiber = fiber->next;
1378
1379
0
  if (fiber->prev != NULL)
1380
0
    fiber->prev->next = next_fiber;
1381
1382
0
  if (next_fiber != NULL)
1383
0
    next_fiber->prev = fiber->prev;
1384
1385
0
  if (fiber_pool->fibers.tail != NULL)
1386
0
    fiber_pool->fibers.tail->next = fiber;
1387
1388
0
  if (fiber_list->tail == fiber)
1389
0
    fiber_list->tail = fiber->prev;
1390
1391
0
  if (fiber_list->head == fiber)
1392
0
    fiber_list->head = next_fiber;
1393
1394
0
  fiber->next = NULL;
1395
0
  fiber->prev = fiber_pool->fibers.tail;
1396
0
  fiber_pool->fibers.tail = fiber;
1397
1398
0
  if (fiber_pool->fibers.head == NULL)
1399
0
    fiber_pool->fibers.head = fiber;
1400
1401
0
  return next_fiber;
1402
0
}
1403
1404
////////////////////////////////////////////////////////////////////////////////
1405
// Kills all fibers from the given one up to the end of the fiber list.
1406
//
1407
static void _yr_re_fiber_kill_tail(
1408
    RE_FIBER_LIST* fiber_list,
1409
    RE_FIBER_POOL* fiber_pool,
1410
    RE_FIBER* fiber)
1411
0
{
1412
0
  RE_FIBER* prev_fiber = fiber->prev;
1413
1414
0
  if (prev_fiber != NULL)
1415
0
    prev_fiber->next = NULL;
1416
1417
0
  fiber->prev = fiber_pool->fibers.tail;
1418
1419
0
  if (fiber_pool->fibers.tail != NULL)
1420
0
    fiber_pool->fibers.tail->next = fiber;
1421
1422
0
  fiber_pool->fibers.tail = fiber_list->tail;
1423
0
  fiber_list->tail = prev_fiber;
1424
1425
0
  if (fiber_list->head == fiber)
1426
0
    fiber_list->head = NULL;
1427
1428
0
  if (fiber_pool->fibers.head == NULL)
1429
0
    fiber_pool->fibers.head = fiber;
1430
0
}
1431
1432
////////////////////////////////////////////////////////////////////////////////
1433
// Kills all fibers in the fiber list.
1434
//
1435
static void _yr_re_fiber_kill_all(
1436
    RE_FIBER_LIST* fiber_list,
1437
    RE_FIBER_POOL* fiber_pool)
1438
0
{
1439
0
  if (fiber_list->head != NULL)
1440
0
    _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
1441
0
}
1442
1443
////////////////////////////////////////////////////////////////////////////////
1444
// Executes a fiber until reaching an "matching" instruction. A "matching"
1445
// instruction is one that actually reads a byte from the input and performs
1446
// some matching. If the fiber reaches a split instruction, the new fiber is
1447
// also synced.
1448
//
1449
static int _yr_re_fiber_sync(
1450
    RE_FIBER_LIST* fiber_list,
1451
    RE_FIBER_POOL* fiber_pool,
1452
    RE_FIBER* fiber_to_sync)
1453
0
{
1454
  // A array for keeping track of which split instructions has been already
1455
  // executed. Each split instruction within a regexp has an associated ID
1456
  // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is
1457
  // required to avoid infinite loops in regexps like (a*)* or (a|)*
1458
1459
0
  RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID];
1460
0
  RE_SPLIT_ID_TYPE splits_executed_count = 0;
1461
0
  RE_SPLIT_ID_TYPE split_id, splits_executed_idx;
1462
1463
0
  int split_already_executed;
1464
1465
0
  RE_REPEAT_ARGS* repeat_args;
1466
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
1467
1468
0
  RE_FIBER* fiber;
1469
0
  RE_FIBER* last;
1470
0
  RE_FIBER* next;
1471
0
  RE_FIBER* branch_a;
1472
0
  RE_FIBER* branch_b;
1473
1474
0
  fiber = fiber_to_sync;
1475
0
  last = fiber_to_sync->next;
1476
1477
0
  while (fiber != last)
1478
0
  {
1479
0
    int16_t jmp_len;
1480
0
    uint8_t opcode = *fiber->ip;
1481
1482
0
    switch (opcode)
1483
0
    {
1484
0
    case RE_OPCODE_SPLIT_A:
1485
0
    case RE_OPCODE_SPLIT_B:
1486
1487
0
      split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1);
1488
0
      split_already_executed = false;
1489
1490
0
      for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count;
1491
0
           splits_executed_idx++)
1492
0
      {
1493
0
        if (split_id == splits_executed[splits_executed_idx])
1494
0
        {
1495
0
          split_already_executed = true;
1496
0
          break;
1497
0
        }
1498
0
      }
1499
1500
0
      if (split_already_executed)
1501
0
      {
1502
0
        fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
1503
0
      }
1504
0
      else
1505
0
      {
1506
0
        branch_a = fiber;
1507
1508
0
        FAIL_ON_ERROR(
1509
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1510
1511
        // With RE_OPCODE_SPLIT_A the current fiber continues at the next
1512
        // instruction in the stream (branch A), while the newly created
1513
        // fiber starts at the address indicated by the instruction (branch B)
1514
        // RE_OPCODE_SPLIT_B has the opposite behavior.
1515
1516
0
        if (opcode == RE_OPCODE_SPLIT_B)
1517
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1518
1519
        // Branch A continues at the next instruction
1520
0
        branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3);
1521
1522
        // Branch B adds the offset indicated by the split instruction.
1523
0
        jmp_len = yr_unaligned_i16(branch_b->ip + 1 + sizeof(RE_SPLIT_ID_TYPE));
1524
1525
0
        branch_b->ip += jmp_len;
1526
1527
#ifdef YR_PARANOID_MODE
1528
        // In normal conditions this should never happen. But with compiled
1529
        // rules that has been hand-crafted by a malicious actor this could
1530
        // happen.
1531
        if (splits_executed_count >= RE_MAX_SPLIT_ID)
1532
          return ERROR_INTERNAL_FATAL_ERROR;
1533
#endif
1534
1535
0
        splits_executed[splits_executed_count] = split_id;
1536
0
        splits_executed_count++;
1537
0
      }
1538
1539
0
      break;
1540
1541
0
    case RE_OPCODE_REPEAT_START_GREEDY:
1542
0
    case RE_OPCODE_REPEAT_START_UNGREEDY:
1543
1544
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1545
0
      assert(repeat_args->max > 0);
1546
0
      branch_a = fiber;
1547
1548
0
      if (repeat_args->min == 0)
1549
0
      {
1550
0
        FAIL_ON_ERROR(
1551
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1552
1553
0
        if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY)
1554
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1555
1556
0
        branch_b->ip += repeat_args->offset;
1557
0
      }
1558
1559
0
      branch_a->stack[++branch_a->sp] = 0;
1560
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1561
0
      break;
1562
1563
0
    case RE_OPCODE_REPEAT_END_GREEDY:
1564
0
    case RE_OPCODE_REPEAT_END_UNGREEDY:
1565
1566
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1567
0
      fiber->stack[fiber->sp]++;
1568
1569
0
      if (fiber->stack[fiber->sp] < repeat_args->min)
1570
0
      {
1571
0
        fiber->ip += repeat_args->offset;
1572
0
        break;
1573
0
      }
1574
1575
0
      branch_a = fiber;
1576
1577
0
      if (fiber->stack[fiber->sp] < repeat_args->max)
1578
0
      {
1579
0
        FAIL_ON_ERROR(
1580
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1581
1582
0
        if (opcode == RE_OPCODE_REPEAT_END_GREEDY)
1583
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1584
1585
0
        branch_b->ip += repeat_args->offset;
1586
0
      }
1587
1588
0
      branch_a->sp--;
1589
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1590
0
      break;
1591
1592
0
    case RE_OPCODE_REPEAT_ANY_GREEDY:
1593
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1594
1595
0
      repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1);
1596
1597
      // If repetition counter (rc) is -1 it means that we are reaching this
1598
      // instruction from the previous one in the instructions stream. In
1599
      // this case let's initialize the counter to 0 and start looping.
1600
1601
0
      if (fiber->rc == -1)
1602
0
        fiber->rc = 0;
1603
1604
0
      if (fiber->rc < repeat_any_args->min)
1605
0
      {
1606
        // Increase repetition counter and continue with next fiber. The
1607
        // instruction pointer for this fiber is not incremented yet, this
1608
        // fiber spins in this same instruction until reaching the minimum
1609
        // number of repetitions.
1610
1611
0
        fiber->rc++;
1612
0
        fiber = fiber->next;
1613
0
      }
1614
0
      else if (fiber->rc < repeat_any_args->max)
1615
0
      {
1616
        // Once the minimum number of repetitions are matched one fiber
1617
        // remains spinning in this instruction until reaching the maximum
1618
        // number of repetitions while new fibers are created. New fibers
1619
        // start executing at the next instruction.
1620
1621
0
        next = fiber->next;
1622
0
        branch_a = fiber;
1623
1624
0
        FAIL_ON_ERROR(
1625
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1626
1627
0
        if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY)
1628
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1629
1630
0
        branch_a->rc++;
1631
0
        branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1632
0
        branch_b->rc = -1;
1633
1634
0
        FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b));
1635
1636
0
        fiber = next;
1637
0
      }
1638
0
      else
1639
0
      {
1640
        // When the maximum number of repetitions is reached the fiber keeps
1641
        // executing at the next instruction. The repetition counter is set
1642
        // to -1 indicating that we are not spinning in a repeat instruction
1643
        // anymore.
1644
1645
0
        fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1646
0
        fiber->rc = -1;
1647
0
      }
1648
1649
0
      break;
1650
1651
0
    case RE_OPCODE_JUMP:
1652
0
      jmp_len = yr_unaligned_i16(fiber->ip + 1);
1653
0
      fiber->ip += jmp_len;
1654
0
      break;
1655
1656
0
    default:
1657
0
      fiber = fiber->next;
1658
0
    }
1659
0
  }
1660
1661
0
  return ERROR_SUCCESS;
1662
0
}
1663
1664
////////////////////////////////////////////////////////////////////////////////
1665
// Executes a regular expression. The specified regular expression will try to
1666
// match the data starting at the address specified by "input". The "input"
1667
// pointer can point to any address inside a memory buffer. Arguments
1668
// "input_forwards_size" and "input_backwards_size" indicate how many bytes
1669
// can be accessible starting at "input" and going forwards and backwards
1670
// respectively.
1671
//
1672
//   <--- input_backwards_size -->|<----------- input_forwards_size -------->
1673
//  |--------  memory buffer  -----------------------------------------------|
1674
//                                ^
1675
//                              input
1676
//
1677
// Args:
1678
//   YR_SCAN_CONTEXT *context         - Scan context.
1679
//   const uint8_t* code              - Regexp code be executed
1680
//   const uint8_t* input             - Pointer to input data
1681
//   size_t input_forwards_size       - Number of accessible bytes starting at
1682
//                                      "input" and going forwards.
1683
//   size_t input_backwards_size      - Number of accessible bytes starting at
1684
//                                      "input" and going backwards
1685
//   int flags                        - Flags:
1686
//      RE_FLAGS_SCAN
1687
//      RE_FLAGS_BACKWARDS
1688
//      RE_FLAGS_EXHAUSTIVE
1689
//      RE_FLAGS_WIDE
1690
//      RE_FLAGS_NO_CASE
1691
//      RE_FLAGS_DOT_ALL
1692
//   RE_MATCH_CALLBACK_FUNC callback  - Callback function
1693
//   void* callback_args              - Callback argument
1694
//   int*  matches                    - Pointer to an integer receiving the
1695
//                                      number of matching bytes. Notice that
1696
//                                      0 means a zero-length match, while no
1697
//                                      matches is -1.
1698
// Returns:
1699
//    ERROR_SUCCESS or any other error code.
1700
1701
int yr_re_exec(
1702
    YR_SCAN_CONTEXT* context,
1703
    const uint8_t* code,
1704
    const uint8_t* input_data,
1705
    size_t input_forwards_size,
1706
    size_t input_backwards_size,
1707
    int flags,
1708
    RE_MATCH_CALLBACK_FUNC callback,
1709
    void* callback_args,
1710
    int* matches)
1711
0
{
1712
0
  const uint8_t* input;
1713
0
  const uint8_t* ip;
1714
1715
0
  uint16_t opcode_args;
1716
0
  uint8_t mask;
1717
0
  uint8_t value;
1718
0
  uint8_t character_size;
1719
1720
0
  RE_FIBER_LIST fibers;
1721
0
  RE_FIBER* fiber;
1722
0
  RE_FIBER* next_fiber;
1723
1724
0
  int bytes_matched;
1725
0
  int max_bytes_matched;
1726
1727
0
  int match;
1728
0
  int input_incr;
1729
0
  int kill;
1730
0
  int action;
1731
1732
0
  bool prev_is_word_char = false;
1733
0
  bool input_is_word_char = false;
1734
1735
0
#define ACTION_NONE      0
1736
0
#define ACTION_CONTINUE  1
1737
0
#define ACTION_KILL      2
1738
0
#define ACTION_KILL_TAIL 3
1739
1740
0
#define prolog                                      \
1741
0
  {                                                 \
1742
0
    if ((bytes_matched >= max_bytes_matched) ||     \
1743
0
        (character_size == 2 && *(input + 1) != 0)) \
1744
0
    {                                               \
1745
0
      action = ACTION_KILL;                         \
1746
0
      break;                                        \
1747
0
    }                                               \
1748
0
  }
1749
1750
0
  if (matches != NULL)
1751
0
    *matches = -1;
1752
1753
0
  if (flags & RE_FLAGS_WIDE)
1754
0
    character_size = 2;
1755
0
  else
1756
0
    character_size = 1;
1757
1758
0
  input = input_data;
1759
0
  input_incr = character_size;
1760
1761
0
  if (flags & RE_FLAGS_BACKWARDS)
1762
0
  {
1763
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1764
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
1765
0
    input -= character_size;
1766
0
    input_incr = -input_incr;
1767
0
  }
1768
0
  else
1769
0
  {
1770
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1771
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
1772
0
  }
1773
1774
  // Round down max_bytes_matched to a multiple of character_size, this way if
1775
  // character_size is 2 and max_bytes_matched is odd we are ignoring the
1776
  // extra byte which can't match anyways.
1777
1778
0
  max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
1779
0
  bytes_matched = 0;
1780
1781
0
  FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber));
1782
1783
0
  fiber->ip = code;
1784
0
  fibers.head = fiber;
1785
0
  fibers.tail = fiber;
1786
1787
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1788
0
      _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1789
0
      _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1790
1791
0
  while (fibers.head != NULL)
1792
0
  {
1793
0
    fiber = fibers.head;
1794
1795
0
    while (fiber != NULL)
1796
0
    {
1797
0
      next_fiber = fiber->next;
1798
1799
0
      if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev))
1800
0
        _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1801
1802
0
      fiber = next_fiber;
1803
0
    }
1804
1805
0
    fiber = fibers.head;
1806
1807
0
    while (fiber != NULL)
1808
0
    {
1809
0
      ip = fiber->ip;
1810
0
      action = ACTION_NONE;
1811
1812
0
      switch (*ip)
1813
0
      {
1814
0
      case RE_OPCODE_ANY:
1815
0
        prolog;
1816
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1817
0
        action = match ? ACTION_NONE : ACTION_KILL;
1818
0
        fiber->ip += 1;
1819
0
        break;
1820
1821
0
      case RE_OPCODE_REPEAT_ANY_GREEDY:
1822
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1823
0
        prolog;
1824
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1825
0
        action = match ? ACTION_NONE : ACTION_KILL;
1826
1827
        // The instruction pointer is not incremented here. The current fiber
1828
        // spins in this instruction until reaching the required number of
1829
        // repetitions. The code controlling the number of repetitions is in
1830
        // _yr_re_fiber_sync.
1831
1832
0
        break;
1833
1834
0
      case RE_OPCODE_LITERAL:
1835
0
        prolog;
1836
0
        if (flags & RE_FLAGS_NO_CASE)
1837
0
          match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)];
1838
0
        else
1839
0
          match = (*input == *(ip + 1));
1840
0
        action = match ? ACTION_NONE : ACTION_KILL;
1841
0
        fiber->ip += 2;
1842
0
        break;
1843
1844
0
      case RE_OPCODE_NOT_LITERAL:
1845
0
        prolog;
1846
1847
        // We don't need to take into account the case-insensitive
1848
        // case because this opcode is only used with hex strings,
1849
        // which can't be case-insensitive.
1850
1851
0
        match = (*input != *(ip + 1));
1852
0
        action = match ? ACTION_NONE : ACTION_KILL;
1853
0
        fiber->ip += 2;
1854
0
        break;
1855
1856
0
      case RE_OPCODE_MASKED_LITERAL:
1857
0
        prolog;
1858
0
        opcode_args = yr_unaligned_u16(ip + 1);
1859
0
        mask = opcode_args >> 8;
1860
0
        value = opcode_args & 0xFF;
1861
1862
        // We don't need to take into account the case-insensitive
1863
        // case because this opcode is only used with hex strings,
1864
        // which can't be case-insensitive.
1865
1866
0
        match = ((*input & mask) == value);
1867
0
        action = match ? ACTION_NONE : ACTION_KILL;
1868
0
        fiber->ip += 3;
1869
0
        break;
1870
1871
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
1872
0
        prolog;
1873
0
        opcode_args = yr_unaligned_u16(ip + 1);
1874
0
        mask = opcode_args >> 8;
1875
0
        value = opcode_args & 0xFF;
1876
1877
        // We don't need to take into account the case-insensitive
1878
        // case because this opcode is only used with hex strings,
1879
        // which can't be case-insensitive.
1880
1881
0
        match = ((*input & mask) != value);
1882
0
        action = match ? ACTION_NONE : ACTION_KILL;
1883
0
        fiber->ip += 3;
1884
0
        break;
1885
1886
0
      case RE_OPCODE_CLASS:
1887
0
        prolog;
1888
0
        match = _yr_re_is_char_in_class(
1889
0
            (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE);
1890
0
        action = match ? ACTION_NONE : ACTION_KILL;
1891
0
        fiber->ip += (sizeof(RE_CLASS) + 1);
1892
0
        break;
1893
1894
0
      case RE_OPCODE_WORD_CHAR:
1895
0
        prolog;
1896
0
        match = _yr_re_is_word_char(input, character_size);
1897
0
        action = match ? ACTION_NONE : ACTION_KILL;
1898
0
        fiber->ip += 1;
1899
0
        break;
1900
1901
0
      case RE_OPCODE_NON_WORD_CHAR:
1902
0
        prolog;
1903
0
        match = !_yr_re_is_word_char(input, character_size);
1904
0
        action = match ? ACTION_NONE : ACTION_KILL;
1905
0
        fiber->ip += 1;
1906
0
        break;
1907
1908
0
      case RE_OPCODE_SPACE:
1909
0
      case RE_OPCODE_NON_SPACE:
1910
1911
0
        prolog;
1912
1913
0
        switch (*input)
1914
0
        {
1915
0
        case ' ':
1916
0
        case '\t':
1917
0
        case '\r':
1918
0
        case '\n':
1919
0
        case '\v':
1920
0
        case '\f':
1921
0
          match = true;
1922
0
          break;
1923
0
        default:
1924
0
          match = false;
1925
0
        }
1926
1927
0
        if (*ip == RE_OPCODE_NON_SPACE)
1928
0
          match = !match;
1929
1930
0
        action = match ? ACTION_NONE : ACTION_KILL;
1931
0
        fiber->ip += 1;
1932
0
        break;
1933
1934
0
      case RE_OPCODE_DIGIT:
1935
0
        prolog;
1936
0
        match = isdigit(*input);
1937
0
        action = match ? ACTION_NONE : ACTION_KILL;
1938
0
        fiber->ip += 1;
1939
0
        break;
1940
1941
0
      case RE_OPCODE_NON_DIGIT:
1942
0
        prolog;
1943
0
        match = !isdigit(*input);
1944
0
        action = match ? ACTION_NONE : ACTION_KILL;
1945
0
        fiber->ip += 1;
1946
0
        break;
1947
1948
0
      case RE_OPCODE_WORD_BOUNDARY:
1949
0
      case RE_OPCODE_NON_WORD_BOUNDARY:
1950
0
        if (input - input_incr + character_size <=
1951
0
                input_data + input_forwards_size &&
1952
0
            input - input_incr >= input_data - input_backwards_size)
1953
0
        {
1954
0
          prev_is_word_char = _yr_re_is_word_char(
1955
0
              input - input_incr, character_size);
1956
0
        }
1957
0
        else
1958
0
        {
1959
0
          prev_is_word_char = false;
1960
0
        }
1961
1962
0
        if (input + character_size <= input_data + input_forwards_size &&
1963
0
            input >= input_data - input_backwards_size)
1964
0
        {
1965
0
          input_is_word_char = _yr_re_is_word_char(input, character_size);
1966
0
        }
1967
0
        else
1968
0
        {
1969
0
          input_is_word_char = false;
1970
0
        }
1971
1972
0
        match = (prev_is_word_char && !input_is_word_char) ||
1973
0
                (!prev_is_word_char && input_is_word_char);
1974
1975
0
        if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
1976
0
          match = !match;
1977
1978
0
        action = match ? ACTION_CONTINUE : ACTION_KILL;
1979
0
        fiber->ip += 1;
1980
0
        break;
1981
1982
0
      case RE_OPCODE_MATCH_AT_START:
1983
0
        if (flags & RE_FLAGS_BACKWARDS)
1984
0
          kill = input_backwards_size > (size_t) bytes_matched;
1985
0
        else
1986
0
          kill = input_backwards_size > 0 || (bytes_matched != 0);
1987
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1988
0
        fiber->ip += 1;
1989
0
        break;
1990
1991
0
      case RE_OPCODE_MATCH_AT_END:
1992
0
        kill = flags & RE_FLAGS_BACKWARDS ||
1993
0
               input_forwards_size > (size_t) bytes_matched;
1994
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1995
0
        fiber->ip += 1;
1996
0
        break;
1997
1998
0
      case RE_OPCODE_MATCH:
1999
2000
0
        if (matches != NULL)
2001
0
          *matches = bytes_matched;
2002
2003
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2004
0
        {
2005
0
          if (callback != NULL)
2006
0
          {
2007
0
            if (flags & RE_FLAGS_BACKWARDS)
2008
0
            {
2009
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2010
0
                  callback(
2011
0
                      input + character_size,
2012
0
                      bytes_matched,
2013
0
                      flags,
2014
0
                      callback_args),
2015
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2016
0
            }
2017
0
            else
2018
0
            {
2019
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2020
0
                  callback(input_data, bytes_matched, flags, callback_args),
2021
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2022
0
            }
2023
0
          }
2024
2025
0
          action = ACTION_KILL;
2026
0
        }
2027
0
        else
2028
0
        {
2029
0
          action = ACTION_KILL_TAIL;
2030
0
        }
2031
2032
0
        break;
2033
2034
0
      default:
2035
0
        assert(false);
2036
0
      }
2037
2038
0
      switch (action)
2039
0
      {
2040
0
      case ACTION_KILL:
2041
0
        fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
2042
0
        break;
2043
2044
0
      case ACTION_KILL_TAIL:
2045
0
        _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber);
2046
0
        fiber = NULL;
2047
0
        break;
2048
2049
0
      case ACTION_CONTINUE:
2050
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2051
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2052
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2053
0
        break;
2054
2055
0
      default:
2056
0
        next_fiber = fiber->next;
2057
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2058
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2059
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2060
0
        fiber = next_fiber;
2061
0
      }
2062
0
    }
2063
2064
0
    input += input_incr;
2065
0
    bytes_matched += character_size;
2066
2067
0
    if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched)
2068
0
    {
2069
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2070
0
          _yr_re_fiber_create(&context->re_fiber_pool, &fiber),
2071
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2072
2073
0
      fiber->ip = code;
2074
0
      _yr_re_fiber_append(&fibers, fiber);
2075
2076
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2077
0
          _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2078
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2079
0
    }
2080
0
  }
2081
2082
0
  return ERROR_SUCCESS;
2083
0
}
2084
2085
////////////////////////////////////////////////////////////////////////////////
2086
// Helper function that creates a RE_FAST_EXEC_POSITION by either allocating it
2087
// or reusing a previously allocated one from a pool.
2088
//
2089
static int _yr_re_fast_exec_position_create(
2090
    RE_FAST_EXEC_POSITION_POOL* pool,
2091
    RE_FAST_EXEC_POSITION** new_position)
2092
0
{
2093
0
  RE_FAST_EXEC_POSITION* position;
2094
2095
0
  if (pool->head != NULL)
2096
0
  {
2097
0
    position = pool->head;
2098
0
    pool->head = position->next;
2099
0
  }
2100
0
  else
2101
0
  {
2102
0
    position = (RE_FAST_EXEC_POSITION*) yr_malloc(
2103
0
        sizeof(RE_FAST_EXEC_POSITION));
2104
2105
0
    if (position == NULL)
2106
0
      return ERROR_INSUFFICIENT_MEMORY;
2107
0
  }
2108
2109
0
  position->input = NULL;
2110
0
  position->round = 0;
2111
0
  position->next = NULL;
2112
0
  position->prev = NULL;
2113
2114
0
  *new_position = position;
2115
2116
0
  return ERROR_SUCCESS;
2117
0
}
2118
2119
////////////////////////////////////////////////////////////////////////////////
2120
// Helper function that moves a list of RE_FAST_EXEC_POSITION structures to a
2121
// pool represented by a RE_FAST_EXEC_POSITION_POOL. Receives pointers to the
2122
// pool, the first item, and last item in the list.
2123
//
2124
static void _yr_re_fast_exec_destroy_position_list(
2125
    RE_FAST_EXEC_POSITION_POOL* pool,
2126
    RE_FAST_EXEC_POSITION* first,
2127
    RE_FAST_EXEC_POSITION* last)
2128
0
{
2129
0
  last->next = pool->head;
2130
2131
0
  if (pool->head != NULL)
2132
0
    pool->head->prev = last;
2133
2134
0
  pool->head = first;
2135
0
}
2136
2137
////////////////////////////////////////////////////////////////////////////////
2138
// This function replaces yr_re_exec for regular expressions marked with flag
2139
// RE_FLAGS_FAST_REGEXP. These regular expressions are derived from hex strings
2140
// that don't contain alternatives, like for example:
2141
//
2142
//   { 01 02 03 04 [0-2] 04 05 06 07 }
2143
//
2144
// The regexp's code produced by such strings can be matched by a faster, less
2145
// general algorithm, and it contains only the following opcodes:
2146
//
2147
//   * RE_OPCODE_LITERAL
2148
//   * RE_OPCODE_MASKED_LITERAL,
2149
//   * RE_OPCODE_NOT_LITERAL
2150
//   * RE_OPCODE_MASKED_NOT_LITERAL
2151
//   * RE_OPCODE_ANY
2152
//   * RE_OPCODE_REPEAT_ANY_UNGREEDY
2153
//   * RE_OPCODE_MATCH.
2154
//
2155
// The regexp's code is executed one instruction at time, and the function
2156
// maintains a list of positions within the input for tracking different
2157
// matching alternatives, these positions are described by an instance of
2158
// RE_FAST_EXEC_POSITION. With each instruction the list of positions is
2159
// updated, removing those where the input data doesn't match the current
2160
// instruction, or adding new positions if the instruction is
2161
// RE_OPCODE_REPEAT_ANY_UNGREEDY. The list of positions is maintained sorted
2162
// by the value of the pointer they hold to the input, and it doesn't contain
2163
// duplicated pointer values.
2164
//
2165
int yr_re_fast_exec(
2166
    YR_SCAN_CONTEXT* context,
2167
    const uint8_t* code,
2168
    const uint8_t* input_data,
2169
    size_t input_forwards_size,
2170
    size_t input_backwards_size,
2171
    int flags,
2172
    RE_MATCH_CALLBACK_FUNC callback,
2173
    void* callback_args,
2174
    int* matches)
2175
0
{
2176
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
2177
2178
  // Pointers to the first and last position in the list.
2179
0
  RE_FAST_EXEC_POSITION* first;
2180
0
  RE_FAST_EXEC_POSITION* last;
2181
2182
0
  int input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;
2183
0
  int bytes_matched;
2184
0
  int max_bytes_matched;
2185
2186
0
  if (flags & RE_FLAGS_BACKWARDS)
2187
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
2188
0
  else
2189
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
2190
2191
0
  const uint8_t* ip = code;
2192
2193
  // Create the first position in the list, which points to the start of the
2194
  // input data. Intially this is the only position, more positions will be
2195
  // created every time RE_OPCODE_REPEAT_ANY_UNGREEDY is executed.
2196
0
  FAIL_ON_ERROR(_yr_re_fast_exec_position_create(
2197
0
      &context->re_fast_exec_position_pool, &first));
2198
2199
0
  first->round = 0;
2200
0
  first->input = input_data;
2201
0
  first->prev = NULL;
2202
0
  first->next = NULL;
2203
2204
0
  if (flags & RE_FLAGS_BACKWARDS)
2205
0
    first->input--;
2206
2207
  // As we are starting with a single position, the last one and the first one
2208
  // are the same.
2209
0
  last = first;
2210
2211
  // Round is incremented with every regxp instruction.
2212
0
  int round = 0;
2213
2214
  // Keep in the loop until the list of positions gets empty. Positions are
2215
  // removed from the list when they don't match the current instruction in
2216
  // the code.
2217
0
  while (first != NULL)
2218
0
  {
2219
0
    RE_FAST_EXEC_POSITION* current = first;
2220
2221
    // Iterate over all the positions, executing the current instruction at
2222
    // all of them.
2223
0
    while (current != NULL)
2224
0
    {
2225
0
      RE_FAST_EXEC_POSITION* next = current->next;
2226
2227
      // Ignore any position in the list whose round number is different from
2228
      // the current round. This prevents new positions added to the list in
2229
      // this round from being taken into account in this same round.
2230
0
      if (current->round != round)
2231
0
      {
2232
0
        current = next;
2233
0
        continue;
2234
0
      }
2235
2236
0
      bytes_matched = flags & RE_FLAGS_BACKWARDS
2237
0
                          ? (int) (input_data - current->input - 1)
2238
0
                          : (int) (current->input - input_data);
2239
2240
0
      uint16_t opcode_args;
2241
0
      uint8_t mask;
2242
0
      uint8_t value;
2243
2244
0
      bool match = false;
2245
2246
0
      switch (*ip)
2247
0
      {
2248
0
      case RE_OPCODE_ANY:
2249
0
        if (bytes_matched >= max_bytes_matched)
2250
0
          break;
2251
2252
0
        match = true;
2253
0
        current->input += input_incr;
2254
0
        break;
2255
2256
0
      case RE_OPCODE_LITERAL:
2257
0
        if (bytes_matched >= max_bytes_matched)
2258
0
          break;
2259
2260
0
        if (*current->input == *(ip + 1))
2261
0
        {
2262
0
          match = true;
2263
0
          current->input += input_incr;
2264
0
        }
2265
0
        break;
2266
2267
0
      case RE_OPCODE_NOT_LITERAL:
2268
0
        if (bytes_matched >= max_bytes_matched)
2269
0
          break;
2270
2271
0
        if (*current->input != *(ip + 1))
2272
0
        {
2273
0
          match = true;
2274
0
          current->input += input_incr;
2275
0
        }
2276
0
        break;
2277
2278
0
      case RE_OPCODE_MASKED_LITERAL:
2279
0
        if (bytes_matched >= max_bytes_matched)
2280
0
          break;
2281
2282
0
        opcode_args = yr_unaligned_u16(ip + 1);
2283
0
        mask = opcode_args >> 8;
2284
0
        value = opcode_args & 0xFF;
2285
2286
0
        if ((*current->input & mask) == value)
2287
0
        {
2288
0
          match = true;
2289
0
          current->input += input_incr;
2290
0
        }
2291
0
        break;
2292
2293
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
2294
0
        if (bytes_matched >= max_bytes_matched)
2295
0
          break;
2296
2297
0
        opcode_args = yr_unaligned_u16(ip + 1);
2298
0
        mask = opcode_args >> 8;
2299
0
        value = opcode_args & 0xFF;
2300
2301
0
        if ((*current->input & mask) != value)
2302
0
        {
2303
0
          match = true;
2304
0
          current->input += input_incr;
2305
0
        }
2306
0
        break;
2307
2308
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2309
0
        repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1);
2310
2311
0
        if (bytes_matched + repeat_any_args->min >= max_bytes_matched)
2312
0
          break;
2313
2314
0
        match = true;
2315
2316
0
        const uint8_t* next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS);
2317
2318
        // insertion_point indicates the item in the list of inputs after which
2319
        // the newly created inputs will be inserted.
2320
0
        RE_FAST_EXEC_POSITION* insertion_point = current;
2321
2322
0
        for (int j = repeat_any_args->min + 1; j <= repeat_any_args->max; j++)
2323
0
        {
2324
          // bytes_matched is the number of bytes matched so far, and j is the
2325
          // minimum number of bytes that are still pending to match. If these
2326
          // two numbers sum more than the maximum number of bytes that can be
2327
          // matched the loop can be aborted. The position that we were about
2328
          // to create can't lead to a match without exceeding
2329
          // max_bytes_matched.
2330
0
          if (bytes_matched + j >= max_bytes_matched)
2331
0
            break;
2332
2333
0
          const uint8_t* next_input = current->input + j * input_incr;
2334
2335
          // Find the point where next_input should be inserted. The list is
2336
          // kept sorted by pointer in increasing order, the insertion point is
2337
          // an item that has a pointer lower or equal than next_input, but
2338
          // whose next item have a pointer that is larger.
2339
0
          while (insertion_point->next != NULL &&
2340
0
                 insertion_point->next->input <= next_input)
2341
0
          {
2342
0
            insertion_point = insertion_point->next;
2343
0
          }
2344
2345
          // If the input already exists for the next round, we don't need to
2346
          // insert it.
2347
0
          if (insertion_point->round == round + 1 &&
2348
0
              insertion_point->input == next_input)
2349
0
            continue;
2350
2351
          // The next opcode is RE_OPCODE_LITERAL, but the literal doesn't
2352
          // match with the byte at next_input, we don't need to add this
2353
          // input to the list as we already know that it won't match in the
2354
          // next round and will be deleted from the list anyways.
2355
0
          if (*(next_opcode) == RE_OPCODE_LITERAL &&
2356
0
              *(next_opcode + 1) != *next_input)
2357
0
            continue;
2358
2359
0
          RE_FAST_EXEC_POSITION* new_input;
2360
2361
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2362
0
              _yr_re_fast_exec_position_create(
2363
0
                  &context->re_fast_exec_position_pool, &new_input),
2364
              // Cleanup
2365
0
              _yr_re_fast_exec_destroy_position_list(
2366
0
                  &context->re_fast_exec_position_pool, first, last));
2367
2368
0
          new_input->input = next_input;
2369
0
          new_input->round = round + 1;
2370
0
          new_input->prev = insertion_point;
2371
0
          new_input->next = insertion_point->next;
2372
0
          insertion_point->next = new_input;
2373
2374
0
          if (new_input->next != NULL)
2375
0
            new_input->next->prev = new_input;
2376
2377
0
          if (insertion_point == last)
2378
0
            last = new_input;
2379
0
        }
2380
2381
0
        current->input += input_incr * repeat_any_args->min;
2382
0
        break;
2383
2384
0
      case RE_OPCODE_MATCH:
2385
2386
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2387
0
        {
2388
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2389
0
              callback(
2390
                  // When matching forwards the matching data always starts at
2391
                  // input_data, when matching backwards it starts at input + 1
2392
                  // or input_data - input_backwards_size if input + 1 is
2393
                  // outside the buffer.
2394
0
                  flags & RE_FLAGS_BACKWARDS
2395
0
                      ? yr_max(
2396
0
                            current->input + 1,
2397
0
                            input_data - input_backwards_size)
2398
0
                      : input_data,
2399
                  // The number of matched bytes should not be larger than
2400
                  // max_bytes_matched.
2401
0
                  yr_min(bytes_matched, max_bytes_matched),
2402
0
                  flags,
2403
0
                  callback_args),
2404
              // Cleanup
2405
0
              _yr_re_fast_exec_destroy_position_list(
2406
0
                  &context->re_fast_exec_position_pool, first, last));
2407
0
        }
2408
0
        else
2409
0
        {
2410
0
          if (matches != NULL)
2411
0
            *matches = bytes_matched;
2412
2413
0
          _yr_re_fast_exec_destroy_position_list(
2414
0
              &context->re_fast_exec_position_pool, first, last);
2415
2416
0
          return ERROR_SUCCESS;
2417
0
        }
2418
0
        break;
2419
2420
0
      default:
2421
0
        printf("non-supported opcode: %d\n", *ip);
2422
0
        assert(false);
2423
0
      }
2424
2425
0
      if (match)
2426
0
      {
2427
0
        current->round = round + 1;
2428
0
      }
2429
0
      else
2430
0
      {
2431
        // Remove current from the list. If the item being removed is the first
2432
        // one or the last one, the first and last pointers should be updated.
2433
        // The removed item is put into the pool for later reuse.
2434
2435
0
        if (current == first)
2436
0
          first = current->next;
2437
2438
0
        if (current == last)
2439
0
          last = current->prev;
2440
2441
0
        if (current->prev != NULL)
2442
0
          current->prev->next = current->next;
2443
2444
0
        if (current->next != NULL)
2445
0
          current->next->prev = current->prev;
2446
2447
0
        current->prev = NULL;
2448
0
        current->next = context->re_fast_exec_position_pool.head;
2449
2450
0
        if (current->next != NULL)
2451
0
          current->next->prev = current;
2452
2453
0
        context->re_fast_exec_position_pool.head = current;
2454
0
      }
2455
2456
0
      current = next;
2457
2458
0
    }  // while (current != NULL)
2459
2460
    // Move instruction pointer (ip) to next instruction. Each opcode has
2461
    // a different size.
2462
0
    switch (*ip)
2463
0
    {
2464
0
    case RE_OPCODE_ANY:
2465
0
      ip += 1;
2466
0
      break;
2467
0
    case RE_OPCODE_LITERAL:
2468
0
    case RE_OPCODE_NOT_LITERAL:
2469
0
      ip += 2;
2470
0
      break;
2471
0
    case RE_OPCODE_MASKED_LITERAL:
2472
0
    case RE_OPCODE_MASKED_NOT_LITERAL:
2473
0
      ip += 3;
2474
0
      break;
2475
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2476
0
      ip += 1 + sizeof(RE_REPEAT_ANY_ARGS);
2477
0
      break;
2478
0
    case RE_OPCODE_MATCH:
2479
0
      break;
2480
0
    default:
2481
0
      assert(false);
2482
0
    }
2483
2484
0
    round++;
2485
0
  }
2486
2487
  // If this point is reached no matches were found.
2488
2489
0
  if (matches != NULL)
2490
0
    *matches = -1;
2491
2492
0
  return ERROR_SUCCESS;
2493
0
}
2494
2495
static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent)
2496
0
{
2497
0
  RE_NODE* child;
2498
0
  int i;
2499
2500
0
  if (re_node == NULL)
2501
0
    return;
2502
2503
0
  if (indent > 0)
2504
0
    printf("\n%*s", indent, " ");
2505
0
  switch (re_node->type)
2506
0
  {
2507
0
  case RE_NODE_ALT:
2508
0
    printf("Alt(");
2509
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2510
0
    printf(",");
2511
0
    _yr_re_print_node(re_node->children_tail, indent + 4);
2512
0
    printf("\n%*s%s", indent, " ", ")");
2513
0
    break;
2514
2515
0
  case RE_NODE_CONCAT:
2516
0
    printf("Cat(");
2517
0
    child = re_node->children_head;
2518
0
    while (child != NULL)
2519
0
    {
2520
0
      _yr_re_print_node(child, indent + 4);
2521
0
      printf(",");
2522
0
      child = child->next_sibling;
2523
0
    }
2524
0
    printf("\n%*s%s", indent, " ", ")");
2525
0
    break;
2526
2527
0
  case RE_NODE_STAR:
2528
0
    printf("Star(");
2529
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2530
0
    printf(")");
2531
0
    break;
2532
2533
0
  case RE_NODE_PLUS:
2534
0
    printf("Plus(");
2535
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2536
0
    printf(")");
2537
0
    break;
2538
2539
0
  case RE_NODE_LITERAL:
2540
0
    printf("Lit(%c)", re_node->value);
2541
0
    break;
2542
2543
0
  case RE_NODE_NOT_LITERAL:
2544
0
    printf("NotLit(%c)", re_node->value);
2545
0
    break;
2546
2547
0
  case RE_NODE_MASKED_LITERAL:
2548
0
    printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask);
2549
0
    break;
2550
2551
0
  case RE_NODE_WORD_CHAR:
2552
0
    printf("WordChar");
2553
0
    break;
2554
2555
0
  case RE_NODE_NON_WORD_CHAR:
2556
0
    printf("NonWordChar");
2557
0
    break;
2558
2559
0
  case RE_NODE_SPACE:
2560
0
    printf("Space");
2561
0
    break;
2562
2563
0
  case RE_NODE_NON_SPACE:
2564
0
    printf("NonSpace");
2565
0
    break;
2566
2567
0
  case RE_NODE_DIGIT:
2568
0
    printf("Digit");
2569
0
    break;
2570
2571
0
  case RE_NODE_NON_DIGIT:
2572
0
    printf("NonDigit");
2573
0
    break;
2574
2575
0
  case RE_NODE_ANY:
2576
0
    printf("Any");
2577
0
    break;
2578
2579
0
  case RE_NODE_RANGE:
2580
0
    printf("Range(%d-%d, ", re_node->start, re_node->end);
2581
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2582
0
    printf("\n%*s%s", indent, " ", ")");
2583
0
    break;
2584
2585
0
  case RE_NODE_CLASS:
2586
0
    printf("Class(");
2587
0
    for (i = 0; i < 256; i++)
2588
0
      if (_yr_re_is_char_in_class(re_node->re_class, i, false))
2589
0
        printf("%02X,", i);
2590
0
    printf(")");
2591
0
    break;
2592
2593
0
  case RE_NODE_EMPTY:
2594
0
    printf("Empty");
2595
0
    break;
2596
2597
0
  case RE_NODE_ANCHOR_START:
2598
0
    printf("AnchorStart");
2599
0
    break;
2600
2601
0
  case RE_NODE_ANCHOR_END:
2602
0
    printf("AnchorEnd");
2603
0
    break;
2604
2605
0
  case RE_NODE_WORD_BOUNDARY:
2606
0
    printf("WordBoundary");
2607
0
    break;
2608
2609
0
  case RE_NODE_NON_WORD_BOUNDARY:
2610
0
    printf("NonWordBoundary");
2611
0
    break;
2612
2613
0
  case RE_NODE_RANGE_ANY:
2614
0
    printf("RangeAny");
2615
0
    break;
2616
2617
0
  default:
2618
0
    printf("???");
2619
0
    break;
2620
0
  }
2621
0
}
2622
2623
void yr_re_print(RE_AST* re_ast)
2624
0
{
2625
0
  _yr_re_print_node(re_ast->root_node, 0);
2626
0
}