Coverage Report

Created: 2026-01-17 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/yara/libyara/re.c
Line
Count
Source
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module implements a regular expressions engine based on Thompson's
33
algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html.
34
35
What the article names a "thread" has been named a "fiber" in this code, in
36
order to avoid confusion with operating system threads.
37
38
*/
39
40
#include <assert.h>
41
#include <string.h>
42
#include <yara/compiler.h>
43
#include <yara/error.h>
44
#include <yara/globals.h>
45
#include <yara/hex_lexer.h>
46
#include <yara/limits.h>
47
#include <yara/mem.h>
48
#include <yara/re.h>
49
#include <yara/re_lexer.h>
50
#include <yara/strutils.h>
51
#include <yara/threading.h>
52
#include <yara/unaligned.h>
53
#include <yara/utils.h>
54
55
540M
#define EMIT_BACKWARDS               0x01
56
313M
#define EMIT_DONT_SET_FORWARDS_CODE  0x02
57
6.50M
#define EMIT_DONT_SET_BACKWARDS_CODE 0x04
58
59
#ifndef INT16_MAX
60
#define INT16_MAX (32767)
61
#endif
62
63
typedef uint8_t RE_SPLIT_ID_TYPE;
64
65
// RE_REPEAT_ARGS and RE_REPEAT_ANY_ARGS are structures that are embedded in
66
// the regexp's instruction stream. As such, they are not always aligned to
67
// 8-byte boundaries, so they need to be "packed" structures in order to prevent
68
// issues due to unaligned memory accesses.
69
#pragma pack(push)
70
#pragma pack(1)
71
72
typedef struct _RE_REPEAT_ARGS
73
{
74
  uint16_t min;
75
  uint16_t max;
76
  int32_t offset;
77
78
} RE_REPEAT_ARGS;
79
80
typedef struct _RE_REPEAT_ANY_ARGS
81
{
82
  uint16_t min;
83
  uint16_t max;
84
85
} RE_REPEAT_ANY_ARGS;
86
87
#pragma pack(pop)
88
89
typedef struct _RE_EMIT_CONTEXT
90
{
91
  YR_ARENA* arena;
92
  RE_SPLIT_ID_TYPE next_split_id;
93
94
} RE_EMIT_CONTEXT;
95
96
0
#define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8))
97
98
static bool _yr_re_is_char_in_class(
99
    RE_CLASS* re_class,
100
    uint8_t chr,
101
    int case_insensitive)
102
0
{
103
0
  int result = CHAR_IN_CLASS(re_class->bitmap, chr);
104
105
0
  if (case_insensitive)
106
0
    result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]);
107
108
0
  if (re_class->negated)
109
0
    result = !result;
110
111
0
  return result;
112
0
}
113
114
static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size)
115
0
{
116
0
  int result = ((yr_isalnum(input) || (*input) == '_'));
117
118
0
  if (character_size == 2)
119
0
    result = result && (*(input + 1) == 0);
120
121
0
  return result;
122
0
}
123
124
RE_NODE* yr_re_node_create(int type)
125
1.86M
{
126
1.86M
  RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE));
127
128
1.86M
  if (result != NULL)
129
1.86M
  {
130
1.86M
    result->type = type;
131
1.86M
    result->children_head = NULL;
132
1.86M
    result->children_tail = NULL;
133
1.86M
    result->prev_sibling = NULL;
134
1.86M
    result->next_sibling = NULL;
135
1.86M
    result->greedy = true;
136
1.86M
    result->forward_code_ref = YR_ARENA_NULL_REF;
137
1.86M
    result->backward_code_ref = YR_ARENA_NULL_REF;
138
1.86M
  }
139
140
1.86M
  return result;
141
1.86M
}
142
143
void yr_re_node_destroy(RE_NODE* node)
144
1.86M
{
145
1.86M
  RE_NODE* child = node->children_head;
146
1.86M
  RE_NODE* next_child;
147
148
3.67M
  while (child != NULL)
149
1.81M
  {
150
1.81M
    next_child = child->next_sibling;
151
1.81M
    yr_re_node_destroy(child);
152
1.81M
    child = next_child;
153
1.81M
  }
154
155
1.86M
  if (node->type == RE_NODE_CLASS)
156
966
    yr_free(node->re_class);
157
158
1.86M
  yr_free(node);
159
1.86M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Appends a node to the end of the children list.
163
//
164
void yr_re_node_append_child(RE_NODE* node, RE_NODE* child)
165
1.82M
{
166
1.82M
  if (node->children_head == NULL)
167
122k
    node->children_head = child;
168
169
1.82M
  if (node->children_tail != NULL)
170
1.69M
    node->children_tail->next_sibling = child;
171
172
1.82M
  child->prev_sibling = node->children_tail;
173
1.82M
  node->children_tail = child;
174
1.82M
}
175
176
////////////////////////////////////////////////////////////////////////////////
177
// Appends a node to the beginning of the children list.
178
//
179
void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child)
180
1.85k
{
181
1.85k
  child->next_sibling = node->children_head;
182
183
1.85k
  if (node->children_head != NULL)
184
1.85k
    node->children_head->prev_sibling = child;
185
186
1.85k
  node->children_head = child;
187
188
1.85k
  if (node->children_tail == NULL)
189
0
    node->children_tail = child;
190
1.85k
}
191
192
int yr_re_ast_create(RE_AST** re_ast)
193
28.2k
{
194
28.2k
  *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST));
195
196
28.2k
  if (*re_ast == NULL)
197
0
    return ERROR_INSUFFICIENT_MEMORY;
198
199
28.2k
  (*re_ast)->flags = 0;
200
28.2k
  (*re_ast)->root_node = NULL;
201
202
28.2k
  return ERROR_SUCCESS;
203
28.2k
}
204
205
void yr_re_ast_destroy(RE_AST* re_ast)
206
28.2k
{
207
28.2k
  if (re_ast->root_node != NULL)
208
27.3k
    yr_re_node_destroy(re_ast->root_node);
209
210
28.2k
  yr_free(re_ast);
211
28.2k
}
212
213
////////////////////////////////////////////////////////////////////////////////
214
// Parses a regexp but don't emit its code. A further call to
215
// yr_re_ast_emit_code is required to get the code.
216
//
217
int yr_re_parse(
218
    const char* re_string,
219
    RE_AST** re_ast,
220
    RE_ERROR* error,
221
    int flags)
222
22.2k
{
223
22.2k
  return yr_parse_re_string(re_string, re_ast, error, flags);
224
22.2k
}
225
226
////////////////////////////////////////////////////////////////////////////////
227
// Parses a hex string but don't emit its code. A further call to
228
// yr_re_ast_emit_code is required to get the code.
229
//
230
int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error)
231
1.15k
{
232
1.15k
  return yr_parse_hex_string(hex_string, re_ast, error);
233
1.15k
}
234
235
////////////////////////////////////////////////////////////////////////////////
236
// Parses the regexp and emit its code to the provided to the
237
// YR_RE_CODE_SECTION in the specified arena.
238
//
239
int yr_re_compile(
240
    const char* re_string,
241
    int flags,
242
    int parser_flags,
243
    YR_ARENA* arena,
244
    YR_ARENA_REF* ref,
245
    RE_ERROR* error)
246
1.53k
{
247
1.53k
  RE_AST* re_ast;
248
1.53k
  RE _re;
249
1.53k
  int result;
250
251
1.53k
  result = yr_re_parse(re_string, &re_ast, error, parser_flags);
252
1.53k
  if (result != ERROR_UNKNOWN_ESCAPE_SEQUENCE)
253
1.53k
    FAIL_ON_ERROR(result);
254
255
1.31k
  _re.flags = flags;
256
257
1.31k
  FAIL_ON_ERROR_WITH_CLEANUP(
258
1.31k
      yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref),
259
1.31k
      yr_re_ast_destroy(re_ast));
260
261
1.31k
  FAIL_ON_ERROR_WITH_CLEANUP(
262
1.31k
      yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast));
263
264
1.28k
  yr_re_ast_destroy(re_ast);
265
266
1.28k
  return result;
267
1.31k
}
268
269
////////////////////////////////////////////////////////////////////////////////
270
// Verifies if the target string matches the pattern
271
//
272
// Args:
273
//    context: Scan context
274
//    re: A pointer to a compiled regexp
275
//    target: Target string
276
//
277
// Returns:
278
//    See return codes for yr_re_exec
279
//
280
int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target)
281
0
{
282
0
  int result;
283
284
0
  yr_re_exec(
285
0
      context,
286
0
      re->code,
287
0
      (uint8_t*) target,
288
0
      strlen(target),
289
0
      0,
290
0
      re->flags | RE_FLAGS_SCAN,
291
0
      NULL,
292
0
      NULL,
293
0
      &result);
294
295
0
  return result;
296
0
}
297
298
////////////////////////////////////////////////////////////////////////////////
299
// Verifies if the provided regular expression is just a literal string
300
// like "abc", "12345", without any wildcard, operator, etc. In that case
301
// returns the string as a SIZED_STRING, or returns NULL if otherwise.
302
//
303
// The caller is responsible for deallocating the returned SIZED_STRING by
304
// calling yr_free.
305
//
306
SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast)
307
25.7k
{
308
25.7k
  SIZED_STRING* string;
309
25.7k
  RE_NODE* child;
310
311
25.7k
  int length = 0;
312
313
25.7k
  if (re_ast->root_node->type == RE_NODE_LITERAL)
314
205
  {
315
205
    length = 1;
316
205
  }
317
25.5k
  else if (re_ast->root_node->type == RE_NODE_CONCAT)
318
24.6k
  {
319
24.6k
    child = re_ast->root_node->children_tail;
320
321
116k
    while (child != NULL && child->type == RE_NODE_LITERAL)
322
91.9k
    {
323
91.9k
      length++;
324
91.9k
      child = child->prev_sibling;
325
91.9k
    }
326
327
24.6k
    if (child != NULL)
328
6.42k
      return NULL;
329
24.6k
  }
330
848
  else
331
848
  {
332
848
    return NULL;
333
848
  }
334
335
18.4k
  string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
336
337
18.4k
  if (string == NULL)
338
0
    return NULL;
339
340
18.4k
  string->length = length;
341
18.4k
  string->flags = 0;
342
343
18.4k
  if (re_ast->root_node->type == RE_NODE_LITERAL)
344
205
  {
345
205
    string->c_string[0] = re_ast->root_node->value;
346
205
  }
347
18.2k
  else
348
18.2k
  {
349
18.2k
    child = re_ast->root_node->children_tail;
350
351
102k
    while (child != NULL)
352
84.4k
    {
353
84.4k
      string->c_string[--length] = child->value;
354
84.4k
      child = child->prev_sibling;
355
84.4k
    }
356
18.2k
  }
357
358
18.4k
  string->c_string[string->length] = '\0';
359
360
18.4k
  return string;
361
18.4k
}
362
363
int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node)
364
148k
{
365
148k
  RE_NODE* child;
366
367
148k
  if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) &&
368
866
      re_node->children_head->type == RE_NODE_ANY)
369
385
    return true;
370
371
148k
  if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE)
372
1.35k
    return true;
373
374
147k
  if (re_node->type == RE_NODE_CONCAT)
375
29.8k
  {
376
29.8k
    child = re_node->children_tail;
377
378
152k
    while (child != NULL)
379
127k
    {
380
127k
      if (_yr_re_node_has_unbounded_quantifier_for_dot(child))
381
5.02k
        return true;
382
383
122k
      child = child->prev_sibling;
384
122k
    }
385
29.8k
  }
386
387
142k
  return false;
388
147k
}
389
390
////////////////////////////////////////////////////////////////////////////////
391
// Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with
392
// quantifiers that don't have a reasonably small upper bound causes a
393
// performance penalty. This function detects such cases in order to warn the
394
// user about this.
395
//
396
int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast)
397
20.8k
{
398
20.8k
  return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node);
399
20.8k
}
400
401
////////////////////////////////////////////////////////////////////////////////
402
// In some cases splitting a regular expression (or hex string) in two parts is
403
// convenient for increasing performance. This happens when the pattern contains
404
// a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 }
405
// In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where
406
// the latter is chained to the former. This means that { 01 02 03 } and
407
// { 04 05 06 } are handled as individual strings, and when both of them are
408
// found, YARA verifies if the distance between the matches complies with the
409
// [0-999] restriction.
410
//
411
// This function traverses a regexp's AST looking for nodes where it should be
412
// splitted. It must be noticed that this only applies to two-level ASTs (i.e.
413
// an AST consisting in a RE_NODE_CONCAT at the root where all the children are
414
// leaves).
415
//
416
// For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the
417
// following AST:
418
//
419
// RE_NODE_CONCAT
420
// |
421
// |- RE_NODE_LITERAL (01)
422
// |- RE_NODE_LITERAL (02)
423
// |- RE_NODE_LITERAL (03)
424
// |- RE_NODE_RANGE_ANY (start=0, end=1000)
425
// |- RE_NODE_LITERAL (04)
426
// |- RE_NODE_LITERAL (05)
427
// |- RE_NODE_LITERAL (06)
428
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
429
// |- RE_NODE_LITERAL (07)
430
// |- RE_NODE_LITERAL (08)
431
// |- RE_NODE_LITERAL (09)
432
//
433
// If the AST above is passed in the re_ast argument, it will be trimmed to:
434
//
435
// RE_NODE_CONCAT
436
// |
437
// |- RE_NODE_LITERAL (01)
438
// |- RE_NODE_LITERAL (02)
439
// |- RE_NODE_LITERAL (03)
440
//
441
// While remainder_re_ast will be:
442
//
443
// RE_NODE_CONCAT
444
// |
445
// |- RE_NODE_LITERAL (04)
446
// |- RE_NODE_LITERAL (05)
447
// |- RE_NODE_LITERAL (06)
448
// |- RE_NODE_RANGE_ANY (start=500, end=2000)
449
// |- RE_NODE_LITERAL (07)
450
// |- RE_NODE_LITERAL (08)
451
// |- RE_NODE_LITERAL (09)
452
//
453
// The caller is responsible for freeing the new AST in remainder_re_ast by
454
// calling yr_re_ast_destroy.
455
//
456
// The integers pointed to by min_gap and max_gap will be filled with the
457
// minimum and maximum gap size between the sub-strings represented by the
458
// two ASTs.
459
//
460
int yr_re_ast_split_at_chaining_point(
461
    RE_AST* re_ast,
462
    RE_AST** remainder_re_ast,
463
    int32_t* min_gap,
464
    int32_t* max_gap)
465
25.7k
{
466
25.7k
  RE_NODE* child;
467
25.7k
  RE_NODE* concat;
468
469
25.7k
  int result;
470
471
25.7k
  *remainder_re_ast = NULL;
472
25.7k
  *min_gap = 0;
473
25.7k
  *max_gap = 0;
474
475
25.7k
  if (re_ast->root_node->type != RE_NODE_CONCAT)
476
1.05k
    return ERROR_SUCCESS;
477
478
24.6k
  child = re_ast->root_node->children_head;
479
480
150k
  while (child != NULL)
481
131k
  {
482
131k
    if (!child->greedy && child->type == RE_NODE_RANGE_ANY &&
483
6.32k
        child->prev_sibling != NULL && child->next_sibling != NULL &&
484
5.28k
        (child->start > YR_STRING_CHAINING_THRESHOLD ||
485
4.69k
         child->end > YR_STRING_CHAINING_THRESHOLD))
486
4.82k
    {
487
4.82k
      result = yr_re_ast_create(remainder_re_ast);
488
489
4.82k
      if (result != ERROR_SUCCESS)
490
0
        return result;
491
492
4.82k
      concat = yr_re_node_create(RE_NODE_CONCAT);
493
494
4.82k
      if (concat == NULL)
495
0
        return ERROR_INSUFFICIENT_MEMORY;
496
497
4.82k
      concat->children_head = child->next_sibling;
498
4.82k
      concat->children_tail = re_ast->root_node->children_tail;
499
500
4.82k
      re_ast->root_node->children_tail = child->prev_sibling;
501
502
4.82k
      child->prev_sibling->next_sibling = NULL;
503
4.82k
      child->next_sibling->prev_sibling = NULL;
504
505
4.82k
      *min_gap = child->start;
506
4.82k
      *max_gap = child->end;
507
508
4.82k
      (*remainder_re_ast)->root_node = concat;
509
4.82k
      (*remainder_re_ast)->flags = re_ast->flags;
510
511
4.82k
      yr_re_node_destroy(child);
512
513
4.82k
      return ERROR_SUCCESS;
514
4.82k
    }
515
516
126k
    child = child->next_sibling;
517
126k
  }
518
519
19.8k
  return ERROR_SUCCESS;
520
24.6k
}
521
522
int _yr_emit_inst(
523
    RE_EMIT_CONTEXT* emit_context,
524
    uint8_t opcode,
525
    YR_ARENA_REF* instruction_ref)
526
87.5M
{
527
87.5M
  FAIL_ON_ERROR(yr_arena_write_data(
528
87.5M
      emit_context->arena,
529
87.5M
      YR_RE_CODE_SECTION,
530
87.5M
      &opcode,
531
87.5M
      sizeof(uint8_t),
532
87.5M
      instruction_ref));
533
534
87.5M
  return ERROR_SUCCESS;
535
87.5M
}
536
537
int _yr_emit_inst_arg_uint8(
538
    RE_EMIT_CONTEXT* emit_context,
539
    uint8_t opcode,
540
    uint8_t argument,
541
    YR_ARENA_REF* instruction_ref,
542
    YR_ARENA_REF* argument_ref)
543
162M
{
544
162M
  FAIL_ON_ERROR(yr_arena_write_data(
545
162M
      emit_context->arena,
546
162M
      YR_RE_CODE_SECTION,
547
162M
      &opcode,
548
162M
      sizeof(uint8_t),
549
162M
      instruction_ref));
550
551
162M
  FAIL_ON_ERROR(yr_arena_write_data(
552
162M
      emit_context->arena,
553
162M
      YR_RE_CODE_SECTION,
554
162M
      &argument,
555
162M
      sizeof(uint8_t),
556
162M
      argument_ref));
557
558
162M
  return ERROR_SUCCESS;
559
162M
}
560
561
int _yr_emit_inst_arg_uint16(
562
    RE_EMIT_CONTEXT* emit_context,
563
    uint8_t opcode,
564
    uint16_t argument,
565
    YR_ARENA_REF* instruction_ref,
566
    YR_ARENA_REF* argument_ref)
567
5.55k
{
568
5.55k
  FAIL_ON_ERROR(yr_arena_write_data(
569
5.55k
      emit_context->arena,
570
5.55k
      YR_RE_CODE_SECTION,
571
5.55k
      &opcode,
572
5.55k
      sizeof(uint8_t),
573
5.55k
      instruction_ref));
574
575
5.55k
  FAIL_ON_ERROR(yr_arena_write_data(
576
5.55k
      emit_context->arena,
577
5.55k
      YR_RE_CODE_SECTION,
578
5.55k
      &argument,
579
5.55k
      sizeof(uint16_t),
580
5.55k
      argument_ref));
581
582
5.55k
  return ERROR_SUCCESS;
583
5.55k
}
584
585
int _yr_emit_inst_arg_uint32(
586
    RE_EMIT_CONTEXT* emit_context,
587
    uint8_t opcode,
588
    uint32_t argument,
589
    YR_ARENA_REF* instruction_ref,
590
    YR_ARENA_REF* argument_ref)
591
0
{
592
0
  FAIL_ON_ERROR(yr_arena_write_data(
593
0
      emit_context->arena,
594
0
      YR_RE_CODE_SECTION,
595
0
      &opcode,
596
0
      sizeof(uint8_t),
597
0
      instruction_ref));
598
599
0
  FAIL_ON_ERROR(yr_arena_write_data(
600
0
      emit_context->arena,
601
0
      YR_RE_CODE_SECTION,
602
0
      &argument,
603
0
      sizeof(uint32_t),
604
0
      argument_ref));
605
606
0
  return ERROR_SUCCESS;
607
0
}
608
609
int _yr_emit_inst_arg_int16(
610
    RE_EMIT_CONTEXT* emit_context,
611
    uint8_t opcode,
612
    int16_t argument,
613
    YR_ARENA_REF* instruction_ref,
614
    YR_ARENA_REF* argument_ref)
615
28.3k
{
616
28.3k
  FAIL_ON_ERROR(yr_arena_write_data(
617
28.3k
      emit_context->arena,
618
28.3k
      YR_RE_CODE_SECTION,
619
28.3k
      &opcode,
620
28.3k
      sizeof(uint8_t),
621
28.3k
      instruction_ref));
622
623
28.3k
  FAIL_ON_ERROR(yr_arena_write_data(
624
28.3k
      emit_context->arena,
625
28.3k
      YR_RE_CODE_SECTION,
626
28.3k
      &argument,
627
28.3k
      sizeof(int16_t),
628
28.3k
      argument_ref));
629
630
28.3k
  return ERROR_SUCCESS;
631
28.3k
}
632
633
int _yr_emit_inst_arg_struct(
634
    RE_EMIT_CONTEXT* emit_context,
635
    uint8_t opcode,
636
    void* structure,
637
    size_t structure_size,
638
    YR_ARENA_REF* instruction_ref,
639
    YR_ARENA_REF* argument_ref)
640
65.5M
{
641
65.5M
  FAIL_ON_ERROR(yr_arena_write_data(
642
65.5M
      emit_context->arena,
643
65.5M
      YR_RE_CODE_SECTION,
644
65.5M
      &opcode,
645
65.5M
      sizeof(uint8_t),
646
65.5M
      instruction_ref));
647
648
65.5M
  FAIL_ON_ERROR(yr_arena_write_data(
649
65.5M
      emit_context->arena,
650
65.5M
      YR_RE_CODE_SECTION,
651
65.5M
      structure,
652
65.5M
      structure_size,
653
65.5M
      argument_ref));
654
655
65.5M
  return ERROR_SUCCESS;
656
65.5M
}
657
658
int _yr_emit_split(
659
    RE_EMIT_CONTEXT* emit_context,
660
    uint8_t opcode,
661
    int16_t argument,
662
    YR_ARENA_REF* instruction_ref,
663
    YR_ARENA_REF* argument_ref)
664
34.7k
{
665
34.7k
  assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B);
666
667
34.7k
  if (emit_context->next_split_id == RE_MAX_SPLIT_ID)
668
88
    return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX;
669
670
34.6k
  FAIL_ON_ERROR(yr_arena_write_data(
671
34.6k
      emit_context->arena,
672
34.6k
      YR_RE_CODE_SECTION,
673
34.6k
      &opcode,
674
34.6k
      sizeof(uint8_t),
675
34.6k
      instruction_ref));
676
677
34.6k
  FAIL_ON_ERROR(yr_arena_write_data(
678
34.6k
      emit_context->arena,
679
34.6k
      YR_RE_CODE_SECTION,
680
34.6k
      &emit_context->next_split_id,
681
34.6k
      sizeof(RE_SPLIT_ID_TYPE),
682
34.6k
      NULL));
683
684
34.6k
  emit_context->next_split_id++;
685
686
34.6k
  FAIL_ON_ERROR(yr_arena_write_data(
687
34.6k
      emit_context->arena,
688
34.6k
      YR_RE_CODE_SECTION,
689
34.6k
      &argument,
690
34.6k
      sizeof(int16_t),
691
34.6k
      argument_ref));
692
693
34.6k
  return ERROR_SUCCESS;
694
34.6k
}
695
696
static int _yr_re_emit(
697
    RE_EMIT_CONTEXT* emit_context,
698
    RE_NODE* re_node,
699
    int flags,
700
    YR_ARENA_REF* code_ref)
701
319M
{
702
319M
  int16_t jmp_offset;
703
704
319M
  yr_arena_off_t bookmark_1 = 0;
705
319M
  yr_arena_off_t bookmark_2 = 0;
706
319M
  yr_arena_off_t bookmark_3 = 0;
707
319M
  yr_arena_off_t bookmark_4 = 0;
708
709
319M
  bool emit_split;
710
319M
  bool emit_repeat;
711
319M
  bool emit_prolog;
712
319M
  bool emit_epilog;
713
714
319M
  RE_REPEAT_ARGS repeat_args;
715
319M
  RE_REPEAT_ARGS* repeat_start_args_addr;
716
319M
  RE_REPEAT_ANY_ARGS repeat_any_args;
717
718
319M
  RE_NODE* child;
719
720
319M
  int16_t* split_offset_addr = NULL;
721
319M
  int16_t* jmp_offset_addr = NULL;
722
723
319M
  YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF;
724
319M
  YR_ARENA_REF split_offset_ref;
725
319M
  YR_ARENA_REF jmp_instruction_ref;
726
319M
  YR_ARENA_REF jmp_offset_ref;
727
319M
  YR_ARENA_REF repeat_start_args_ref;
728
729
319M
  switch (re_node->type)
730
319M
  {
731
162M
  case RE_NODE_LITERAL:
732
162M
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
733
162M
        emit_context,
734
162M
        RE_OPCODE_LITERAL,
735
162M
        re_node->value,
736
162M
        &instruction_ref,
737
162M
        NULL));
738
162M
    break;
739
740
492
  case RE_NODE_NOT_LITERAL:
741
492
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
742
492
        emit_context,
743
492
        RE_OPCODE_NOT_LITERAL,
744
492
        re_node->value,
745
492
        &instruction_ref,
746
492
        NULL));
747
492
    break;
748
749
4.37k
  case RE_NODE_MASKED_LITERAL:
750
4.37k
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
751
4.37k
        emit_context,
752
4.37k
        RE_OPCODE_MASKED_LITERAL,
753
4.37k
        re_node->mask << 8 | re_node->value,
754
4.37k
        &instruction_ref,
755
4.37k
        NULL));
756
4.37k
    break;
757
758
1.17k
  case RE_NODE_MASKED_NOT_LITERAL:
759
1.17k
    FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
760
1.17k
        emit_context,
761
1.17k
        RE_OPCODE_MASKED_NOT_LITERAL,
762
1.17k
        re_node->mask << 8 | re_node->value,
763
1.17k
        &instruction_ref,
764
1.17k
        NULL));
765
1.17k
    break;
766
767
391
  case RE_NODE_WORD_CHAR:
768
391
    FAIL_ON_ERROR(
769
391
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref));
770
391
    break;
771
772
130
  case RE_NODE_NON_WORD_CHAR:
773
130
    FAIL_ON_ERROR(
774
130
        _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref));
775
130
    break;
776
777
758
  case RE_NODE_WORD_BOUNDARY:
778
758
    FAIL_ON_ERROR(
779
758
        _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref));
780
758
    break;
781
782
256
  case RE_NODE_NON_WORD_BOUNDARY:
783
256
    FAIL_ON_ERROR(_yr_emit_inst(
784
256
        emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref));
785
256
    break;
786
787
2.32k
  case RE_NODE_SPACE:
788
2.32k
    FAIL_ON_ERROR(
789
2.32k
        _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref));
790
2.32k
    break;
791
792
270
  case RE_NODE_NON_SPACE:
793
270
    FAIL_ON_ERROR(
794
270
        _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref));
795
270
    break;
796
797
421
  case RE_NODE_DIGIT:
798
421
    FAIL_ON_ERROR(
799
421
        _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref));
800
421
    break;
801
802
716
  case RE_NODE_NON_DIGIT:
803
716
    FAIL_ON_ERROR(
804
716
        _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref));
805
716
    break;
806
807
3.07M
  case RE_NODE_ANY:
808
3.07M
    FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref));
809
3.07M
    break;
810
811
83.8M
  case RE_NODE_CLASS:
812
83.8M
    FAIL_ON_ERROR(
813
83.8M
        _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref));
814
815
83.8M
    FAIL_ON_ERROR(yr_arena_write_data(
816
83.8M
        emit_context->arena,
817
83.8M
        YR_RE_CODE_SECTION,
818
83.8M
        re_node->re_class,
819
83.8M
        sizeof(*re_node->re_class),
820
83.8M
        NULL));
821
83.8M
    break;
822
823
327k
  case RE_NODE_ANCHOR_START:
824
327k
    FAIL_ON_ERROR(_yr_emit_inst(
825
327k
        emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref));
826
327k
    break;
827
828
255k
  case RE_NODE_ANCHOR_END:
829
255k
    FAIL_ON_ERROR(
830
255k
        _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref));
831
255k
    break;
832
833
36.9M
  case RE_NODE_CONCAT:
834
36.9M
    FAIL_ON_ERROR(_yr_re_emit(
835
36.9M
        emit_context,
836
36.9M
        (flags & EMIT_BACKWARDS) ? re_node->children_tail
837
36.9M
                                 : re_node->children_head,
838
36.9M
        flags,
839
36.9M
        &instruction_ref));
840
841
36.9M
    if (flags & EMIT_BACKWARDS)
842
279k
      child = re_node->children_tail->prev_sibling;
843
36.6M
    else
844
36.6M
      child = re_node->children_head->next_sibling;
845
846
220M
    while (child != NULL)
847
183M
    {
848
183M
      FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL));
849
850
183M
      child = (flags & EMIT_BACKWARDS) ? child->prev_sibling
851
183M
                                       : child->next_sibling;
852
183M
    }
853
36.9M
    break;
854
855
36.9M
  case RE_NODE_PLUS:
856
    // Code for e+ looks like:
857
    //
858
    //          L1: code for e
859
    //              split L1, L2
860
    //          L2:
861
    //
862
2.78k
    FAIL_ON_ERROR(_yr_re_emit(
863
2.78k
        emit_context, re_node->children_head, flags, &instruction_ref));
864
865
2.68k
    bookmark_1 = yr_arena_get_current_offset(
866
2.68k
        emit_context->arena, YR_RE_CODE_SECTION);
867
868
2.68k
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
869
14
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
870
871
2.67k
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
872
873
2.67k
    FAIL_ON_ERROR(_yr_emit_split(
874
2.67k
        emit_context,
875
2.67k
        re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A,
876
2.67k
        jmp_offset,
877
2.67k
        NULL,
878
2.67k
        NULL));
879
880
2.64k
    break;
881
882
3.12k
  case RE_NODE_STAR:
883
    // Code for e* looks like:
884
    //
885
    //          L1: split L1, L2
886
    //              code for e
887
    //              jmp L1
888
    //          L2:
889
3.12k
    FAIL_ON_ERROR(_yr_emit_split(
890
3.12k
        emit_context,
891
3.12k
        re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
892
3.12k
        0,
893
3.12k
        &instruction_ref,
894
3.12k
        &split_offset_ref));
895
896
3.11k
    FAIL_ON_ERROR(
897
3.11k
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
898
899
2.94k
    bookmark_1 = yr_arena_get_current_offset(
900
2.94k
        emit_context->arena, YR_RE_CODE_SECTION);
901
902
2.94k
    if (instruction_ref.offset - bookmark_1 < INT16_MIN)
903
12
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
904
905
2.93k
    jmp_offset = (int16_t) (instruction_ref.offset - bookmark_1);
906
907
    // Emit jump with offset set to 0.
908
909
2.93k
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
910
2.93k
        emit_context, RE_OPCODE_JUMP, jmp_offset, NULL, NULL));
911
912
2.93k
    bookmark_1 = yr_arena_get_current_offset(
913
2.93k
        emit_context->arena, YR_RE_CODE_SECTION);
914
915
2.93k
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
916
0
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
917
918
2.93k
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
919
920
    // Update split offset.
921
2.93k
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
922
2.93k
        emit_context->arena, &split_offset_ref);
923
924
2.93k
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
925
2.93k
    break;
926
927
26.4k
  case RE_NODE_ALT:
928
    // Code for e1|e2 looks like:
929
    //
930
    //              split L1, L2
931
    //          L1: code for e1
932
    //              jmp L3
933
    //          L2: code for e2
934
    //          L3:
935
936
    // Emit a split instruction with offset set to 0 temporarily. Offset
937
    // will be updated after we know the size of the code generated for
938
    // the left node (e1).
939
940
26.4k
    FAIL_ON_ERROR(_yr_emit_split(
941
26.4k
        emit_context,
942
26.4k
        RE_OPCODE_SPLIT_A,
943
26.4k
        0,
944
26.4k
        &instruction_ref,
945
26.4k
        &split_offset_ref));
946
947
26.4k
    FAIL_ON_ERROR(
948
26.4k
        _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
949
950
    // Emit jump with offset set to 0.
951
952
25.3k
    FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
953
25.3k
        emit_context,
954
25.3k
        RE_OPCODE_JUMP,
955
25.3k
        0,
956
25.3k
        &jmp_instruction_ref,
957
25.3k
        &jmp_offset_ref));
958
959
25.3k
    bookmark_1 = yr_arena_get_current_offset(
960
25.3k
        emit_context->arena, YR_RE_CODE_SECTION);
961
962
25.3k
    if (bookmark_1 - instruction_ref.offset > INT16_MAX)
963
22
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
964
965
25.3k
    jmp_offset = (int16_t) (bookmark_1 - instruction_ref.offset);
966
967
    // Update split offset.
968
25.3k
    split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
969
25.3k
        emit_context->arena, &split_offset_ref);
970
971
25.3k
    memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
972
973
25.3k
    FAIL_ON_ERROR(
974
25.3k
        _yr_re_emit(emit_context, re_node->children_tail, flags, NULL));
975
976
24.9k
    bookmark_1 = yr_arena_get_current_offset(
977
24.9k
        emit_context->arena, YR_RE_CODE_SECTION);
978
979
24.9k
    if (bookmark_1 - jmp_instruction_ref.offset > INT16_MAX)
980
19
      return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
981
982
24.9k
    jmp_offset = (int16_t) (bookmark_1 - jmp_instruction_ref.offset);
983
984
    // Update offset for jmp instruction.
985
24.9k
    jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
986
24.9k
        emit_context->arena, &jmp_offset_ref);
987
988
24.9k
    memcpy(jmp_offset_addr, &jmp_offset, sizeof(jmp_offset));
989
24.9k
    break;
990
991
4.11k
  case RE_NODE_RANGE_ANY:
992
4.11k
    repeat_any_args.min = re_node->start;
993
4.11k
    repeat_any_args.max = re_node->end;
994
995
4.11k
    FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
996
4.11k
        emit_context,
997
4.11k
        re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY
998
4.11k
                        : RE_OPCODE_REPEAT_ANY_UNGREEDY,
999
4.11k
        &repeat_any_args,
1000
4.11k
        sizeof(repeat_any_args),
1001
4.11k
        &instruction_ref,
1002
4.11k
        NULL));
1003
1004
4.11k
    break;
1005
1006
32.9M
  case RE_NODE_RANGE:
1007
    // Code for e{n,m} looks like:
1008
    //
1009
    //            code for e              ---   prolog
1010
    //            repeat_start n, m, L1   --+
1011
    //        L0: code for e                |   repeat
1012
    //            repeat_end n, m, L0     --+
1013
    //        L1: split L2, L3            ---   split
1014
    //        L2: code for e              ---   epilog
1015
    //        L3:
1016
    //
1017
    // Not all sections (prolog, repeat, split and epilog) are generated in all
1018
    // cases, it depends on the values of n and m. The following table shows
1019
    // which sections are generated for the first few values of n and m.
1020
    //
1021
    //        n,m   prolog  repeat      split  epilog
1022
    //                      (min,max)
1023
    //        ---------------------------------------
1024
    //        0,0     -       -           -      -
1025
    //        0,1     -       -           X      X
1026
    //        0,2     -       0,1         X      X
1027
    //        0,3     -       0,2         X      X
1028
    //        0,M     -       0,M-1       X      X
1029
    //
1030
    //        1,1     X       -           -      -
1031
    //        1,2     X       -           X      X
1032
    //        1,3     X       0,1         X      X
1033
    //        1,4     X       1,2         X      X
1034
    //        1,M     X       1,M-2       X      X
1035
    //
1036
    //        2,2     X       -           -      X
1037
    //        2,3     X       1,1         X      X
1038
    //        2,4     X       1,2         X      X
1039
    //        2,M     X       1,M-2       X      X
1040
    //
1041
    //        3,3     X       1,1         -      X
1042
    //        3,4     X       2,2         X      X
1043
    //        3,M     X       2,M-2       X      X
1044
    //
1045
    //        4,4     X       2,2         -      X
1046
    //        4,5     X       3,3         X      X
1047
    //        4,M     X       3,M-2       X      X
1048
    //
1049
    // The code can't consists simply in the repeat section, the prolog and
1050
    // epilog are required because we can't have atoms pointing to code inside
1051
    // the repeat loop. Atoms' forwards_code will point to code in the prolog
1052
    // and backwards_code will point to code in the epilog (or in prolog if
1053
    // epilog wasn't generated, like in n=1,m=1)
1054
1055
32.9M
    emit_prolog = re_node->start > 0;
1056
32.9M
    emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2;
1057
32.9M
    emit_split = re_node->end > re_node->start;
1058
32.9M
    emit_epilog = re_node->end > re_node->start || re_node->end > 1;
1059
1060
32.9M
    if (emit_prolog)
1061
32.9M
    {
1062
32.9M
      FAIL_ON_ERROR(_yr_re_emit(
1063
32.9M
          emit_context, re_node->children_head, flags, &instruction_ref));
1064
32.9M
    }
1065
1066
32.9M
    if (emit_repeat)
1067
32.7M
    {
1068
32.7M
      repeat_args.min = re_node->start;
1069
32.7M
      repeat_args.max = re_node->end;
1070
1071
32.7M
      if (emit_prolog)
1072
32.7M
      {
1073
32.7M
        repeat_args.max--;
1074
32.7M
        repeat_args.min--;
1075
32.7M
      }
1076
1077
32.7M
      if (emit_split)
1078
610
      {
1079
610
        repeat_args.max--;
1080
610
      }
1081
32.7M
      else
1082
32.7M
      {
1083
32.7M
        repeat_args.min--;
1084
32.7M
        repeat_args.max--;
1085
32.7M
      }
1086
1087
32.7M
      repeat_args.offset = 0;
1088
1089
32.7M
      bookmark_1 = yr_arena_get_current_offset(
1090
32.7M
          emit_context->arena, YR_RE_CODE_SECTION);
1091
1092
32.7M
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1093
32.7M
          emit_context,
1094
32.7M
          re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY
1095
32.7M
                          : RE_OPCODE_REPEAT_START_UNGREEDY,
1096
32.7M
          &repeat_args,
1097
32.7M
          sizeof(repeat_args),
1098
32.7M
          emit_prolog ? NULL : &instruction_ref,
1099
32.7M
          &repeat_start_args_ref));
1100
1101
32.7M
      bookmark_2 = yr_arena_get_current_offset(
1102
32.7M
          emit_context->arena, YR_RE_CODE_SECTION);
1103
1104
32.7M
      FAIL_ON_ERROR(_yr_re_emit(
1105
32.7M
          emit_context,
1106
32.7M
          re_node->children_head,
1107
32.7M
          flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
1108
32.7M
          NULL));
1109
1110
32.7M
      bookmark_3 = yr_arena_get_current_offset(
1111
32.7M
          emit_context->arena, YR_RE_CODE_SECTION);
1112
1113
32.7M
      if (bookmark_2 - bookmark_3 < INT32_MIN)
1114
8
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1115
1116
32.7M
      repeat_args.offset = (int32_t) (bookmark_2 - bookmark_3);
1117
1118
32.7M
      FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1119
32.7M
          emit_context,
1120
32.7M
          re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY
1121
32.7M
                          : RE_OPCODE_REPEAT_END_UNGREEDY,
1122
32.7M
          &repeat_args,
1123
32.7M
          sizeof(repeat_args),
1124
32.7M
          NULL,
1125
32.7M
          NULL));
1126
1127
32.7M
      bookmark_4 = yr_arena_get_current_offset(
1128
32.7M
          emit_context->arena, YR_RE_CODE_SECTION);
1129
1130
32.7M
      repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr(
1131
32.7M
          emit_context->arena, &repeat_start_args_ref);
1132
1133
32.7M
      if (bookmark_4 - bookmark_1 > INT32_MAX)
1134
0
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1135
1136
32.7M
      repeat_start_args_addr->offset = (int32_t) (bookmark_4 - bookmark_1);
1137
32.7M
    }
1138
1139
32.9M
    if (emit_split)
1140
2.42k
    {
1141
2.42k
      bookmark_1 = yr_arena_get_current_offset(
1142
2.42k
          emit_context->arena, YR_RE_CODE_SECTION);
1143
1144
2.42k
      FAIL_ON_ERROR(_yr_emit_split(
1145
2.42k
          emit_context,
1146
2.42k
          re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
1147
2.42k
          0,
1148
2.42k
          NULL,
1149
2.42k
          &split_offset_ref));
1150
2.41k
    }
1151
1152
32.9M
    if (emit_epilog)
1153
32.9M
    {
1154
32.9M
      FAIL_ON_ERROR(_yr_re_emit(
1155
32.9M
          emit_context,
1156
32.9M
          re_node->children_head,
1157
32.9M
          emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags,
1158
32.9M
          emit_prolog || emit_repeat ? NULL : &instruction_ref));
1159
32.9M
    }
1160
1161
32.9M
    if (emit_split)
1162
2.37k
    {
1163
2.37k
      bookmark_2 = yr_arena_get_current_offset(
1164
2.37k
          emit_context->arena, YR_RE_CODE_SECTION);
1165
1166
2.37k
      if (bookmark_2 - bookmark_1 > INT16_MAX)
1167
8
        return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1168
1169
2.36k
      split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
1170
2.36k
          emit_context->arena, &split_offset_ref);
1171
1172
2.36k
      jmp_offset = (int16_t) (bookmark_2 - bookmark_1);
1173
1174
2.36k
      memcpy(split_offset_addr, &jmp_offset, sizeof(jmp_offset));
1175
2.36k
    }
1176
1177
32.9M
    break;
1178
319M
  }
1179
1180
319M
  if (flags & EMIT_BACKWARDS)
1181
6.50M
  {
1182
6.50M
    if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE))
1183
1.29M
    {
1184
1.29M
      re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION;
1185
1.29M
      re_node->backward_code_ref.offset = yr_arena_get_current_offset(
1186
1.29M
          emit_context->arena, YR_RE_CODE_SECTION);
1187
1.29M
    }
1188
6.50M
  }
1189
313M
  else
1190
313M
  {
1191
313M
    if (!(flags & EMIT_DONT_SET_FORWARDS_CODE))
1192
1.35M
    {
1193
1.35M
      re_node->forward_code_ref = instruction_ref;
1194
1.35M
    }
1195
313M
  }
1196
1197
319M
  if (code_ref != NULL)
1198
69.8M
    *code_ref = instruction_ref;
1199
1200
319M
  return ERROR_SUCCESS;
1201
319M
}
1202
1203
int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code)
1204
15.7k
{
1205
15.7k
  RE_EMIT_CONTEXT emit_context;
1206
1207
  // Emit code for matching the regular expressions forwards.
1208
15.7k
  emit_context.arena = arena;
1209
15.7k
  emit_context.next_split_id = 0;
1210
1211
15.7k
  FAIL_ON_ERROR(_yr_re_emit(
1212
15.7k
      &emit_context,
1213
15.7k
      re_ast->root_node,
1214
15.7k
      backwards_code ? EMIT_BACKWARDS : 0,
1215
15.7k
      NULL));
1216
1217
15.5k
  FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL));
1218
1219
15.5k
  return ERROR_SUCCESS;
1220
15.5k
}
1221
1222
static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber)
1223
0
{
1224
0
  RE_FIBER* fiber;
1225
1226
0
  if (fiber_pool->fibers.head != NULL)
1227
0
  {
1228
0
    fiber = fiber_pool->fibers.head;
1229
0
    fiber_pool->fibers.head = fiber->next;
1230
1231
0
    if (fiber_pool->fibers.tail == fiber)
1232
0
      fiber_pool->fibers.tail = NULL;
1233
0
  }
1234
0
  else
1235
0
  {
1236
0
    if (fiber_pool->fiber_count == RE_MAX_FIBERS)
1237
0
      return ERROR_TOO_MANY_RE_FIBERS;
1238
1239
0
    fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER));
1240
1241
0
    if (fiber == NULL)
1242
0
      return ERROR_INSUFFICIENT_MEMORY;
1243
1244
0
    fiber_pool->fiber_count++;
1245
0
  }
1246
1247
0
  fiber->ip = NULL;
1248
0
  fiber->sp = -1;
1249
0
  fiber->rc = -1;
1250
0
  fiber->next = NULL;
1251
0
  fiber->prev = NULL;
1252
1253
0
  *new_fiber = fiber;
1254
1255
0
  return ERROR_SUCCESS;
1256
0
}
1257
1258
////////////////////////////////////////////////////////////////////////////////
1259
// Appends 'fiber' to 'fiber_list'
1260
//
1261
static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber)
1262
0
{
1263
0
  assert(fiber->prev == NULL);
1264
0
  assert(fiber->next == NULL);
1265
1266
0
  fiber->prev = fiber_list->tail;
1267
1268
0
  if (fiber_list->tail != NULL)
1269
0
    fiber_list->tail->next = fiber;
1270
1271
0
  fiber_list->tail = fiber;
1272
1273
0
  if (fiber_list->head == NULL)
1274
0
    fiber_list->head = fiber;
1275
1276
0
  assert(fiber_list->tail->next == NULL);
1277
0
  assert(fiber_list->head->prev == NULL);
1278
0
}
1279
1280
////////////////////////////////////////////////////////////////////////////////
1281
// Verifies if a fiber with the same properties (ip, rc, sp, and stack values)
1282
// than 'target_fiber' exists in 'fiber_list'. The list is iterated from
1283
// the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
1284
// taken into account.
1285
//
1286
static int _yr_re_fiber_exists(
1287
    RE_FIBER_LIST* fiber_list,
1288
    RE_FIBER* target_fiber,
1289
    RE_FIBER* last_fiber)
1290
0
{
1291
0
  RE_FIBER* fiber = fiber_list->head;
1292
1293
0
  int equal_stacks;
1294
0
  int i;
1295
1296
0
  if (last_fiber == NULL)
1297
0
    return false;
1298
1299
0
  while (fiber != last_fiber->next)
1300
0
  {
1301
0
    if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp &&
1302
0
        fiber->rc == target_fiber->rc)
1303
0
    {
1304
0
      equal_stacks = true;
1305
1306
0
      for (i = 0; i <= fiber->sp; i++)
1307
0
      {
1308
0
        if (fiber->stack[i] != target_fiber->stack[i])
1309
0
        {
1310
0
          equal_stacks = false;
1311
0
          break;
1312
0
        }
1313
0
      }
1314
1315
0
      if (equal_stacks)
1316
0
        return true;
1317
0
    }
1318
1319
0
    fiber = fiber->next;
1320
0
  }
1321
1322
0
  return false;
1323
0
}
1324
1325
////////////////////////////////////////////////////////////////////////////////
1326
// Clones a fiber in fiber_list and inserts the cloned fiber just after.
1327
// the original one. If fiber_list is:
1328
//
1329
//   f1 -> f2 -> f3 -> f4
1330
//
1331
// Splitting f2 will result in:
1332
//
1333
//   f1 -> f2 -> cloned f2 -> f3 -> f4
1334
//
1335
static int _yr_re_fiber_split(
1336
    RE_FIBER_LIST* fiber_list,
1337
    RE_FIBER_POOL* fiber_pool,
1338
    RE_FIBER* fiber,
1339
    RE_FIBER** new_fiber)
1340
0
{
1341
0
  int32_t i;
1342
1343
0
  FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber));
1344
1345
0
  (*new_fiber)->sp = fiber->sp;
1346
0
  (*new_fiber)->ip = fiber->ip;
1347
0
  (*new_fiber)->rc = fiber->rc;
1348
1349
0
  for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i];
1350
1351
0
  (*new_fiber)->next = fiber->next;
1352
0
  (*new_fiber)->prev = fiber;
1353
1354
0
  if (fiber->next != NULL)
1355
0
    fiber->next->prev = *new_fiber;
1356
1357
0
  fiber->next = *new_fiber;
1358
1359
0
  if (fiber_list->tail == fiber)
1360
0
    fiber_list->tail = *new_fiber;
1361
1362
0
  assert(fiber_list->tail->next == NULL);
1363
0
  assert(fiber_list->head->prev == NULL);
1364
1365
0
  return ERROR_SUCCESS;
1366
0
}
1367
1368
////////////////////////////////////////////////////////////////////////////////
1369
// Kills a given fiber by removing it from the fiber list and putting it in the
1370
// fiber pool.
1371
//
1372
static RE_FIBER* _yr_re_fiber_kill(
1373
    RE_FIBER_LIST* fiber_list,
1374
    RE_FIBER_POOL* fiber_pool,
1375
    RE_FIBER* fiber)
1376
0
{
1377
0
  RE_FIBER* next_fiber = fiber->next;
1378
1379
0
  if (fiber->prev != NULL)
1380
0
    fiber->prev->next = next_fiber;
1381
1382
0
  if (next_fiber != NULL)
1383
0
    next_fiber->prev = fiber->prev;
1384
1385
0
  if (fiber_pool->fibers.tail != NULL)
1386
0
    fiber_pool->fibers.tail->next = fiber;
1387
1388
0
  if (fiber_list->tail == fiber)
1389
0
    fiber_list->tail = fiber->prev;
1390
1391
0
  if (fiber_list->head == fiber)
1392
0
    fiber_list->head = next_fiber;
1393
1394
0
  fiber->next = NULL;
1395
0
  fiber->prev = fiber_pool->fibers.tail;
1396
0
  fiber_pool->fibers.tail = fiber;
1397
1398
0
  if (fiber_pool->fibers.head == NULL)
1399
0
    fiber_pool->fibers.head = fiber;
1400
1401
0
  return next_fiber;
1402
0
}
1403
1404
////////////////////////////////////////////////////////////////////////////////
1405
// Kills all fibers from the given one up to the end of the fiber list.
1406
//
1407
static void _yr_re_fiber_kill_tail(
1408
    RE_FIBER_LIST* fiber_list,
1409
    RE_FIBER_POOL* fiber_pool,
1410
    RE_FIBER* fiber)
1411
0
{
1412
0
  RE_FIBER* prev_fiber = fiber->prev;
1413
1414
0
  if (prev_fiber != NULL)
1415
0
    prev_fiber->next = NULL;
1416
1417
0
  fiber->prev = fiber_pool->fibers.tail;
1418
1419
0
  if (fiber_pool->fibers.tail != NULL)
1420
0
    fiber_pool->fibers.tail->next = fiber;
1421
1422
0
  fiber_pool->fibers.tail = fiber_list->tail;
1423
0
  fiber_list->tail = prev_fiber;
1424
1425
0
  if (fiber_list->head == fiber)
1426
0
    fiber_list->head = NULL;
1427
1428
0
  if (fiber_pool->fibers.head == NULL)
1429
0
    fiber_pool->fibers.head = fiber;
1430
0
}
1431
1432
////////////////////////////////////////////////////////////////////////////////
1433
// Kills all fibers in the fiber list.
1434
//
1435
static void _yr_re_fiber_kill_all(
1436
    RE_FIBER_LIST* fiber_list,
1437
    RE_FIBER_POOL* fiber_pool)
1438
0
{
1439
0
  if (fiber_list->head != NULL)
1440
0
    _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
1441
0
}
1442
1443
////////////////////////////////////////////////////////////////////////////////
1444
// Executes a fiber until reaching an "matching" instruction. A "matching"
1445
// instruction is one that actually reads a byte from the input and performs
1446
// some matching. If the fiber reaches a split instruction, the new fiber is
1447
// also synced.
1448
//
1449
static int _yr_re_fiber_sync(
1450
    RE_FIBER_LIST* fiber_list,
1451
    RE_FIBER_POOL* fiber_pool,
1452
    RE_FIBER* fiber_to_sync)
1453
0
{
1454
  // A array for keeping track of which split instructions has been already
1455
  // executed. Each split instruction within a regexp has an associated ID
1456
  // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is
1457
  // required to avoid infinite loops in regexps like (a*)* or (a|)*
1458
1459
0
  RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID];
1460
0
  RE_SPLIT_ID_TYPE splits_executed_count = 0;
1461
0
  RE_SPLIT_ID_TYPE split_id, splits_executed_idx;
1462
1463
0
  int split_already_executed;
1464
1465
0
  RE_REPEAT_ARGS* repeat_args;
1466
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
1467
1468
0
  RE_FIBER* fiber;
1469
0
  RE_FIBER* last;
1470
0
  RE_FIBER* next;
1471
0
  RE_FIBER* branch_a;
1472
0
  RE_FIBER* branch_b;
1473
1474
0
  fiber = fiber_to_sync;
1475
0
  last = fiber_to_sync->next;
1476
1477
0
  while (fiber != last)
1478
0
  {
1479
0
    int16_t jmp_len;
1480
0
    uint8_t opcode = *fiber->ip;
1481
1482
0
    switch (opcode)
1483
0
    {
1484
0
    case RE_OPCODE_SPLIT_A:
1485
0
    case RE_OPCODE_SPLIT_B:
1486
1487
0
      split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1);
1488
0
      split_already_executed = false;
1489
1490
0
      for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count;
1491
0
           splits_executed_idx++)
1492
0
      {
1493
0
        if (split_id == splits_executed[splits_executed_idx])
1494
0
        {
1495
0
          split_already_executed = true;
1496
0
          break;
1497
0
        }
1498
0
      }
1499
1500
0
      if (split_already_executed)
1501
0
      {
1502
0
        fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
1503
0
      }
1504
0
      else
1505
0
      {
1506
0
        branch_a = fiber;
1507
1508
0
        FAIL_ON_ERROR(
1509
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1510
1511
        // With RE_OPCODE_SPLIT_A the current fiber continues at the next
1512
        // instruction in the stream (branch A), while the newly created
1513
        // fiber starts at the address indicated by the instruction (branch B)
1514
        // RE_OPCODE_SPLIT_B has the opposite behavior.
1515
1516
0
        if (opcode == RE_OPCODE_SPLIT_B)
1517
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1518
1519
        // Branch A continues at the next instruction
1520
0
        branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3);
1521
1522
        // Branch B adds the offset indicated by the split instruction.
1523
0
        jmp_len = yr_unaligned_i16(branch_b->ip + 1 + sizeof(RE_SPLIT_ID_TYPE));
1524
1525
0
        branch_b->ip += jmp_len;
1526
1527
#ifdef YR_PARANOID_MODE
1528
        // In normal conditions this should never happen. But with compiled
1529
        // rules that has been hand-crafted by a malicious actor this could
1530
        // happen.
1531
        if (splits_executed_count >= RE_MAX_SPLIT_ID)
1532
          return ERROR_INTERNAL_FATAL_ERROR;
1533
#endif
1534
1535
0
        splits_executed[splits_executed_count] = split_id;
1536
0
        splits_executed_count++;
1537
0
      }
1538
1539
0
      break;
1540
1541
0
    case RE_OPCODE_REPEAT_START_GREEDY:
1542
0
    case RE_OPCODE_REPEAT_START_UNGREEDY:
1543
1544
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1545
0
      assert(repeat_args->max > 0);
1546
0
      branch_a = fiber;
1547
1548
0
      if (repeat_args->min == 0)
1549
0
      {
1550
0
        FAIL_ON_ERROR(
1551
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1552
1553
0
        if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY)
1554
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1555
1556
0
        branch_b->ip += repeat_args->offset;
1557
0
      }
1558
1559
0
      branch_a->stack[++branch_a->sp] = 0;
1560
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1561
0
      break;
1562
1563
0
    case RE_OPCODE_REPEAT_END_GREEDY:
1564
0
    case RE_OPCODE_REPEAT_END_UNGREEDY:
1565
1566
0
      repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1567
0
      fiber->stack[fiber->sp]++;
1568
1569
0
      if (fiber->stack[fiber->sp] < repeat_args->min)
1570
0
      {
1571
0
        fiber->ip += repeat_args->offset;
1572
0
        break;
1573
0
      }
1574
1575
0
      branch_a = fiber;
1576
1577
0
      if (fiber->stack[fiber->sp] < repeat_args->max)
1578
0
      {
1579
0
        FAIL_ON_ERROR(
1580
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1581
1582
0
        if (opcode == RE_OPCODE_REPEAT_END_GREEDY)
1583
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1584
1585
0
        branch_b->ip += repeat_args->offset;
1586
0
      }
1587
1588
0
      branch_a->sp--;
1589
0
      branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1590
0
      break;
1591
1592
0
    case RE_OPCODE_REPEAT_ANY_GREEDY:
1593
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1594
1595
0
      repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1);
1596
1597
      // If repetition counter (rc) is -1 it means that we are reaching this
1598
      // instruction from the previous one in the instructions stream. In
1599
      // this case let's initialize the counter to 0 and start looping.
1600
1601
0
      if (fiber->rc == -1)
1602
0
        fiber->rc = 0;
1603
1604
0
      if (fiber->rc < repeat_any_args->min)
1605
0
      {
1606
        // Increase repetition counter and continue with next fiber. The
1607
        // instruction pointer for this fiber is not incremented yet, this
1608
        // fiber spins in this same instruction until reaching the minimum
1609
        // number of repetitions.
1610
1611
0
        fiber->rc++;
1612
0
        fiber = fiber->next;
1613
0
      }
1614
0
      else if (fiber->rc < repeat_any_args->max)
1615
0
      {
1616
        // Once the minimum number of repetitions are matched one fiber
1617
        // remains spinning in this instruction until reaching the maximum
1618
        // number of repetitions while new fibers are created. New fibers
1619
        // start executing at the next instruction.
1620
1621
0
        next = fiber->next;
1622
0
        branch_a = fiber;
1623
1624
0
        FAIL_ON_ERROR(
1625
0
            _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1626
1627
0
        if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY)
1628
0
          yr_swap(branch_a, branch_b, RE_FIBER*);
1629
1630
0
        branch_a->rc++;
1631
0
        branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1632
0
        branch_b->rc = -1;
1633
1634
0
        FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b));
1635
1636
0
        fiber = next;
1637
0
      }
1638
0
      else
1639
0
      {
1640
        // When the maximum number of repetitions is reached the fiber keeps
1641
        // executing at the next instruction. The repetition counter is set
1642
        // to -1 indicating that we are not spinning in a repeat instruction
1643
        // anymore.
1644
1645
0
        fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1646
0
        fiber->rc = -1;
1647
0
      }
1648
1649
0
      break;
1650
1651
0
    case RE_OPCODE_JUMP:
1652
0
      jmp_len = yr_unaligned_i16(fiber->ip + 1);
1653
0
      fiber->ip += jmp_len;
1654
0
      break;
1655
1656
0
    default:
1657
0
      fiber = fiber->next;
1658
0
    }
1659
0
  }
1660
1661
0
  return ERROR_SUCCESS;
1662
0
}
1663
1664
////////////////////////////////////////////////////////////////////////////////
1665
// Executes a regular expression. The specified regular expression will try to
1666
// match the data starting at the address specified by "input". The "input"
1667
// pointer can point to any address inside a memory buffer. Arguments
1668
// "input_forwards_size" and "input_backwards_size" indicate how many bytes
1669
// can be accessible starting at "input" and going forwards and backwards
1670
// respectively.
1671
//
1672
//   <--- input_backwards_size -->|<----------- input_forwards_size -------->
1673
//  |--------  memory buffer  -----------------------------------------------|
1674
//                                ^
1675
//                              input
1676
//
1677
// Args:
1678
//   YR_SCAN_CONTEXT *context         - Scan context.
1679
//   const uint8_t* code              - Regexp code be executed
1680
//   const uint8_t* input             - Pointer to input data
1681
//   size_t input_forwards_size       - Number of accessible bytes starting at
1682
//                                      "input" and going forwards.
1683
//   size_t input_backwards_size      - Number of accessible bytes starting at
1684
//                                      "input" and going backwards
1685
//   int flags                        - Flags:
1686
//      RE_FLAGS_SCAN
1687
//      RE_FLAGS_BACKWARDS
1688
//      RE_FLAGS_EXHAUSTIVE
1689
//      RE_FLAGS_WIDE
1690
//      RE_FLAGS_NO_CASE
1691
//      RE_FLAGS_DOT_ALL
1692
//   RE_MATCH_CALLBACK_FUNC callback  - Callback function
1693
//   void* callback_args              - Callback argument
1694
//   int*  matches                    - Pointer to an integer receiving the
1695
//                                      number of matching bytes. Notice that
1696
//                                      0 means a zero-length match, while no
1697
//                                      matches is -1.
1698
// Returns:
1699
//    ERROR_SUCCESS or any other error code.
1700
1701
int yr_re_exec(
1702
    YR_SCAN_CONTEXT* context,
1703
    const uint8_t* code,
1704
    const uint8_t* input_data,
1705
    size_t input_forwards_size,
1706
    size_t input_backwards_size,
1707
    int flags,
1708
    RE_MATCH_CALLBACK_FUNC callback,
1709
    void* callback_args,
1710
    int* matches)
1711
0
{
1712
0
  const uint8_t* input;
1713
0
  const uint8_t* ip;
1714
1715
0
  uint16_t opcode_args;
1716
0
  uint8_t mask;
1717
0
  uint8_t value;
1718
0
  uint8_t character_size;
1719
1720
0
  RE_FIBER_LIST fibers;
1721
0
  RE_FIBER* fiber;
1722
0
  RE_FIBER* next_fiber;
1723
1724
0
  int bytes_matched;
1725
0
  int max_bytes_matched;
1726
1727
0
  int match;
1728
0
  int input_incr;
1729
0
  int kill;
1730
0
  int action;
1731
1732
0
  bool prev_is_word_char = false;
1733
0
  bool input_is_word_char = false;
1734
1735
0
#define ACTION_NONE      0
1736
0
#define ACTION_CONTINUE  1
1737
0
#define ACTION_KILL      2
1738
0
#define ACTION_KILL_TAIL 3
1739
1740
0
#define prolog                                      \
1741
0
  {                                                 \
1742
0
    if ((bytes_matched >= max_bytes_matched) ||     \
1743
0
        (character_size == 2 && *(input + 1) != 0)) \
1744
0
    {                                               \
1745
0
      action = ACTION_KILL;                         \
1746
0
      break;                                        \
1747
0
    }                                               \
1748
0
  }
1749
1750
0
  if (matches != NULL)
1751
0
    *matches = -1;
1752
1753
0
  if (flags & RE_FLAGS_WIDE)
1754
0
    character_size = 2;
1755
0
  else
1756
0
    character_size = 1;
1757
1758
0
  input = input_data;
1759
0
  input_incr = character_size;
1760
1761
0
  if (flags & RE_FLAGS_BACKWARDS)
1762
0
  {
1763
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1764
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
1765
0
    input -= character_size;
1766
0
    input_incr = -input_incr;
1767
0
  }
1768
0
  else
1769
0
  {
1770
    // Signedness conversion is sound as long as YR_RE_SCAN_LIMIT <= INT_MAX
1771
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
1772
0
  }
1773
1774
  // Round down max_bytes_matched to a multiple of character_size, this way if
1775
  // character_size is 2 and max_bytes_matched is odd we are ignoring the
1776
  // extra byte which can't match anyways.
1777
1778
0
  max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
1779
0
  bytes_matched = 0;
1780
1781
0
  FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber));
1782
1783
0
  fiber->ip = code;
1784
0
  fibers.head = fiber;
1785
0
  fibers.tail = fiber;
1786
1787
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1788
0
      _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1789
0
      _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1790
1791
0
  while (fibers.head != NULL)
1792
0
  {
1793
0
    fiber = fibers.head;
1794
1795
0
    while (fiber != NULL)
1796
0
    {
1797
0
      next_fiber = fiber->next;
1798
1799
0
      if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev))
1800
0
        _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1801
1802
0
      fiber = next_fiber;
1803
0
    }
1804
1805
0
    fiber = fibers.head;
1806
1807
0
    while (fiber != NULL)
1808
0
    {
1809
0
      ip = fiber->ip;
1810
0
      action = ACTION_NONE;
1811
1812
0
      switch (*ip)
1813
0
      {
1814
0
      case RE_OPCODE_ANY:
1815
0
        prolog;
1816
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1817
0
        action = match ? ACTION_NONE : ACTION_KILL;
1818
0
        fiber->ip += 1;
1819
0
        break;
1820
1821
0
      case RE_OPCODE_REPEAT_ANY_GREEDY:
1822
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1823
0
        prolog;
1824
0
        match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1825
0
        action = match ? ACTION_NONE : ACTION_KILL;
1826
1827
        // The instruction pointer is not incremented here. The current fiber
1828
        // spins in this instruction until reaching the required number of
1829
        // repetitions. The code controlling the number of repetitions is in
1830
        // _yr_re_fiber_sync.
1831
1832
0
        break;
1833
1834
0
      case RE_OPCODE_LITERAL:
1835
0
        prolog;
1836
0
        if (flags & RE_FLAGS_NO_CASE)
1837
0
          match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)];
1838
0
        else
1839
0
          match = (*input == *(ip + 1));
1840
0
        action = match ? ACTION_NONE : ACTION_KILL;
1841
0
        fiber->ip += 2;
1842
0
        break;
1843
1844
0
      case RE_OPCODE_NOT_LITERAL:
1845
0
        prolog;
1846
1847
        // We don't need to take into account the case-insensitive
1848
        // case because this opcode is only used with hex strings,
1849
        // which can't be case-insensitive.
1850
1851
0
        match = (*input != *(ip + 1));
1852
0
        action = match ? ACTION_NONE : ACTION_KILL;
1853
0
        fiber->ip += 2;
1854
0
        break;
1855
1856
0
      case RE_OPCODE_MASKED_LITERAL:
1857
0
        prolog;
1858
0
        opcode_args = yr_unaligned_u16(ip + 1);
1859
0
        mask = opcode_args >> 8;
1860
0
        value = opcode_args & 0xFF;
1861
1862
        // We don't need to take into account the case-insensitive
1863
        // case because this opcode is only used with hex strings,
1864
        // which can't be case-insensitive.
1865
1866
0
        match = ((*input & mask) == value);
1867
0
        action = match ? ACTION_NONE : ACTION_KILL;
1868
0
        fiber->ip += 3;
1869
0
        break;
1870
1871
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
1872
0
        prolog;
1873
0
        opcode_args = yr_unaligned_u16(ip + 1);
1874
0
        mask = opcode_args >> 8;
1875
0
        value = opcode_args & 0xFF;
1876
1877
        // We don't need to take into account the case-insensitive
1878
        // case because this opcode is only used with hex strings,
1879
        // which can't be case-insensitive.
1880
1881
0
        match = ((*input & mask) != value);
1882
0
        action = match ? ACTION_NONE : ACTION_KILL;
1883
0
        fiber->ip += 3;
1884
0
        break;
1885
1886
0
      case RE_OPCODE_CLASS:
1887
0
        prolog;
1888
0
        match = _yr_re_is_char_in_class(
1889
0
            (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE);
1890
0
        action = match ? ACTION_NONE : ACTION_KILL;
1891
0
        fiber->ip += (sizeof(RE_CLASS) + 1);
1892
0
        break;
1893
1894
0
      case RE_OPCODE_WORD_CHAR:
1895
0
        prolog;
1896
0
        match = _yr_re_is_word_char(input, character_size);
1897
0
        action = match ? ACTION_NONE : ACTION_KILL;
1898
0
        fiber->ip += 1;
1899
0
        break;
1900
1901
0
      case RE_OPCODE_NON_WORD_CHAR:
1902
0
        prolog;
1903
0
        match = !_yr_re_is_word_char(input, character_size);
1904
0
        action = match ? ACTION_NONE : ACTION_KILL;
1905
0
        fiber->ip += 1;
1906
0
        break;
1907
1908
0
      case RE_OPCODE_SPACE:
1909
0
      case RE_OPCODE_NON_SPACE:
1910
1911
0
        prolog;
1912
1913
0
        switch (*input)
1914
0
        {
1915
0
        case ' ':
1916
0
        case '\t':
1917
0
        case '\r':
1918
0
        case '\n':
1919
0
        case '\v':
1920
0
        case '\f':
1921
0
          match = true;
1922
0
          break;
1923
0
        default:
1924
0
          match = false;
1925
0
        }
1926
1927
0
        if (*ip == RE_OPCODE_NON_SPACE)
1928
0
          match = !match;
1929
1930
0
        action = match ? ACTION_NONE : ACTION_KILL;
1931
0
        fiber->ip += 1;
1932
0
        break;
1933
1934
0
      case RE_OPCODE_DIGIT:
1935
0
        prolog;
1936
0
        match = isdigit(*input);
1937
0
        action = match ? ACTION_NONE : ACTION_KILL;
1938
0
        fiber->ip += 1;
1939
0
        break;
1940
1941
0
      case RE_OPCODE_NON_DIGIT:
1942
0
        prolog;
1943
0
        match = !isdigit(*input);
1944
0
        action = match ? ACTION_NONE : ACTION_KILL;
1945
0
        fiber->ip += 1;
1946
0
        break;
1947
1948
0
      case RE_OPCODE_WORD_BOUNDARY:
1949
0
      case RE_OPCODE_NON_WORD_BOUNDARY:
1950
0
        if (input - input_incr + character_size <=
1951
0
                input_data + input_forwards_size &&
1952
0
            input - input_incr >= input_data - input_backwards_size)
1953
0
        {
1954
0
          prev_is_word_char = _yr_re_is_word_char(
1955
0
              input - input_incr, character_size);
1956
0
        }
1957
0
        else
1958
0
        {
1959
0
          prev_is_word_char = false;
1960
0
        }
1961
1962
0
        if (input + character_size <= input_data + input_forwards_size &&
1963
0
            input >= input_data - input_backwards_size)
1964
0
        {
1965
0
          input_is_word_char = _yr_re_is_word_char(input, character_size);
1966
0
        }
1967
0
        else
1968
0
        {
1969
0
          input_is_word_char = false;
1970
0
        }
1971
1972
0
        match = (prev_is_word_char && !input_is_word_char) ||
1973
0
                (!prev_is_word_char && input_is_word_char);
1974
1975
0
        if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
1976
0
          match = !match;
1977
1978
0
        action = match ? ACTION_CONTINUE : ACTION_KILL;
1979
0
        fiber->ip += 1;
1980
0
        break;
1981
1982
0
      case RE_OPCODE_MATCH_AT_START:
1983
0
        if (flags & RE_FLAGS_BACKWARDS)
1984
0
          kill = input_backwards_size > (size_t) bytes_matched;
1985
0
        else
1986
0
          kill = input_backwards_size > 0 || (bytes_matched != 0);
1987
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1988
0
        fiber->ip += 1;
1989
0
        break;
1990
1991
0
      case RE_OPCODE_MATCH_AT_END:
1992
0
        kill = flags & RE_FLAGS_BACKWARDS ||
1993
0
               input_forwards_size > (size_t) bytes_matched;
1994
0
        action = kill ? ACTION_KILL : ACTION_CONTINUE;
1995
0
        fiber->ip += 1;
1996
0
        break;
1997
1998
0
      case RE_OPCODE_MATCH:
1999
2000
0
        if (matches != NULL)
2001
0
          *matches = bytes_matched;
2002
2003
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2004
0
        {
2005
0
          if (callback != NULL)
2006
0
          {
2007
0
            if (flags & RE_FLAGS_BACKWARDS)
2008
0
            {
2009
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2010
0
                  callback(
2011
0
                      input + character_size,
2012
0
                      bytes_matched,
2013
0
                      flags,
2014
0
                      callback_args),
2015
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2016
0
            }
2017
0
            else
2018
0
            {
2019
0
              FAIL_ON_ERROR_WITH_CLEANUP(
2020
0
                  callback(input_data, bytes_matched, flags, callback_args),
2021
0
                  _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2022
0
            }
2023
0
          }
2024
2025
0
          action = ACTION_KILL;
2026
0
        }
2027
0
        else
2028
0
        {
2029
0
          action = ACTION_KILL_TAIL;
2030
0
        }
2031
2032
0
        break;
2033
2034
0
      default:
2035
0
        assert(false);
2036
0
      }
2037
2038
0
      switch (action)
2039
0
      {
2040
0
      case ACTION_KILL:
2041
0
        fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
2042
0
        break;
2043
2044
0
      case ACTION_KILL_TAIL:
2045
0
        _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber);
2046
0
        fiber = NULL;
2047
0
        break;
2048
2049
0
      case ACTION_CONTINUE:
2050
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2051
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2052
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2053
0
        break;
2054
2055
0
      default:
2056
0
        next_fiber = fiber->next;
2057
0
        FAIL_ON_ERROR_WITH_CLEANUP(
2058
0
            _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2059
0
            _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2060
0
        fiber = next_fiber;
2061
0
      }
2062
0
    }
2063
2064
0
    input += input_incr;
2065
0
    bytes_matched += character_size;
2066
2067
0
    if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched)
2068
0
    {
2069
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2070
0
          _yr_re_fiber_create(&context->re_fiber_pool, &fiber),
2071
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2072
2073
0
      fiber->ip = code;
2074
0
      _yr_re_fiber_append(&fibers, fiber);
2075
2076
0
      FAIL_ON_ERROR_WITH_CLEANUP(
2077
0
          _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
2078
0
          _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
2079
0
    }
2080
0
  }
2081
2082
0
  return ERROR_SUCCESS;
2083
0
}
2084
2085
////////////////////////////////////////////////////////////////////////////////
2086
// Helper function that creates a RE_FAST_EXEC_POSITION by either allocating it
2087
// or reusing a previously allocated one from a pool.
2088
//
2089
static int _yr_re_fast_exec_position_create(
2090
    RE_FAST_EXEC_POSITION_POOL* pool,
2091
    RE_FAST_EXEC_POSITION** new_position)
2092
0
{
2093
0
  RE_FAST_EXEC_POSITION* position;
2094
2095
0
  if (pool->head != NULL)
2096
0
  {
2097
0
    position = pool->head;
2098
0
    pool->head = position->next;
2099
0
  }
2100
0
  else
2101
0
  {
2102
0
    position = (RE_FAST_EXEC_POSITION*) yr_malloc(
2103
0
        sizeof(RE_FAST_EXEC_POSITION));
2104
2105
0
    if (position == NULL)
2106
0
      return ERROR_INSUFFICIENT_MEMORY;
2107
0
  }
2108
2109
0
  position->input = NULL;
2110
0
  position->round = 0;
2111
0
  position->next = NULL;
2112
0
  position->prev = NULL;
2113
2114
0
  *new_position = position;
2115
2116
0
  return ERROR_SUCCESS;
2117
0
}
2118
2119
////////////////////////////////////////////////////////////////////////////////
2120
// Helper function that moves a list of RE_FAST_EXEC_POSITION structures to a
2121
// pool represented by a RE_FAST_EXEC_POSITION_POOL. Receives pointers to the
2122
// pool, the first item, and last item in the list.
2123
//
2124
static void _yr_re_fast_exec_destroy_position_list(
2125
    RE_FAST_EXEC_POSITION_POOL* pool,
2126
    RE_FAST_EXEC_POSITION* first,
2127
    RE_FAST_EXEC_POSITION* last)
2128
0
{
2129
0
  last->next = pool->head;
2130
2131
0
  if (pool->head != NULL)
2132
0
    pool->head->prev = last;
2133
2134
0
  pool->head = first;
2135
0
}
2136
2137
////////////////////////////////////////////////////////////////////////////////
2138
// This function replaces yr_re_exec for regular expressions marked with flag
2139
// RE_FLAGS_FAST_REGEXP. These regular expressions are derived from hex strings
2140
// that don't contain alternatives, like for example:
2141
//
2142
//   { 01 02 03 04 [0-2] 04 05 06 07 }
2143
//
2144
// The regexp's code produced by such strings can be matched by a faster, less
2145
// general algorithm, and it contains only the following opcodes:
2146
//
2147
//   * RE_OPCODE_LITERAL
2148
//   * RE_OPCODE_MASKED_LITERAL,
2149
//   * RE_OPCODE_NOT_LITERAL
2150
//   * RE_OPCODE_MASKED_NOT_LITERAL
2151
//   * RE_OPCODE_ANY
2152
//   * RE_OPCODE_REPEAT_ANY_UNGREEDY
2153
//   * RE_OPCODE_MATCH.
2154
//
2155
// The regexp's code is executed one instruction at time, and the function
2156
// maintains a list of positions within the input for tracking different
2157
// matching alternatives, these positions are described by an instance of
2158
// RE_FAST_EXEC_POSITION. With each instruction the list of positions is
2159
// updated, removing those where the input data doesn't match the current
2160
// instruction, or adding new positions if the instruction is
2161
// RE_OPCODE_REPEAT_ANY_UNGREEDY. The list of positions is maintained sorted
2162
// by the value of the pointer they hold to the input, and it doesn't contain
2163
// duplicated pointer values.
2164
//
2165
int yr_re_fast_exec(
2166
    YR_SCAN_CONTEXT* context,
2167
    const uint8_t* code,
2168
    const uint8_t* input_data,
2169
    size_t input_forwards_size,
2170
    size_t input_backwards_size,
2171
    int flags,
2172
    RE_MATCH_CALLBACK_FUNC callback,
2173
    void* callback_args,
2174
    int* matches)
2175
0
{
2176
0
  RE_REPEAT_ANY_ARGS* repeat_any_args;
2177
2178
  // Pointers to the first and last position in the list.
2179
0
  RE_FAST_EXEC_POSITION* first;
2180
0
  RE_FAST_EXEC_POSITION* last;
2181
2182
0
  int input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;
2183
0
  int bytes_matched;
2184
0
  int max_bytes_matched;
2185
2186
0
  if (flags & RE_FLAGS_BACKWARDS)
2187
0
    max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
2188
0
  else
2189
0
    max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
2190
2191
0
  const uint8_t* ip = code;
2192
2193
  // Create the first position in the list, which points to the start of the
2194
  // input data. Intially this is the only position, more positions will be
2195
  // created every time RE_OPCODE_REPEAT_ANY_UNGREEDY is executed.
2196
0
  FAIL_ON_ERROR(_yr_re_fast_exec_position_create(
2197
0
      &context->re_fast_exec_position_pool, &first));
2198
2199
0
  first->round = 0;
2200
0
  first->input = input_data;
2201
0
  first->prev = NULL;
2202
0
  first->next = NULL;
2203
2204
0
  if (flags & RE_FLAGS_BACKWARDS)
2205
0
    first->input--;
2206
2207
  // As we are starting with a single position, the last one and the first one
2208
  // are the same.
2209
0
  last = first;
2210
2211
  // Round is incremented with every regxp instruction.
2212
0
  int round = 0;
2213
2214
  // Keep in the loop until the list of positions gets empty. Positions are
2215
  // removed from the list when they don't match the current instruction in
2216
  // the code.
2217
0
  while (first != NULL)
2218
0
  {
2219
0
    RE_FAST_EXEC_POSITION* current = first;
2220
2221
    // Iterate over all the positions, executing the current instruction at
2222
    // all of them.
2223
0
    while (current != NULL)
2224
0
    {
2225
0
      RE_FAST_EXEC_POSITION* next = current->next;
2226
2227
      // Ignore any position in the list whose round number is different from
2228
      // the current round. This prevents new positions added to the list in
2229
      // this round from being taken into account in this same round.
2230
0
      if (current->round != round)
2231
0
      {
2232
0
        current = next;
2233
0
        continue;
2234
0
      }
2235
2236
0
      bytes_matched = flags & RE_FLAGS_BACKWARDS
2237
0
                          ? (int) (input_data - current->input - 1)
2238
0
                          : (int) (current->input - input_data);
2239
2240
0
      uint16_t opcode_args;
2241
0
      uint8_t mask;
2242
0
      uint8_t value;
2243
2244
0
      bool match = false;
2245
2246
0
      switch (*ip)
2247
0
      {
2248
0
      case RE_OPCODE_ANY:
2249
0
        if (bytes_matched >= max_bytes_matched)
2250
0
          break;
2251
2252
0
        match = true;
2253
0
        current->input += input_incr;
2254
0
        break;
2255
2256
0
      case RE_OPCODE_LITERAL:
2257
0
        if (bytes_matched >= max_bytes_matched)
2258
0
          break;
2259
2260
0
        if (*current->input == *(ip + 1))
2261
0
        {
2262
0
          match = true;
2263
0
          current->input += input_incr;
2264
0
        }
2265
0
        break;
2266
2267
0
      case RE_OPCODE_NOT_LITERAL:
2268
0
        if (bytes_matched >= max_bytes_matched)
2269
0
          break;
2270
2271
0
        if (*current->input != *(ip + 1))
2272
0
        {
2273
0
          match = true;
2274
0
          current->input += input_incr;
2275
0
        }
2276
0
        break;
2277
2278
0
      case RE_OPCODE_MASKED_LITERAL:
2279
0
        if (bytes_matched >= max_bytes_matched)
2280
0
          break;
2281
2282
0
        opcode_args = yr_unaligned_u16(ip + 1);
2283
0
        mask = opcode_args >> 8;
2284
0
        value = opcode_args & 0xFF;
2285
2286
0
        if ((*current->input & mask) == value)
2287
0
        {
2288
0
          match = true;
2289
0
          current->input += input_incr;
2290
0
        }
2291
0
        break;
2292
2293
0
      case RE_OPCODE_MASKED_NOT_LITERAL:
2294
0
        if (bytes_matched >= max_bytes_matched)
2295
0
          break;
2296
2297
0
        opcode_args = yr_unaligned_u16(ip + 1);
2298
0
        mask = opcode_args >> 8;
2299
0
        value = opcode_args & 0xFF;
2300
2301
0
        if ((*current->input & mask) != value)
2302
0
        {
2303
0
          match = true;
2304
0
          current->input += input_incr;
2305
0
        }
2306
0
        break;
2307
2308
0
      case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2309
0
        repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1);
2310
2311
0
        if (bytes_matched + repeat_any_args->min >= max_bytes_matched)
2312
0
          break;
2313
2314
0
        match = true;
2315
2316
0
        const uint8_t* next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS);
2317
2318
        // insertion_point indicates the item in the list of inputs after which
2319
        // the newly created inputs will be inserted.
2320
0
        RE_FAST_EXEC_POSITION* insertion_point = current;
2321
2322
0
        for (int j = repeat_any_args->min + 1; j <= repeat_any_args->max; j++)
2323
0
        {
2324
          // bytes_matched is the number of bytes matched so far, and j is the
2325
          // minimum number of bytes that are still pending to match. If these
2326
          // two numbers sum more than the maximum number of bytes that can be
2327
          // matched the loop can be aborted. The position that we were about
2328
          // to create can't lead to a match without exceeding
2329
          // max_bytes_matched.
2330
0
          if (bytes_matched + j >= max_bytes_matched)
2331
0
            break;
2332
2333
0
          const uint8_t* next_input = current->input + j * input_incr;
2334
2335
          // Find the point where next_input should be inserted. The list is
2336
          // kept sorted by pointer in increasing order, the insertion point is
2337
          // an item that has a pointer lower or equal than next_input, but
2338
          // whose next item have a pointer that is larger.
2339
0
          while (insertion_point->next != NULL &&
2340
0
                 insertion_point->next->input <= next_input)
2341
0
          {
2342
0
            insertion_point = insertion_point->next;
2343
0
          }
2344
2345
          // If the input already exists for the next round, we don't need to
2346
          // insert it.
2347
0
          if (insertion_point->round == round + 1 &&
2348
0
              insertion_point->input == next_input)
2349
0
            continue;
2350
2351
          // The next opcode is RE_OPCODE_LITERAL, but the literal doesn't
2352
          // match with the byte at next_input, we don't need to add this
2353
          // input to the list as we already know that it won't match in the
2354
          // next round and will be deleted from the list anyways.
2355
0
          if (*(next_opcode) == RE_OPCODE_LITERAL &&
2356
0
              *(next_opcode + 1) != *next_input)
2357
0
            continue;
2358
2359
0
          RE_FAST_EXEC_POSITION* new_input;
2360
2361
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2362
0
              _yr_re_fast_exec_position_create(
2363
0
                  &context->re_fast_exec_position_pool, &new_input),
2364
              // Cleanup
2365
0
              _yr_re_fast_exec_destroy_position_list(
2366
0
                  &context->re_fast_exec_position_pool, first, last));
2367
2368
0
          new_input->input = next_input;
2369
0
          new_input->round = round + 1;
2370
0
          new_input->prev = insertion_point;
2371
0
          new_input->next = insertion_point->next;
2372
0
          insertion_point->next = new_input;
2373
2374
0
          if (new_input->next != NULL)
2375
0
            new_input->next->prev = new_input;
2376
2377
0
          if (insertion_point == last)
2378
0
            last = new_input;
2379
0
        }
2380
2381
0
        current->input += input_incr * repeat_any_args->min;
2382
0
        break;
2383
2384
0
      case RE_OPCODE_MATCH:
2385
2386
0
        if (flags & RE_FLAGS_EXHAUSTIVE)
2387
0
        {
2388
0
          FAIL_ON_ERROR_WITH_CLEANUP(
2389
0
              callback(
2390
                  // When matching forwards the matching data always starts at
2391
                  // input_data, when matching backwards it starts at input + 1
2392
                  // or input_data - input_backwards_size if input + 1 is
2393
                  // outside the buffer.
2394
0
                  flags & RE_FLAGS_BACKWARDS
2395
0
                      ? yr_max(
2396
0
                            current->input + 1,
2397
0
                            input_data - input_backwards_size)
2398
0
                      : input_data,
2399
                  // The number of matched bytes should not be larger than
2400
                  // max_bytes_matched.
2401
0
                  yr_min(bytes_matched, max_bytes_matched),
2402
0
                  flags,
2403
0
                  callback_args),
2404
              // Cleanup
2405
0
              _yr_re_fast_exec_destroy_position_list(
2406
0
                  &context->re_fast_exec_position_pool, first, last));
2407
0
        }
2408
0
        else
2409
0
        {
2410
0
          if (matches != NULL)
2411
0
            *matches = bytes_matched;
2412
2413
0
          _yr_re_fast_exec_destroy_position_list(
2414
0
              &context->re_fast_exec_position_pool, first, last);
2415
2416
0
          return ERROR_SUCCESS;
2417
0
        }
2418
0
        break;
2419
2420
0
      default:
2421
0
        printf("non-supported opcode: %d\n", *ip);
2422
0
        assert(false);
2423
0
      }
2424
2425
0
      if (match)
2426
0
      {
2427
0
        current->round = round + 1;
2428
0
      }
2429
0
      else
2430
0
      {
2431
        // Remove current from the list. If the item being removed is the first
2432
        // one or the last one, the first and last pointers should be updated.
2433
        // The removed item is put into the pool for later reuse.
2434
2435
0
        if (current == first)
2436
0
          first = current->next;
2437
2438
0
        if (current == last)
2439
0
          last = current->prev;
2440
2441
0
        if (current->prev != NULL)
2442
0
          current->prev->next = current->next;
2443
2444
0
        if (current->next != NULL)
2445
0
          current->next->prev = current->prev;
2446
2447
0
        current->prev = NULL;
2448
0
        current->next = context->re_fast_exec_position_pool.head;
2449
2450
0
        if (current->next != NULL)
2451
0
          current->next->prev = current;
2452
2453
0
        context->re_fast_exec_position_pool.head = current;
2454
0
      }
2455
2456
0
      current = next;
2457
2458
0
    }  // while (current != NULL)
2459
2460
    // Move instruction pointer (ip) to next instruction. Each opcode has
2461
    // a different size.
2462
0
    switch (*ip)
2463
0
    {
2464
0
    case RE_OPCODE_ANY:
2465
0
      ip += 1;
2466
0
      break;
2467
0
    case RE_OPCODE_LITERAL:
2468
0
    case RE_OPCODE_NOT_LITERAL:
2469
0
      ip += 2;
2470
0
      break;
2471
0
    case RE_OPCODE_MASKED_LITERAL:
2472
0
    case RE_OPCODE_MASKED_NOT_LITERAL:
2473
0
      ip += 3;
2474
0
      break;
2475
0
    case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2476
0
      ip += 1 + sizeof(RE_REPEAT_ANY_ARGS);
2477
0
      break;
2478
0
    case RE_OPCODE_MATCH:
2479
0
      break;
2480
0
    default:
2481
0
      assert(false);
2482
0
    }
2483
2484
0
    round++;
2485
0
  }
2486
2487
  // If this point is reached no matches were found.
2488
2489
0
  if (matches != NULL)
2490
0
    *matches = -1;
2491
2492
0
  return ERROR_SUCCESS;
2493
0
}
2494
2495
static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent)
2496
0
{
2497
0
  RE_NODE* child;
2498
0
  int i;
2499
2500
0
  if (re_node == NULL)
2501
0
    return;
2502
2503
0
  if (indent > 0)
2504
0
    printf("\n%*s", indent, " ");
2505
0
  switch (re_node->type)
2506
0
  {
2507
0
  case RE_NODE_ALT:
2508
0
    printf("Alt(");
2509
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2510
0
    printf(",");
2511
0
    _yr_re_print_node(re_node->children_tail, indent + 4);
2512
0
    printf("\n%*s%s", indent, " ", ")");
2513
0
    break;
2514
2515
0
  case RE_NODE_CONCAT:
2516
0
    printf("Cat(");
2517
0
    child = re_node->children_head;
2518
0
    while (child != NULL)
2519
0
    {
2520
0
      _yr_re_print_node(child, indent + 4);
2521
0
      printf(",");
2522
0
      child = child->next_sibling;
2523
0
    }
2524
0
    printf("\n%*s%s", indent, " ", ")");
2525
0
    break;
2526
2527
0
  case RE_NODE_STAR:
2528
0
    printf("Star(");
2529
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2530
0
    printf(")");
2531
0
    break;
2532
2533
0
  case RE_NODE_PLUS:
2534
0
    printf("Plus(");
2535
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2536
0
    printf(")");
2537
0
    break;
2538
2539
0
  case RE_NODE_LITERAL:
2540
0
    printf("Lit(%c)", re_node->value);
2541
0
    break;
2542
2543
0
  case RE_NODE_NOT_LITERAL:
2544
0
    printf("NotLit(%c)", re_node->value);
2545
0
    break;
2546
2547
0
  case RE_NODE_MASKED_LITERAL:
2548
0
    printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask);
2549
0
    break;
2550
2551
0
  case RE_NODE_WORD_CHAR:
2552
0
    printf("WordChar");
2553
0
    break;
2554
2555
0
  case RE_NODE_NON_WORD_CHAR:
2556
0
    printf("NonWordChar");
2557
0
    break;
2558
2559
0
  case RE_NODE_SPACE:
2560
0
    printf("Space");
2561
0
    break;
2562
2563
0
  case RE_NODE_NON_SPACE:
2564
0
    printf("NonSpace");
2565
0
    break;
2566
2567
0
  case RE_NODE_DIGIT:
2568
0
    printf("Digit");
2569
0
    break;
2570
2571
0
  case RE_NODE_NON_DIGIT:
2572
0
    printf("NonDigit");
2573
0
    break;
2574
2575
0
  case RE_NODE_ANY:
2576
0
    printf("Any");
2577
0
    break;
2578
2579
0
  case RE_NODE_RANGE:
2580
0
    printf("Range(%d-%d, ", re_node->start, re_node->end);
2581
0
    _yr_re_print_node(re_node->children_head, indent + 4);
2582
0
    printf("\n%*s%s", indent, " ", ")");
2583
0
    break;
2584
2585
0
  case RE_NODE_CLASS:
2586
0
    printf("Class(");
2587
0
    for (i = 0; i < 256; i++)
2588
0
      if (_yr_re_is_char_in_class(re_node->re_class, i, false))
2589
0
        printf("%02X,", i);
2590
0
    printf(")");
2591
0
    break;
2592
2593
0
  case RE_NODE_EMPTY:
2594
0
    printf("Empty");
2595
0
    break;
2596
2597
0
  case RE_NODE_ANCHOR_START:
2598
0
    printf("AnchorStart");
2599
0
    break;
2600
2601
0
  case RE_NODE_ANCHOR_END:
2602
0
    printf("AnchorEnd");
2603
0
    break;
2604
2605
0
  case RE_NODE_WORD_BOUNDARY:
2606
0
    printf("WordBoundary");
2607
0
    break;
2608
2609
0
  case RE_NODE_NON_WORD_BOUNDARY:
2610
0
    printf("NonWordBoundary");
2611
0
    break;
2612
2613
0
  case RE_NODE_RANGE_ANY:
2614
0
    printf("RangeAny");
2615
0
    break;
2616
2617
0
  default:
2618
0
    printf("???");
2619
0
    break;
2620
0
  }
2621
0
}
2622
2623
void yr_re_print(RE_AST* re_ast)
2624
0
{
2625
0
  _yr_re_print_node(re_ast->root_node, 0);
2626
0
}