Coverage Report

Created: 2025-09-05 06:55

/src/yara/libyara/ahocorasick.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
#include <assert.h>
31
#include <stddef.h>
32
#include <string.h>
33
#include <yara/ahocorasick.h>
34
#include <yara/arena.h>
35
#include <yara/compiler.h>
36
#include <yara/error.h>
37
#include <yara/mem.h>
38
#include <yara/utils.h>
39
40
typedef struct _QUEUE_NODE
41
{
42
  YR_AC_STATE* value;
43
44
  struct _QUEUE_NODE* previous;
45
  struct _QUEUE_NODE* next;
46
47
} QUEUE_NODE;
48
49
typedef struct _QUEUE
50
{
51
  QUEUE_NODE* head;
52
  QUEUE_NODE* tail;
53
54
} QUEUE;
55
56
////////////////////////////////////////////////////////////////////////////////
57
// Pushes an automaton state into the tail of a queue.
58
//
59
// Args:
60
//   queue: Pointer to the queue.
61
//   state: Pointer to the state being pushed into the queue.
62
//
63
// Returns:
64
//   ERROR_SUCCESS
65
//   ERROR_INSUFFICIENT_MEMORY
66
//
67
static int _yr_ac_queue_push(QUEUE* queue, YR_AC_STATE* state)
68
624k
{
69
624k
  QUEUE_NODE* pushed_node;
70
71
624k
  pushed_node = (QUEUE_NODE*) yr_malloc(sizeof(QUEUE_NODE));
72
73
624k
  if (pushed_node == NULL)
74
0
    return ERROR_INSUFFICIENT_MEMORY;
75
76
624k
  pushed_node->previous = queue->tail;
77
624k
  pushed_node->next = NULL;
78
624k
  pushed_node->value = state;
79
80
624k
  if (queue->tail != NULL)
81
624k
    queue->tail->next = pushed_node;
82
120
  else  // queue is empty
83
120
    queue->head = pushed_node;
84
85
624k
  queue->tail = pushed_node;
86
87
624k
  return ERROR_SUCCESS;
88
624k
}
89
90
////////////////////////////////////////////////////////////////////////////////
91
// Pops an automaton state from the head of a queue.
92
//
93
// Args:
94
//   queue: Pointer to the queue.
95
//
96
// Returns:
97
//   Pointer to the poped state.
98
//
99
static YR_AC_STATE* _yr_ac_queue_pop(QUEUE* queue)
100
624k
{
101
624k
  YR_AC_STATE* result;
102
624k
  QUEUE_NODE* popped_node;
103
104
624k
  if (queue->head == NULL)
105
0
    return NULL;
106
107
624k
  popped_node = queue->head;
108
624k
  queue->head = popped_node->next;
109
110
624k
  if (queue->head)
111
624k
    queue->head->previous = NULL;
112
120
  else  // queue is empty
113
120
    queue->tail = NULL;
114
115
624k
  result = popped_node->value;
116
117
624k
  yr_free(popped_node);
118
624k
  return result;
119
624k
}
120
121
////////////////////////////////////////////////////////////////////////////////
122
// Checks if a queue is empty.
123
//
124
// Args:
125
//   queue: Pointer to the queue.
126
//
127
// Returns:
128
//   true if queue is empty, false otherwise.
129
//
130
static int _yr_ac_queue_is_empty(QUEUE* queue)
131
624k
{
132
624k
  return queue->head == NULL;
133
624k
}
134
135
////////////////////////////////////////////////////////////////////////////////
136
// Given an automaton state and an input symbol, returns the new state
137
// after reading the input symbol.
138
//
139
// Args:
140
//    state: Pointer to automaton state.
141
//    input: Input symbol.
142
//
143
// Returns:
144
//   Pointer to the next automaton state.
145
//
146
static YR_AC_STATE* _yr_ac_next_state(YR_AC_STATE* state, uint8_t input)
147
7.52M
{
148
7.52M
  YR_AC_STATE* next_state = state->first_child;
149
150
290M
  while (next_state != NULL)
151
290M
  {
152
290M
    if (next_state->input == input)
153
7.06M
      return next_state;
154
155
283M
    next_state = next_state->siblings;
156
283M
  }
157
158
462k
  return NULL;
159
7.52M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Creates a new automaton state, the automaton will transition from
163
// the given state to the new state after reading the input symbol.
164
//
165
// Args:
166
//   state: Pointer to the origin state.
167
//   input: Input symbol.
168
//
169
// Returns:
170
//   YR_AC_STATE* pointer to the newly allocated state or NULL in case
171
//   of error.
172
//
173
static YR_AC_STATE* _yr_ac_state_create(YR_AC_STATE* state, uint8_t input)
174
370k
{
175
370k
  YR_AC_STATE* new_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
176
177
370k
  if (new_state == NULL)
178
0
    return NULL;
179
180
370k
  new_state->input = input;
181
370k
  new_state->depth = state->depth + 1;
182
370k
  new_state->matches_ref = YR_ARENA_NULL_REF;
183
370k
  new_state->failure = NULL;
184
370k
  new_state->t_table_slot = 0;
185
370k
  new_state->first_child = NULL;
186
370k
  new_state->siblings = state->first_child;
187
370k
  state->first_child = new_state;
188
189
370k
  return new_state;
190
370k
}
191
192
////////////////////////////////////////////////////////////////////////////////
193
// Destroys an automaton state.
194
//
195
static int _yr_ac_state_destroy(YR_AC_STATE* state)
196
371k
{
197
371k
  YR_AC_STATE* child_state = state->first_child;
198
199
742k
  while (child_state != NULL)
200
370k
  {
201
370k
    YR_AC_STATE* next_child_state = child_state->siblings;
202
370k
    _yr_ac_state_destroy(child_state);
203
370k
    child_state = next_child_state;
204
370k
  }
205
206
371k
  yr_free(state);
207
208
371k
  return ERROR_SUCCESS;
209
371k
}
210
211
////////////////////////////////////////////////////////////////////////////////
212
// Create failure links for each automaton state.
213
//
214
// This function must be called after all the strings have been added to the
215
// automaton with yr_ac_add_string.
216
//
217
static int _yr_ac_create_failure_links(YR_AC_AUTOMATON* automaton)
218
113
{
219
113
  YR_AC_STATE* current_state;
220
113
  YR_AC_STATE* failure_state;
221
113
  YR_AC_STATE* temp_state;
222
113
  YR_AC_STATE* state;
223
113
  YR_AC_STATE* transition_state;
224
113
  YR_AC_STATE* root_state;
225
113
  YR_AC_MATCH* match;
226
227
113
  QUEUE queue;
228
229
113
  queue.head = NULL;
230
113
  queue.tail = NULL;
231
232
113
  root_state = automaton->root;
233
234
  // Set the failure link of root state to itself.
235
113
  root_state->failure = root_state;
236
237
  // Push root's children and set their failure link to root.
238
113
  state = root_state->first_child;
239
240
941
  while (state != NULL)
241
828
  {
242
828
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
243
828
    state->failure = root_state;
244
828
    state = state->siblings;
245
828
  }
246
247
  // Traverse the trie in BFS order calculating the failure link
248
  // for each state.
249
208k
  while (!_yr_ac_queue_is_empty(&queue))
250
208k
  {
251
208k
    current_state = _yr_ac_queue_pop(&queue);
252
208k
    match = yr_arena_ref_to_ptr(automaton->arena, &current_state->matches_ref);
253
254
208k
    if (match != NULL)
255
140k
    {
256
      // Find the last match in the list of matches.
257
2.16M
      while (match->next != NULL) match = match->next;
258
259
140k
      if (match->backtrack > 0)
260
129k
        match->next = yr_arena_ref_to_ptr(
261
129k
            automaton->arena, &root_state->matches_ref);
262
140k
    }
263
68.0k
    else
264
68.0k
    {
265
      // This state doesn't have any matches, its matches will be those
266
      // in the root state, if any.
267
68.0k
      current_state->matches_ref = root_state->matches_ref;
268
68.0k
    }
269
270
    // Iterate over all the states that the current state can transition to.
271
208k
    transition_state = current_state->first_child;
272
273
415k
    while (transition_state != NULL)
274
207k
    {
275
207k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, transition_state));
276
207k
      failure_state = current_state->failure;
277
278
253k
      while (1)
279
253k
      {
280
253k
        temp_state = _yr_ac_next_state(failure_state, transition_state->input);
281
282
253k
        if (temp_state != NULL)
283
161k
        {
284
161k
          transition_state->failure = temp_state;
285
286
161k
          if (YR_ARENA_IS_NULL_REF(transition_state->matches_ref))
287
42.1k
          {
288
42.1k
            transition_state->matches_ref = temp_state->matches_ref;
289
42.1k
          }
290
119k
          else
291
119k
          {
292
119k
            match = yr_arena_ref_to_ptr(
293
119k
                automaton->arena, &transition_state->matches_ref);
294
295
119k
            assert(match != NULL);
296
297
            // Find the last match in the list of matches.
298
1.43M
            while (match->next != NULL) match = match->next;
299
300
119k
            match->next = yr_arena_ref_to_ptr(
301
119k
                automaton->arena, &temp_state->matches_ref);
302
119k
          }
303
304
161k
          break;
305
161k
        }
306
92.6k
        else
307
92.6k
        {
308
92.6k
          if (failure_state == root_state)
309
45.9k
          {
310
45.9k
            transition_state->failure = root_state;
311
45.9k
            break;
312
45.9k
          }
313
46.6k
          else
314
46.6k
          {
315
46.6k
            failure_state = failure_state->failure;
316
46.6k
          }
317
92.6k
        }
318
253k
      }  // while(1)
319
320
207k
      transition_state = transition_state->siblings;
321
207k
    }
322
323
208k
  }  // while(!__yr_ac_queue_is_empty(&queue))
324
325
113
  return ERROR_SUCCESS;
326
113
}
327
328
////////////////////////////////////////////////////////////////////////////////
329
// Returns true if the transitions for state s2 are a subset of the transitions
330
// for state s1. In other words, if at state s2 input X is accepted, it must be
331
// accepted in s1 too.
332
//
333
static bool _yr_ac_transitions_subset(YR_AC_STATE* s1, YR_AC_STATE* s2)
334
161k
{
335
161k
  uint8_t set[32];
336
337
161k
  YR_AC_STATE* state = s1->first_child;
338
339
161k
  memset(set, 0, 32);
340
341
274k
  while (state != NULL)
342
112k
  {
343
112k
    set[state->input / 8] |= 1 << state->input % 8;
344
112k
    state = state->siblings;
345
112k
  }
346
347
161k
  state = s2->first_child;
348
349
207k
  while (state != NULL)
350
174k
  {
351
174k
    if (!(set[state->input / 8] & 1 << state->input % 8))
352
128k
      return false;
353
354
46.4k
    state = state->siblings;
355
46.4k
  }
356
357
32.9k
  return true;
358
161k
}
359
360
////////////////////////////////////////////////////////////////////////////////
361
// Removes unnecessary failure links.
362
//
363
static int _yr_ac_optimize_failure_links(YR_AC_AUTOMATON* automaton)
364
113
{
365
113
  QUEUE queue = {NULL, NULL};
366
367
  // Push root's children.
368
113
  YR_AC_STATE* root_state = automaton->root;
369
113
  YR_AC_STATE* state = root_state->first_child;
370
371
941
  while (state != NULL)
372
828
  {
373
828
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
374
828
    state = state->siblings;
375
828
  }
376
377
208k
  while (!_yr_ac_queue_is_empty(&queue))
378
208k
  {
379
208k
    YR_AC_STATE* current_state = _yr_ac_queue_pop(&queue);
380
381
208k
    if (current_state->failure != root_state)
382
161k
    {
383
161k
      if (_yr_ac_transitions_subset(current_state, current_state->failure))
384
32.9k
        current_state->failure = current_state->failure->failure;
385
161k
    }
386
387
    // Push children of current_state
388
208k
    state = current_state->first_child;
389
390
415k
    while (state != NULL)
391
207k
    {
392
207k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
393
207k
      state = state->siblings;
394
207k
    }
395
208k
  }
396
397
113
  return ERROR_SUCCESS;
398
113
}
399
400
////////////////////////////////////////////////////////////////////////////////
401
// Find a place within the automaton's transition table where the transitions
402
// for the given state can be put. The function first create a bitmask for the
403
// state's transition table, then searches for an offset within the automaton's
404
// bitmask where the state's bitmask can be put without bit collisions.
405
//
406
static int _yr_ac_find_suitable_transition_table_slot(
407
    YR_AC_AUTOMATON* automaton,
408
    YR_ARENA* arena,
409
    YR_AC_STATE* state,
410
    uint32_t* slot)
411
208k
{
412
  // The state's transition table has 257 entries, 1 for the failure link and
413
  // 256 for each possible input byte, so the state's bitmask has 257 bits.
414
208k
  YR_BITMASK state_bitmask[YR_BITMASK_SIZE(257)];
415
416
208k
  YR_AC_STATE* child_state = state->first_child;
417
418
  // Start with all bits set to zero.
419
208k
  yr_bitmask_clear_all(state_bitmask);
420
421
  // The first slot in the transition table is for the state's failure link,
422
  // so the first bit in the bitmask must be set to one.
423
208k
  yr_bitmask_set(state_bitmask, 0);
424
425
415k
  while (child_state != NULL)
426
207k
  {
427
207k
    yr_bitmask_set(state_bitmask, child_state->input + 1);
428
207k
    child_state = child_state->siblings;
429
207k
  }
430
431
208k
  *slot = yr_bitmask_find_non_colliding_offset(
432
208k
      automaton->bitmask,
433
208k
      state_bitmask,
434
208k
      automaton->tables_size,
435
208k
      257,
436
208k
      &automaton->t_table_unused_candidate);
437
438
  // Make sure that we are not going beyond the maximum size of the transition
439
  // table, starting at the slot found there must be at least 257 other slots
440
  // for accommodating the state's transition table.
441
208k
  assert(*slot + 257 < YR_AC_MAX_TRANSITION_TABLE_SIZE);
442
443
208k
  if (*slot > automaton->tables_size - 257)
444
1.60k
  {
445
1.60k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
446
1.60k
        arena, YR_AC_TRANSITION_TABLE, 257 * sizeof(YR_AC_TRANSITION), NULL));
447
448
1.60k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
449
1.60k
        arena, YR_AC_STATE_MATCHES_TABLE, 257 * sizeof(uint8_t*), NULL));
450
451
1.60k
    size_t bm_len = YR_BITMASK_SIZE(automaton->tables_size) *
452
1.60k
                    sizeof(YR_BITMASK);
453
454
1.60k
    size_t bm_len_incr = YR_BITMASK_SIZE(257) * sizeof(YR_BITMASK);
455
456
1.60k
    automaton->bitmask = yr_realloc(automaton->bitmask, bm_len + bm_len_incr);
457
458
1.60k
    if (automaton->bitmask == NULL)
459
0
      return ERROR_INSUFFICIENT_MEMORY;
460
461
1.60k
    memset((uint8_t*) automaton->bitmask + bm_len, 0, bm_len_incr);
462
463
1.60k
    automaton->tables_size += 257;
464
1.60k
  }
465
466
208k
  return ERROR_SUCCESS;
467
208k
}
468
469
////////////////////////////////////////////////////////////////////////////////
470
// Builds the transition table for the automaton. The transition table (T) is a
471
// large array of 32-bits integers. Each state in the automaton is represented
472
// by an index S within the array. The integer stored in T[S] is the failure
473
// link for state S, it contains the index of the next state when no valid
474
// transition exists for the next input byte.
475
//
476
// At position T[S+1+B] (where B is a byte) we can find the transition (if any)
477
// that must be followed from state S if the next input is B. The value in
478
// T[S+1+B] contains the index for next state or zero. A zero value means that
479
// no valid transition exists from state S when next input is B, and the failure
480
// link must be used instead.
481
//
482
// The transition table for state S starts at T[S] and spans the next 257
483
// slots in the array (1 for the failure link and 256 for all the possible
484
// transitions). But many of those slots are for invalid transitions, so
485
// the transitions for multiple states can be interleaved as long as they don't
486
// collide. For example, instead of having this transition table with state S1
487
// and S2 separated by a large number of slots:
488
//
489
// S1                                             S2
490
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
491
// | FLS1 |   X  |   -  |   -  |     -     |  Y   | FLS2 |   Z  |     -     |
492
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
493
//
494
// We can interleave the transitions for states S1 and S2 and get this other
495
// transition table, which is more compact:
496
//
497
// S1            S2
498
// +------+------+------+------+--   ~   --+------+
499
// | FLS1 |  X   | FLS2 |   Z  |     -     |  Y   |
500
// +------+------+------+------+--   ~   --+------+
501
//
502
// And how do we know that transition Z belongs to state S2 and not S1? Or that
503
// transition Y belongs to S1 and not S2? Because each slot of the array not
504
// only contains the index for the state where the transition points to, it
505
// also contains the offset of the transition relative to its owner state. So,
506
// the value for the owner offset would be 1 for transitions X, because X
507
// belongs to state S1 and it's located 1 position away from S1. The same occurs
508
// for Z, it belongs to S2 and it's located one position away from S2 so its
509
// owner offset is 1. If we are in S1 and next byte is 2, we are going to read
510
// the transition at T[S1+1+2] which is Z. But we know that transition Z is not
511
// a valid transition for state S1 because the owner offset for Z is 1 not 3.
512
//
513
// Each 32-bit slot in the transition table has 23 bits for storing the index
514
// of the target state and 9 bits for storing the offset of the slot relative
515
// to its own state. The offset can be any value from 0 to 256, both inclusive,
516
// hence 9 bits are required for it. The layout for the slot goes like:
517
//
518
// 32                      23        0
519
// +-----------------------+---------+
520
// | Target state's index  |  Offset |
521
// +-----------------------+---------+
522
//
523
// A more detailed description can be found in: http://goo.gl/lE6zG
524
//
525
static int _yr_ac_build_transition_table(YR_AC_AUTOMATON* automaton)
526
113
{
527
113
  YR_AC_TRANSITION* t_table;
528
113
  uint32_t* m_table;
529
113
  YR_AC_STATE* state;
530
113
  YR_AC_STATE* child_state;
531
113
  YR_AC_STATE* root_state = automaton->root;
532
533
113
  uint32_t slot;
534
535
113
  QUEUE queue = {NULL, NULL};
536
537
  // Both t_table and m_table have 512 slots initially, which is enough for the
538
  // root node's transition table.
539
113
  automaton->tables_size = 512;
540
541
113
  automaton->bitmask = yr_calloc(
542
113
      YR_BITMASK_SIZE(automaton->tables_size), sizeof(YR_BITMASK));
543
544
113
  if (automaton->bitmask == NULL)
545
0
    return ERROR_INSUFFICIENT_MEMORY;
546
547
113
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
548
113
      automaton->arena,
549
113
      YR_AC_TRANSITION_TABLE,
550
113
      automaton->tables_size * sizeof(YR_AC_TRANSITION),
551
113
      NULL));
552
553
113
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
554
113
      automaton->arena,
555
113
      YR_AC_STATE_MATCHES_TABLE,
556
113
      automaton->tables_size * sizeof(uint32_t),
557
113
      NULL));
558
559
113
  t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
560
113
  m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
561
562
  // The failure link for the root node points to itself.
563
113
  t_table[0] = YR_AC_MAKE_TRANSITION(0, 0);
564
565
  // Initialize the entry corresponding to the root node in the match table.
566
  // Entries in this table are the index within YR_AC_MATCH_POOL where resides
567
  // the YR_AC_MATCH structure that corresponds to the head of the matches list
568
  // for the node. The indexes start counting at 1, the zero is used for
569
  // indicating that the node has no associated matches.
570
113
  if (!YR_ARENA_IS_NULL_REF(root_state->matches_ref))
571
9
    m_table[0] = root_state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
572
573
  // Mark the first slot in the transition table as used.
574
113
  yr_bitmask_set(automaton->bitmask, 0);
575
576
  // Index 0 is for root node. Unused indexes start at 1.
577
113
  automaton->t_table_unused_candidate = 1;
578
579
113
  child_state = root_state->first_child;
580
581
941
  while (child_state != NULL)
582
828
  {
583
    // Each state stores its slot number.
584
828
    child_state->t_table_slot = child_state->input + 1;
585
586
828
    t_table[child_state->input + 1] = YR_AC_MAKE_TRANSITION(
587
828
        0, child_state->input + 1);
588
589
828
    yr_bitmask_set(automaton->bitmask, child_state->input + 1);
590
591
828
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
592
828
    child_state = child_state->siblings;
593
828
  }
594
595
208k
  while (!_yr_ac_queue_is_empty(&queue))
596
208k
  {
597
208k
    state = _yr_ac_queue_pop(&queue);
598
599
208k
    FAIL_ON_ERROR(_yr_ac_find_suitable_transition_table_slot(
600
208k
        automaton, automaton->arena, state, &slot));
601
602
    // _yr_ac_find_suitable_transition_table_slot can allocate more space in
603
    // both tables and cause the tables to be moved to a different memory
604
    // location, we must get their up-to-date addresses.
605
208k
    t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
606
208k
    m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
607
608
208k
    t_table[state->t_table_slot] |= (slot << YR_AC_SLOT_OFFSET_BITS);
609
208k
    t_table[slot] = YR_AC_MAKE_TRANSITION(state->failure->t_table_slot, 0);
610
611
    // The match table is an array of indexes within YR_AC_MATCHES_POOL. The
612
    // N-th item in the array is the index for the YR_AC_MATCH structure that
613
    // represents the head of the matches list for state N. The indexes start
614
    // at 1, the 0 indicates that there are no matches for the state.
615
208k
    if (YR_ARENA_IS_NULL_REF(state->matches_ref))
616
59.7k
      m_table[slot] = 0;
617
148k
    else
618
148k
      m_table[slot] = state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
619
620
208k
    state->t_table_slot = slot;
621
622
208k
    yr_bitmask_set(automaton->bitmask, slot);
623
624
    // Push children of current_state
625
208k
    child_state = state->first_child;
626
627
415k
    while (child_state != NULL)
628
207k
    {
629
207k
      child_state->t_table_slot = slot + child_state->input + 1;
630
631
207k
      t_table[child_state->t_table_slot] = YR_AC_MAKE_TRANSITION(
632
207k
          0, child_state->input + 1);
633
634
207k
      yr_bitmask_set(automaton->bitmask, child_state->t_table_slot);
635
636
207k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
637
638
207k
      child_state = child_state->siblings;
639
207k
    }
640
208k
  }
641
642
113
  return ERROR_SUCCESS;
643
113
}
644
645
////////////////////////////////////////////////////////////////////////////////
646
// Prints automaton state for debug purposes. This function is invoked by
647
// yr_ac_print_automaton, is not intended to be used stand-alone.
648
//
649
static void _yr_ac_print_automaton_state(
650
    YR_AC_AUTOMATON* automaton,
651
    YR_AC_STATE* state)
652
0
{
653
0
  int child_count;
654
655
0
  YR_AC_MATCH* match;
656
0
  YR_AC_STATE* child_state;
657
658
0
  for (int i = 0; i < state->depth; i++) printf(" ");
659
660
0
  child_state = state->first_child;
661
0
  child_count = 0;
662
663
0
  while (child_state != NULL)
664
0
  {
665
0
    child_count++;
666
0
    child_state = child_state->siblings;
667
0
  }
668
669
0
  printf(
670
0
      "%p childs:%d depth:%d failure:%p",
671
0
      state,
672
0
      child_count,
673
0
      state->depth,
674
0
      state->failure);
675
676
0
  match = yr_arena_ref_to_ptr(automaton->arena, &state->matches_ref);
677
678
0
  while (match != NULL)
679
0
  {
680
0
    printf("\n");
681
682
0
    for (int i = 0; i < state->depth + 1; i++) printf(" ");
683
684
0
    printf("%s = ", match->string->identifier);
685
686
0
    if (STRING_IS_HEX(match->string))
687
0
    {
688
0
      printf("{ ");
689
690
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
691
0
        printf("%02x ", match->string->string[i]);
692
693
0
      printf("}");
694
0
    }
695
0
    else if (STRING_IS_REGEXP(match->string))
696
0
    {
697
0
      printf("/");
698
699
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
700
0
        printf("%c", match->string->string[i]);
701
702
0
      printf("/");
703
0
    }
704
0
    else
705
0
    {
706
0
      printf("\"");
707
708
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
709
0
        printf("%c", match->string->string[i]);
710
711
0
      printf("\"");
712
0
    }
713
714
0
    match = match->next;
715
0
  }
716
717
0
  printf("\n");
718
719
0
  child_state = state->first_child;
720
721
0
  while (child_state != NULL)
722
0
  {
723
0
    _yr_ac_print_automaton_state(automaton, child_state);
724
0
    child_state = child_state->siblings;
725
0
  }
726
0
}
727
728
////////////////////////////////////////////////////////////////////////////////
729
// Creates a new automaton
730
//
731
int yr_ac_automaton_create(YR_ARENA* arena, YR_AC_AUTOMATON** automaton)
732
1.75k
{
733
1.75k
  YR_AC_AUTOMATON* new_automaton;
734
1.75k
  YR_AC_STATE* root_state;
735
736
1.75k
  new_automaton = (YR_AC_AUTOMATON*) yr_malloc(sizeof(YR_AC_AUTOMATON));
737
1.75k
  root_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
738
739
1.75k
  if (new_automaton == NULL || root_state == NULL)
740
0
  {
741
0
    yr_free(new_automaton);
742
0
    yr_free(root_state);
743
744
0
    return ERROR_INSUFFICIENT_MEMORY;
745
0
  }
746
747
1.75k
  root_state->depth = 0;
748
1.75k
  root_state->matches_ref = YR_ARENA_NULL_REF;
749
1.75k
  root_state->failure = NULL;
750
1.75k
  root_state->first_child = NULL;
751
1.75k
  root_state->siblings = NULL;
752
1.75k
  root_state->t_table_slot = 0;
753
754
1.75k
  new_automaton->arena = arena;
755
1.75k
  new_automaton->root = root_state;
756
1.75k
  new_automaton->bitmask = NULL;
757
1.75k
  new_automaton->tables_size = 0;
758
759
1.75k
  *automaton = new_automaton;
760
761
1.75k
  return ERROR_SUCCESS;
762
1.75k
}
763
764
////////////////////////////////////////////////////////////////////////////////
765
// Destroys automaton
766
//
767
int yr_ac_automaton_destroy(YR_AC_AUTOMATON* automaton)
768
1.75k
{
769
1.75k
  _yr_ac_state_destroy(automaton->root);
770
771
1.75k
  yr_free(automaton->bitmask);
772
1.75k
  yr_free(automaton);
773
774
1.75k
  return ERROR_SUCCESS;
775
1.75k
}
776
777
////////////////////////////////////////////////////////////////////////////////
778
// Adds a string to the automaton. This function is invoked once for each
779
// string defined in the rules.
780
//
781
int yr_ac_add_string(
782
    YR_AC_AUTOMATON* automaton,
783
    YR_STRING* string,
784
    uint32_t string_idx,
785
    YR_ATOM_LIST_ITEM* atom,
786
    YR_ARENA* arena)
787
39.6k
{
788
1.92M
  while (atom != NULL)
789
1.88M
  {
790
1.88M
    YR_AC_STATE* state = automaton->root;
791
792
9.15M
    for (int i = 0; i < atom->atom.length; i++)
793
7.26M
    {
794
7.26M
      YR_AC_STATE* next_state = _yr_ac_next_state(state, atom->atom.bytes[i]);
795
796
7.26M
      if (next_state == NULL)
797
370k
      {
798
370k
        next_state = _yr_ac_state_create(state, atom->atom.bytes[i]);
799
800
370k
        if (next_state == NULL)
801
0
          return ERROR_INSUFFICIENT_MEMORY;
802
370k
      }
803
804
7.26M
      state = next_state;
805
7.26M
    }
806
807
1.88M
    YR_ARENA_REF new_match_ref;
808
809
1.88M
    FAIL_ON_ERROR(yr_arena_allocate_struct(
810
1.88M
        arena,
811
1.88M
        YR_AC_STATE_MATCHES_POOL,
812
1.88M
        sizeof(YR_AC_MATCH),
813
1.88M
        &new_match_ref,
814
1.88M
        offsetof(YR_AC_MATCH, string),
815
1.88M
        offsetof(YR_AC_MATCH, forward_code),
816
1.88M
        offsetof(YR_AC_MATCH, backward_code),
817
1.88M
        offsetof(YR_AC_MATCH, next),
818
1.88M
        EOL));
819
820
1.88M
    YR_AC_MATCH* new_match = yr_arena_ref_to_ptr(arena, &new_match_ref);
821
822
1.88M
    new_match->backtrack = state->depth + atom->backtrack;
823
1.88M
    new_match->string = yr_arena_get_ptr(
824
1.88M
        arena, YR_STRINGS_TABLE, string_idx * sizeof(struct YR_STRING));
825
826
1.88M
    new_match->forward_code = yr_arena_ref_to_ptr(
827
1.88M
        arena, &atom->forward_code_ref);
828
829
1.88M
    new_match->backward_code = yr_arena_ref_to_ptr(
830
1.88M
        arena, &atom->backward_code_ref);
831
832
    // Add newly created match to the list of matches for the state.
833
1.88M
    new_match->next = yr_arena_ref_to_ptr(arena, &state->matches_ref);
834
1.88M
    state->matches_ref = new_match_ref;
835
836
1.88M
    atom = atom->next;
837
1.88M
  }
838
839
39.6k
  return ERROR_SUCCESS;
840
39.6k
}
841
842
////////////////////////////////////////////////////////////////////////////////
843
// Compiles the Aho-Corasick automaton, the resulting data structures are
844
// are written in the provided arena.
845
//
846
int yr_ac_compile(YR_AC_AUTOMATON* automaton, YR_ARENA* arena)
847
113
{
848
113
  FAIL_ON_ERROR(_yr_ac_create_failure_links(automaton));
849
113
  FAIL_ON_ERROR(_yr_ac_optimize_failure_links(automaton));
850
113
  FAIL_ON_ERROR(_yr_ac_build_transition_table(automaton));
851
852
113
  return ERROR_SUCCESS;
853
113
}
854
855
////////////////////////////////////////////////////////////////////////////////
856
// Prints automaton for debug purposes.
857
//
858
void yr_ac_print_automaton(YR_AC_AUTOMATON* automaton)
859
0
{
860
0
  printf("-------------------------------------------------------\n");
861
0
  _yr_ac_print_automaton_state(automaton, automaton->root);
862
0
  printf("-------------------------------------------------------\n");
863
0
}