Coverage Report

Created: 2025-07-12 06:22

/src/yara/libyara/ahocorasick.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
#include <assert.h>
31
#include <stddef.h>
32
#include <string.h>
33
#include <yara/ahocorasick.h>
34
#include <yara/arena.h>
35
#include <yara/compiler.h>
36
#include <yara/error.h>
37
#include <yara/mem.h>
38
#include <yara/utils.h>
39
40
typedef struct _QUEUE_NODE
41
{
42
  YR_AC_STATE* value;
43
44
  struct _QUEUE_NODE* previous;
45
  struct _QUEUE_NODE* next;
46
47
} QUEUE_NODE;
48
49
typedef struct _QUEUE
50
{
51
  QUEUE_NODE* head;
52
  QUEUE_NODE* tail;
53
54
} QUEUE;
55
56
////////////////////////////////////////////////////////////////////////////////
57
// Pushes an automaton state into the tail of a queue.
58
//
59
// Args:
60
//   queue: Pointer to the queue.
61
//   state: Pointer to the state being pushed into the queue.
62
//
63
// Returns:
64
//   ERROR_SUCCESS
65
//   ERROR_INSUFFICIENT_MEMORY
66
//
67
static int _yr_ac_queue_push(QUEUE* queue, YR_AC_STATE* state)
68
1.06M
{
69
1.06M
  QUEUE_NODE* pushed_node;
70
71
1.06M
  pushed_node = (QUEUE_NODE*) yr_malloc(sizeof(QUEUE_NODE));
72
73
1.06M
  if (pushed_node == NULL)
74
0
    return ERROR_INSUFFICIENT_MEMORY;
75
76
1.06M
  pushed_node->previous = queue->tail;
77
1.06M
  pushed_node->next = NULL;
78
1.06M
  pushed_node->value = state;
79
80
1.06M
  if (queue->tail != NULL)
81
1.06M
    queue->tail->next = pushed_node;
82
171
  else  // queue is empty
83
171
    queue->head = pushed_node;
84
85
1.06M
  queue->tail = pushed_node;
86
87
1.06M
  return ERROR_SUCCESS;
88
1.06M
}
89
90
////////////////////////////////////////////////////////////////////////////////
91
// Pops an automaton state from the head of a queue.
92
//
93
// Args:
94
//   queue: Pointer to the queue.
95
//
96
// Returns:
97
//   Pointer to the poped state.
98
//
99
static YR_AC_STATE* _yr_ac_queue_pop(QUEUE* queue)
100
1.06M
{
101
1.06M
  YR_AC_STATE* result;
102
1.06M
  QUEUE_NODE* popped_node;
103
104
1.06M
  if (queue->head == NULL)
105
0
    return NULL;
106
107
1.06M
  popped_node = queue->head;
108
1.06M
  queue->head = popped_node->next;
109
110
1.06M
  if (queue->head)
111
1.06M
    queue->head->previous = NULL;
112
171
  else  // queue is empty
113
171
    queue->tail = NULL;
114
115
1.06M
  result = popped_node->value;
116
117
1.06M
  yr_free(popped_node);
118
1.06M
  return result;
119
1.06M
}
120
121
////////////////////////////////////////////////////////////////////////////////
122
// Checks if a queue is empty.
123
//
124
// Args:
125
//   queue: Pointer to the queue.
126
//
127
// Returns:
128
//   true if queue is empty, false otherwise.
129
//
130
static int _yr_ac_queue_is_empty(QUEUE* queue)
131
1.06M
{
132
1.06M
  return queue->head == NULL;
133
1.06M
}
134
135
////////////////////////////////////////////////////////////////////////////////
136
// Given an automaton state and an input symbol, returns the new state
137
// after reading the input symbol.
138
//
139
// Args:
140
//    state: Pointer to automaton state.
141
//    input: Input symbol.
142
//
143
// Returns:
144
//   Pointer to the next automaton state.
145
//
146
static YR_AC_STATE* _yr_ac_next_state(YR_AC_STATE* state, uint8_t input)
147
14.4M
{
148
14.4M
  YR_AC_STATE* next_state = state->first_child;
149
150
551M
  while (next_state != NULL)
151
550M
  {
152
550M
    if (next_state->input == input)
153
13.6M
      return next_state;
154
155
537M
    next_state = next_state->siblings;
156
537M
  }
157
158
783k
  return NULL;
159
14.4M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Creates a new automaton state, the automaton will transition from
163
// the given state to the new state after reading the input symbol.
164
//
165
// Args:
166
//   state: Pointer to the origin state.
167
//   input: Input symbol.
168
//
169
// Returns:
170
//   YR_AC_STATE* pointer to the newly allocated state or NULL in case
171
//   of error.
172
//
173
static YR_AC_STATE* _yr_ac_state_create(YR_AC_STATE* state, uint8_t input)
174
627k
{
175
627k
  YR_AC_STATE* new_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
176
177
627k
  if (new_state == NULL)
178
0
    return NULL;
179
180
627k
  new_state->input = input;
181
627k
  new_state->depth = state->depth + 1;
182
627k
  new_state->matches_ref = YR_ARENA_NULL_REF;
183
627k
  new_state->failure = NULL;
184
627k
  new_state->t_table_slot = 0;
185
627k
  new_state->first_child = NULL;
186
627k
  new_state->siblings = state->first_child;
187
627k
  state->first_child = new_state;
188
189
627k
  return new_state;
190
627k
}
191
192
////////////////////////////////////////////////////////////////////////////////
193
// Destroys an automaton state.
194
//
195
static int _yr_ac_state_destroy(YR_AC_STATE* state)
196
629k
{
197
629k
  YR_AC_STATE* child_state = state->first_child;
198
199
1.25M
  while (child_state != NULL)
200
627k
  {
201
627k
    YR_AC_STATE* next_child_state = child_state->siblings;
202
627k
    _yr_ac_state_destroy(child_state);
203
627k
    child_state = next_child_state;
204
627k
  }
205
206
629k
  yr_free(state);
207
208
629k
  return ERROR_SUCCESS;
209
629k
}
210
211
////////////////////////////////////////////////////////////////////////////////
212
// Create failure links for each automaton state.
213
//
214
// This function must be called after all the strings have been added to the
215
// automaton with yr_ac_add_string.
216
//
217
static int _yr_ac_create_failure_links(YR_AC_AUTOMATON* automaton)
218
125
{
219
125
  YR_AC_STATE* current_state;
220
125
  YR_AC_STATE* failure_state;
221
125
  YR_AC_STATE* temp_state;
222
125
  YR_AC_STATE* state;
223
125
  YR_AC_STATE* transition_state;
224
125
  YR_AC_STATE* root_state;
225
125
  YR_AC_MATCH* match;
226
227
125
  QUEUE queue;
228
229
125
  queue.head = NULL;
230
125
  queue.tail = NULL;
231
232
125
  root_state = automaton->root;
233
234
  // Set the failure link of root state to itself.
235
125
  root_state->failure = root_state;
236
237
  // Push root's children and set their failure link to root.
238
125
  state = root_state->first_child;
239
240
1.96k
  while (state != NULL)
241
1.83k
  {
242
1.83k
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
243
1.83k
    state->failure = root_state;
244
1.83k
    state = state->siblings;
245
1.83k
  }
246
247
  // Traverse the trie in BFS order calculating the failure link
248
  // for each state.
249
353k
  while (!_yr_ac_queue_is_empty(&queue))
250
353k
  {
251
353k
    current_state = _yr_ac_queue_pop(&queue);
252
353k
    match = yr_arena_ref_to_ptr(automaton->arena, &current_state->matches_ref);
253
254
353k
    if (match != NULL)
255
243k
    {
256
      // Find the last match in the list of matches.
257
3.85M
      while (match->next != NULL) match = match->next;
258
259
243k
      if (match->backtrack > 0)
260
221k
        match->next = yr_arena_ref_to_ptr(
261
221k
            automaton->arena, &root_state->matches_ref);
262
243k
    }
263
110k
    else
264
110k
    {
265
      // This state doesn't have any matches, its matches will be those
266
      // in the root state, if any.
267
110k
      current_state->matches_ref = root_state->matches_ref;
268
110k
    }
269
270
    // Iterate over all the states that the current state can transition to.
271
353k
    transition_state = current_state->first_child;
272
273
705k
    while (transition_state != NULL)
274
351k
    {
275
351k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, transition_state));
276
351k
      failure_state = current_state->failure;
277
278
427k
      while (1)
279
427k
      {
280
427k
        temp_state = _yr_ac_next_state(failure_state, transition_state->input);
281
282
427k
        if (temp_state != NULL)
283
270k
        {
284
270k
          transition_state->failure = temp_state;
285
286
270k
          if (YR_ARENA_IS_NULL_REF(transition_state->matches_ref))
287
62.6k
          {
288
62.6k
            transition_state->matches_ref = temp_state->matches_ref;
289
62.6k
          }
290
208k
          else
291
208k
          {
292
208k
            match = yr_arena_ref_to_ptr(
293
208k
                automaton->arena, &transition_state->matches_ref);
294
295
208k
            assert(match != NULL);
296
297
            // Find the last match in the list of matches.
298
2.72M
            while (match->next != NULL) match = match->next;
299
300
208k
            match->next = yr_arena_ref_to_ptr(
301
208k
                automaton->arena, &temp_state->matches_ref);
302
208k
          }
303
304
270k
          break;
305
270k
        }
306
156k
        else
307
156k
        {
308
156k
          if (failure_state == root_state)
309
80.9k
          {
310
80.9k
            transition_state->failure = root_state;
311
80.9k
            break;
312
80.9k
          }
313
75.7k
          else
314
75.7k
          {
315
75.7k
            failure_state = failure_state->failure;
316
75.7k
          }
317
156k
        }
318
427k
      }  // while(1)
319
320
351k
      transition_state = transition_state->siblings;
321
351k
    }
322
323
353k
  }  // while(!__yr_ac_queue_is_empty(&queue))
324
325
125
  return ERROR_SUCCESS;
326
125
}
327
328
////////////////////////////////////////////////////////////////////////////////
329
// Returns true if the transitions for state s2 are a subset of the transitions
330
// for state s1. In other words, if at state s2 input X is accepted, it must be
331
// accepted in s1 too.
332
//
333
static bool _yr_ac_transitions_subset(YR_AC_STATE* s1, YR_AC_STATE* s2)
334
270k
{
335
270k
  uint8_t set[32];
336
337
270k
  YR_AC_STATE* state = s1->first_child;
338
339
270k
  memset(set, 0, 32);
340
341
457k
  while (state != NULL)
342
186k
  {
343
186k
    set[state->input / 8] |= 1 << state->input % 8;
344
186k
    state = state->siblings;
345
186k
  }
346
347
270k
  state = s2->first_child;
348
349
347k
  while (state != NULL)
350
298k
  {
351
298k
    if (!(set[state->input / 8] & 1 << state->input % 8))
352
221k
      return false;
353
354
76.6k
    state = state->siblings;
355
76.6k
  }
356
357
49.0k
  return true;
358
270k
}
359
360
////////////////////////////////////////////////////////////////////////////////
361
// Removes unnecessary failure links.
362
//
363
static int _yr_ac_optimize_failure_links(YR_AC_AUTOMATON* automaton)
364
125
{
365
125
  QUEUE queue = {NULL, NULL};
366
367
  // Push root's children.
368
125
  YR_AC_STATE* root_state = automaton->root;
369
125
  YR_AC_STATE* state = root_state->first_child;
370
371
1.96k
  while (state != NULL)
372
1.83k
  {
373
1.83k
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
374
1.83k
    state = state->siblings;
375
1.83k
  }
376
377
353k
  while (!_yr_ac_queue_is_empty(&queue))
378
353k
  {
379
353k
    YR_AC_STATE* current_state = _yr_ac_queue_pop(&queue);
380
381
353k
    if (current_state->failure != root_state)
382
270k
    {
383
270k
      if (_yr_ac_transitions_subset(current_state, current_state->failure))
384
49.0k
        current_state->failure = current_state->failure->failure;
385
270k
    }
386
387
    // Push children of current_state
388
353k
    state = current_state->first_child;
389
390
705k
    while (state != NULL)
391
351k
    {
392
351k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
393
351k
      state = state->siblings;
394
351k
    }
395
353k
  }
396
397
125
  return ERROR_SUCCESS;
398
125
}
399
400
////////////////////////////////////////////////////////////////////////////////
401
// Find a place within the automaton's transition table where the transitions
402
// for the given state can be put. The function first create a bitmask for the
403
// state's transition table, then searches for an offset within the automaton's
404
// bitmask where the state's bitmask can be put without bit collisions.
405
//
406
static int _yr_ac_find_suitable_transition_table_slot(
407
    YR_AC_AUTOMATON* automaton,
408
    YR_ARENA* arena,
409
    YR_AC_STATE* state,
410
    uint32_t* slot)
411
353k
{
412
  // The state's transition table has 257 entries, 1 for the failure link and
413
  // 256 for each possible input byte, so the state's bitmask has 257 bits.
414
353k
  YR_BITMASK state_bitmask[YR_BITMASK_SIZE(257)];
415
416
353k
  YR_AC_STATE* child_state = state->first_child;
417
418
  // Start with all bits set to zero.
419
353k
  yr_bitmask_clear_all(state_bitmask);
420
421
  // The first slot in the transition table is for the state's failure link,
422
  // so the first bit in the bitmask must be set to one.
423
353k
  yr_bitmask_set(state_bitmask, 0);
424
425
705k
  while (child_state != NULL)
426
351k
  {
427
351k
    yr_bitmask_set(state_bitmask, child_state->input + 1);
428
351k
    child_state = child_state->siblings;
429
351k
  }
430
431
353k
  *slot = yr_bitmask_find_non_colliding_offset(
432
353k
      automaton->bitmask,
433
353k
      state_bitmask,
434
353k
      automaton->tables_size,
435
353k
      257,
436
353k
      &automaton->t_table_unused_candidate);
437
438
  // Make sure that we are not going beyond the maximum size of the transition
439
  // table, starting at the slot found there must be at least 257 other slots
440
  // for accommodating the state's transition table.
441
353k
  assert(*slot + 257 < YR_AC_MAX_TRANSITION_TABLE_SIZE);
442
443
353k
  if (*slot > automaton->tables_size - 257)
444
2.73k
  {
445
2.73k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
446
2.73k
        arena, YR_AC_TRANSITION_TABLE, 257 * sizeof(YR_AC_TRANSITION), NULL));
447
448
2.73k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
449
2.73k
        arena, YR_AC_STATE_MATCHES_TABLE, 257 * sizeof(uint8_t*), NULL));
450
451
2.73k
    size_t bm_len = YR_BITMASK_SIZE(automaton->tables_size) *
452
2.73k
                    sizeof(YR_BITMASK);
453
454
2.73k
    size_t bm_len_incr = YR_BITMASK_SIZE(257) * sizeof(YR_BITMASK);
455
456
2.73k
    automaton->bitmask = yr_realloc(automaton->bitmask, bm_len + bm_len_incr);
457
458
2.73k
    if (automaton->bitmask == NULL)
459
0
      return ERROR_INSUFFICIENT_MEMORY;
460
461
2.73k
    memset((uint8_t*) automaton->bitmask + bm_len, 0, bm_len_incr);
462
463
2.73k
    automaton->tables_size += 257;
464
2.73k
  }
465
466
353k
  return ERROR_SUCCESS;
467
353k
}
468
469
////////////////////////////////////////////////////////////////////////////////
470
// Builds the transition table for the automaton. The transition table (T) is a
471
// large array of 32-bits integers. Each state in the automaton is represented
472
// by an index S within the array. The integer stored in T[S] is the failure
473
// link for state S, it contains the index of the next state when no valid
474
// transition exists for the next input byte.
475
//
476
// At position T[S+1+B] (where B is a byte) we can find the transition (if any)
477
// that must be followed from state S if the next input is B. The value in
478
// T[S+1+B] contains the index for next state or zero. A zero value means that
479
// no valid transition exists from state S when next input is B, and the failure
480
// link must be used instead.
481
//
482
// The transition table for state S starts at T[S] and spans the next 257
483
// slots in the array (1 for the failure link and 256 for all the possible
484
// transitions). But many of those slots are for invalid transitions, so
485
// the transitions for multiple states can be interleaved as long as they don't
486
// collide. For example, instead of having this transition table with state S1
487
// and S2 separated by a large number of slots:
488
//
489
// S1                                             S2
490
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
491
// | FLS1 |   X  |   -  |   -  |     -     |  Y   | FLS2 |   Z  |     -     |
492
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
493
//
494
// We can interleave the transitions for states S1 and S2 and get this other
495
// transition table, which is more compact:
496
//
497
// S1            S2
498
// +------+------+------+------+--   ~   --+------+
499
// | FLS1 |  X   | FLS2 |   Z  |     -     |  Y   |
500
// +------+------+------+------+--   ~   --+------+
501
//
502
// And how do we know that transition Z belongs to state S2 and not S1? Or that
503
// transition Y belongs to S1 and not S2? Because each slot of the array not
504
// only contains the index for the state where the transition points to, it
505
// also contains the offset of the transition relative to its owner state. So,
506
// the value for the owner offset would be 1 for transitions X, because X
507
// belongs to state S1 and it's located 1 position away from S1. The same occurs
508
// for Z, it belongs to S2 and it's located one position away from S2 so its
509
// owner offset is 1. If we are in S1 and next byte is 2, we are going to read
510
// the transition at T[S1+1+2] which is Z. But we know that transition Z is not
511
// a valid transition for state S1 because the owner offset for Z is 1 not 3.
512
//
513
// Each 32-bit slot in the transition table has 23 bits for storing the index
514
// of the target state and 9 bits for storing the offset of the slot relative
515
// to its own state. The offset can be any value from 0 to 256, both inclusive,
516
// hence 9 bits are required for it. The layout for the slot goes like:
517
//
518
// 32                      23        0
519
// +-----------------------+---------+
520
// | Target state's index  |  Offset |
521
// +-----------------------+---------+
522
//
523
// A more detailed description can be found in: http://goo.gl/lE6zG
524
//
525
static int _yr_ac_build_transition_table(YR_AC_AUTOMATON* automaton)
526
125
{
527
125
  YR_AC_TRANSITION* t_table;
528
125
  uint32_t* m_table;
529
125
  YR_AC_STATE* state;
530
125
  YR_AC_STATE* child_state;
531
125
  YR_AC_STATE* root_state = automaton->root;
532
533
125
  uint32_t slot;
534
535
125
  QUEUE queue = {NULL, NULL};
536
537
  // Both t_table and m_table have 512 slots initially, which is enough for the
538
  // root node's transition table.
539
125
  automaton->tables_size = 512;
540
541
125
  automaton->bitmask = yr_calloc(
542
125
      YR_BITMASK_SIZE(automaton->tables_size), sizeof(YR_BITMASK));
543
544
125
  if (automaton->bitmask == NULL)
545
0
    return ERROR_INSUFFICIENT_MEMORY;
546
547
125
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
548
125
      automaton->arena,
549
125
      YR_AC_TRANSITION_TABLE,
550
125
      automaton->tables_size * sizeof(YR_AC_TRANSITION),
551
125
      NULL));
552
553
125
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
554
125
      automaton->arena,
555
125
      YR_AC_STATE_MATCHES_TABLE,
556
125
      automaton->tables_size * sizeof(uint32_t),
557
125
      NULL));
558
559
125
  t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
560
125
  m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
561
562
  // The failure link for the root node points to itself.
563
125
  t_table[0] = YR_AC_MAKE_TRANSITION(0, 0);
564
565
  // Initialize the entry corresponding to the root node in the match table.
566
  // Entries in this table are the index within YR_AC_MATCH_POOL where resides
567
  // the YR_AC_MATCH structure that corresponds to the head of the matches list
568
  // for the node. The indexes start counting at 1, the zero is used for
569
  // indicating that the node has no associated matches.
570
125
  if (!YR_ARENA_IS_NULL_REF(root_state->matches_ref))
571
12
    m_table[0] = root_state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
572
573
  // Mark the first slot in the transition table as used.
574
125
  yr_bitmask_set(automaton->bitmask, 0);
575
576
  // Index 0 is for root node. Unused indexes start at 1.
577
125
  automaton->t_table_unused_candidate = 1;
578
579
125
  child_state = root_state->first_child;
580
581
1.96k
  while (child_state != NULL)
582
1.83k
  {
583
    // Each state stores its slot number.
584
1.83k
    child_state->t_table_slot = child_state->input + 1;
585
586
1.83k
    t_table[child_state->input + 1] = YR_AC_MAKE_TRANSITION(
587
1.83k
        0, child_state->input + 1);
588
589
1.83k
    yr_bitmask_set(automaton->bitmask, child_state->input + 1);
590
591
1.83k
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
592
1.83k
    child_state = child_state->siblings;
593
1.83k
  }
594
595
353k
  while (!_yr_ac_queue_is_empty(&queue))
596
353k
  {
597
353k
    state = _yr_ac_queue_pop(&queue);
598
599
353k
    FAIL_ON_ERROR(_yr_ac_find_suitable_transition_table_slot(
600
353k
        automaton, automaton->arena, state, &slot));
601
602
    // _yr_ac_find_suitable_transition_table_slot can allocate more space in
603
    // both tables and cause the tables to be moved to a different memory
604
    // location, we must get their up-to-date addresses.
605
353k
    t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
606
353k
    m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
607
608
353k
    t_table[state->t_table_slot] |= (slot << YR_AC_SLOT_OFFSET_BITS);
609
353k
    t_table[slot] = YR_AC_MAKE_TRANSITION(state->failure->t_table_slot, 0);
610
611
    // The match table is an array of indexes within YR_AC_MATCHES_POOL. The
612
    // N-th item in the array is the index for the YR_AC_MATCH structure that
613
    // represents the head of the matches list for state N. The indexes start
614
    // at 1, the 0 indicates that there are no matches for the state.
615
353k
    if (YR_ARENA_IS_NULL_REF(state->matches_ref))
616
100k
      m_table[slot] = 0;
617
253k
    else
618
253k
      m_table[slot] = state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
619
620
353k
    state->t_table_slot = slot;
621
622
353k
    yr_bitmask_set(automaton->bitmask, slot);
623
624
    // Push children of current_state
625
353k
    child_state = state->first_child;
626
627
705k
    while (child_state != NULL)
628
351k
    {
629
351k
      child_state->t_table_slot = slot + child_state->input + 1;
630
631
351k
      t_table[child_state->t_table_slot] = YR_AC_MAKE_TRANSITION(
632
351k
          0, child_state->input + 1);
633
634
351k
      yr_bitmask_set(automaton->bitmask, child_state->t_table_slot);
635
636
351k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
637
638
351k
      child_state = child_state->siblings;
639
351k
    }
640
353k
  }
641
642
125
  return ERROR_SUCCESS;
643
125
}
644
645
////////////////////////////////////////////////////////////////////////////////
646
// Prints automaton state for debug purposes. This function is invoked by
647
// yr_ac_print_automaton, is not intended to be used stand-alone.
648
//
649
static void _yr_ac_print_automaton_state(
650
    YR_AC_AUTOMATON* automaton,
651
    YR_AC_STATE* state)
652
0
{
653
0
  int child_count;
654
655
0
  YR_AC_MATCH* match;
656
0
  YR_AC_STATE* child_state;
657
658
0
  for (int i = 0; i < state->depth; i++) printf(" ");
659
660
0
  child_state = state->first_child;
661
0
  child_count = 0;
662
663
0
  while (child_state != NULL)
664
0
  {
665
0
    child_count++;
666
0
    child_state = child_state->siblings;
667
0
  }
668
669
0
  printf(
670
0
      "%p childs:%d depth:%d failure:%p",
671
0
      state,
672
0
      child_count,
673
0
      state->depth,
674
0
      state->failure);
675
676
0
  match = yr_arena_ref_to_ptr(automaton->arena, &state->matches_ref);
677
678
0
  while (match != NULL)
679
0
  {
680
0
    printf("\n");
681
682
0
    for (int i = 0; i < state->depth + 1; i++) printf(" ");
683
684
0
    printf("%s = ", match->string->identifier);
685
686
0
    if (STRING_IS_HEX(match->string))
687
0
    {
688
0
      printf("{ ");
689
690
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
691
0
        printf("%02x ", match->string->string[i]);
692
693
0
      printf("}");
694
0
    }
695
0
    else if (STRING_IS_REGEXP(match->string))
696
0
    {
697
0
      printf("/");
698
699
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
700
0
        printf("%c", match->string->string[i]);
701
702
0
      printf("/");
703
0
    }
704
0
    else
705
0
    {
706
0
      printf("\"");
707
708
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
709
0
        printf("%c", match->string->string[i]);
710
711
0
      printf("\"");
712
0
    }
713
714
0
    match = match->next;
715
0
  }
716
717
0
  printf("\n");
718
719
0
  child_state = state->first_child;
720
721
0
  while (child_state != NULL)
722
0
  {
723
0
    _yr_ac_print_automaton_state(automaton, child_state);
724
0
    child_state = child_state->siblings;
725
0
  }
726
0
}
727
728
////////////////////////////////////////////////////////////////////////////////
729
// Creates a new automaton
730
//
731
int yr_ac_automaton_create(YR_ARENA* arena, YR_AC_AUTOMATON** automaton)
732
1.88k
{
733
1.88k
  YR_AC_AUTOMATON* new_automaton;
734
1.88k
  YR_AC_STATE* root_state;
735
736
1.88k
  new_automaton = (YR_AC_AUTOMATON*) yr_malloc(sizeof(YR_AC_AUTOMATON));
737
1.88k
  root_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
738
739
1.88k
  if (new_automaton == NULL || root_state == NULL)
740
0
  {
741
0
    yr_free(new_automaton);
742
0
    yr_free(root_state);
743
744
0
    return ERROR_INSUFFICIENT_MEMORY;
745
0
  }
746
747
1.88k
  root_state->depth = 0;
748
1.88k
  root_state->matches_ref = YR_ARENA_NULL_REF;
749
1.88k
  root_state->failure = NULL;
750
1.88k
  root_state->first_child = NULL;
751
1.88k
  root_state->siblings = NULL;
752
1.88k
  root_state->t_table_slot = 0;
753
754
1.88k
  new_automaton->arena = arena;
755
1.88k
  new_automaton->root = root_state;
756
1.88k
  new_automaton->bitmask = NULL;
757
1.88k
  new_automaton->tables_size = 0;
758
759
1.88k
  *automaton = new_automaton;
760
761
1.88k
  return ERROR_SUCCESS;
762
1.88k
}
763
764
////////////////////////////////////////////////////////////////////////////////
765
// Destroys automaton
766
//
767
int yr_ac_automaton_destroy(YR_AC_AUTOMATON* automaton)
768
1.88k
{
769
1.88k
  _yr_ac_state_destroy(automaton->root);
770
771
1.88k
  yr_free(automaton->bitmask);
772
1.88k
  yr_free(automaton);
773
774
1.88k
  return ERROR_SUCCESS;
775
1.88k
}
776
777
////////////////////////////////////////////////////////////////////////////////
778
// Adds a string to the automaton. This function is invoked once for each
779
// string defined in the rules.
780
//
781
int yr_ac_add_string(
782
    YR_AC_AUTOMATON* automaton,
783
    YR_STRING* string,
784
    uint32_t string_idx,
785
    YR_ATOM_LIST_ITEM* atom,
786
    YR_ARENA* arena)
787
33.6k
{
788
3.61M
  while (atom != NULL)
789
3.58M
  {
790
3.58M
    YR_AC_STATE* state = automaton->root;
791
792
17.5M
    for (int i = 0; i < atom->atom.length; i++)
793
13.9M
    {
794
13.9M
      YR_AC_STATE* next_state = _yr_ac_next_state(state, atom->atom.bytes[i]);
795
796
13.9M
      if (next_state == NULL)
797
627k
      {
798
627k
        next_state = _yr_ac_state_create(state, atom->atom.bytes[i]);
799
800
627k
        if (next_state == NULL)
801
0
          return ERROR_INSUFFICIENT_MEMORY;
802
627k
      }
803
804
13.9M
      state = next_state;
805
13.9M
    }
806
807
3.58M
    YR_ARENA_REF new_match_ref;
808
809
3.58M
    FAIL_ON_ERROR(yr_arena_allocate_struct(
810
3.58M
        arena,
811
3.58M
        YR_AC_STATE_MATCHES_POOL,
812
3.58M
        sizeof(YR_AC_MATCH),
813
3.58M
        &new_match_ref,
814
3.58M
        offsetof(YR_AC_MATCH, string),
815
3.58M
        offsetof(YR_AC_MATCH, forward_code),
816
3.58M
        offsetof(YR_AC_MATCH, backward_code),
817
3.58M
        offsetof(YR_AC_MATCH, next),
818
3.58M
        EOL));
819
820
3.58M
    YR_AC_MATCH* new_match = yr_arena_ref_to_ptr(arena, &new_match_ref);
821
822
3.58M
    new_match->backtrack = state->depth + atom->backtrack;
823
3.58M
    new_match->string = yr_arena_get_ptr(
824
3.58M
        arena, YR_STRINGS_TABLE, string_idx * sizeof(struct YR_STRING));
825
826
3.58M
    new_match->forward_code = yr_arena_ref_to_ptr(
827
3.58M
        arena, &atom->forward_code_ref);
828
829
3.58M
    new_match->backward_code = yr_arena_ref_to_ptr(
830
3.58M
        arena, &atom->backward_code_ref);
831
832
    // Add newly created match to the list of matches for the state.
833
3.58M
    new_match->next = yr_arena_ref_to_ptr(arena, &state->matches_ref);
834
3.58M
    state->matches_ref = new_match_ref;
835
836
3.58M
    atom = atom->next;
837
3.58M
  }
838
839
33.6k
  return ERROR_SUCCESS;
840
33.6k
}
841
842
////////////////////////////////////////////////////////////////////////////////
843
// Compiles the Aho-Corasick automaton, the resulting data structures are
844
// are written in the provided arena.
845
//
846
int yr_ac_compile(YR_AC_AUTOMATON* automaton, YR_ARENA* arena)
847
125
{
848
125
  FAIL_ON_ERROR(_yr_ac_create_failure_links(automaton));
849
125
  FAIL_ON_ERROR(_yr_ac_optimize_failure_links(automaton));
850
125
  FAIL_ON_ERROR(_yr_ac_build_transition_table(automaton));
851
852
125
  return ERROR_SUCCESS;
853
125
}
854
855
////////////////////////////////////////////////////////////////////////////////
856
// Prints automaton for debug purposes.
857
//
858
void yr_ac_print_automaton(YR_AC_AUTOMATON* automaton)
859
0
{
860
0
  printf("-------------------------------------------------------\n");
861
0
  _yr_ac_print_automaton_state(automaton, automaton->root);
862
0
  printf("-------------------------------------------------------\n");
863
0
}