Coverage Report

Created: 2025-08-26 06:33

/src/yara/libyara/ahocorasick.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
#include <assert.h>
31
#include <stddef.h>
32
#include <string.h>
33
#include <yara/ahocorasick.h>
34
#include <yara/arena.h>
35
#include <yara/compiler.h>
36
#include <yara/error.h>
37
#include <yara/mem.h>
38
#include <yara/utils.h>
39
40
typedef struct _QUEUE_NODE
41
{
42
  YR_AC_STATE* value;
43
44
  struct _QUEUE_NODE* previous;
45
  struct _QUEUE_NODE* next;
46
47
} QUEUE_NODE;
48
49
typedef struct _QUEUE
50
{
51
  QUEUE_NODE* head;
52
  QUEUE_NODE* tail;
53
54
} QUEUE;
55
56
////////////////////////////////////////////////////////////////////////////////
57
// Pushes an automaton state into the tail of a queue.
58
//
59
// Args:
60
//   queue: Pointer to the queue.
61
//   state: Pointer to the state being pushed into the queue.
62
//
63
// Returns:
64
//   ERROR_SUCCESS
65
//   ERROR_INSUFFICIENT_MEMORY
66
//
67
static int _yr_ac_queue_push(QUEUE* queue, YR_AC_STATE* state)
68
664k
{
69
664k
  QUEUE_NODE* pushed_node;
70
71
664k
  pushed_node = (QUEUE_NODE*) yr_malloc(sizeof(QUEUE_NODE));
72
73
664k
  if (pushed_node == NULL)
74
0
    return ERROR_INSUFFICIENT_MEMORY;
75
76
664k
  pushed_node->previous = queue->tail;
77
664k
  pushed_node->next = NULL;
78
664k
  pushed_node->value = state;
79
80
664k
  if (queue->tail != NULL)
81
664k
    queue->tail->next = pushed_node;
82
111
  else  // queue is empty
83
111
    queue->head = pushed_node;
84
85
664k
  queue->tail = pushed_node;
86
87
664k
  return ERROR_SUCCESS;
88
664k
}
89
90
////////////////////////////////////////////////////////////////////////////////
91
// Pops an automaton state from the head of a queue.
92
//
93
// Args:
94
//   queue: Pointer to the queue.
95
//
96
// Returns:
97
//   Pointer to the poped state.
98
//
99
static YR_AC_STATE* _yr_ac_queue_pop(QUEUE* queue)
100
664k
{
101
664k
  YR_AC_STATE* result;
102
664k
  QUEUE_NODE* popped_node;
103
104
664k
  if (queue->head == NULL)
105
0
    return NULL;
106
107
664k
  popped_node = queue->head;
108
664k
  queue->head = popped_node->next;
109
110
664k
  if (queue->head)
111
664k
    queue->head->previous = NULL;
112
111
  else  // queue is empty
113
111
    queue->tail = NULL;
114
115
664k
  result = popped_node->value;
116
117
664k
  yr_free(popped_node);
118
664k
  return result;
119
664k
}
120
121
////////////////////////////////////////////////////////////////////////////////
122
// Checks if a queue is empty.
123
//
124
// Args:
125
//   queue: Pointer to the queue.
126
//
127
// Returns:
128
//   true if queue is empty, false otherwise.
129
//
130
static int _yr_ac_queue_is_empty(QUEUE* queue)
131
664k
{
132
664k
  return queue->head == NULL;
133
664k
}
134
135
////////////////////////////////////////////////////////////////////////////////
136
// Given an automaton state and an input symbol, returns the new state
137
// after reading the input symbol.
138
//
139
// Args:
140
//    state: Pointer to automaton state.
141
//    input: Input symbol.
142
//
143
// Returns:
144
//   Pointer to the next automaton state.
145
//
146
static YR_AC_STATE* _yr_ac_next_state(YR_AC_STATE* state, uint8_t input)
147
5.68M
{
148
5.68M
  YR_AC_STATE* next_state = state->first_child;
149
150
212M
  while (next_state != NULL)
151
212M
  {
152
212M
    if (next_state->input == input)
153
5.13M
      return next_state;
154
155
207M
    next_state = next_state->siblings;
156
207M
  }
157
158
546k
  return NULL;
159
5.68M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Creates a new automaton state, the automaton will transition from
163
// the given state to the new state after reading the input symbol.
164
//
165
// Args:
166
//   state: Pointer to the origin state.
167
//   input: Input symbol.
168
//
169
// Returns:
170
//   YR_AC_STATE* pointer to the newly allocated state or NULL in case
171
//   of error.
172
//
173
static YR_AC_STATE* _yr_ac_state_create(YR_AC_STATE* state, uint8_t input)
174
441k
{
175
441k
  YR_AC_STATE* new_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
176
177
441k
  if (new_state == NULL)
178
0
    return NULL;
179
180
441k
  new_state->input = input;
181
441k
  new_state->depth = state->depth + 1;
182
441k
  new_state->matches_ref = YR_ARENA_NULL_REF;
183
441k
  new_state->failure = NULL;
184
441k
  new_state->t_table_slot = 0;
185
441k
  new_state->first_child = NULL;
186
441k
  new_state->siblings = state->first_child;
187
441k
  state->first_child = new_state;
188
189
441k
  return new_state;
190
441k
}
191
192
////////////////////////////////////////////////////////////////////////////////
193
// Destroys an automaton state.
194
//
195
static int _yr_ac_state_destroy(YR_AC_STATE* state)
196
442k
{
197
442k
  YR_AC_STATE* child_state = state->first_child;
198
199
884k
  while (child_state != NULL)
200
441k
  {
201
441k
    YR_AC_STATE* next_child_state = child_state->siblings;
202
441k
    _yr_ac_state_destroy(child_state);
203
441k
    child_state = next_child_state;
204
441k
  }
205
206
442k
  yr_free(state);
207
208
442k
  return ERROR_SUCCESS;
209
442k
}
210
211
////////////////////////////////////////////////////////////////////////////////
212
// Create failure links for each automaton state.
213
//
214
// This function must be called after all the strings have been added to the
215
// automaton with yr_ac_add_string.
216
//
217
static int _yr_ac_create_failure_links(YR_AC_AUTOMATON* automaton)
218
112
{
219
112
  YR_AC_STATE* current_state;
220
112
  YR_AC_STATE* failure_state;
221
112
  YR_AC_STATE* temp_state;
222
112
  YR_AC_STATE* state;
223
112
  YR_AC_STATE* transition_state;
224
112
  YR_AC_STATE* root_state;
225
112
  YR_AC_MATCH* match;
226
227
112
  QUEUE queue;
228
229
112
  queue.head = NULL;
230
112
  queue.tail = NULL;
231
232
112
  root_state = automaton->root;
233
234
  // Set the failure link of root state to itself.
235
112
  root_state->failure = root_state;
236
237
  // Push root's children and set their failure link to root.
238
112
  state = root_state->first_child;
239
240
1.19k
  while (state != NULL)
241
1.08k
  {
242
1.08k
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
243
1.08k
    state->failure = root_state;
244
1.08k
    state = state->siblings;
245
1.08k
  }
246
247
  // Traverse the trie in BFS order calculating the failure link
248
  // for each state.
249
221k
  while (!_yr_ac_queue_is_empty(&queue))
250
221k
  {
251
221k
    current_state = _yr_ac_queue_pop(&queue);
252
221k
    match = yr_arena_ref_to_ptr(automaton->arena, &current_state->matches_ref);
253
254
221k
    if (match != NULL)
255
147k
    {
256
      // Find the last match in the list of matches.
257
1.51M
      while (match->next != NULL) match = match->next;
258
259
147k
      if (match->backtrack > 0)
260
132k
        match->next = yr_arena_ref_to_ptr(
261
132k
            automaton->arena, &root_state->matches_ref);
262
147k
    }
263
73.4k
    else
264
73.4k
    {
265
      // This state doesn't have any matches, its matches will be those
266
      // in the root state, if any.
267
73.4k
      current_state->matches_ref = root_state->matches_ref;
268
73.4k
    }
269
270
    // Iterate over all the states that the current state can transition to.
271
221k
    transition_state = current_state->first_child;
272
273
441k
    while (transition_state != NULL)
274
220k
    {
275
220k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, transition_state));
276
220k
      failure_state = current_state->failure;
277
278
276k
      while (1)
279
276k
      {
280
276k
        temp_state = _yr_ac_next_state(failure_state, transition_state->input);
281
282
276k
        if (temp_state != NULL)
283
171k
        {
284
171k
          transition_state->failure = temp_state;
285
286
171k
          if (YR_ARENA_IS_NULL_REF(transition_state->matches_ref))
287
49.2k
          {
288
49.2k
            transition_state->matches_ref = temp_state->matches_ref;
289
49.2k
          }
290
122k
          else
291
122k
          {
292
122k
            match = yr_arena_ref_to_ptr(
293
122k
                automaton->arena, &transition_state->matches_ref);
294
295
122k
            assert(match != NULL);
296
297
            // Find the last match in the list of matches.
298
865k
            while (match->next != NULL) match = match->next;
299
300
122k
            match->next = yr_arena_ref_to_ptr(
301
122k
                automaton->arena, &temp_state->matches_ref);
302
122k
          }
303
304
171k
          break;
305
171k
        }
306
105k
        else
307
105k
        {
308
105k
          if (failure_state == root_state)
309
48.8k
          {
310
48.8k
            transition_state->failure = root_state;
311
48.8k
            break;
312
48.8k
          }
313
56.5k
          else
314
56.5k
          {
315
56.5k
            failure_state = failure_state->failure;
316
56.5k
          }
317
105k
        }
318
276k
      }  // while(1)
319
320
220k
      transition_state = transition_state->siblings;
321
220k
    }
322
323
221k
  }  // while(!__yr_ac_queue_is_empty(&queue))
324
325
112
  return ERROR_SUCCESS;
326
112
}
327
328
////////////////////////////////////////////////////////////////////////////////
329
// Returns true if the transitions for state s2 are a subset of the transitions
330
// for state s1. In other words, if at state s2 input X is accepted, it must be
331
// accepted in s1 too.
332
//
333
static bool _yr_ac_transitions_subset(YR_AC_STATE* s1, YR_AC_STATE* s2)
334
171k
{
335
171k
  uint8_t set[32];
336
337
171k
  YR_AC_STATE* state = s1->first_child;
338
339
171k
  memset(set, 0, 32);
340
341
298k
  while (state != NULL)
342
127k
  {
343
127k
    set[state->input / 8] |= 1 << state->input % 8;
344
127k
    state = state->siblings;
345
127k
  }
346
347
171k
  state = s2->first_child;
348
349
227k
  while (state != NULL)
350
191k
  {
351
191k
    if (!(set[state->input / 8] & 1 << state->input % 8))
352
136k
      return false;
353
354
55.7k
    state = state->siblings;
355
55.7k
  }
356
357
35.4k
  return true;
358
171k
}
359
360
////////////////////////////////////////////////////////////////////////////////
361
// Removes unnecessary failure links.
362
//
363
static int _yr_ac_optimize_failure_links(YR_AC_AUTOMATON* automaton)
364
112
{
365
112
  QUEUE queue = {NULL, NULL};
366
367
  // Push root's children.
368
112
  YR_AC_STATE* root_state = automaton->root;
369
112
  YR_AC_STATE* state = root_state->first_child;
370
371
1.19k
  while (state != NULL)
372
1.08k
  {
373
1.08k
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
374
1.08k
    state = state->siblings;
375
1.08k
  }
376
377
221k
  while (!_yr_ac_queue_is_empty(&queue))
378
221k
  {
379
221k
    YR_AC_STATE* current_state = _yr_ac_queue_pop(&queue);
380
381
221k
    if (current_state->failure != root_state)
382
171k
    {
383
171k
      if (_yr_ac_transitions_subset(current_state, current_state->failure))
384
35.4k
        current_state->failure = current_state->failure->failure;
385
171k
    }
386
387
    // Push children of current_state
388
221k
    state = current_state->first_child;
389
390
441k
    while (state != NULL)
391
220k
    {
392
220k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
393
220k
      state = state->siblings;
394
220k
    }
395
221k
  }
396
397
112
  return ERROR_SUCCESS;
398
112
}
399
400
////////////////////////////////////////////////////////////////////////////////
401
// Find a place within the automaton's transition table where the transitions
402
// for the given state can be put. The function first create a bitmask for the
403
// state's transition table, then searches for an offset within the automaton's
404
// bitmask where the state's bitmask can be put without bit collisions.
405
//
406
static int _yr_ac_find_suitable_transition_table_slot(
407
    YR_AC_AUTOMATON* automaton,
408
    YR_ARENA* arena,
409
    YR_AC_STATE* state,
410
    uint32_t* slot)
411
221k
{
412
  // The state's transition table has 257 entries, 1 for the failure link and
413
  // 256 for each possible input byte, so the state's bitmask has 257 bits.
414
221k
  YR_BITMASK state_bitmask[YR_BITMASK_SIZE(257)];
415
416
221k
  YR_AC_STATE* child_state = state->first_child;
417
418
  // Start with all bits set to zero.
419
221k
  yr_bitmask_clear_all(state_bitmask);
420
421
  // The first slot in the transition table is for the state's failure link,
422
  // so the first bit in the bitmask must be set to one.
423
221k
  yr_bitmask_set(state_bitmask, 0);
424
425
441k
  while (child_state != NULL)
426
220k
  {
427
220k
    yr_bitmask_set(state_bitmask, child_state->input + 1);
428
220k
    child_state = child_state->siblings;
429
220k
  }
430
431
221k
  *slot = yr_bitmask_find_non_colliding_offset(
432
221k
      automaton->bitmask,
433
221k
      state_bitmask,
434
221k
      automaton->tables_size,
435
221k
      257,
436
221k
      &automaton->t_table_unused_candidate);
437
438
  // Make sure that we are not going beyond the maximum size of the transition
439
  // table, starting at the slot found there must be at least 257 other slots
440
  // for accommodating the state's transition table.
441
221k
  assert(*slot + 257 < YR_AC_MAX_TRANSITION_TABLE_SIZE);
442
443
221k
  if (*slot > automaton->tables_size - 257)
444
1.70k
  {
445
1.70k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
446
1.70k
        arena, YR_AC_TRANSITION_TABLE, 257 * sizeof(YR_AC_TRANSITION), NULL));
447
448
1.70k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
449
1.70k
        arena, YR_AC_STATE_MATCHES_TABLE, 257 * sizeof(uint8_t*), NULL));
450
451
1.70k
    size_t bm_len = YR_BITMASK_SIZE(automaton->tables_size) *
452
1.70k
                    sizeof(YR_BITMASK);
453
454
1.70k
    size_t bm_len_incr = YR_BITMASK_SIZE(257) * sizeof(YR_BITMASK);
455
456
1.70k
    automaton->bitmask = yr_realloc(automaton->bitmask, bm_len + bm_len_incr);
457
458
1.70k
    if (automaton->bitmask == NULL)
459
0
      return ERROR_INSUFFICIENT_MEMORY;
460
461
1.70k
    memset((uint8_t*) automaton->bitmask + bm_len, 0, bm_len_incr);
462
463
1.70k
    automaton->tables_size += 257;
464
1.70k
  }
465
466
221k
  return ERROR_SUCCESS;
467
221k
}
468
469
////////////////////////////////////////////////////////////////////////////////
470
// Builds the transition table for the automaton. The transition table (T) is a
471
// large array of 32-bits integers. Each state in the automaton is represented
472
// by an index S within the array. The integer stored in T[S] is the failure
473
// link for state S, it contains the index of the next state when no valid
474
// transition exists for the next input byte.
475
//
476
// At position T[S+1+B] (where B is a byte) we can find the transition (if any)
477
// that must be followed from state S if the next input is B. The value in
478
// T[S+1+B] contains the index for next state or zero. A zero value means that
479
// no valid transition exists from state S when next input is B, and the failure
480
// link must be used instead.
481
//
482
// The transition table for state S starts at T[S] and spans the next 257
483
// slots in the array (1 for the failure link and 256 for all the possible
484
// transitions). But many of those slots are for invalid transitions, so
485
// the transitions for multiple states can be interleaved as long as they don't
486
// collide. For example, instead of having this transition table with state S1
487
// and S2 separated by a large number of slots:
488
//
489
// S1                                             S2
490
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
491
// | FLS1 |   X  |   -  |   -  |     -     |  Y   | FLS2 |   Z  |     -     |
492
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
493
//
494
// We can interleave the transitions for states S1 and S2 and get this other
495
// transition table, which is more compact:
496
//
497
// S1            S2
498
// +------+------+------+------+--   ~   --+------+
499
// | FLS1 |  X   | FLS2 |   Z  |     -     |  Y   |
500
// +------+------+------+------+--   ~   --+------+
501
//
502
// And how do we know that transition Z belongs to state S2 and not S1? Or that
503
// transition Y belongs to S1 and not S2? Because each slot of the array not
504
// only contains the index for the state where the transition points to, it
505
// also contains the offset of the transition relative to its owner state. So,
506
// the value for the owner offset would be 1 for transitions X, because X
507
// belongs to state S1 and it's located 1 position away from S1. The same occurs
508
// for Z, it belongs to S2 and it's located one position away from S2 so its
509
// owner offset is 1. If we are in S1 and next byte is 2, we are going to read
510
// the transition at T[S1+1+2] which is Z. But we know that transition Z is not
511
// a valid transition for state S1 because the owner offset for Z is 1 not 3.
512
//
513
// Each 32-bit slot in the transition table has 23 bits for storing the index
514
// of the target state and 9 bits for storing the offset of the slot relative
515
// to its own state. The offset can be any value from 0 to 256, both inclusive,
516
// hence 9 bits are required for it. The layout for the slot goes like:
517
//
518
// 32                      23        0
519
// +-----------------------+---------+
520
// | Target state's index  |  Offset |
521
// +-----------------------+---------+
522
//
523
// A more detailed description can be found in: http://goo.gl/lE6zG
524
//
525
static int _yr_ac_build_transition_table(YR_AC_AUTOMATON* automaton)
526
112
{
527
112
  YR_AC_TRANSITION* t_table;
528
112
  uint32_t* m_table;
529
112
  YR_AC_STATE* state;
530
112
  YR_AC_STATE* child_state;
531
112
  YR_AC_STATE* root_state = automaton->root;
532
533
112
  uint32_t slot;
534
535
112
  QUEUE queue = {NULL, NULL};
536
537
  // Both t_table and m_table have 512 slots initially, which is enough for the
538
  // root node's transition table.
539
112
  automaton->tables_size = 512;
540
541
112
  automaton->bitmask = yr_calloc(
542
112
      YR_BITMASK_SIZE(automaton->tables_size), sizeof(YR_BITMASK));
543
544
112
  if (automaton->bitmask == NULL)
545
0
    return ERROR_INSUFFICIENT_MEMORY;
546
547
112
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
548
112
      automaton->arena,
549
112
      YR_AC_TRANSITION_TABLE,
550
112
      automaton->tables_size * sizeof(YR_AC_TRANSITION),
551
112
      NULL));
552
553
112
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
554
112
      automaton->arena,
555
112
      YR_AC_STATE_MATCHES_TABLE,
556
112
      automaton->tables_size * sizeof(uint32_t),
557
112
      NULL));
558
559
112
  t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
560
112
  m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
561
562
  // The failure link for the root node points to itself.
563
112
  t_table[0] = YR_AC_MAKE_TRANSITION(0, 0);
564
565
  // Initialize the entry corresponding to the root node in the match table.
566
  // Entries in this table are the index within YR_AC_MATCH_POOL where resides
567
  // the YR_AC_MATCH structure that corresponds to the head of the matches list
568
  // for the node. The indexes start counting at 1, the zero is used for
569
  // indicating that the node has no associated matches.
570
112
  if (!YR_ARENA_IS_NULL_REF(root_state->matches_ref))
571
9
    m_table[0] = root_state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
572
573
  // Mark the first slot in the transition table as used.
574
112
  yr_bitmask_set(automaton->bitmask, 0);
575
576
  // Index 0 is for root node. Unused indexes start at 1.
577
112
  automaton->t_table_unused_candidate = 1;
578
579
112
  child_state = root_state->first_child;
580
581
1.19k
  while (child_state != NULL)
582
1.08k
  {
583
    // Each state stores its slot number.
584
1.08k
    child_state->t_table_slot = child_state->input + 1;
585
586
1.08k
    t_table[child_state->input + 1] = YR_AC_MAKE_TRANSITION(
587
1.08k
        0, child_state->input + 1);
588
589
1.08k
    yr_bitmask_set(automaton->bitmask, child_state->input + 1);
590
591
1.08k
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
592
1.08k
    child_state = child_state->siblings;
593
1.08k
  }
594
595
221k
  while (!_yr_ac_queue_is_empty(&queue))
596
221k
  {
597
221k
    state = _yr_ac_queue_pop(&queue);
598
599
221k
    FAIL_ON_ERROR(_yr_ac_find_suitable_transition_table_slot(
600
221k
        automaton, automaton->arena, state, &slot));
601
602
    // _yr_ac_find_suitable_transition_table_slot can allocate more space in
603
    // both tables and cause the tables to be moved to a different memory
604
    // location, we must get their up-to-date addresses.
605
221k
    t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
606
221k
    m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
607
608
221k
    t_table[state->t_table_slot] |= (slot << YR_AC_SLOT_OFFSET_BITS);
609
221k
    t_table[slot] = YR_AC_MAKE_TRANSITION(state->failure->t_table_slot, 0);
610
611
    // The match table is an array of indexes within YR_AC_MATCHES_POOL. The
612
    // N-th item in the array is the index for the YR_AC_MATCH structure that
613
    // represents the head of the matches list for state N. The indexes start
614
    // at 1, the 0 indicates that there are no matches for the state.
615
221k
    if (YR_ARENA_IS_NULL_REF(state->matches_ref))
616
65.0k
      m_table[slot] = 0;
617
156k
    else
618
156k
      m_table[slot] = state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
619
620
221k
    state->t_table_slot = slot;
621
622
221k
    yr_bitmask_set(automaton->bitmask, slot);
623
624
    // Push children of current_state
625
221k
    child_state = state->first_child;
626
627
441k
    while (child_state != NULL)
628
220k
    {
629
220k
      child_state->t_table_slot = slot + child_state->input + 1;
630
631
220k
      t_table[child_state->t_table_slot] = YR_AC_MAKE_TRANSITION(
632
220k
          0, child_state->input + 1);
633
634
220k
      yr_bitmask_set(automaton->bitmask, child_state->t_table_slot);
635
636
220k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
637
638
220k
      child_state = child_state->siblings;
639
220k
    }
640
221k
  }
641
642
112
  return ERROR_SUCCESS;
643
112
}
644
645
////////////////////////////////////////////////////////////////////////////////
646
// Prints automaton state for debug purposes. This function is invoked by
647
// yr_ac_print_automaton, is not intended to be used stand-alone.
648
//
649
static void _yr_ac_print_automaton_state(
650
    YR_AC_AUTOMATON* automaton,
651
    YR_AC_STATE* state)
652
0
{
653
0
  int child_count;
654
655
0
  YR_AC_MATCH* match;
656
0
  YR_AC_STATE* child_state;
657
658
0
  for (int i = 0; i < state->depth; i++) printf(" ");
659
660
0
  child_state = state->first_child;
661
0
  child_count = 0;
662
663
0
  while (child_state != NULL)
664
0
  {
665
0
    child_count++;
666
0
    child_state = child_state->siblings;
667
0
  }
668
669
0
  printf(
670
0
      "%p childs:%d depth:%d failure:%p",
671
0
      state,
672
0
      child_count,
673
0
      state->depth,
674
0
      state->failure);
675
676
0
  match = yr_arena_ref_to_ptr(automaton->arena, &state->matches_ref);
677
678
0
  while (match != NULL)
679
0
  {
680
0
    printf("\n");
681
682
0
    for (int i = 0; i < state->depth + 1; i++) printf(" ");
683
684
0
    printf("%s = ", match->string->identifier);
685
686
0
    if (STRING_IS_HEX(match->string))
687
0
    {
688
0
      printf("{ ");
689
690
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
691
0
        printf("%02x ", match->string->string[i]);
692
693
0
      printf("}");
694
0
    }
695
0
    else if (STRING_IS_REGEXP(match->string))
696
0
    {
697
0
      printf("/");
698
699
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
700
0
        printf("%c", match->string->string[i]);
701
702
0
      printf("/");
703
0
    }
704
0
    else
705
0
    {
706
0
      printf("\"");
707
708
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
709
0
        printf("%c", match->string->string[i]);
710
711
0
      printf("\"");
712
0
    }
713
714
0
    match = match->next;
715
0
  }
716
717
0
  printf("\n");
718
719
0
  child_state = state->first_child;
720
721
0
  while (child_state != NULL)
722
0
  {
723
0
    _yr_ac_print_automaton_state(automaton, child_state);
724
0
    child_state = child_state->siblings;
725
0
  }
726
0
}
727
728
////////////////////////////////////////////////////////////////////////////////
729
// Creates a new automaton
730
//
731
int yr_ac_automaton_create(YR_ARENA* arena, YR_AC_AUTOMATON** automaton)
732
1.76k
{
733
1.76k
  YR_AC_AUTOMATON* new_automaton;
734
1.76k
  YR_AC_STATE* root_state;
735
736
1.76k
  new_automaton = (YR_AC_AUTOMATON*) yr_malloc(sizeof(YR_AC_AUTOMATON));
737
1.76k
  root_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
738
739
1.76k
  if (new_automaton == NULL || root_state == NULL)
740
0
  {
741
0
    yr_free(new_automaton);
742
0
    yr_free(root_state);
743
744
0
    return ERROR_INSUFFICIENT_MEMORY;
745
0
  }
746
747
1.76k
  root_state->depth = 0;
748
1.76k
  root_state->matches_ref = YR_ARENA_NULL_REF;
749
1.76k
  root_state->failure = NULL;
750
1.76k
  root_state->first_child = NULL;
751
1.76k
  root_state->siblings = NULL;
752
1.76k
  root_state->t_table_slot = 0;
753
754
1.76k
  new_automaton->arena = arena;
755
1.76k
  new_automaton->root = root_state;
756
1.76k
  new_automaton->bitmask = NULL;
757
1.76k
  new_automaton->tables_size = 0;
758
759
1.76k
  *automaton = new_automaton;
760
761
1.76k
  return ERROR_SUCCESS;
762
1.76k
}
763
764
////////////////////////////////////////////////////////////////////////////////
765
// Destroys automaton
766
//
767
int yr_ac_automaton_destroy(YR_AC_AUTOMATON* automaton)
768
1.76k
{
769
1.76k
  _yr_ac_state_destroy(automaton->root);
770
771
1.76k
  yr_free(automaton->bitmask);
772
1.76k
  yr_free(automaton);
773
774
1.76k
  return ERROR_SUCCESS;
775
1.76k
}
776
777
////////////////////////////////////////////////////////////////////////////////
778
// Adds a string to the automaton. This function is invoked once for each
779
// string defined in the rules.
780
//
781
int yr_ac_add_string(
782
    YR_AC_AUTOMATON* automaton,
783
    YR_STRING* string,
784
    uint32_t string_idx,
785
    YR_ATOM_LIST_ITEM* atom,
786
    YR_ARENA* arena)
787
35.5k
{
788
1.46M
  while (atom != NULL)
789
1.42M
  {
790
1.42M
    YR_AC_STATE* state = automaton->root;
791
792
6.83M
    for (int i = 0; i < atom->atom.length; i++)
793
5.40M
    {
794
5.40M
      YR_AC_STATE* next_state = _yr_ac_next_state(state, atom->atom.bytes[i]);
795
796
5.40M
      if (next_state == NULL)
797
441k
      {
798
441k
        next_state = _yr_ac_state_create(state, atom->atom.bytes[i]);
799
800
441k
        if (next_state == NULL)
801
0
          return ERROR_INSUFFICIENT_MEMORY;
802
441k
      }
803
804
5.40M
      state = next_state;
805
5.40M
    }
806
807
1.42M
    YR_ARENA_REF new_match_ref;
808
809
1.42M
    FAIL_ON_ERROR(yr_arena_allocate_struct(
810
1.42M
        arena,
811
1.42M
        YR_AC_STATE_MATCHES_POOL,
812
1.42M
        sizeof(YR_AC_MATCH),
813
1.42M
        &new_match_ref,
814
1.42M
        offsetof(YR_AC_MATCH, string),
815
1.42M
        offsetof(YR_AC_MATCH, forward_code),
816
1.42M
        offsetof(YR_AC_MATCH, backward_code),
817
1.42M
        offsetof(YR_AC_MATCH, next),
818
1.42M
        EOL));
819
820
1.42M
    YR_AC_MATCH* new_match = yr_arena_ref_to_ptr(arena, &new_match_ref);
821
822
1.42M
    new_match->backtrack = state->depth + atom->backtrack;
823
1.42M
    new_match->string = yr_arena_get_ptr(
824
1.42M
        arena, YR_STRINGS_TABLE, string_idx * sizeof(struct YR_STRING));
825
826
1.42M
    new_match->forward_code = yr_arena_ref_to_ptr(
827
1.42M
        arena, &atom->forward_code_ref);
828
829
1.42M
    new_match->backward_code = yr_arena_ref_to_ptr(
830
1.42M
        arena, &atom->backward_code_ref);
831
832
    // Add newly created match to the list of matches for the state.
833
1.42M
    new_match->next = yr_arena_ref_to_ptr(arena, &state->matches_ref);
834
1.42M
    state->matches_ref = new_match_ref;
835
836
1.42M
    atom = atom->next;
837
1.42M
  }
838
839
35.5k
  return ERROR_SUCCESS;
840
35.5k
}
841
842
////////////////////////////////////////////////////////////////////////////////
843
// Compiles the Aho-Corasick automaton, the resulting data structures are
844
// are written in the provided arena.
845
//
846
int yr_ac_compile(YR_AC_AUTOMATON* automaton, YR_ARENA* arena)
847
112
{
848
112
  FAIL_ON_ERROR(_yr_ac_create_failure_links(automaton));
849
112
  FAIL_ON_ERROR(_yr_ac_optimize_failure_links(automaton));
850
112
  FAIL_ON_ERROR(_yr_ac_build_transition_table(automaton));
851
852
112
  return ERROR_SUCCESS;
853
112
}
854
855
////////////////////////////////////////////////////////////////////////////////
856
// Prints automaton for debug purposes.
857
//
858
void yr_ac_print_automaton(YR_AC_AUTOMATON* automaton)
859
0
{
860
0
  printf("-------------------------------------------------------\n");
861
0
  _yr_ac_print_automaton_state(automaton, automaton->root);
862
0
  printf("-------------------------------------------------------\n");
863
0
}