Coverage Report

Created: 2025-11-01 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/yara/libyara/ahocorasick.c
Line
Count
Source
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
#include <assert.h>
31
#include <stddef.h>
32
#include <string.h>
33
#include <yara/ahocorasick.h>
34
#include <yara/arena.h>
35
#include <yara/compiler.h>
36
#include <yara/error.h>
37
#include <yara/mem.h>
38
#include <yara/utils.h>
39
40
typedef struct _QUEUE_NODE
41
{
42
  YR_AC_STATE* value;
43
44
  struct _QUEUE_NODE* previous;
45
  struct _QUEUE_NODE* next;
46
47
} QUEUE_NODE;
48
49
typedef struct _QUEUE
50
{
51
  QUEUE_NODE* head;
52
  QUEUE_NODE* tail;
53
54
} QUEUE;
55
56
////////////////////////////////////////////////////////////////////////////////
57
// Pushes an automaton state into the tail of a queue.
58
//
59
// Args:
60
//   queue: Pointer to the queue.
61
//   state: Pointer to the state being pushed into the queue.
62
//
63
// Returns:
64
//   ERROR_SUCCESS
65
//   ERROR_INSUFFICIENT_MEMORY
66
//
67
static int _yr_ac_queue_push(QUEUE* queue, YR_AC_STATE* state)
68
409k
{
69
409k
  QUEUE_NODE* pushed_node;
70
71
409k
  pushed_node = (QUEUE_NODE*) yr_malloc(sizeof(QUEUE_NODE));
72
73
409k
  if (pushed_node == NULL)
74
0
    return ERROR_INSUFFICIENT_MEMORY;
75
76
409k
  pushed_node->previous = queue->tail;
77
409k
  pushed_node->next = NULL;
78
409k
  pushed_node->value = state;
79
80
409k
  if (queue->tail != NULL)
81
409k
    queue->tail->next = pushed_node;
82
90
  else  // queue is empty
83
90
    queue->head = pushed_node;
84
85
409k
  queue->tail = pushed_node;
86
87
409k
  return ERROR_SUCCESS;
88
409k
}
89
90
////////////////////////////////////////////////////////////////////////////////
91
// Pops an automaton state from the head of a queue.
92
//
93
// Args:
94
//   queue: Pointer to the queue.
95
//
96
// Returns:
97
//   Pointer to the poped state.
98
//
99
static YR_AC_STATE* _yr_ac_queue_pop(QUEUE* queue)
100
409k
{
101
409k
  YR_AC_STATE* result;
102
409k
  QUEUE_NODE* popped_node;
103
104
409k
  if (queue->head == NULL)
105
0
    return NULL;
106
107
409k
  popped_node = queue->head;
108
409k
  queue->head = popped_node->next;
109
110
409k
  if (queue->head)
111
409k
    queue->head->previous = NULL;
112
90
  else  // queue is empty
113
90
    queue->tail = NULL;
114
115
409k
  result = popped_node->value;
116
117
409k
  yr_free(popped_node);
118
409k
  return result;
119
409k
}
120
121
////////////////////////////////////////////////////////////////////////////////
122
// Checks if a queue is empty.
123
//
124
// Args:
125
//   queue: Pointer to the queue.
126
//
127
// Returns:
128
//   true if queue is empty, false otherwise.
129
//
130
static int _yr_ac_queue_is_empty(QUEUE* queue)
131
409k
{
132
409k
  return queue->head == NULL;
133
409k
}
134
135
////////////////////////////////////////////////////////////////////////////////
136
// Given an automaton state and an input symbol, returns the new state
137
// after reading the input symbol.
138
//
139
// Args:
140
//    state: Pointer to automaton state.
141
//    input: Input symbol.
142
//
143
// Returns:
144
//   Pointer to the next automaton state.
145
//
146
static YR_AC_STATE* _yr_ac_next_state(YR_AC_STATE* state, uint8_t input)
147
6.24M
{
148
6.24M
  YR_AC_STATE* next_state = state->first_child;
149
150
263M
  while (next_state != NULL)
151
262M
  {
152
262M
    if (next_state->input == input)
153
5.82M
      return next_state;
154
155
257M
    next_state = next_state->siblings;
156
257M
  }
157
158
414k
  return NULL;
159
6.24M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Creates a new automaton state, the automaton will transition from
163
// the given state to the new state after reading the input symbol.
164
//
165
// Args:
166
//   state: Pointer to the origin state.
167
//   input: Input symbol.
168
//
169
// Returns:
170
//   YR_AC_STATE* pointer to the newly allocated state or NULL in case
171
//   of error.
172
//
173
static YR_AC_STATE* _yr_ac_state_create(YR_AC_STATE* state, uint8_t input)
174
356k
{
175
356k
  YR_AC_STATE* new_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
176
177
356k
  if (new_state == NULL)
178
0
    return NULL;
179
180
356k
  new_state->input = input;
181
356k
  new_state->depth = state->depth + 1;
182
356k
  new_state->matches_ref = YR_ARENA_NULL_REF;
183
356k
  new_state->failure = NULL;
184
356k
  new_state->t_table_slot = 0;
185
356k
  new_state->first_child = NULL;
186
356k
  new_state->siblings = state->first_child;
187
356k
  state->first_child = new_state;
188
189
356k
  return new_state;
190
356k
}
191
192
////////////////////////////////////////////////////////////////////////////////
193
// Destroys an automaton state.
194
//
195
static int _yr_ac_state_destroy(YR_AC_STATE* state)
196
358k
{
197
358k
  YR_AC_STATE* child_state = state->first_child;
198
199
715k
  while (child_state != NULL)
200
356k
  {
201
356k
    YR_AC_STATE* next_child_state = child_state->siblings;
202
356k
    _yr_ac_state_destroy(child_state);
203
356k
    child_state = next_child_state;
204
356k
  }
205
206
358k
  yr_free(state);
207
208
358k
  return ERROR_SUCCESS;
209
358k
}
210
211
////////////////////////////////////////////////////////////////////////////////
212
// Create failure links for each automaton state.
213
//
214
// This function must be called after all the strings have been added to the
215
// automaton with yr_ac_add_string.
216
//
217
static int _yr_ac_create_failure_links(YR_AC_AUTOMATON* automaton)
218
90
{
219
90
  YR_AC_STATE* current_state;
220
90
  YR_AC_STATE* failure_state;
221
90
  YR_AC_STATE* temp_state;
222
90
  YR_AC_STATE* state;
223
90
  YR_AC_STATE* transition_state;
224
90
  YR_AC_STATE* root_state;
225
90
  YR_AC_MATCH* match;
226
227
90
  QUEUE queue;
228
229
90
  queue.head = NULL;
230
90
  queue.tail = NULL;
231
232
90
  root_state = automaton->root;
233
234
  // Set the failure link of root state to itself.
235
90
  root_state->failure = root_state;
236
237
  // Push root's children and set their failure link to root.
238
90
  state = root_state->first_child;
239
240
689
  while (state != NULL)
241
599
  {
242
599
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
243
599
    state->failure = root_state;
244
599
    state = state->siblings;
245
599
  }
246
247
  // Traverse the trie in BFS order calculating the failure link
248
  // for each state.
249
136k
  while (!_yr_ac_queue_is_empty(&queue))
250
136k
  {
251
136k
    current_state = _yr_ac_queue_pop(&queue);
252
136k
    match = yr_arena_ref_to_ptr(automaton->arena, &current_state->matches_ref);
253
254
136k
    if (match != NULL)
255
96.3k
    {
256
      // Find the last match in the list of matches.
257
1.38M
      while (match->next != NULL) match = match->next;
258
259
96.3k
      if (match->backtrack > 0)
260
88.6k
        match->next = yr_arena_ref_to_ptr(
261
88.6k
            automaton->arena, &root_state->matches_ref);
262
96.3k
    }
263
40.2k
    else
264
40.2k
    {
265
      // This state doesn't have any matches, its matches will be those
266
      // in the root state, if any.
267
40.2k
      current_state->matches_ref = root_state->matches_ref;
268
40.2k
    }
269
270
    // Iterate over all the states that the current state can transition to.
271
136k
    transition_state = current_state->first_child;
272
273
272k
    while (transition_state != NULL)
274
135k
    {
275
135k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, transition_state));
276
135k
      failure_state = current_state->failure;
277
278
162k
      while (1)
279
162k
      {
280
162k
        temp_state = _yr_ac_next_state(failure_state, transition_state->input);
281
282
162k
        if (temp_state != NULL)
283
105k
        {
284
105k
          transition_state->failure = temp_state;
285
286
105k
          if (YR_ARENA_IS_NULL_REF(transition_state->matches_ref))
287
21.8k
          {
288
21.8k
            transition_state->matches_ref = temp_state->matches_ref;
289
21.8k
          }
290
83.2k
          else
291
83.2k
          {
292
83.2k
            match = yr_arena_ref_to_ptr(
293
83.2k
                automaton->arena, &transition_state->matches_ref);
294
295
83.2k
            assert(match != NULL);
296
297
            // Find the last match in the list of matches.
298
1.01M
            while (match->next != NULL) match = match->next;
299
300
83.2k
            match->next = yr_arena_ref_to_ptr(
301
83.2k
                automaton->arena, &temp_state->matches_ref);
302
83.2k
          }
303
304
105k
          break;
305
105k
        }
306
57.5k
        else
307
57.5k
        {
308
57.5k
          if (failure_state == root_state)
309
30.8k
          {
310
30.8k
            transition_state->failure = root_state;
311
30.8k
            break;
312
30.8k
          }
313
26.6k
          else
314
26.6k
          {
315
26.6k
            failure_state = failure_state->failure;
316
26.6k
          }
317
57.5k
        }
318
162k
      }  // while(1)
319
320
135k
      transition_state = transition_state->siblings;
321
135k
    }
322
323
136k
  }  // while(!__yr_ac_queue_is_empty(&queue))
324
325
90
  return ERROR_SUCCESS;
326
90
}
327
328
////////////////////////////////////////////////////////////////////////////////
329
// Returns true if the transitions for state s2 are a subset of the transitions
330
// for state s1. In other words, if at state s2 input X is accepted, it must be
331
// accepted in s1 too.
332
//
333
static bool _yr_ac_transitions_subset(YR_AC_STATE* s1, YR_AC_STATE* s2)
334
105k
{
335
105k
  uint8_t set[32];
336
337
105k
  YR_AC_STATE* state = s1->first_child;
338
339
105k
  memset(set, 0, 32);
340
341
175k
  while (state != NULL)
342
70.5k
  {
343
70.5k
    set[state->input / 8] |= 1 << state->input % 8;
344
70.5k
    state = state->siblings;
345
70.5k
  }
346
347
105k
  state = s2->first_child;
348
349
129k
  while (state != NULL)
350
115k
  {
351
115k
    if (!(set[state->input / 8] & 1 << state->input % 8))
352
91.4k
      return false;
353
354
24.5k
    state = state->siblings;
355
24.5k
  }
356
357
13.6k
  return true;
358
105k
}
359
360
////////////////////////////////////////////////////////////////////////////////
361
// Removes unnecessary failure links.
362
//
363
static int _yr_ac_optimize_failure_links(YR_AC_AUTOMATON* automaton)
364
90
{
365
90
  QUEUE queue = {NULL, NULL};
366
367
  // Push root's children.
368
90
  YR_AC_STATE* root_state = automaton->root;
369
90
  YR_AC_STATE* state = root_state->first_child;
370
371
689
  while (state != NULL)
372
599
  {
373
599
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
374
599
    state = state->siblings;
375
599
  }
376
377
136k
  while (!_yr_ac_queue_is_empty(&queue))
378
136k
  {
379
136k
    YR_AC_STATE* current_state = _yr_ac_queue_pop(&queue);
380
381
136k
    if (current_state->failure != root_state)
382
105k
    {
383
105k
      if (_yr_ac_transitions_subset(current_state, current_state->failure))
384
13.6k
        current_state->failure = current_state->failure->failure;
385
105k
    }
386
387
    // Push children of current_state
388
136k
    state = current_state->first_child;
389
390
272k
    while (state != NULL)
391
135k
    {
392
135k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
393
135k
      state = state->siblings;
394
135k
    }
395
136k
  }
396
397
90
  return ERROR_SUCCESS;
398
90
}
399
400
////////////////////////////////////////////////////////////////////////////////
401
// Find a place within the automaton's transition table where the transitions
402
// for the given state can be put. The function first create a bitmask for the
403
// state's transition table, then searches for an offset within the automaton's
404
// bitmask where the state's bitmask can be put without bit collisions.
405
//
406
static int _yr_ac_find_suitable_transition_table_slot(
407
    YR_AC_AUTOMATON* automaton,
408
    YR_ARENA* arena,
409
    YR_AC_STATE* state,
410
    uint32_t* slot)
411
136k
{
412
  // The state's transition table has 257 entries, 1 for the failure link and
413
  // 256 for each possible input byte, so the state's bitmask has 257 bits.
414
136k
  YR_BITMASK state_bitmask[YR_BITMASK_SIZE(257)];
415
416
136k
  YR_AC_STATE* child_state = state->first_child;
417
418
  // Start with all bits set to zero.
419
136k
  yr_bitmask_clear_all(state_bitmask);
420
421
  // The first slot in the transition table is for the state's failure link,
422
  // so the first bit in the bitmask must be set to one.
423
136k
  yr_bitmask_set(state_bitmask, 0);
424
425
272k
  while (child_state != NULL)
426
135k
  {
427
135k
    yr_bitmask_set(state_bitmask, child_state->input + 1);
428
135k
    child_state = child_state->siblings;
429
135k
  }
430
431
136k
  *slot = yr_bitmask_find_non_colliding_offset(
432
136k
      automaton->bitmask,
433
136k
      state_bitmask,
434
136k
      automaton->tables_size,
435
136k
      257,
436
136k
      &automaton->t_table_unused_candidate);
437
438
  // Make sure that we are not going beyond the maximum size of the transition
439
  // table, starting at the slot found there must be at least 257 other slots
440
  // for accommodating the state's transition table.
441
136k
  assert(*slot + 257 < YR_AC_MAX_TRANSITION_TABLE_SIZE);
442
443
136k
  if (*slot > automaton->tables_size - 257)
444
1.05k
  {
445
1.05k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
446
1.05k
        arena, YR_AC_TRANSITION_TABLE, 257 * sizeof(YR_AC_TRANSITION), NULL));
447
448
1.05k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
449
1.05k
        arena, YR_AC_STATE_MATCHES_TABLE, 257 * sizeof(uint8_t*), NULL));
450
451
1.05k
    size_t bm_len = YR_BITMASK_SIZE(automaton->tables_size) *
452
1.05k
                    sizeof(YR_BITMASK);
453
454
1.05k
    size_t bm_len_incr = YR_BITMASK_SIZE(257) * sizeof(YR_BITMASK);
455
456
1.05k
    automaton->bitmask = yr_realloc(automaton->bitmask, bm_len + bm_len_incr);
457
458
1.05k
    if (automaton->bitmask == NULL)
459
0
      return ERROR_INSUFFICIENT_MEMORY;
460
461
1.05k
    memset((uint8_t*) automaton->bitmask + bm_len, 0, bm_len_incr);
462
463
1.05k
    automaton->tables_size += 257;
464
1.05k
  }
465
466
136k
  return ERROR_SUCCESS;
467
136k
}
468
469
////////////////////////////////////////////////////////////////////////////////
470
// Builds the transition table for the automaton. The transition table (T) is a
471
// large array of 32-bits integers. Each state in the automaton is represented
472
// by an index S within the array. The integer stored in T[S] is the failure
473
// link for state S, it contains the index of the next state when no valid
474
// transition exists for the next input byte.
475
//
476
// At position T[S+1+B] (where B is a byte) we can find the transition (if any)
477
// that must be followed from state S if the next input is B. The value in
478
// T[S+1+B] contains the index for next state or zero. A zero value means that
479
// no valid transition exists from state S when next input is B, and the failure
480
// link must be used instead.
481
//
482
// The transition table for state S starts at T[S] and spans the next 257
483
// slots in the array (1 for the failure link and 256 for all the possible
484
// transitions). But many of those slots are for invalid transitions, so
485
// the transitions for multiple states can be interleaved as long as they don't
486
// collide. For example, instead of having this transition table with state S1
487
// and S2 separated by a large number of slots:
488
//
489
// S1                                             S2
490
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
491
// | FLS1 |   X  |   -  |   -  |     -     |  Y   | FLS2 |   Z  |     -     |
492
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
493
//
494
// We can interleave the transitions for states S1 and S2 and get this other
495
// transition table, which is more compact:
496
//
497
// S1            S2
498
// +------+------+------+------+--   ~   --+------+
499
// | FLS1 |  X   | FLS2 |   Z  |     -     |  Y   |
500
// +------+------+------+------+--   ~   --+------+
501
//
502
// And how do we know that transition Z belongs to state S2 and not S1? Or that
503
// transition Y belongs to S1 and not S2? Because each slot of the array not
504
// only contains the index for the state where the transition points to, it
505
// also contains the offset of the transition relative to its owner state. So,
506
// the value for the owner offset would be 1 for transitions X, because X
507
// belongs to state S1 and it's located 1 position away from S1. The same occurs
508
// for Z, it belongs to S2 and it's located one position away from S2 so its
509
// owner offset is 1. If we are in S1 and next byte is 2, we are going to read
510
// the transition at T[S1+1+2] which is Z. But we know that transition Z is not
511
// a valid transition for state S1 because the owner offset for Z is 1 not 3.
512
//
513
// Each 32-bit slot in the transition table has 23 bits for storing the index
514
// of the target state and 9 bits for storing the offset of the slot relative
515
// to its own state. The offset can be any value from 0 to 256, both inclusive,
516
// hence 9 bits are required for it. The layout for the slot goes like:
517
//
518
// 32                      23        0
519
// +-----------------------+---------+
520
// | Target state's index  |  Offset |
521
// +-----------------------+---------+
522
//
523
// A more detailed description can be found in: http://goo.gl/lE6zG
524
//
525
static int _yr_ac_build_transition_table(YR_AC_AUTOMATON* automaton)
526
90
{
527
90
  YR_AC_TRANSITION* t_table;
528
90
  uint32_t* m_table;
529
90
  YR_AC_STATE* state;
530
90
  YR_AC_STATE* child_state;
531
90
  YR_AC_STATE* root_state = automaton->root;
532
533
90
  uint32_t slot;
534
535
90
  QUEUE queue = {NULL, NULL};
536
537
  // Both t_table and m_table have 512 slots initially, which is enough for the
538
  // root node's transition table.
539
90
  automaton->tables_size = 512;
540
541
90
  automaton->bitmask = yr_calloc(
542
90
      YR_BITMASK_SIZE(automaton->tables_size), sizeof(YR_BITMASK));
543
544
90
  if (automaton->bitmask == NULL)
545
0
    return ERROR_INSUFFICIENT_MEMORY;
546
547
90
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
548
90
      automaton->arena,
549
90
      YR_AC_TRANSITION_TABLE,
550
90
      automaton->tables_size * sizeof(YR_AC_TRANSITION),
551
90
      NULL));
552
553
90
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
554
90
      automaton->arena,
555
90
      YR_AC_STATE_MATCHES_TABLE,
556
90
      automaton->tables_size * sizeof(uint32_t),
557
90
      NULL));
558
559
90
  t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
560
90
  m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
561
562
  // The failure link for the root node points to itself.
563
90
  t_table[0] = YR_AC_MAKE_TRANSITION(0, 0);
564
565
  // Initialize the entry corresponding to the root node in the match table.
566
  // Entries in this table are the index within YR_AC_MATCH_POOL where resides
567
  // the YR_AC_MATCH structure that corresponds to the head of the matches list
568
  // for the node. The indexes start counting at 1, the zero is used for
569
  // indicating that the node has no associated matches.
570
90
  if (!YR_ARENA_IS_NULL_REF(root_state->matches_ref))
571
4
    m_table[0] = root_state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
572
573
  // Mark the first slot in the transition table as used.
574
90
  yr_bitmask_set(automaton->bitmask, 0);
575
576
  // Index 0 is for root node. Unused indexes start at 1.
577
90
  automaton->t_table_unused_candidate = 1;
578
579
90
  child_state = root_state->first_child;
580
581
689
  while (child_state != NULL)
582
599
  {
583
    // Each state stores its slot number.
584
599
    child_state->t_table_slot = child_state->input + 1;
585
586
599
    t_table[child_state->input + 1] = YR_AC_MAKE_TRANSITION(
587
599
        0, child_state->input + 1);
588
589
599
    yr_bitmask_set(automaton->bitmask, child_state->input + 1);
590
591
599
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
592
599
    child_state = child_state->siblings;
593
599
  }
594
595
136k
  while (!_yr_ac_queue_is_empty(&queue))
596
136k
  {
597
136k
    state = _yr_ac_queue_pop(&queue);
598
599
136k
    FAIL_ON_ERROR(_yr_ac_find_suitable_transition_table_slot(
600
136k
        automaton, automaton->arena, state, &slot));
601
602
    // _yr_ac_find_suitable_transition_table_slot can allocate more space in
603
    // both tables and cause the tables to be moved to a different memory
604
    // location, we must get their up-to-date addresses.
605
136k
    t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
606
136k
    m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
607
608
136k
    t_table[state->t_table_slot] |= (slot << YR_AC_SLOT_OFFSET_BITS);
609
136k
    t_table[slot] = YR_AC_MAKE_TRANSITION(state->failure->t_table_slot, 0);
610
611
    // The match table is an array of indexes within YR_AC_MATCHES_POOL. The
612
    // N-th item in the array is the index for the YR_AC_MATCH structure that
613
    // represents the head of the matches list for state N. The indexes start
614
    // at 1, the 0 indicates that there are no matches for the state.
615
136k
    if (YR_ARENA_IS_NULL_REF(state->matches_ref))
616
36.9k
      m_table[slot] = 0;
617
99.6k
    else
618
99.6k
      m_table[slot] = state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
619
620
136k
    state->t_table_slot = slot;
621
622
136k
    yr_bitmask_set(automaton->bitmask, slot);
623
624
    // Push children of current_state
625
136k
    child_state = state->first_child;
626
627
272k
    while (child_state != NULL)
628
135k
    {
629
135k
      child_state->t_table_slot = slot + child_state->input + 1;
630
631
135k
      t_table[child_state->t_table_slot] = YR_AC_MAKE_TRANSITION(
632
135k
          0, child_state->input + 1);
633
634
135k
      yr_bitmask_set(automaton->bitmask, child_state->t_table_slot);
635
636
135k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
637
638
135k
      child_state = child_state->siblings;
639
135k
    }
640
136k
  }
641
642
90
  return ERROR_SUCCESS;
643
90
}
644
645
////////////////////////////////////////////////////////////////////////////////
646
// Prints automaton state for debug purposes. This function is invoked by
647
// yr_ac_print_automaton, is not intended to be used stand-alone.
648
//
649
static void _yr_ac_print_automaton_state(
650
    YR_AC_AUTOMATON* automaton,
651
    YR_AC_STATE* state)
652
0
{
653
0
  int child_count;
654
655
0
  YR_AC_MATCH* match;
656
0
  YR_AC_STATE* child_state;
657
658
0
  for (int i = 0; i < state->depth; i++) printf(" ");
659
660
0
  child_state = state->first_child;
661
0
  child_count = 0;
662
663
0
  while (child_state != NULL)
664
0
  {
665
0
    child_count++;
666
0
    child_state = child_state->siblings;
667
0
  }
668
669
0
  printf(
670
0
      "%p childs:%d depth:%d failure:%p",
671
0
      state,
672
0
      child_count,
673
0
      state->depth,
674
0
      state->failure);
675
676
0
  match = yr_arena_ref_to_ptr(automaton->arena, &state->matches_ref);
677
678
0
  while (match != NULL)
679
0
  {
680
0
    printf("\n");
681
682
0
    for (int i = 0; i < state->depth + 1; i++) printf(" ");
683
684
0
    printf("%s = ", match->string->identifier);
685
686
0
    if (STRING_IS_HEX(match->string))
687
0
    {
688
0
      printf("{ ");
689
690
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
691
0
        printf("%02x ", match->string->string[i]);
692
693
0
      printf("}");
694
0
    }
695
0
    else if (STRING_IS_REGEXP(match->string))
696
0
    {
697
0
      printf("/");
698
699
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
700
0
        printf("%c", match->string->string[i]);
701
702
0
      printf("/");
703
0
    }
704
0
    else
705
0
    {
706
0
      printf("\"");
707
708
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
709
0
        printf("%c", match->string->string[i]);
710
711
0
      printf("\"");
712
0
    }
713
714
0
    match = match->next;
715
0
  }
716
717
0
  printf("\n");
718
719
0
  child_state = state->first_child;
720
721
0
  while (child_state != NULL)
722
0
  {
723
0
    _yr_ac_print_automaton_state(automaton, child_state);
724
0
    child_state = child_state->siblings;
725
0
  }
726
0
}
727
728
////////////////////////////////////////////////////////////////////////////////
729
// Creates a new automaton
730
//
731
int yr_ac_automaton_create(YR_ARENA* arena, YR_AC_AUTOMATON** automaton)
732
1.70k
{
733
1.70k
  YR_AC_AUTOMATON* new_automaton;
734
1.70k
  YR_AC_STATE* root_state;
735
736
1.70k
  new_automaton = (YR_AC_AUTOMATON*) yr_malloc(sizeof(YR_AC_AUTOMATON));
737
1.70k
  root_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
738
739
1.70k
  if (new_automaton == NULL || root_state == NULL)
740
0
  {
741
0
    yr_free(new_automaton);
742
0
    yr_free(root_state);
743
744
0
    return ERROR_INSUFFICIENT_MEMORY;
745
0
  }
746
747
1.70k
  root_state->depth = 0;
748
1.70k
  root_state->matches_ref = YR_ARENA_NULL_REF;
749
1.70k
  root_state->failure = NULL;
750
1.70k
  root_state->first_child = NULL;
751
1.70k
  root_state->siblings = NULL;
752
1.70k
  root_state->t_table_slot = 0;
753
754
1.70k
  new_automaton->arena = arena;
755
1.70k
  new_automaton->root = root_state;
756
1.70k
  new_automaton->bitmask = NULL;
757
1.70k
  new_automaton->tables_size = 0;
758
759
1.70k
  *automaton = new_automaton;
760
761
1.70k
  return ERROR_SUCCESS;
762
1.70k
}
763
764
////////////////////////////////////////////////////////////////////////////////
765
// Destroys automaton
766
//
767
int yr_ac_automaton_destroy(YR_AC_AUTOMATON* automaton)
768
1.70k
{
769
1.70k
  _yr_ac_state_destroy(automaton->root);
770
771
1.70k
  yr_free(automaton->bitmask);
772
1.70k
  yr_free(automaton);
773
774
1.70k
  return ERROR_SUCCESS;
775
1.70k
}
776
777
////////////////////////////////////////////////////////////////////////////////
778
// Adds a string to the automaton. This function is invoked once for each
779
// string defined in the rules.
780
//
781
int yr_ac_add_string(
782
    YR_AC_AUTOMATON* automaton,
783
    YR_STRING* string,
784
    uint32_t string_idx,
785
    YR_ATOM_LIST_ITEM* atom,
786
    YR_ARENA* arena)
787
27.9k
{
788
1.61M
  while (atom != NULL)
789
1.58M
  {
790
1.58M
    YR_AC_STATE* state = automaton->root;
791
792
7.66M
    for (int i = 0; i < atom->atom.length; i++)
793
6.07M
    {
794
6.07M
      YR_AC_STATE* next_state = _yr_ac_next_state(state, atom->atom.bytes[i]);
795
796
6.07M
      if (next_state == NULL)
797
356k
      {
798
356k
        next_state = _yr_ac_state_create(state, atom->atom.bytes[i]);
799
800
356k
        if (next_state == NULL)
801
0
          return ERROR_INSUFFICIENT_MEMORY;
802
356k
      }
803
804
6.07M
      state = next_state;
805
6.07M
    }
806
807
1.58M
    YR_ARENA_REF new_match_ref;
808
809
1.58M
    FAIL_ON_ERROR(yr_arena_allocate_struct(
810
1.58M
        arena,
811
1.58M
        YR_AC_STATE_MATCHES_POOL,
812
1.58M
        sizeof(YR_AC_MATCH),
813
1.58M
        &new_match_ref,
814
1.58M
        offsetof(YR_AC_MATCH, string),
815
1.58M
        offsetof(YR_AC_MATCH, forward_code),
816
1.58M
        offsetof(YR_AC_MATCH, backward_code),
817
1.58M
        offsetof(YR_AC_MATCH, next),
818
1.58M
        EOL));
819
820
1.58M
    YR_AC_MATCH* new_match = yr_arena_ref_to_ptr(arena, &new_match_ref);
821
822
1.58M
    new_match->backtrack = state->depth + atom->backtrack;
823
1.58M
    new_match->string = yr_arena_get_ptr(
824
1.58M
        arena, YR_STRINGS_TABLE, string_idx * sizeof(struct YR_STRING));
825
826
1.58M
    new_match->forward_code = yr_arena_ref_to_ptr(
827
1.58M
        arena, &atom->forward_code_ref);
828
829
1.58M
    new_match->backward_code = yr_arena_ref_to_ptr(
830
1.58M
        arena, &atom->backward_code_ref);
831
832
    // Add newly created match to the list of matches for the state.
833
1.58M
    new_match->next = yr_arena_ref_to_ptr(arena, &state->matches_ref);
834
1.58M
    state->matches_ref = new_match_ref;
835
836
1.58M
    atom = atom->next;
837
1.58M
  }
838
839
27.9k
  return ERROR_SUCCESS;
840
27.9k
}
841
842
////////////////////////////////////////////////////////////////////////////////
843
// Compiles the Aho-Corasick automaton, the resulting data structures are
844
// are written in the provided arena.
845
//
846
int yr_ac_compile(YR_AC_AUTOMATON* automaton, YR_ARENA* arena)
847
90
{
848
90
  FAIL_ON_ERROR(_yr_ac_create_failure_links(automaton));
849
90
  FAIL_ON_ERROR(_yr_ac_optimize_failure_links(automaton));
850
90
  FAIL_ON_ERROR(_yr_ac_build_transition_table(automaton));
851
852
90
  return ERROR_SUCCESS;
853
90
}
854
855
////////////////////////////////////////////////////////////////////////////////
856
// Prints automaton for debug purposes.
857
//
858
void yr_ac_print_automaton(YR_AC_AUTOMATON* automaton)
859
0
{
860
0
  printf("-------------------------------------------------------\n");
861
0
  _yr_ac_print_automaton_state(automaton, automaton->root);
862
0
  printf("-------------------------------------------------------\n");
863
0
}