Coverage Report

Created: 2026-01-17 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/yara/libyara/ahocorasick.c
Line
Count
Source
1
/*
2
Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
#include <assert.h>
31
#include <stddef.h>
32
#include <string.h>
33
#include <yara/ahocorasick.h>
34
#include <yara/arena.h>
35
#include <yara/compiler.h>
36
#include <yara/error.h>
37
#include <yara/mem.h>
38
#include <yara/utils.h>
39
40
typedef struct _QUEUE_NODE
41
{
42
  YR_AC_STATE* value;
43
44
  struct _QUEUE_NODE* previous;
45
  struct _QUEUE_NODE* next;
46
47
} QUEUE_NODE;
48
49
typedef struct _QUEUE
50
{
51
  QUEUE_NODE* head;
52
  QUEUE_NODE* tail;
53
54
} QUEUE;
55
56
////////////////////////////////////////////////////////////////////////////////
57
// Pushes an automaton state into the tail of a queue.
58
//
59
// Args:
60
//   queue: Pointer to the queue.
61
//   state: Pointer to the state being pushed into the queue.
62
//
63
// Returns:
64
//   ERROR_SUCCESS
65
//   ERROR_INSUFFICIENT_MEMORY
66
//
67
static int _yr_ac_queue_push(QUEUE* queue, YR_AC_STATE* state)
68
502k
{
69
502k
  QUEUE_NODE* pushed_node;
70
71
502k
  pushed_node = (QUEUE_NODE*) yr_malloc(sizeof(QUEUE_NODE));
72
73
502k
  if (pushed_node == NULL)
74
0
    return ERROR_INSUFFICIENT_MEMORY;
75
76
502k
  pushed_node->previous = queue->tail;
77
502k
  pushed_node->next = NULL;
78
502k
  pushed_node->value = state;
79
80
502k
  if (queue->tail != NULL)
81
502k
    queue->tail->next = pushed_node;
82
111
  else  // queue is empty
83
111
    queue->head = pushed_node;
84
85
502k
  queue->tail = pushed_node;
86
87
502k
  return ERROR_SUCCESS;
88
502k
}
89
90
////////////////////////////////////////////////////////////////////////////////
91
// Pops an automaton state from the head of a queue.
92
//
93
// Args:
94
//   queue: Pointer to the queue.
95
//
96
// Returns:
97
//   Pointer to the poped state.
98
//
99
static YR_AC_STATE* _yr_ac_queue_pop(QUEUE* queue)
100
502k
{
101
502k
  YR_AC_STATE* result;
102
502k
  QUEUE_NODE* popped_node;
103
104
502k
  if (queue->head == NULL)
105
0
    return NULL;
106
107
502k
  popped_node = queue->head;
108
502k
  queue->head = popped_node->next;
109
110
502k
  if (queue->head)
111
502k
    queue->head->previous = NULL;
112
111
  else  // queue is empty
113
111
    queue->tail = NULL;
114
115
502k
  result = popped_node->value;
116
117
502k
  yr_free(popped_node);
118
502k
  return result;
119
502k
}
120
121
////////////////////////////////////////////////////////////////////////////////
122
// Checks if a queue is empty.
123
//
124
// Args:
125
//   queue: Pointer to the queue.
126
//
127
// Returns:
128
//   true if queue is empty, false otherwise.
129
//
130
static int _yr_ac_queue_is_empty(QUEUE* queue)
131
502k
{
132
502k
  return queue->head == NULL;
133
502k
}
134
135
////////////////////////////////////////////////////////////////////////////////
136
// Given an automaton state and an input symbol, returns the new state
137
// after reading the input symbol.
138
//
139
// Args:
140
//    state: Pointer to automaton state.
141
//    input: Input symbol.
142
//
143
// Returns:
144
//   Pointer to the next automaton state.
145
//
146
static YR_AC_STATE* _yr_ac_next_state(YR_AC_STATE* state, uint8_t input)
147
6.81M
{
148
6.81M
  YR_AC_STATE* next_state = state->first_child;
149
150
313M
  while (next_state != NULL)
151
312M
  {
152
312M
    if (next_state->input == input)
153
6.32M
      return next_state;
154
155
306M
    next_state = next_state->siblings;
156
306M
  }
157
158
487k
  return NULL;
159
6.81M
}
160
161
////////////////////////////////////////////////////////////////////////////////
162
// Creates a new automaton state, the automaton will transition from
163
// the given state to the new state after reading the input symbol.
164
//
165
// Args:
166
//   state: Pointer to the origin state.
167
//   input: Input symbol.
168
//
169
// Returns:
170
//   YR_AC_STATE* pointer to the newly allocated state or NULL in case
171
//   of error.
172
//
173
static YR_AC_STATE* _yr_ac_state_create(YR_AC_STATE* state, uint8_t input)
174
392k
{
175
392k
  YR_AC_STATE* new_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
176
177
392k
  if (new_state == NULL)
178
0
    return NULL;
179
180
392k
  new_state->input = input;
181
392k
  new_state->depth = state->depth + 1;
182
392k
  new_state->matches_ref = YR_ARENA_NULL_REF;
183
392k
  new_state->failure = NULL;
184
392k
  new_state->t_table_slot = 0;
185
392k
  new_state->first_child = NULL;
186
392k
  new_state->siblings = state->first_child;
187
392k
  state->first_child = new_state;
188
189
392k
  return new_state;
190
392k
}
191
192
////////////////////////////////////////////////////////////////////////////////
193
// Destroys an automaton state.
194
//
195
static int _yr_ac_state_destroy(YR_AC_STATE* state)
196
394k
{
197
394k
  YR_AC_STATE* child_state = state->first_child;
198
199
786k
  while (child_state != NULL)
200
392k
  {
201
392k
    YR_AC_STATE* next_child_state = child_state->siblings;
202
392k
    _yr_ac_state_destroy(child_state);
203
392k
    child_state = next_child_state;
204
392k
  }
205
206
394k
  yr_free(state);
207
208
394k
  return ERROR_SUCCESS;
209
394k
}
210
211
////////////////////////////////////////////////////////////////////////////////
212
// Create failure links for each automaton state.
213
//
214
// This function must be called after all the strings have been added to the
215
// automaton with yr_ac_add_string.
216
//
217
static int _yr_ac_create_failure_links(YR_AC_AUTOMATON* automaton)
218
116
{
219
116
  YR_AC_STATE* current_state;
220
116
  YR_AC_STATE* failure_state;
221
116
  YR_AC_STATE* temp_state;
222
116
  YR_AC_STATE* state;
223
116
  YR_AC_STATE* transition_state;
224
116
  YR_AC_STATE* root_state;
225
116
  YR_AC_MATCH* match;
226
227
116
  QUEUE queue;
228
229
116
  queue.head = NULL;
230
116
  queue.tail = NULL;
231
232
116
  root_state = automaton->root;
233
234
  // Set the failure link of root state to itself.
235
116
  root_state->failure = root_state;
236
237
  // Push root's children and set their failure link to root.
238
116
  state = root_state->first_child;
239
240
662
  while (state != NULL)
241
546
  {
242
546
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
243
546
    state->failure = root_state;
244
546
    state = state->siblings;
245
546
  }
246
247
  // Traverse the trie in BFS order calculating the failure link
248
  // for each state.
249
167k
  while (!_yr_ac_queue_is_empty(&queue))
250
167k
  {
251
167k
    current_state = _yr_ac_queue_pop(&queue);
252
167k
    match = yr_arena_ref_to_ptr(automaton->arena, &current_state->matches_ref);
253
254
167k
    if (match != NULL)
255
114k
    {
256
      // Find the last match in the list of matches.
257
1.44M
      while (match->next != NULL) match = match->next;
258
259
114k
      if (match->backtrack > 0)
260
108k
        match->next = yr_arena_ref_to_ptr(
261
108k
            automaton->arena, &root_state->matches_ref);
262
114k
    }
263
52.7k
    else
264
52.7k
    {
265
      // This state doesn't have any matches, its matches will be those
266
      // in the root state, if any.
267
52.7k
      current_state->matches_ref = root_state->matches_ref;
268
52.7k
    }
269
270
    // Iterate over all the states that the current state can transition to.
271
167k
    transition_state = current_state->first_child;
272
273
334k
    while (transition_state != NULL)
274
166k
    {
275
166k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, transition_state));
276
166k
      failure_state = current_state->failure;
277
278
218k
      while (1)
279
218k
      {
280
218k
        temp_state = _yr_ac_next_state(failure_state, transition_state->input);
281
282
218k
        if (temp_state != NULL)
283
123k
        {
284
123k
          transition_state->failure = temp_state;
285
286
123k
          if (YR_ARENA_IS_NULL_REF(transition_state->matches_ref))
287
31.5k
          {
288
31.5k
            transition_state->matches_ref = temp_state->matches_ref;
289
31.5k
          }
290
91.8k
          else
291
91.8k
          {
292
91.8k
            match = yr_arena_ref_to_ptr(
293
91.8k
                automaton->arena, &transition_state->matches_ref);
294
295
91.8k
            assert(match != NULL);
296
297
            // Find the last match in the list of matches.
298
983k
            while (match->next != NULL) match = match->next;
299
300
91.8k
            match->next = yr_arena_ref_to_ptr(
301
91.8k
                automaton->arena, &temp_state->matches_ref);
302
91.8k
          }
303
304
123k
          break;
305
123k
        }
306
95.0k
        else
307
95.0k
        {
308
95.0k
          if (failure_state == root_state)
309
43.4k
          {
310
43.4k
            transition_state->failure = root_state;
311
43.4k
            break;
312
43.4k
          }
313
51.5k
          else
314
51.5k
          {
315
51.5k
            failure_state = failure_state->failure;
316
51.5k
          }
317
95.0k
        }
318
218k
      }  // while(1)
319
320
166k
      transition_state = transition_state->siblings;
321
166k
    }
322
323
167k
  }  // while(!__yr_ac_queue_is_empty(&queue))
324
325
116
  return ERROR_SUCCESS;
326
116
}
327
328
////////////////////////////////////////////////////////////////////////////////
329
// Returns true if the transitions for state s2 are a subset of the transitions
330
// for state s1. In other words, if at state s2 input X is accepted, it must be
331
// accepted in s1 too.
332
//
333
static bool _yr_ac_transitions_subset(YR_AC_STATE* s1, YR_AC_STATE* s2)
334
123k
{
335
123k
  uint8_t set[32];
336
337
123k
  YR_AC_STATE* state = s1->first_child;
338
339
123k
  memset(set, 0, 32);
340
341
229k
  while (state != NULL)
342
106k
  {
343
106k
    set[state->input / 8] |= 1 << state->input % 8;
344
106k
    state = state->siblings;
345
106k
  }
346
347
123k
  state = s2->first_child;
348
349
159k
  while (state != NULL)
350
137k
  {
351
137k
    if (!(set[state->input / 8] & 1 << state->input % 8))
352
101k
      return false;
353
354
36.1k
    state = state->siblings;
355
36.1k
  }
356
357
22.3k
  return true;
358
123k
}
359
360
////////////////////////////////////////////////////////////////////////////////
361
// Removes unnecessary failure links.
362
//
363
static int _yr_ac_optimize_failure_links(YR_AC_AUTOMATON* automaton)
364
116
{
365
116
  QUEUE queue = {NULL, NULL};
366
367
  // Push root's children.
368
116
  YR_AC_STATE* root_state = automaton->root;
369
116
  YR_AC_STATE* state = root_state->first_child;
370
371
662
  while (state != NULL)
372
546
  {
373
546
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
374
546
    state = state->siblings;
375
546
  }
376
377
167k
  while (!_yr_ac_queue_is_empty(&queue))
378
167k
  {
379
167k
    YR_AC_STATE* current_state = _yr_ac_queue_pop(&queue);
380
381
167k
    if (current_state->failure != root_state)
382
123k
    {
383
123k
      if (_yr_ac_transitions_subset(current_state, current_state->failure))
384
22.3k
        current_state->failure = current_state->failure->failure;
385
123k
    }
386
387
    // Push children of current_state
388
167k
    state = current_state->first_child;
389
390
334k
    while (state != NULL)
391
166k
    {
392
166k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state));
393
166k
      state = state->siblings;
394
166k
    }
395
167k
  }
396
397
116
  return ERROR_SUCCESS;
398
116
}
399
400
////////////////////////////////////////////////////////////////////////////////
401
// Find a place within the automaton's transition table where the transitions
402
// for the given state can be put. The function first create a bitmask for the
403
// state's transition table, then searches for an offset within the automaton's
404
// bitmask where the state's bitmask can be put without bit collisions.
405
//
406
static int _yr_ac_find_suitable_transition_table_slot(
407
    YR_AC_AUTOMATON* automaton,
408
    YR_ARENA* arena,
409
    YR_AC_STATE* state,
410
    uint32_t* slot)
411
167k
{
412
  // The state's transition table has 257 entries, 1 for the failure link and
413
  // 256 for each possible input byte, so the state's bitmask has 257 bits.
414
167k
  YR_BITMASK state_bitmask[YR_BITMASK_SIZE(257)];
415
416
167k
  YR_AC_STATE* child_state = state->first_child;
417
418
  // Start with all bits set to zero.
419
167k
  yr_bitmask_clear_all(state_bitmask);
420
421
  // The first slot in the transition table is for the state's failure link,
422
  // so the first bit in the bitmask must be set to one.
423
167k
  yr_bitmask_set(state_bitmask, 0);
424
425
334k
  while (child_state != NULL)
426
166k
  {
427
166k
    yr_bitmask_set(state_bitmask, child_state->input + 1);
428
166k
    child_state = child_state->siblings;
429
166k
  }
430
431
167k
  *slot = yr_bitmask_find_non_colliding_offset(
432
167k
      automaton->bitmask,
433
167k
      state_bitmask,
434
167k
      automaton->tables_size,
435
167k
      257,
436
167k
      &automaton->t_table_unused_candidate);
437
438
  // Make sure that we are not going beyond the maximum size of the transition
439
  // table, starting at the slot found there must be at least 257 other slots
440
  // for accommodating the state's transition table.
441
167k
  assert(*slot + 257 < YR_AC_MAX_TRANSITION_TABLE_SIZE);
442
443
167k
  if (*slot > automaton->tables_size - 257)
444
1.28k
  {
445
1.28k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
446
1.28k
        arena, YR_AC_TRANSITION_TABLE, 257 * sizeof(YR_AC_TRANSITION), NULL));
447
448
1.28k
    FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
449
1.28k
        arena, YR_AC_STATE_MATCHES_TABLE, 257 * sizeof(uint8_t*), NULL));
450
451
1.28k
    size_t bm_len = YR_BITMASK_SIZE(automaton->tables_size) *
452
1.28k
                    sizeof(YR_BITMASK);
453
454
1.28k
    size_t bm_len_incr = YR_BITMASK_SIZE(257) * sizeof(YR_BITMASK);
455
456
1.28k
    automaton->bitmask = yr_realloc(automaton->bitmask, bm_len + bm_len_incr);
457
458
1.28k
    if (automaton->bitmask == NULL)
459
0
      return ERROR_INSUFFICIENT_MEMORY;
460
461
1.28k
    memset((uint8_t*) automaton->bitmask + bm_len, 0, bm_len_incr);
462
463
1.28k
    automaton->tables_size += 257;
464
1.28k
  }
465
466
167k
  return ERROR_SUCCESS;
467
167k
}
468
469
////////////////////////////////////////////////////////////////////////////////
470
// Builds the transition table for the automaton. The transition table (T) is a
471
// large array of 32-bits integers. Each state in the automaton is represented
472
// by an index S within the array. The integer stored in T[S] is the failure
473
// link for state S, it contains the index of the next state when no valid
474
// transition exists for the next input byte.
475
//
476
// At position T[S+1+B] (where B is a byte) we can find the transition (if any)
477
// that must be followed from state S if the next input is B. The value in
478
// T[S+1+B] contains the index for next state or zero. A zero value means that
479
// no valid transition exists from state S when next input is B, and the failure
480
// link must be used instead.
481
//
482
// The transition table for state S starts at T[S] and spans the next 257
483
// slots in the array (1 for the failure link and 256 for all the possible
484
// transitions). But many of those slots are for invalid transitions, so
485
// the transitions for multiple states can be interleaved as long as they don't
486
// collide. For example, instead of having this transition table with state S1
487
// and S2 separated by a large number of slots:
488
//
489
// S1                                             S2
490
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
491
// | FLS1 |   X  |   -  |   -  |     -     |  Y   | FLS2 |   Z  |     -     |
492
// +------+------+------+------+--   ~   --+------+------+------+--   ~   --+
493
//
494
// We can interleave the transitions for states S1 and S2 and get this other
495
// transition table, which is more compact:
496
//
497
// S1            S2
498
// +------+------+------+------+--   ~   --+------+
499
// | FLS1 |  X   | FLS2 |   Z  |     -     |  Y   |
500
// +------+------+------+------+--   ~   --+------+
501
//
502
// And how do we know that transition Z belongs to state S2 and not S1? Or that
503
// transition Y belongs to S1 and not S2? Because each slot of the array not
504
// only contains the index for the state where the transition points to, it
505
// also contains the offset of the transition relative to its owner state. So,
506
// the value for the owner offset would be 1 for transitions X, because X
507
// belongs to state S1 and it's located 1 position away from S1. The same occurs
508
// for Z, it belongs to S2 and it's located one position away from S2 so its
509
// owner offset is 1. If we are in S1 and next byte is 2, we are going to read
510
// the transition at T[S1+1+2] which is Z. But we know that transition Z is not
511
// a valid transition for state S1 because the owner offset for Z is 1 not 3.
512
//
513
// Each 32-bit slot in the transition table has 23 bits for storing the index
514
// of the target state and 9 bits for storing the offset of the slot relative
515
// to its own state. The offset can be any value from 0 to 256, both inclusive,
516
// hence 9 bits are required for it. The layout for the slot goes like:
517
//
518
// 32                      23        0
519
// +-----------------------+---------+
520
// | Target state's index  |  Offset |
521
// +-----------------------+---------+
522
//
523
// A more detailed description can be found in: http://goo.gl/lE6zG
524
//
525
static int _yr_ac_build_transition_table(YR_AC_AUTOMATON* automaton)
526
116
{
527
116
  YR_AC_TRANSITION* t_table;
528
116
  uint32_t* m_table;
529
116
  YR_AC_STATE* state;
530
116
  YR_AC_STATE* child_state;
531
116
  YR_AC_STATE* root_state = automaton->root;
532
533
116
  uint32_t slot;
534
535
116
  QUEUE queue = {NULL, NULL};
536
537
  // Both t_table and m_table have 512 slots initially, which is enough for the
538
  // root node's transition table.
539
116
  automaton->tables_size = 512;
540
541
116
  automaton->bitmask = yr_calloc(
542
116
      YR_BITMASK_SIZE(automaton->tables_size), sizeof(YR_BITMASK));
543
544
116
  if (automaton->bitmask == NULL)
545
0
    return ERROR_INSUFFICIENT_MEMORY;
546
547
116
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
548
116
      automaton->arena,
549
116
      YR_AC_TRANSITION_TABLE,
550
116
      automaton->tables_size * sizeof(YR_AC_TRANSITION),
551
116
      NULL));
552
553
116
  FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory(
554
116
      automaton->arena,
555
116
      YR_AC_STATE_MATCHES_TABLE,
556
116
      automaton->tables_size * sizeof(uint32_t),
557
116
      NULL));
558
559
116
  t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
560
116
  m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
561
562
  // The failure link for the root node points to itself.
563
116
  t_table[0] = YR_AC_MAKE_TRANSITION(0, 0);
564
565
  // Initialize the entry corresponding to the root node in the match table.
566
  // Entries in this table are the index within YR_AC_MATCH_POOL where resides
567
  // the YR_AC_MATCH structure that corresponds to the head of the matches list
568
  // for the node. The indexes start counting at 1, the zero is used for
569
  // indicating that the node has no associated matches.
570
116
  if (!YR_ARENA_IS_NULL_REF(root_state->matches_ref))
571
7
    m_table[0] = root_state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
572
573
  // Mark the first slot in the transition table as used.
574
116
  yr_bitmask_set(automaton->bitmask, 0);
575
576
  // Index 0 is for root node. Unused indexes start at 1.
577
116
  automaton->t_table_unused_candidate = 1;
578
579
116
  child_state = root_state->first_child;
580
581
662
  while (child_state != NULL)
582
546
  {
583
    // Each state stores its slot number.
584
546
    child_state->t_table_slot = child_state->input + 1;
585
586
546
    t_table[child_state->input + 1] = YR_AC_MAKE_TRANSITION(
587
546
        0, child_state->input + 1);
588
589
546
    yr_bitmask_set(automaton->bitmask, child_state->input + 1);
590
591
546
    FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
592
546
    child_state = child_state->siblings;
593
546
  }
594
595
167k
  while (!_yr_ac_queue_is_empty(&queue))
596
167k
  {
597
167k
    state = _yr_ac_queue_pop(&queue);
598
599
167k
    FAIL_ON_ERROR(_yr_ac_find_suitable_transition_table_slot(
600
167k
        automaton, automaton->arena, state, &slot));
601
602
    // _yr_ac_find_suitable_transition_table_slot can allocate more space in
603
    // both tables and cause the tables to be moved to a different memory
604
    // location, we must get their up-to-date addresses.
605
167k
    t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0);
606
167k
    m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0);
607
608
167k
    t_table[state->t_table_slot] |= (slot << YR_AC_SLOT_OFFSET_BITS);
609
167k
    t_table[slot] = YR_AC_MAKE_TRANSITION(state->failure->t_table_slot, 0);
610
611
    // The match table is an array of indexes within YR_AC_MATCHES_POOL. The
612
    // N-th item in the array is the index for the YR_AC_MATCH structure that
613
    // represents the head of the matches list for state N. The indexes start
614
    // at 1, the 0 indicates that there are no matches for the state.
615
167k
    if (YR_ARENA_IS_NULL_REF(state->matches_ref))
616
49.3k
      m_table[slot] = 0;
617
118k
    else
618
118k
      m_table[slot] = state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1;
619
620
167k
    state->t_table_slot = slot;
621
622
167k
    yr_bitmask_set(automaton->bitmask, slot);
623
624
    // Push children of current_state
625
167k
    child_state = state->first_child;
626
627
334k
    while (child_state != NULL)
628
166k
    {
629
166k
      child_state->t_table_slot = slot + child_state->input + 1;
630
631
166k
      t_table[child_state->t_table_slot] = YR_AC_MAKE_TRANSITION(
632
166k
          0, child_state->input + 1);
633
634
166k
      yr_bitmask_set(automaton->bitmask, child_state->t_table_slot);
635
636
166k
      FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state));
637
638
166k
      child_state = child_state->siblings;
639
166k
    }
640
167k
  }
641
642
116
  return ERROR_SUCCESS;
643
116
}
644
645
////////////////////////////////////////////////////////////////////////////////
646
// Prints automaton state for debug purposes. This function is invoked by
647
// yr_ac_print_automaton, is not intended to be used stand-alone.
648
//
649
static void _yr_ac_print_automaton_state(
650
    YR_AC_AUTOMATON* automaton,
651
    YR_AC_STATE* state)
652
0
{
653
0
  int child_count;
654
655
0
  YR_AC_MATCH* match;
656
0
  YR_AC_STATE* child_state;
657
658
0
  for (int i = 0; i < state->depth; i++) printf(" ");
659
660
0
  child_state = state->first_child;
661
0
  child_count = 0;
662
663
0
  while (child_state != NULL)
664
0
  {
665
0
    child_count++;
666
0
    child_state = child_state->siblings;
667
0
  }
668
669
0
  printf(
670
0
      "%p childs:%d depth:%d failure:%p",
671
0
      state,
672
0
      child_count,
673
0
      state->depth,
674
0
      state->failure);
675
676
0
  match = yr_arena_ref_to_ptr(automaton->arena, &state->matches_ref);
677
678
0
  while (match != NULL)
679
0
  {
680
0
    printf("\n");
681
682
0
    for (int i = 0; i < state->depth + 1; i++) printf(" ");
683
684
0
    printf("%s = ", match->string->identifier);
685
686
0
    if (STRING_IS_HEX(match->string))
687
0
    {
688
0
      printf("{ ");
689
690
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
691
0
        printf("%02x ", match->string->string[i]);
692
693
0
      printf("}");
694
0
    }
695
0
    else if (STRING_IS_REGEXP(match->string))
696
0
    {
697
0
      printf("/");
698
699
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
700
0
        printf("%c", match->string->string[i]);
701
702
0
      printf("/");
703
0
    }
704
0
    else
705
0
    {
706
0
      printf("\"");
707
708
0
      for (int i = 0; i < yr_min(match->string->length, 10); i++)
709
0
        printf("%c", match->string->string[i]);
710
711
0
      printf("\"");
712
0
    }
713
714
0
    match = match->next;
715
0
  }
716
717
0
  printf("\n");
718
719
0
  child_state = state->first_child;
720
721
0
  while (child_state != NULL)
722
0
  {
723
0
    _yr_ac_print_automaton_state(automaton, child_state);
724
0
    child_state = child_state->siblings;
725
0
  }
726
0
}
727
728
////////////////////////////////////////////////////////////////////////////////
729
// Creates a new automaton
730
//
731
int yr_ac_automaton_create(YR_ARENA* arena, YR_AC_AUTOMATON** automaton)
732
1.74k
{
733
1.74k
  YR_AC_AUTOMATON* new_automaton;
734
1.74k
  YR_AC_STATE* root_state;
735
736
1.74k
  new_automaton = (YR_AC_AUTOMATON*) yr_malloc(sizeof(YR_AC_AUTOMATON));
737
1.74k
  root_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE));
738
739
1.74k
  if (new_automaton == NULL || root_state == NULL)
740
0
  {
741
0
    yr_free(new_automaton);
742
0
    yr_free(root_state);
743
744
0
    return ERROR_INSUFFICIENT_MEMORY;
745
0
  }
746
747
1.74k
  root_state->depth = 0;
748
1.74k
  root_state->matches_ref = YR_ARENA_NULL_REF;
749
1.74k
  root_state->failure = NULL;
750
1.74k
  root_state->first_child = NULL;
751
1.74k
  root_state->siblings = NULL;
752
1.74k
  root_state->t_table_slot = 0;
753
754
1.74k
  new_automaton->arena = arena;
755
1.74k
  new_automaton->root = root_state;
756
1.74k
  new_automaton->bitmask = NULL;
757
1.74k
  new_automaton->tables_size = 0;
758
759
1.74k
  *automaton = new_automaton;
760
761
1.74k
  return ERROR_SUCCESS;
762
1.74k
}
763
764
////////////////////////////////////////////////////////////////////////////////
765
// Destroys automaton
766
//
767
int yr_ac_automaton_destroy(YR_AC_AUTOMATON* automaton)
768
1.74k
{
769
1.74k
  _yr_ac_state_destroy(automaton->root);
770
771
1.74k
  yr_free(automaton->bitmask);
772
1.74k
  yr_free(automaton);
773
774
1.74k
  return ERROR_SUCCESS;
775
1.74k
}
776
777
////////////////////////////////////////////////////////////////////////////////
778
// Adds a string to the automaton. This function is invoked once for each
779
// string defined in the rules.
780
//
781
int yr_ac_add_string(
782
    YR_AC_AUTOMATON* automaton,
783
    YR_STRING* string,
784
    uint32_t string_idx,
785
    YR_ATOM_LIST_ITEM* atom,
786
    YR_ARENA* arena)
787
27.5k
{
788
1.77M
  while (atom != NULL)
789
1.74M
  {
790
1.74M
    YR_AC_STATE* state = automaton->root;
791
792
8.33M
    for (int i = 0; i < atom->atom.length; i++)
793
6.59M
    {
794
6.59M
      YR_AC_STATE* next_state = _yr_ac_next_state(state, atom->atom.bytes[i]);
795
796
6.59M
      if (next_state == NULL)
797
392k
      {
798
392k
        next_state = _yr_ac_state_create(state, atom->atom.bytes[i]);
799
800
392k
        if (next_state == NULL)
801
0
          return ERROR_INSUFFICIENT_MEMORY;
802
392k
      }
803
804
6.59M
      state = next_state;
805
6.59M
    }
806
807
1.74M
    YR_ARENA_REF new_match_ref;
808
809
1.74M
    FAIL_ON_ERROR(yr_arena_allocate_struct(
810
1.74M
        arena,
811
1.74M
        YR_AC_STATE_MATCHES_POOL,
812
1.74M
        sizeof(YR_AC_MATCH),
813
1.74M
        &new_match_ref,
814
1.74M
        offsetof(YR_AC_MATCH, string),
815
1.74M
        offsetof(YR_AC_MATCH, forward_code),
816
1.74M
        offsetof(YR_AC_MATCH, backward_code),
817
1.74M
        offsetof(YR_AC_MATCH, next),
818
1.74M
        EOL));
819
820
1.74M
    YR_AC_MATCH* new_match = yr_arena_ref_to_ptr(arena, &new_match_ref);
821
822
1.74M
    new_match->backtrack = state->depth + atom->backtrack;
823
1.74M
    new_match->string = yr_arena_get_ptr(
824
1.74M
        arena, YR_STRINGS_TABLE, string_idx * sizeof(struct YR_STRING));
825
826
1.74M
    new_match->forward_code = yr_arena_ref_to_ptr(
827
1.74M
        arena, &atom->forward_code_ref);
828
829
1.74M
    new_match->backward_code = yr_arena_ref_to_ptr(
830
1.74M
        arena, &atom->backward_code_ref);
831
832
    // Add newly created match to the list of matches for the state.
833
1.74M
    new_match->next = yr_arena_ref_to_ptr(arena, &state->matches_ref);
834
1.74M
    state->matches_ref = new_match_ref;
835
836
1.74M
    atom = atom->next;
837
1.74M
  }
838
839
27.5k
  return ERROR_SUCCESS;
840
27.5k
}
841
842
////////////////////////////////////////////////////////////////////////////////
843
// Compiles the Aho-Corasick automaton, the resulting data structures are
844
// are written in the provided arena.
845
//
846
int yr_ac_compile(YR_AC_AUTOMATON* automaton, YR_ARENA* arena)
847
116
{
848
116
  FAIL_ON_ERROR(_yr_ac_create_failure_links(automaton));
849
116
  FAIL_ON_ERROR(_yr_ac_optimize_failure_links(automaton));
850
116
  FAIL_ON_ERROR(_yr_ac_build_transition_table(automaton));
851
852
116
  return ERROR_SUCCESS;
853
116
}
854
855
////////////////////////////////////////////////////////////////////////////////
856
// Prints automaton for debug purposes.
857
//
858
void yr_ac_print_automaton(YR_AC_AUTOMATON* automaton)
859
0
{
860
0
  printf("-------------------------------------------------------\n");
861
0
  _yr_ac_print_automaton_state(automaton, automaton->root);
862
0
  printf("-------------------------------------------------------\n");
863
0
}