/src/yara/libyara/ahocorasick.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2013. The YARA Authors. All Rights Reserved. |
3 | | |
4 | | Redistribution and use in source and binary forms, with or without modification, |
5 | | are permitted provided that the following conditions are met: |
6 | | |
7 | | 1. Redistributions of source code must retain the above copyright notice, this |
8 | | list of conditions and the following disclaimer. |
9 | | |
10 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
11 | | this list of conditions and the following disclaimer in the documentation and/or |
12 | | other materials provided with the distribution. |
13 | | |
14 | | 3. Neither the name of the copyright holder nor the names of its contributors |
15 | | may be used to endorse or promote products derived from this software without |
16 | | specific prior written permission. |
17 | | |
18 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
19 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
20 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR |
22 | | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
23 | | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
24 | | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
25 | | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
27 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | */ |
29 | | |
30 | | #include <assert.h> |
31 | | #include <stddef.h> |
32 | | #include <string.h> |
33 | | #include <yara/ahocorasick.h> |
34 | | #include <yara/arena.h> |
35 | | #include <yara/compiler.h> |
36 | | #include <yara/error.h> |
37 | | #include <yara/mem.h> |
38 | | #include <yara/utils.h> |
39 | | |
40 | | typedef struct _QUEUE_NODE |
41 | | { |
42 | | YR_AC_STATE* value; |
43 | | |
44 | | struct _QUEUE_NODE* previous; |
45 | | struct _QUEUE_NODE* next; |
46 | | |
47 | | } QUEUE_NODE; |
48 | | |
49 | | typedef struct _QUEUE |
50 | | { |
51 | | QUEUE_NODE* head; |
52 | | QUEUE_NODE* tail; |
53 | | |
54 | | } QUEUE; |
55 | | |
56 | | //////////////////////////////////////////////////////////////////////////////// |
57 | | // Pushes an automaton state into the tail of a queue. |
58 | | // |
59 | | // Args: |
60 | | // queue: Pointer to the queue. |
61 | | // state: Pointer to the state being pushed into the queue. |
62 | | // |
63 | | // Returns: |
64 | | // ERROR_SUCCESS |
65 | | // ERROR_INSUFFICIENT_MEMORY |
66 | | // |
67 | | static int _yr_ac_queue_push(QUEUE* queue, YR_AC_STATE* state) |
68 | 1.06M | { |
69 | 1.06M | QUEUE_NODE* pushed_node; |
70 | | |
71 | 1.06M | pushed_node = (QUEUE_NODE*) yr_malloc(sizeof(QUEUE_NODE)); |
72 | | |
73 | 1.06M | if (pushed_node == NULL) |
74 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
75 | | |
76 | 1.06M | pushed_node->previous = queue->tail; |
77 | 1.06M | pushed_node->next = NULL; |
78 | 1.06M | pushed_node->value = state; |
79 | | |
80 | 1.06M | if (queue->tail != NULL) |
81 | 1.06M | queue->tail->next = pushed_node; |
82 | 171 | else // queue is empty |
83 | 171 | queue->head = pushed_node; |
84 | | |
85 | 1.06M | queue->tail = pushed_node; |
86 | | |
87 | 1.06M | return ERROR_SUCCESS; |
88 | 1.06M | } |
89 | | |
90 | | //////////////////////////////////////////////////////////////////////////////// |
91 | | // Pops an automaton state from the head of a queue. |
92 | | // |
93 | | // Args: |
94 | | // queue: Pointer to the queue. |
95 | | // |
96 | | // Returns: |
97 | | // Pointer to the poped state. |
98 | | // |
99 | | static YR_AC_STATE* _yr_ac_queue_pop(QUEUE* queue) |
100 | 1.06M | { |
101 | 1.06M | YR_AC_STATE* result; |
102 | 1.06M | QUEUE_NODE* popped_node; |
103 | | |
104 | 1.06M | if (queue->head == NULL) |
105 | 0 | return NULL; |
106 | | |
107 | 1.06M | popped_node = queue->head; |
108 | 1.06M | queue->head = popped_node->next; |
109 | | |
110 | 1.06M | if (queue->head) |
111 | 1.06M | queue->head->previous = NULL; |
112 | 171 | else // queue is empty |
113 | 171 | queue->tail = NULL; |
114 | | |
115 | 1.06M | result = popped_node->value; |
116 | | |
117 | 1.06M | yr_free(popped_node); |
118 | 1.06M | return result; |
119 | 1.06M | } |
120 | | |
121 | | //////////////////////////////////////////////////////////////////////////////// |
122 | | // Checks if a queue is empty. |
123 | | // |
124 | | // Args: |
125 | | // queue: Pointer to the queue. |
126 | | // |
127 | | // Returns: |
128 | | // true if queue is empty, false otherwise. |
129 | | // |
130 | | static int _yr_ac_queue_is_empty(QUEUE* queue) |
131 | 1.06M | { |
132 | 1.06M | return queue->head == NULL; |
133 | 1.06M | } |
134 | | |
135 | | //////////////////////////////////////////////////////////////////////////////// |
136 | | // Given an automaton state and an input symbol, returns the new state |
137 | | // after reading the input symbol. |
138 | | // |
139 | | // Args: |
140 | | // state: Pointer to automaton state. |
141 | | // input: Input symbol. |
142 | | // |
143 | | // Returns: |
144 | | // Pointer to the next automaton state. |
145 | | // |
146 | | static YR_AC_STATE* _yr_ac_next_state(YR_AC_STATE* state, uint8_t input) |
147 | 14.4M | { |
148 | 14.4M | YR_AC_STATE* next_state = state->first_child; |
149 | | |
150 | 551M | while (next_state != NULL) |
151 | 550M | { |
152 | 550M | if (next_state->input == input) |
153 | 13.6M | return next_state; |
154 | | |
155 | 537M | next_state = next_state->siblings; |
156 | 537M | } |
157 | | |
158 | 783k | return NULL; |
159 | 14.4M | } |
160 | | |
161 | | //////////////////////////////////////////////////////////////////////////////// |
162 | | // Creates a new automaton state, the automaton will transition from |
163 | | // the given state to the new state after reading the input symbol. |
164 | | // |
165 | | // Args: |
166 | | // state: Pointer to the origin state. |
167 | | // input: Input symbol. |
168 | | // |
169 | | // Returns: |
170 | | // YR_AC_STATE* pointer to the newly allocated state or NULL in case |
171 | | // of error. |
172 | | // |
173 | | static YR_AC_STATE* _yr_ac_state_create(YR_AC_STATE* state, uint8_t input) |
174 | 627k | { |
175 | 627k | YR_AC_STATE* new_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE)); |
176 | | |
177 | 627k | if (new_state == NULL) |
178 | 0 | return NULL; |
179 | | |
180 | 627k | new_state->input = input; |
181 | 627k | new_state->depth = state->depth + 1; |
182 | 627k | new_state->matches_ref = YR_ARENA_NULL_REF; |
183 | 627k | new_state->failure = NULL; |
184 | 627k | new_state->t_table_slot = 0; |
185 | 627k | new_state->first_child = NULL; |
186 | 627k | new_state->siblings = state->first_child; |
187 | 627k | state->first_child = new_state; |
188 | | |
189 | 627k | return new_state; |
190 | 627k | } |
191 | | |
192 | | //////////////////////////////////////////////////////////////////////////////// |
193 | | // Destroys an automaton state. |
194 | | // |
195 | | static int _yr_ac_state_destroy(YR_AC_STATE* state) |
196 | 629k | { |
197 | 629k | YR_AC_STATE* child_state = state->first_child; |
198 | | |
199 | 1.25M | while (child_state != NULL) |
200 | 627k | { |
201 | 627k | YR_AC_STATE* next_child_state = child_state->siblings; |
202 | 627k | _yr_ac_state_destroy(child_state); |
203 | 627k | child_state = next_child_state; |
204 | 627k | } |
205 | | |
206 | 629k | yr_free(state); |
207 | | |
208 | 629k | return ERROR_SUCCESS; |
209 | 629k | } |
210 | | |
211 | | //////////////////////////////////////////////////////////////////////////////// |
212 | | // Create failure links for each automaton state. |
213 | | // |
214 | | // This function must be called after all the strings have been added to the |
215 | | // automaton with yr_ac_add_string. |
216 | | // |
217 | | static int _yr_ac_create_failure_links(YR_AC_AUTOMATON* automaton) |
218 | 125 | { |
219 | 125 | YR_AC_STATE* current_state; |
220 | 125 | YR_AC_STATE* failure_state; |
221 | 125 | YR_AC_STATE* temp_state; |
222 | 125 | YR_AC_STATE* state; |
223 | 125 | YR_AC_STATE* transition_state; |
224 | 125 | YR_AC_STATE* root_state; |
225 | 125 | YR_AC_MATCH* match; |
226 | | |
227 | 125 | QUEUE queue; |
228 | | |
229 | 125 | queue.head = NULL; |
230 | 125 | queue.tail = NULL; |
231 | | |
232 | 125 | root_state = automaton->root; |
233 | | |
234 | | // Set the failure link of root state to itself. |
235 | 125 | root_state->failure = root_state; |
236 | | |
237 | | // Push root's children and set their failure link to root. |
238 | 125 | state = root_state->first_child; |
239 | | |
240 | 1.96k | while (state != NULL) |
241 | 1.83k | { |
242 | 1.83k | FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state)); |
243 | 1.83k | state->failure = root_state; |
244 | 1.83k | state = state->siblings; |
245 | 1.83k | } |
246 | | |
247 | | // Traverse the trie in BFS order calculating the failure link |
248 | | // for each state. |
249 | 353k | while (!_yr_ac_queue_is_empty(&queue)) |
250 | 353k | { |
251 | 353k | current_state = _yr_ac_queue_pop(&queue); |
252 | 353k | match = yr_arena_ref_to_ptr(automaton->arena, ¤t_state->matches_ref); |
253 | | |
254 | 353k | if (match != NULL) |
255 | 243k | { |
256 | | // Find the last match in the list of matches. |
257 | 3.85M | while (match->next != NULL) match = match->next; |
258 | | |
259 | 243k | if (match->backtrack > 0) |
260 | 221k | match->next = yr_arena_ref_to_ptr( |
261 | 221k | automaton->arena, &root_state->matches_ref); |
262 | 243k | } |
263 | 110k | else |
264 | 110k | { |
265 | | // This state doesn't have any matches, its matches will be those |
266 | | // in the root state, if any. |
267 | 110k | current_state->matches_ref = root_state->matches_ref; |
268 | 110k | } |
269 | | |
270 | | // Iterate over all the states that the current state can transition to. |
271 | 353k | transition_state = current_state->first_child; |
272 | | |
273 | 705k | while (transition_state != NULL) |
274 | 351k | { |
275 | 351k | FAIL_ON_ERROR(_yr_ac_queue_push(&queue, transition_state)); |
276 | 351k | failure_state = current_state->failure; |
277 | | |
278 | 427k | while (1) |
279 | 427k | { |
280 | 427k | temp_state = _yr_ac_next_state(failure_state, transition_state->input); |
281 | | |
282 | 427k | if (temp_state != NULL) |
283 | 270k | { |
284 | 270k | transition_state->failure = temp_state; |
285 | | |
286 | 270k | if (YR_ARENA_IS_NULL_REF(transition_state->matches_ref)) |
287 | 62.6k | { |
288 | 62.6k | transition_state->matches_ref = temp_state->matches_ref; |
289 | 62.6k | } |
290 | 208k | else |
291 | 208k | { |
292 | 208k | match = yr_arena_ref_to_ptr( |
293 | 208k | automaton->arena, &transition_state->matches_ref); |
294 | | |
295 | 208k | assert(match != NULL); |
296 | | |
297 | | // Find the last match in the list of matches. |
298 | 2.72M | while (match->next != NULL) match = match->next; |
299 | | |
300 | 208k | match->next = yr_arena_ref_to_ptr( |
301 | 208k | automaton->arena, &temp_state->matches_ref); |
302 | 208k | } |
303 | | |
304 | 270k | break; |
305 | 270k | } |
306 | 156k | else |
307 | 156k | { |
308 | 156k | if (failure_state == root_state) |
309 | 80.9k | { |
310 | 80.9k | transition_state->failure = root_state; |
311 | 80.9k | break; |
312 | 80.9k | } |
313 | 75.7k | else |
314 | 75.7k | { |
315 | 75.7k | failure_state = failure_state->failure; |
316 | 75.7k | } |
317 | 156k | } |
318 | 427k | } // while(1) |
319 | | |
320 | 351k | transition_state = transition_state->siblings; |
321 | 351k | } |
322 | | |
323 | 353k | } // while(!__yr_ac_queue_is_empty(&queue)) |
324 | | |
325 | 125 | return ERROR_SUCCESS; |
326 | 125 | } |
327 | | |
328 | | //////////////////////////////////////////////////////////////////////////////// |
329 | | // Returns true if the transitions for state s2 are a subset of the transitions |
330 | | // for state s1. In other words, if at state s2 input X is accepted, it must be |
331 | | // accepted in s1 too. |
332 | | // |
333 | | static bool _yr_ac_transitions_subset(YR_AC_STATE* s1, YR_AC_STATE* s2) |
334 | 270k | { |
335 | 270k | uint8_t set[32]; |
336 | | |
337 | 270k | YR_AC_STATE* state = s1->first_child; |
338 | | |
339 | 270k | memset(set, 0, 32); |
340 | | |
341 | 457k | while (state != NULL) |
342 | 186k | { |
343 | 186k | set[state->input / 8] |= 1 << state->input % 8; |
344 | 186k | state = state->siblings; |
345 | 186k | } |
346 | | |
347 | 270k | state = s2->first_child; |
348 | | |
349 | 347k | while (state != NULL) |
350 | 298k | { |
351 | 298k | if (!(set[state->input / 8] & 1 << state->input % 8)) |
352 | 221k | return false; |
353 | | |
354 | 76.6k | state = state->siblings; |
355 | 76.6k | } |
356 | | |
357 | 49.0k | return true; |
358 | 270k | } |
359 | | |
360 | | //////////////////////////////////////////////////////////////////////////////// |
361 | | // Removes unnecessary failure links. |
362 | | // |
363 | | static int _yr_ac_optimize_failure_links(YR_AC_AUTOMATON* automaton) |
364 | 125 | { |
365 | 125 | QUEUE queue = {NULL, NULL}; |
366 | | |
367 | | // Push root's children. |
368 | 125 | YR_AC_STATE* root_state = automaton->root; |
369 | 125 | YR_AC_STATE* state = root_state->first_child; |
370 | | |
371 | 1.96k | while (state != NULL) |
372 | 1.83k | { |
373 | 1.83k | FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state)); |
374 | 1.83k | state = state->siblings; |
375 | 1.83k | } |
376 | | |
377 | 353k | while (!_yr_ac_queue_is_empty(&queue)) |
378 | 353k | { |
379 | 353k | YR_AC_STATE* current_state = _yr_ac_queue_pop(&queue); |
380 | | |
381 | 353k | if (current_state->failure != root_state) |
382 | 270k | { |
383 | 270k | if (_yr_ac_transitions_subset(current_state, current_state->failure)) |
384 | 49.0k | current_state->failure = current_state->failure->failure; |
385 | 270k | } |
386 | | |
387 | | // Push children of current_state |
388 | 353k | state = current_state->first_child; |
389 | | |
390 | 705k | while (state != NULL) |
391 | 351k | { |
392 | 351k | FAIL_ON_ERROR(_yr_ac_queue_push(&queue, state)); |
393 | 351k | state = state->siblings; |
394 | 351k | } |
395 | 353k | } |
396 | | |
397 | 125 | return ERROR_SUCCESS; |
398 | 125 | } |
399 | | |
400 | | //////////////////////////////////////////////////////////////////////////////// |
401 | | // Find a place within the automaton's transition table where the transitions |
402 | | // for the given state can be put. The function first create a bitmask for the |
403 | | // state's transition table, then searches for an offset within the automaton's |
404 | | // bitmask where the state's bitmask can be put without bit collisions. |
405 | | // |
406 | | static int _yr_ac_find_suitable_transition_table_slot( |
407 | | YR_AC_AUTOMATON* automaton, |
408 | | YR_ARENA* arena, |
409 | | YR_AC_STATE* state, |
410 | | uint32_t* slot) |
411 | 353k | { |
412 | | // The state's transition table has 257 entries, 1 for the failure link and |
413 | | // 256 for each possible input byte, so the state's bitmask has 257 bits. |
414 | 353k | YR_BITMASK state_bitmask[YR_BITMASK_SIZE(257)]; |
415 | | |
416 | 353k | YR_AC_STATE* child_state = state->first_child; |
417 | | |
418 | | // Start with all bits set to zero. |
419 | 353k | yr_bitmask_clear_all(state_bitmask); |
420 | | |
421 | | // The first slot in the transition table is for the state's failure link, |
422 | | // so the first bit in the bitmask must be set to one. |
423 | 353k | yr_bitmask_set(state_bitmask, 0); |
424 | | |
425 | 705k | while (child_state != NULL) |
426 | 351k | { |
427 | 351k | yr_bitmask_set(state_bitmask, child_state->input + 1); |
428 | 351k | child_state = child_state->siblings; |
429 | 351k | } |
430 | | |
431 | 353k | *slot = yr_bitmask_find_non_colliding_offset( |
432 | 353k | automaton->bitmask, |
433 | 353k | state_bitmask, |
434 | 353k | automaton->tables_size, |
435 | 353k | 257, |
436 | 353k | &automaton->t_table_unused_candidate); |
437 | | |
438 | | // Make sure that we are not going beyond the maximum size of the transition |
439 | | // table, starting at the slot found there must be at least 257 other slots |
440 | | // for accommodating the state's transition table. |
441 | 353k | assert(*slot + 257 < YR_AC_MAX_TRANSITION_TABLE_SIZE); |
442 | | |
443 | 353k | if (*slot > automaton->tables_size - 257) |
444 | 2.73k | { |
445 | 2.73k | FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory( |
446 | 2.73k | arena, YR_AC_TRANSITION_TABLE, 257 * sizeof(YR_AC_TRANSITION), NULL)); |
447 | | |
448 | 2.73k | FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory( |
449 | 2.73k | arena, YR_AC_STATE_MATCHES_TABLE, 257 * sizeof(uint8_t*), NULL)); |
450 | | |
451 | 2.73k | size_t bm_len = YR_BITMASK_SIZE(automaton->tables_size) * |
452 | 2.73k | sizeof(YR_BITMASK); |
453 | | |
454 | 2.73k | size_t bm_len_incr = YR_BITMASK_SIZE(257) * sizeof(YR_BITMASK); |
455 | | |
456 | 2.73k | automaton->bitmask = yr_realloc(automaton->bitmask, bm_len + bm_len_incr); |
457 | | |
458 | 2.73k | if (automaton->bitmask == NULL) |
459 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
460 | | |
461 | 2.73k | memset((uint8_t*) automaton->bitmask + bm_len, 0, bm_len_incr); |
462 | | |
463 | 2.73k | automaton->tables_size += 257; |
464 | 2.73k | } |
465 | | |
466 | 353k | return ERROR_SUCCESS; |
467 | 353k | } |
468 | | |
469 | | //////////////////////////////////////////////////////////////////////////////// |
470 | | // Builds the transition table for the automaton. The transition table (T) is a |
471 | | // large array of 32-bits integers. Each state in the automaton is represented |
472 | | // by an index S within the array. The integer stored in T[S] is the failure |
473 | | // link for state S, it contains the index of the next state when no valid |
474 | | // transition exists for the next input byte. |
475 | | // |
476 | | // At position T[S+1+B] (where B is a byte) we can find the transition (if any) |
477 | | // that must be followed from state S if the next input is B. The value in |
478 | | // T[S+1+B] contains the index for next state or zero. A zero value means that |
479 | | // no valid transition exists from state S when next input is B, and the failure |
480 | | // link must be used instead. |
481 | | // |
482 | | // The transition table for state S starts at T[S] and spans the next 257 |
483 | | // slots in the array (1 for the failure link and 256 for all the possible |
484 | | // transitions). But many of those slots are for invalid transitions, so |
485 | | // the transitions for multiple states can be interleaved as long as they don't |
486 | | // collide. For example, instead of having this transition table with state S1 |
487 | | // and S2 separated by a large number of slots: |
488 | | // |
489 | | // S1 S2 |
490 | | // +------+------+------+------+-- ~ --+------+------+------+-- ~ --+ |
491 | | // | FLS1 | X | - | - | - | Y | FLS2 | Z | - | |
492 | | // +------+------+------+------+-- ~ --+------+------+------+-- ~ --+ |
493 | | // |
494 | | // We can interleave the transitions for states S1 and S2 and get this other |
495 | | // transition table, which is more compact: |
496 | | // |
497 | | // S1 S2 |
498 | | // +------+------+------+------+-- ~ --+------+ |
499 | | // | FLS1 | X | FLS2 | Z | - | Y | |
500 | | // +------+------+------+------+-- ~ --+------+ |
501 | | // |
502 | | // And how do we know that transition Z belongs to state S2 and not S1? Or that |
503 | | // transition Y belongs to S1 and not S2? Because each slot of the array not |
504 | | // only contains the index for the state where the transition points to, it |
505 | | // also contains the offset of the transition relative to its owner state. So, |
506 | | // the value for the owner offset would be 1 for transitions X, because X |
507 | | // belongs to state S1 and it's located 1 position away from S1. The same occurs |
508 | | // for Z, it belongs to S2 and it's located one position away from S2 so its |
509 | | // owner offset is 1. If we are in S1 and next byte is 2, we are going to read |
510 | | // the transition at T[S1+1+2] which is Z. But we know that transition Z is not |
511 | | // a valid transition for state S1 because the owner offset for Z is 1 not 3. |
512 | | // |
513 | | // Each 32-bit slot in the transition table has 23 bits for storing the index |
514 | | // of the target state and 9 bits for storing the offset of the slot relative |
515 | | // to its own state. The offset can be any value from 0 to 256, both inclusive, |
516 | | // hence 9 bits are required for it. The layout for the slot goes like: |
517 | | // |
518 | | // 32 23 0 |
519 | | // +-----------------------+---------+ |
520 | | // | Target state's index | Offset | |
521 | | // +-----------------------+---------+ |
522 | | // |
523 | | // A more detailed description can be found in: http://goo.gl/lE6zG |
524 | | // |
525 | | static int _yr_ac_build_transition_table(YR_AC_AUTOMATON* automaton) |
526 | 125 | { |
527 | 125 | YR_AC_TRANSITION* t_table; |
528 | 125 | uint32_t* m_table; |
529 | 125 | YR_AC_STATE* state; |
530 | 125 | YR_AC_STATE* child_state; |
531 | 125 | YR_AC_STATE* root_state = automaton->root; |
532 | | |
533 | 125 | uint32_t slot; |
534 | | |
535 | 125 | QUEUE queue = {NULL, NULL}; |
536 | | |
537 | | // Both t_table and m_table have 512 slots initially, which is enough for the |
538 | | // root node's transition table. |
539 | 125 | automaton->tables_size = 512; |
540 | | |
541 | 125 | automaton->bitmask = yr_calloc( |
542 | 125 | YR_BITMASK_SIZE(automaton->tables_size), sizeof(YR_BITMASK)); |
543 | | |
544 | 125 | if (automaton->bitmask == NULL) |
545 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
546 | | |
547 | 125 | FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory( |
548 | 125 | automaton->arena, |
549 | 125 | YR_AC_TRANSITION_TABLE, |
550 | 125 | automaton->tables_size * sizeof(YR_AC_TRANSITION), |
551 | 125 | NULL)); |
552 | | |
553 | 125 | FAIL_ON_ERROR(yr_arena_allocate_zeroed_memory( |
554 | 125 | automaton->arena, |
555 | 125 | YR_AC_STATE_MATCHES_TABLE, |
556 | 125 | automaton->tables_size * sizeof(uint32_t), |
557 | 125 | NULL)); |
558 | | |
559 | 125 | t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0); |
560 | 125 | m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0); |
561 | | |
562 | | // The failure link for the root node points to itself. |
563 | 125 | t_table[0] = YR_AC_MAKE_TRANSITION(0, 0); |
564 | | |
565 | | // Initialize the entry corresponding to the root node in the match table. |
566 | | // Entries in this table are the index within YR_AC_MATCH_POOL where resides |
567 | | // the YR_AC_MATCH structure that corresponds to the head of the matches list |
568 | | // for the node. The indexes start counting at 1, the zero is used for |
569 | | // indicating that the node has no associated matches. |
570 | 125 | if (!YR_ARENA_IS_NULL_REF(root_state->matches_ref)) |
571 | 12 | m_table[0] = root_state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1; |
572 | | |
573 | | // Mark the first slot in the transition table as used. |
574 | 125 | yr_bitmask_set(automaton->bitmask, 0); |
575 | | |
576 | | // Index 0 is for root node. Unused indexes start at 1. |
577 | 125 | automaton->t_table_unused_candidate = 1; |
578 | | |
579 | 125 | child_state = root_state->first_child; |
580 | | |
581 | 1.96k | while (child_state != NULL) |
582 | 1.83k | { |
583 | | // Each state stores its slot number. |
584 | 1.83k | child_state->t_table_slot = child_state->input + 1; |
585 | | |
586 | 1.83k | t_table[child_state->input + 1] = YR_AC_MAKE_TRANSITION( |
587 | 1.83k | 0, child_state->input + 1); |
588 | | |
589 | 1.83k | yr_bitmask_set(automaton->bitmask, child_state->input + 1); |
590 | | |
591 | 1.83k | FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state)); |
592 | 1.83k | child_state = child_state->siblings; |
593 | 1.83k | } |
594 | | |
595 | 353k | while (!_yr_ac_queue_is_empty(&queue)) |
596 | 353k | { |
597 | 353k | state = _yr_ac_queue_pop(&queue); |
598 | | |
599 | 353k | FAIL_ON_ERROR(_yr_ac_find_suitable_transition_table_slot( |
600 | 353k | automaton, automaton->arena, state, &slot)); |
601 | | |
602 | | // _yr_ac_find_suitable_transition_table_slot can allocate more space in |
603 | | // both tables and cause the tables to be moved to a different memory |
604 | | // location, we must get their up-to-date addresses. |
605 | 353k | t_table = yr_arena_get_ptr(automaton->arena, YR_AC_TRANSITION_TABLE, 0); |
606 | 353k | m_table = yr_arena_get_ptr(automaton->arena, YR_AC_STATE_MATCHES_TABLE, 0); |
607 | | |
608 | 353k | t_table[state->t_table_slot] |= (slot << YR_AC_SLOT_OFFSET_BITS); |
609 | 353k | t_table[slot] = YR_AC_MAKE_TRANSITION(state->failure->t_table_slot, 0); |
610 | | |
611 | | // The match table is an array of indexes within YR_AC_MATCHES_POOL. The |
612 | | // N-th item in the array is the index for the YR_AC_MATCH structure that |
613 | | // represents the head of the matches list for state N. The indexes start |
614 | | // at 1, the 0 indicates that there are no matches for the state. |
615 | 353k | if (YR_ARENA_IS_NULL_REF(state->matches_ref)) |
616 | 100k | m_table[slot] = 0; |
617 | 253k | else |
618 | 253k | m_table[slot] = state->matches_ref.offset / sizeof(YR_AC_MATCH) + 1; |
619 | | |
620 | 353k | state->t_table_slot = slot; |
621 | | |
622 | 353k | yr_bitmask_set(automaton->bitmask, slot); |
623 | | |
624 | | // Push children of current_state |
625 | 353k | child_state = state->first_child; |
626 | | |
627 | 705k | while (child_state != NULL) |
628 | 351k | { |
629 | 351k | child_state->t_table_slot = slot + child_state->input + 1; |
630 | | |
631 | 351k | t_table[child_state->t_table_slot] = YR_AC_MAKE_TRANSITION( |
632 | 351k | 0, child_state->input + 1); |
633 | | |
634 | 351k | yr_bitmask_set(automaton->bitmask, child_state->t_table_slot); |
635 | | |
636 | 351k | FAIL_ON_ERROR(_yr_ac_queue_push(&queue, child_state)); |
637 | | |
638 | 351k | child_state = child_state->siblings; |
639 | 351k | } |
640 | 353k | } |
641 | | |
642 | 125 | return ERROR_SUCCESS; |
643 | 125 | } |
644 | | |
645 | | //////////////////////////////////////////////////////////////////////////////// |
646 | | // Prints automaton state for debug purposes. This function is invoked by |
647 | | // yr_ac_print_automaton, is not intended to be used stand-alone. |
648 | | // |
649 | | static void _yr_ac_print_automaton_state( |
650 | | YR_AC_AUTOMATON* automaton, |
651 | | YR_AC_STATE* state) |
652 | 0 | { |
653 | 0 | int child_count; |
654 | |
|
655 | 0 | YR_AC_MATCH* match; |
656 | 0 | YR_AC_STATE* child_state; |
657 | |
|
658 | 0 | for (int i = 0; i < state->depth; i++) printf(" "); |
659 | |
|
660 | 0 | child_state = state->first_child; |
661 | 0 | child_count = 0; |
662 | |
|
663 | 0 | while (child_state != NULL) |
664 | 0 | { |
665 | 0 | child_count++; |
666 | 0 | child_state = child_state->siblings; |
667 | 0 | } |
668 | |
|
669 | 0 | printf( |
670 | 0 | "%p childs:%d depth:%d failure:%p", |
671 | 0 | state, |
672 | 0 | child_count, |
673 | 0 | state->depth, |
674 | 0 | state->failure); |
675 | |
|
676 | 0 | match = yr_arena_ref_to_ptr(automaton->arena, &state->matches_ref); |
677 | |
|
678 | 0 | while (match != NULL) |
679 | 0 | { |
680 | 0 | printf("\n"); |
681 | |
|
682 | 0 | for (int i = 0; i < state->depth + 1; i++) printf(" "); |
683 | |
|
684 | 0 | printf("%s = ", match->string->identifier); |
685 | |
|
686 | 0 | if (STRING_IS_HEX(match->string)) |
687 | 0 | { |
688 | 0 | printf("{ "); |
689 | |
|
690 | 0 | for (int i = 0; i < yr_min(match->string->length, 10); i++) |
691 | 0 | printf("%02x ", match->string->string[i]); |
692 | |
|
693 | 0 | printf("}"); |
694 | 0 | } |
695 | 0 | else if (STRING_IS_REGEXP(match->string)) |
696 | 0 | { |
697 | 0 | printf("/"); |
698 | |
|
699 | 0 | for (int i = 0; i < yr_min(match->string->length, 10); i++) |
700 | 0 | printf("%c", match->string->string[i]); |
701 | |
|
702 | 0 | printf("/"); |
703 | 0 | } |
704 | 0 | else |
705 | 0 | { |
706 | 0 | printf("\""); |
707 | |
|
708 | 0 | for (int i = 0; i < yr_min(match->string->length, 10); i++) |
709 | 0 | printf("%c", match->string->string[i]); |
710 | |
|
711 | 0 | printf("\""); |
712 | 0 | } |
713 | |
|
714 | 0 | match = match->next; |
715 | 0 | } |
716 | |
|
717 | 0 | printf("\n"); |
718 | |
|
719 | 0 | child_state = state->first_child; |
720 | |
|
721 | 0 | while (child_state != NULL) |
722 | 0 | { |
723 | 0 | _yr_ac_print_automaton_state(automaton, child_state); |
724 | 0 | child_state = child_state->siblings; |
725 | 0 | } |
726 | 0 | } |
727 | | |
728 | | //////////////////////////////////////////////////////////////////////////////// |
729 | | // Creates a new automaton |
730 | | // |
731 | | int yr_ac_automaton_create(YR_ARENA* arena, YR_AC_AUTOMATON** automaton) |
732 | 1.88k | { |
733 | 1.88k | YR_AC_AUTOMATON* new_automaton; |
734 | 1.88k | YR_AC_STATE* root_state; |
735 | | |
736 | 1.88k | new_automaton = (YR_AC_AUTOMATON*) yr_malloc(sizeof(YR_AC_AUTOMATON)); |
737 | 1.88k | root_state = (YR_AC_STATE*) yr_malloc(sizeof(YR_AC_STATE)); |
738 | | |
739 | 1.88k | if (new_automaton == NULL || root_state == NULL) |
740 | 0 | { |
741 | 0 | yr_free(new_automaton); |
742 | 0 | yr_free(root_state); |
743 | |
|
744 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
745 | 0 | } |
746 | | |
747 | 1.88k | root_state->depth = 0; |
748 | 1.88k | root_state->matches_ref = YR_ARENA_NULL_REF; |
749 | 1.88k | root_state->failure = NULL; |
750 | 1.88k | root_state->first_child = NULL; |
751 | 1.88k | root_state->siblings = NULL; |
752 | 1.88k | root_state->t_table_slot = 0; |
753 | | |
754 | 1.88k | new_automaton->arena = arena; |
755 | 1.88k | new_automaton->root = root_state; |
756 | 1.88k | new_automaton->bitmask = NULL; |
757 | 1.88k | new_automaton->tables_size = 0; |
758 | | |
759 | 1.88k | *automaton = new_automaton; |
760 | | |
761 | 1.88k | return ERROR_SUCCESS; |
762 | 1.88k | } |
763 | | |
764 | | //////////////////////////////////////////////////////////////////////////////// |
765 | | // Destroys automaton |
766 | | // |
767 | | int yr_ac_automaton_destroy(YR_AC_AUTOMATON* automaton) |
768 | 1.88k | { |
769 | 1.88k | _yr_ac_state_destroy(automaton->root); |
770 | | |
771 | 1.88k | yr_free(automaton->bitmask); |
772 | 1.88k | yr_free(automaton); |
773 | | |
774 | 1.88k | return ERROR_SUCCESS; |
775 | 1.88k | } |
776 | | |
777 | | //////////////////////////////////////////////////////////////////////////////// |
778 | | // Adds a string to the automaton. This function is invoked once for each |
779 | | // string defined in the rules. |
780 | | // |
781 | | int yr_ac_add_string( |
782 | | YR_AC_AUTOMATON* automaton, |
783 | | YR_STRING* string, |
784 | | uint32_t string_idx, |
785 | | YR_ATOM_LIST_ITEM* atom, |
786 | | YR_ARENA* arena) |
787 | 33.6k | { |
788 | 3.61M | while (atom != NULL) |
789 | 3.58M | { |
790 | 3.58M | YR_AC_STATE* state = automaton->root; |
791 | | |
792 | 17.5M | for (int i = 0; i < atom->atom.length; i++) |
793 | 13.9M | { |
794 | 13.9M | YR_AC_STATE* next_state = _yr_ac_next_state(state, atom->atom.bytes[i]); |
795 | | |
796 | 13.9M | if (next_state == NULL) |
797 | 627k | { |
798 | 627k | next_state = _yr_ac_state_create(state, atom->atom.bytes[i]); |
799 | | |
800 | 627k | if (next_state == NULL) |
801 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
802 | 627k | } |
803 | | |
804 | 13.9M | state = next_state; |
805 | 13.9M | } |
806 | | |
807 | 3.58M | YR_ARENA_REF new_match_ref; |
808 | | |
809 | 3.58M | FAIL_ON_ERROR(yr_arena_allocate_struct( |
810 | 3.58M | arena, |
811 | 3.58M | YR_AC_STATE_MATCHES_POOL, |
812 | 3.58M | sizeof(YR_AC_MATCH), |
813 | 3.58M | &new_match_ref, |
814 | 3.58M | offsetof(YR_AC_MATCH, string), |
815 | 3.58M | offsetof(YR_AC_MATCH, forward_code), |
816 | 3.58M | offsetof(YR_AC_MATCH, backward_code), |
817 | 3.58M | offsetof(YR_AC_MATCH, next), |
818 | 3.58M | EOL)); |
819 | | |
820 | 3.58M | YR_AC_MATCH* new_match = yr_arena_ref_to_ptr(arena, &new_match_ref); |
821 | | |
822 | 3.58M | new_match->backtrack = state->depth + atom->backtrack; |
823 | 3.58M | new_match->string = yr_arena_get_ptr( |
824 | 3.58M | arena, YR_STRINGS_TABLE, string_idx * sizeof(struct YR_STRING)); |
825 | | |
826 | 3.58M | new_match->forward_code = yr_arena_ref_to_ptr( |
827 | 3.58M | arena, &atom->forward_code_ref); |
828 | | |
829 | 3.58M | new_match->backward_code = yr_arena_ref_to_ptr( |
830 | 3.58M | arena, &atom->backward_code_ref); |
831 | | |
832 | | // Add newly created match to the list of matches for the state. |
833 | 3.58M | new_match->next = yr_arena_ref_to_ptr(arena, &state->matches_ref); |
834 | 3.58M | state->matches_ref = new_match_ref; |
835 | | |
836 | 3.58M | atom = atom->next; |
837 | 3.58M | } |
838 | | |
839 | 33.6k | return ERROR_SUCCESS; |
840 | 33.6k | } |
841 | | |
842 | | //////////////////////////////////////////////////////////////////////////////// |
843 | | // Compiles the Aho-Corasick automaton, the resulting data structures are |
844 | | // are written in the provided arena. |
845 | | // |
846 | | int yr_ac_compile(YR_AC_AUTOMATON* automaton, YR_ARENA* arena) |
847 | 125 | { |
848 | 125 | FAIL_ON_ERROR(_yr_ac_create_failure_links(automaton)); |
849 | 125 | FAIL_ON_ERROR(_yr_ac_optimize_failure_links(automaton)); |
850 | 125 | FAIL_ON_ERROR(_yr_ac_build_transition_table(automaton)); |
851 | | |
852 | 125 | return ERROR_SUCCESS; |
853 | 125 | } |
854 | | |
855 | | //////////////////////////////////////////////////////////////////////////////// |
856 | | // Prints automaton for debug purposes. |
857 | | // |
858 | | void yr_ac_print_automaton(YR_AC_AUTOMATON* automaton) |
859 | 0 | { |
860 | 0 | printf("-------------------------------------------------------\n"); |
861 | 0 | _yr_ac_print_automaton_state(automaton, automaton->root); |
862 | 0 | printf("-------------------------------------------------------\n"); |
863 | 0 | } |