Coverage Report

Created: 2026-02-14 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/yara/libyara/atoms.c
Line
Count
Source
1
/*
2
Copyright (c) 2013-2018. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module handles atom extraction from regexps and hex strings. Atoms are
33
undivided substrings found in a regexps and hex strings. Let's consider this
34
hex string:
35
36
{ 01 02 03 04 05 ?? 06 07 08 [1-2] 09 0A }
37
38
In the above string, byte sequences 0102030405, 060708 and 090A are atoms.
39
Similarly, in this regexp:
40
41
/abc.*ed[0-9]+fgh/
42
43
The strings "abc", "ed" and "fgh" are atoms.
44
45
When searching for regexps/hex strings matching a file, YARA uses these
46
atoms to find locations inside the file where the regexp/hex string could
47
match. If the atom "abc" is found somewhere inside the file, there is a chance
48
for /abc.*ed[0-9]+fgh/ to match the file, if "abc" doesn't appear in the file
49
there's no chance for the regexp to match. When the atom is found in the file
50
YARA proceeds to fully evaluate the regexp/hex string to determine if it's
51
actually a match.
52
53
For each regexp/hex string YARA extracts one or more atoms. Sometimes a
54
single atom is enough (like in the previous example "abc" is enough for finding
55
/abc.*ed[0-9]+fgh/), but sometimes a single atom isn't enough like in the
56
regexp /(abc|efg)/. In this case YARA must search for both "abc" AND "efg" and
57
fully evaluate the regexp whenever one of these atoms is found.
58
59
In the regexp /Look(at|into)this/ YARA can search for "Look", or search for
60
"this", or search for both "at" and "into". This is what we call an atoms tree,
61
because it can be represented by the following tree structure:
62
63
-OR
64
  |- "Look"
65
  |
66
  |- AND
67
  |   |
68
  |   |- "at"
69
  |    - "into"
70
  |
71
   - "this"
72
73
From an atom tree YARA chooses the best combination, trying to minimize the
74
number of required atoms, but also using high quality atoms (long atoms with
75
not too many zeroes and a bit of byte diversity). In the previous example YARA
76
will end up using the "Look" atom alone, but in /a(bcd|efg)h/ atoms "bcd" and
77
"efg" will be used because "a" and "h" are too short.
78
*/
79
80
#include <assert.h>
81
#include <string.h>
82
#include <yara/atoms.h>
83
#include <yara/error.h>
84
#include <yara/globals.h>
85
#include <yara/limits.h>
86
#include <yara/mem.h>
87
#include <yara/stack.h>
88
#include <yara/types.h>
89
#include <yara/utils.h>
90
91
////////////////////////////////////////////////////////////////////////////////
92
// Returns a numeric value indicating the quality of an atom. The quality
93
// depends on some characteristics of the atom, including its length, number
94
// of very common bytes like 00 and FF and number of unique distinct bytes.
95
// Atom 00 00 has a very low quality, because it's only two bytes long and
96
// both bytes are zeroes. Atom 01 01 01 01 is better but still not optimal,
97
// because the same byte is repeated. Atom 01 02 03 04 is an optimal one.
98
//
99
// Args:
100
//    config: Pointer to YR_ATOMS_CONFIG struct.
101
//    atom: Pointer to YR_ATOM struct.
102
//
103
// Returns:
104
//    An integer indicating the atom's quality
105
//
106
int yr_atoms_heuristic_quality(YR_ATOMS_CONFIG* config, YR_ATOM* atom)
107
4.24M
{
108
4.24M
  YR_BITMASK seen_bytes[YR_BITMASK_SIZE(256)];
109
110
4.24M
  int quality = 0;
111
4.24M
  int unique_bytes = 0;
112
113
4.24M
  assert(atom->length <= YR_MAX_ATOM_LENGTH);
114
115
4.24M
  yr_bitmask_clear_all(seen_bytes);
116
117
  // Each byte in the atom contributes a certain amount of points to the
118
  // quality. Bytes [a-zA-Z] contribute 18 points each, common bytes like
119
  // 0x00, 0x20 and 0xFF contribute only 12 points, and the rest of the
120
  // bytes contribute 20 points. The ?? mask substracts 10 points, and masks
121
  // X? and ?X contribute 4 points. An additional boost consisting in 2x the
122
  // number of unique bytes in the atom is added to the quality. This are
123
  // some examples of the quality of atoms:
124
  //
125
  //   01 02 03 04   quality = 20 + 20 + 20 + 20 + 8 = 88
126
  //   01 ?? 03 04   quality = 20 - 10 + 20 + 20 + 6 = 56
127
  //   01 0? 03      quality = 20 +  4 + 20      + 4 = 48
128
  //   01 02         quality = 20 + 20           + 4 = 44
129
  //   01 ?? ?3 04   quality = 20 - 10 +  2 + 20 + 4 = 36
130
  //   61 62         quality = 18 + 18           + 4 = 40
131
  //   61 61         quality = 18 + 18           + 2 = 38  <- warning threshold
132
  //   00 01         quality = 12 + 20           + 4 = 36
133
  //   01 ?? 02      quality = 20 - 10 + 20      + 4 = 34
134
  //   01            quality = 20                + 1 = 21
135
136
20.6M
  for (int i = 0; i < atom->length; i++)
137
16.4M
  {
138
16.4M
    switch (atom->mask[i])
139
16.4M
    {
140
5.93k
    case 0x00:
141
5.93k
      quality -= 10;
142
5.93k
      break;
143
553
    case 0x0F:
144
553
      quality += 4;
145
553
      break;
146
1.34k
    case 0xF0:
147
1.34k
      quality += 4;
148
1.34k
      break;
149
16.4M
    case 0xFF:
150
16.4M
      switch (atom->bytes[i])
151
16.4M
      {
152
1.63M
      case 0x00:
153
1.89M
      case 0x20:
154
1.89M
      case 0xCC:
155
4.44M
      case 0xFF:
156
        // Common bytes contribute less to the quality than the rest.
157
4.44M
        quality += 12;
158
4.44M
        break;
159
11.9M
      default:
160
        // Bytes in the a-z and A-Z ranges have a slightly lower quality
161
        // than the rest. We want to favor atoms that contain bytes outside
162
        // those ranges because they generate less additional atoms during
163
        // calls to _yr_atoms_case_combinations.
164
11.9M
        if (yr_lowercase[atom->bytes[i]] >= 'a' &&
165
9.89M
            yr_lowercase[atom->bytes[i]] <= 'z')
166
6.51M
          quality += 18;
167
5.44M
        else
168
5.44M
          quality += 20;
169
16.4M
      }
170
171
16.4M
      if (!yr_bitmask_is_set(seen_bytes, atom->bytes[i]))
172
11.6M
      {
173
11.6M
        yr_bitmask_set(seen_bytes, atom->bytes[i]);
174
11.6M
        unique_bytes++;
175
11.6M
      }
176
16.4M
    }
177
16.4M
  }
178
179
  // If all the bytes in the atom are equal and very common, let's penalize
180
  // it heavily.
181
4.24M
  if (unique_bytes == 1 && (yr_bitmask_is_set(seen_bytes, 0x00) ||
182
1.01M
                            yr_bitmask_is_set(seen_bytes, 0x20) ||
183
1.01M
                            yr_bitmask_is_set(seen_bytes, 0x90) ||
184
1.01M
                            yr_bitmask_is_set(seen_bytes, 0xCC) ||
185
1.01M
                            yr_bitmask_is_set(seen_bytes, 0xFF)))
186
487k
  {
187
487k
    quality -= 10 * atom->length;
188
487k
  }
189
  // In general atoms with more unique bytes have a better quality, so let's
190
  // boost the quality in the amount of unique bytes.
191
3.75M
  else
192
3.75M
  {
193
3.75M
    quality += 2 * unique_bytes;
194
3.75M
  }
195
196
  // The final quality is not zero-based, we start at YR_MAX_ATOM_QUALITY
197
  // for the best possible atom and substract from there. The best possible
198
  // quality is 21 * YR_MAX_ATOM_LENGTH (20 points per byte + 2 additional point
199
  // per unique byte, with a maximum of 2*YR_MAX_ATOM_LENGTH unique bytes).
200
201
4.24M
  return YR_MAX_ATOM_QUALITY - 22 * YR_MAX_ATOM_LENGTH + quality;
202
4.24M
}
203
204
////////////////////////////////////////////////////////////////////////////////
205
// Compares the byte sequence in a1 with the YR_ATOM in a2, taking atom's mask
206
// into account.
207
//
208
// Returns:
209
//   < 0 if the first byte that does not match has a lower value in a1 than
210
//       in a2.
211
//   > 0 if the first byte that does not match has a greater value in a1 than
212
//       in a2.
213
//   = 0 if a1 is equal or matches a2.
214
//
215
static int _yr_atoms_cmp(const uint8_t* a1, YR_ATOM* a2)
216
0
{
217
0
  int result = 0;
218
0
  int i = 0;
219
220
0
  while (result == 0 && i < a2->length)
221
0
  {
222
0
    switch (a2->mask[i])
223
0
    {
224
0
    case 0xFF:
225
0
    case 0x0F:
226
0
    case 0xF0:
227
0
    case 0x00:
228
0
      result = (a1[i] & a2->mask[i]) - a2->bytes[i];
229
0
      break;
230
0
    default:
231
0
      assert(false);
232
0
    }
233
234
0
    i++;
235
0
  }
236
237
0
  return result;
238
0
}
239
240
////////////////////////////////////////////////////////////////////////////////
241
// Returns a numeric value indicating the quality of an atom. The quality is
242
// based in the atom quality table passed in "config". Very common atoms
243
// (i.e: those with greater quality) have lower quality than those that are
244
// uncommon. See the comment for yr_compiler_set_atom_quality_table for
245
// details about the quality table's format.
246
//
247
// Args:
248
//    YR_ATOMS_CONFIG* config   - Pointer to YR_ATOMS_CONFIG struct.
249
//    YR_ATOM* atom             - Pointer to YR_ATOM struct.
250
//
251
// Returns:
252
//    An integer indicating the atom's quality
253
//
254
int yr_atoms_table_quality(YR_ATOMS_CONFIG* config, YR_ATOM* atom)
255
0
{
256
0
  YR_ATOM_QUALITY_TABLE_ENTRY* table = config->quality_table;
257
258
0
  int begin = 0;
259
0
  int end = config->quality_table_entries;
260
261
0
  assert(atom->length <= YR_MAX_ATOM_LENGTH);
262
263
0
  while (end > begin)
264
0
  {
265
0
    int middle = begin + (end - begin) / 2;
266
0
    int c = _yr_atoms_cmp(table[middle].atom, atom);
267
268
0
    if (c < 0)
269
0
    {
270
0
      begin = middle + 1;
271
0
    }
272
0
    else if (c > 0)
273
0
    {
274
0
      end = middle;
275
0
    }
276
0
    else
277
0
    {
278
0
      int i = middle + 1;
279
0
      int quality = table[middle].quality;
280
0
      int min_quality = quality;
281
282
0
      while (i < end && _yr_atoms_cmp(table[i].atom, atom) == 0)
283
0
      {
284
0
        if (min_quality > table[i].quality)
285
0
          min_quality = table[i].quality;
286
287
0
        i++;
288
0
      }
289
290
0
      i = middle - 1;
291
292
0
      while (i >= begin && _yr_atoms_cmp(table[i].atom, atom) == 0)
293
0
      {
294
0
        if (min_quality > table[i].quality)
295
0
          min_quality = table[i].quality;
296
297
0
        i--;
298
0
      }
299
300
0
      return min_quality >> (YR_MAX_ATOM_LENGTH - atom->length);
301
0
    }
302
0
  }
303
304
0
  return YR_MAX_ATOM_QUALITY;
305
0
}
306
307
////////////////////////////////////////////////////////////////////////////////
308
// Returns the quality for the worst quality atom in a list.
309
//
310
int yr_atoms_min_quality(YR_ATOMS_CONFIG* config, YR_ATOM_LIST_ITEM* atom_list)
311
0
{
312
0
  YR_ATOM_LIST_ITEM* atom;
313
314
0
  int quality;
315
0
  int min_quality = YR_MAX_ATOM_QUALITY;
316
317
0
  if (atom_list == NULL)
318
0
    return YR_MIN_ATOM_QUALITY;
319
320
0
  atom = atom_list;
321
322
0
  while (atom != NULL)
323
0
  {
324
0
    quality = config->get_atom_quality(config, &atom->atom);
325
326
0
    if (quality < min_quality)
327
0
      min_quality = quality;
328
329
0
    atom = atom->next;
330
0
  }
331
332
0
  return min_quality;
333
0
}
334
335
////////////////////////////////////////////////////////////////////////////////
336
// Creates a new node for an atoms tree.
337
//
338
static YR_ATOM_TREE_NODE* _yr_atoms_tree_node_create(uint8_t type)
339
348k
{
340
348k
  YR_ATOM_TREE_NODE* new_node = (YR_ATOM_TREE_NODE*) yr_malloc(
341
348k
      sizeof(YR_ATOM_TREE_NODE));
342
343
348k
  if (new_node != NULL)
344
348k
  {
345
348k
    new_node->type = type;
346
348k
    new_node->atom.length = 0;
347
348k
    new_node->next_sibling = NULL;
348
348k
    new_node->children_head = NULL;
349
348k
    new_node->children_tail = NULL;
350
348k
  }
351
352
348k
  return new_node;
353
348k
}
354
355
////////////////////////////////////////////////////////////////////////////////
356
// Destroys a node from an atoms tree.
357
//
358
static void _yr_atoms_tree_node_destroy(YR_ATOM_TREE_NODE* node)
359
348k
{
360
348k
  YR_ATOM_TREE_NODE* child;
361
348k
  YR_ATOM_TREE_NODE* next_child;
362
363
348k
  if (node == NULL)
364
0
    return;
365
366
348k
  if (node->type == ATOM_TREE_OR || node->type == ATOM_TREE_AND)
367
35.9k
  {
368
35.9k
    child = node->children_head;
369
370
377k
    while (child != NULL)
371
341k
    {
372
341k
      next_child = child->next_sibling;
373
341k
      _yr_atoms_tree_node_destroy(child);
374
341k
      child = next_child;
375
341k
    }
376
35.9k
  }
377
378
348k
  yr_free(node);
379
348k
}
380
381
////////////////////////////////////////////////////////////////////////////////
382
// Appends a new child node to another atoms tree node.
383
//
384
static void _yr_atoms_tree_node_append(
385
    YR_ATOM_TREE_NODE* dest,
386
    YR_ATOM_TREE_NODE* node)
387
322k
{
388
322k
  if (dest->children_head == NULL)
389
24.5k
    dest->children_head = node;
390
391
322k
  if (dest->children_tail != NULL)
392
297k
    dest->children_tail->next_sibling = node;
393
394
322k
  dest->children_tail = node;
395
322k
}
396
397
////////////////////////////////////////////////////////////////////////////////
398
// Destroys an atoms tree.
399
//
400
static void _yr_atoms_tree_destroy(YR_ATOM_TREE* atom_tree)
401
6.62k
{
402
6.62k
  _yr_atoms_tree_node_destroy(atom_tree->root_node);
403
6.62k
  yr_free(atom_tree);
404
6.62k
}
405
406
////////////////////////////////////////////////////////////////////////////////
407
// Destroys an atoms list.
408
//
409
void yr_atoms_list_destroy(YR_ATOM_LIST_ITEM* list_head)
410
58.3k
{
411
58.3k
  YR_ATOM_LIST_ITEM* item = list_head;
412
58.3k
  YR_ATOM_LIST_ITEM* next;
413
414
3.20M
  while (item != NULL)
415
3.14M
  {
416
3.14M
    next = item->next;
417
3.14M
    yr_free(item);
418
3.14M
    item = next;
419
3.14M
  }
420
58.3k
}
421
422
////////////////////////////////////////////////////////////////////////////////
423
// Concats two atoms lists.
424
//
425
static YR_ATOM_LIST_ITEM* _yr_atoms_list_concat(
426
    YR_ATOM_LIST_ITEM* list1,
427
    YR_ATOM_LIST_ITEM* list2)
428
9.00k
{
429
9.00k
  YR_ATOM_LIST_ITEM* item;
430
431
9.00k
  if (list1 == NULL)
432
825
    return list2;
433
434
8.18k
  item = list1;
435
436
386k
  while (item->next != NULL)
437
378k
  {
438
378k
    item = item->next;
439
378k
  }
440
441
8.18k
  item->next = list2;
442
8.18k
  return list1;
443
9.00k
}
444
445
////////////////////////////////////////////////////////////////////////////////
446
// If the atom starts or ends with an unknown byte (mask == 0x00), trim
447
// those bytes out of the atom. We don't want to expand an atom like
448
// { ?? 01 02 } into { 00 01 02 }, { 01 01 02}, { 02 01 02} .. { FF 01 02}
449
// in those cases it's better to simply have a shorter atom { 01 02 }.
450
//
451
// Args:
452
//   atom: Pointer to the YR_ATOM to be trimmed.
453
//
454
// Returns:
455
//   The number of bytes that were trimmed from the beginning of the atom.
456
//
457
int _yr_atoms_trim(YR_ATOM* atom)
458
3.96M
{
459
3.96M
  int mask_00 = 0;
460
3.96M
  int mask_ff = 0;
461
462
3.96M
  int trim_left = 0;
463
464
4.00M
  while (trim_left < atom->length && atom->mask[trim_left] == 0) trim_left++;
465
466
3.96M
  while (atom->length > trim_left && atom->mask[atom->length - 1] == 0)
467
1.67k
    atom->length--;
468
469
3.96M
  atom->length -= trim_left;
470
471
3.96M
  if (atom->length == 0)
472
41.8k
    return 0;
473
474
  // The trimmed atom goes from trim_left to trim_left + atom->length and the
475
  // first and last byte in the atom are known (mask == 0xFF). Now count the
476
  // number of known and unknown bytes in the atom (mask == 0xFF and
477
  // mask == 0x00 respectively).
478
479
19.4M
  for (int i = 0; i < atom->length; i++)
480
15.5M
  {
481
15.5M
    if (atom->mask[trim_left + i] == 0xFF)
482
15.5M
      mask_ff++;
483
9.57k
    else if (atom->mask[trim_left + i] == 0x00)
484
5.95k
      mask_00++;
485
15.5M
  }
486
487
  // If the number of unknown bytes is >= than the number of known bytes
488
  // it doesn't make sense the to use this atom, so we use a single byte atom
489
  // containing the first known byte. If YR_MAX_ATOM_LENGTH == 4 this happens
490
  // only when the atom is like { XX ?? ?? YY }, so using the first known
491
  // byte is good enough. For larger values of YR_MAX_ATOM_LENGTH this is not
492
  // the most efficient solution, as better atoms could be choosen. For
493
  // example, in { XX ?? ?? ?? YY ZZ } the best atom is { YY ZZ } not { XX }.
494
  // But let's keep it like this for simplicity.
495
496
3.92M
  if (mask_00 >= mask_ff)
497
1.34k
    atom->length = 1;
498
499
3.92M
  if (trim_left == 0)
500
3.92M
    return 0;
501
502
  // Shift bytes and mask trim_left positions to the left.
503
504
7.46k
  for (int i = 0; i < YR_MAX_ATOM_LENGTH - trim_left; i++)
505
5.58k
  {
506
5.58k
    atom->bytes[i] = atom->bytes[trim_left + i];
507
5.58k
    atom->mask[i] = atom->mask[trim_left + i];
508
5.58k
  }
509
510
1.87k
  return trim_left;
511
3.92M
}
512
513
////////////////////////////////////////////////////////////////////////////////
514
// This function receives an atom tree and returns a list of atoms to be added
515
// to the Aho-Corasick automaton.
516
//
517
static int _yr_atoms_choose(
518
    YR_ATOMS_CONFIG* config,
519
    YR_ATOM_TREE_NODE* node,
520
    YR_ATOM_LIST_ITEM** chosen_atoms,
521
    int* atoms_quality)
522
52.5k
{
523
52.5k
  YR_ATOM_TREE_NODE* child;
524
52.5k
  YR_ATOM_LIST_ITEM* item;
525
52.5k
  YR_ATOM_LIST_ITEM* tail;
526
527
52.5k
  int shift, quality;
528
529
52.5k
  int max_quality = YR_MIN_ATOM_QUALITY;
530
52.5k
  int min_quality = YR_MAX_ATOM_QUALITY;
531
532
52.5k
  *chosen_atoms = NULL;
533
52.5k
  *atoms_quality = YR_MIN_ATOM_QUALITY;
534
535
52.5k
  switch (node->type)
536
52.5k
  {
537
16.5k
  case ATOM_TREE_LEAF:
538
539
16.5k
    item = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
540
541
16.5k
    if (item == NULL)
542
0
      return ERROR_INSUFFICIENT_MEMORY;
543
544
16.5k
    memcpy(&item->atom, &node->atom, sizeof(YR_ATOM));
545
546
16.5k
    shift = _yr_atoms_trim(&item->atom);
547
548
16.5k
    if (item->atom.length > 0)
549
15.7k
    {
550
15.7k
      item->forward_code_ref = node->re_nodes[shift]->forward_code_ref;
551
15.7k
      item->backward_code_ref = node->re_nodes[shift]->backward_code_ref;
552
15.7k
      item->backtrack = 0;
553
15.7k
      item->next = NULL;
554
555
15.7k
      *chosen_atoms = item;
556
15.7k
      *atoms_quality = config->get_atom_quality(config, &item->atom);
557
15.7k
    }
558
883
    else
559
883
    {
560
883
      yr_free(item);
561
883
    }
562
563
16.5k
    break;
564
565
26.2k
  case ATOM_TREE_OR:
566
567
    // The choosen nodes are those coming from the highest quality child.
568
569
26.2k
    child = node->children_head;
570
571
52.0k
    while (child != NULL)
572
26.3k
    {
573
26.3k
      FAIL_ON_ERROR(_yr_atoms_choose(config, child, &item, &quality));
574
575
26.3k
      if (quality > max_quality)
576
23.1k
      {
577
23.1k
        max_quality = quality;
578
23.1k
        yr_atoms_list_destroy(*chosen_atoms);
579
23.1k
        *chosen_atoms = item;
580
23.1k
      }
581
3.27k
      else
582
3.27k
      {
583
3.27k
        yr_atoms_list_destroy(item);
584
3.27k
      }
585
586
26.3k
      if (max_quality == YR_MAX_ATOM_QUALITY)
587
565
        break;
588
589
25.8k
      child = child->next_sibling;
590
25.8k
    }
591
592
26.2k
    *atoms_quality = max_quality;
593
26.2k
    break;
594
595
9.78k
  case ATOM_TREE_AND:
596
597
    // The choosen nodes are the concatenation of the the nodes choosen from
598
    // all the children.
599
600
9.78k
    child = node->children_head;
601
602
29.3k
    while (child != NULL)
603
19.5k
    {
604
19.5k
      FAIL_ON_ERROR(_yr_atoms_choose(config, child, &item, &quality));
605
606
19.5k
      if (quality < min_quality)
607
10.7k
        min_quality = quality;
608
609
19.5k
      if (item != NULL)
610
17.6k
      {
611
17.6k
        tail = item;
612
97.3k
        while (tail->next != NULL) tail = tail->next;
613
614
17.6k
        tail->next = *chosen_atoms;
615
17.6k
        *chosen_atoms = item;
616
17.6k
      }
617
618
19.5k
      child = child->next_sibling;
619
19.5k
    }
620
621
9.78k
    *atoms_quality = min_quality;
622
9.78k
    break;
623
52.5k
  }
624
625
52.5k
  return ERROR_SUCCESS;
626
52.5k
}
627
628
////////////////////////////////////////////////////////////////////////////////
629
// Returns all combinations of lower and upper cases for a given atom. For
630
// atom "abc" the output would be "abc" "abC" "aBC" and so on. Resulting
631
// atoms are written into the output buffer in this format:
632
//
633
//  [size of atom 1] [atom 1]  ... [size of atom N] [atom N] [0]
634
//
635
// Notice the zero at the end to indicate where the output ends.
636
//
637
// The caller is responsible of providing a buffer large enough to hold the
638
// returned atoms.
639
//
640
static uint8_t* _yr_atoms_case_combinations(
641
    uint8_t* atom,
642
    int atom_length,
643
    int atom_offset,
644
    uint8_t* output_buffer)
645
3.31M
{
646
3.31M
  uint8_t c;
647
3.31M
  uint8_t* new_atom;
648
649
3.31M
  if (atom_offset + 1 < atom_length)
650
1.83M
    output_buffer = _yr_atoms_case_combinations(
651
1.83M
        atom, atom_length, atom_offset + 1, output_buffer);
652
653
3.31M
  c = atom[atom_offset];
654
655
3.31M
  if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
656
2.45M
  {
657
    // Write atom length.
658
2.45M
    *output_buffer = atom_length;
659
2.45M
    output_buffer++;
660
661
2.45M
    memcpy(output_buffer, atom, atom_length);
662
663
2.45M
    new_atom = output_buffer;
664
2.45M
    output_buffer += atom_length;
665
666
    // Swap character case.
667
2.45M
    if (c >= 'a' && c <= 'z')
668
2.22M
      new_atom[atom_offset] -= 32;
669
231k
    else
670
231k
      new_atom[atom_offset] += 32;
671
672
2.45M
    if (atom_offset + 1 < atom_length)
673
1.12M
      output_buffer = _yr_atoms_case_combinations(
674
1.12M
          new_atom, atom_length, atom_offset + 1, output_buffer);
675
2.45M
  }
676
677
3.31M
  if (atom_offset == 0)
678
349k
    *output_buffer = 0;
679
680
3.31M
  return output_buffer;
681
3.31M
}
682
683
// Size of buffer used in _yr_atoms_case_insensitive for storing the all
684
// the possible combinations for an atom. Each atom has up to YR_MAX_ATOM_LENGTH
685
// characters and each character has two possible values (upper and lower case).
686
// That means 2 ^ YR_MAX_ATOM_LENGTH combinations for an atom, where each atom
687
// occupies YR_MAX_ATOM_LENGTH + 1 bytes (the atom itself +1 byte for its
688
// length). One extra bytes is allocated for the zero value indicating the end.
689
690
#define CASE_COMBINATIONS_BUFFER_SIZE \
691
  (1 << YR_MAX_ATOM_LENGTH) * (YR_MAX_ATOM_LENGTH + 1) + 1
692
693
////////////////////////////////////////////////////////////////////////////////
694
// For a given list of atoms returns another list of atoms
695
// with every case combination.
696
//
697
static int _yr_atoms_case_insensitive(
698
    YR_ATOM_LIST_ITEM* atoms,
699
    YR_ATOM_LIST_ITEM** case_insensitive_atoms)
700
7.38k
{
701
7.38k
  YR_ATOM_LIST_ITEM* atom;
702
7.38k
  YR_ATOM_LIST_ITEM* new_atom;
703
704
7.38k
  uint8_t buffer[CASE_COMBINATIONS_BUFFER_SIZE];
705
7.38k
  uint8_t atom_length;
706
7.38k
  uint8_t* atoms_cursor;
707
708
7.38k
  int i;
709
710
7.38k
  *case_insensitive_atoms = NULL;
711
7.38k
  atom = atoms;
712
713
357k
  while (atom != NULL)
714
349k
  {
715
349k
    _yr_atoms_case_combinations(atom->atom.bytes, atom->atom.length, 0, buffer);
716
717
349k
    atoms_cursor = buffer;
718
349k
    atom_length = *atoms_cursor;
719
349k
    atoms_cursor++;
720
721
2.80M
    while (atom_length != 0)
722
2.45M
    {
723
2.45M
      new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
724
725
2.45M
      if (new_atom == NULL)
726
0
        return ERROR_INSUFFICIENT_MEMORY;
727
728
12.2M
      for (i = 0; i < atom_length; i++)
729
9.74M
      {
730
9.74M
        new_atom->atom.bytes[i] = atoms_cursor[i];
731
9.74M
        new_atom->atom.mask[i] = 0xFF;
732
9.74M
      }
733
734
2.45M
      new_atom->atom.length = atom_length;
735
2.45M
      new_atom->forward_code_ref = atom->forward_code_ref;
736
2.45M
      new_atom->backward_code_ref = atom->backward_code_ref;
737
2.45M
      new_atom->backtrack = atom->backtrack;
738
2.45M
      new_atom->next = *case_insensitive_atoms;
739
740
2.45M
      *case_insensitive_atoms = new_atom;
741
742
2.45M
      atoms_cursor += atom_length;
743
2.45M
      atom_length = *atoms_cursor;
744
2.45M
      atoms_cursor++;
745
2.45M
    }
746
747
349k
    atom = atom->next;
748
349k
  }
749
750
7.38k
  return ERROR_SUCCESS;
751
7.38k
}
752
753
////////////////////////////////////////////////////////////////////////////////
754
// For a given list of atoms returns another list after a single byte xor
755
// has been applied to it.
756
//
757
static int _yr_atoms_xor(
758
    YR_ATOM_LIST_ITEM* atoms,
759
    uint8_t min,
760
    uint8_t max,
761
    YR_ATOM_LIST_ITEM** xor_atoms)
762
899
{
763
899
  YR_ATOM_LIST_ITEM* atom;
764
899
  YR_ATOM_LIST_ITEM* new_atom;
765
766
899
  int i, j;
767
899
  *xor_atoms = NULL;
768
899
  atom = atoms;
769
770
2.09k
  while (atom != NULL)
771
1.19k
  {
772
184k
    for (j = min; j <= max; j++)
773
183k
    {
774
183k
      new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
775
776
183k
      if (new_atom == NULL)
777
0
        return ERROR_INSUFFICIENT_MEMORY;
778
779
711k
      for (i = 0; i < atom->atom.length; i++)
780
528k
      {
781
528k
        new_atom->atom.bytes[i] = atom->atom.bytes[i] ^ j;
782
528k
        new_atom->atom.mask[i] = 0xFF;
783
528k
      }
784
785
183k
      new_atom->atom.length = yr_min(atom->atom.length, YR_MAX_ATOM_LENGTH);
786
183k
      new_atom->forward_code_ref = atom->forward_code_ref;
787
183k
      new_atom->backward_code_ref = atom->backward_code_ref;
788
183k
      new_atom->backtrack = atom->backtrack;
789
183k
      new_atom->next = *xor_atoms;
790
791
183k
      *xor_atoms = new_atom;
792
183k
    }
793
794
1.19k
    atom = atom->next;
795
1.19k
  }
796
899
  return ERROR_SUCCESS;
797
899
}
798
799
////////////////////////////////////////////////////////////////////////////////
800
// For a given list of atoms returns another list with the corresponding
801
// wide atoms. Wide atoms are just the original atoms with interleaved zeroes,
802
// for example: 01 02 -> 01 00 02 00
803
//
804
static int _yr_atoms_wide(
805
    YR_ATOM_LIST_ITEM* atoms,
806
    YR_ATOM_LIST_ITEM** wide_atoms)
807
3.36k
{
808
3.36k
  YR_ATOM_LIST_ITEM* atom;
809
3.36k
  YR_ATOM_LIST_ITEM* new_atom;
810
811
3.36k
  int i;
812
813
3.36k
  *wide_atoms = NULL;
814
3.36k
  atom = atoms;
815
816
46.9k
  while (atom != NULL)
817
43.5k
  {
818
43.5k
    new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
819
820
43.5k
    if (new_atom == NULL)
821
0
      return ERROR_INSUFFICIENT_MEMORY;
822
823
217k
    for (i = 0; i < YR_MAX_ATOM_LENGTH; i++)
824
174k
    {
825
174k
      new_atom->atom.bytes[i] = 0;
826
174k
      new_atom->atom.mask[i] = 0xFF;
827
174k
    }
828
829
128k
    for (i = 0; i < atom->atom.length; i++)
830
126k
    {
831
126k
      if (i * 2 < YR_MAX_ATOM_LENGTH)
832
85.1k
        new_atom->atom.bytes[i * 2] = atom->atom.bytes[i];
833
41.0k
      else
834
41.0k
        break;
835
126k
    }
836
837
43.5k
    new_atom->atom.length = yr_min(atom->atom.length * 2, YR_MAX_ATOM_LENGTH);
838
43.5k
    new_atom->forward_code_ref = atom->forward_code_ref;
839
43.5k
    new_atom->backward_code_ref = atom->backward_code_ref;
840
43.5k
    new_atom->backtrack = atom->backtrack * 2;
841
43.5k
    new_atom->next = *wide_atoms;
842
843
43.5k
    *wide_atoms = new_atom;
844
845
43.5k
    atom = atom->next;
846
43.5k
  }
847
848
3.36k
  return ERROR_SUCCESS;
849
3.36k
}
850
851
struct STACK_ITEM
852
{
853
  RE_NODE* re_node;
854
  YR_ATOM_TREE_NODE* new_appending_node;
855
};
856
857
#define make_atom_from_re_nodes(atom, nodes_length, nodes)   \
858
3.94M
  {                                                          \
859
3.94M
    atom.length = nodes_length;                              \
860
19.5M
    for (i = 0; i < atom.length; i++)                        \
861
15.5M
    {                                                        \
862
15.5M
      atom.bytes[i] = (uint8_t) (recent_re_nodes)[i]->value; \
863
15.5M
      atom.mask[i] = (uint8_t) (recent_re_nodes)[i]->mask;   \
864
15.5M
    }                                                        \
865
3.94M
  }
866
867
////////////////////////////////////////////////////////////////////////////////
868
// Extract atoms from a regular expression. This is a helper function used by
869
// yr_atoms_extract_from_re that receives the abstract syntax tree for a regexp
870
// (or hex pattern) and builds an atom tree. The appending_node argument is a
871
// pointer to the ATOM_TREE_OR node at the root of the atom tree. This function
872
// creates the tree by appending new nodes to it.
873
//
874
static int _yr_atoms_extract_from_re(
875
    YR_ATOMS_CONFIG* config,
876
    RE_AST* re_ast,
877
    YR_ATOM_TREE_NODE* appending_node)
878
6.62k
{
879
6.62k
  YR_STACK* stack;
880
6.62k
  RE_NODE* re_node;
881
882
6.62k
  YR_ATOM atom = {0};
883
6.62k
  YR_ATOM best_atom = {0};
884
885
6.62k
  struct STACK_ITEM si;
886
887
6.62k
  int i, shift;
888
6.62k
  int quality;
889
6.62k
  int best_quality = -1;
890
6.62k
  int n = 0;
891
892
6.62k
  YR_ATOM_TREE_NODE* and_node;
893
6.62k
  YR_ATOM_TREE_NODE* left_node;
894
6.62k
  YR_ATOM_TREE_NODE* right_node;
895
896
  // The RE_NODEs most recently visited that can conform an atom (ie:
897
  // RE_NODE_LITERAL, RE_NODE_MASKED_LITERAL and RE_NODE_ANY). The number of
898
  // items in this array is n.
899
6.62k
  RE_NODE* recent_re_nodes[YR_MAX_ATOM_LENGTH];
900
901
  // The RE_NODEs corresponding to the best atom found so far for the current
902
  // appending node.
903
6.62k
  RE_NODE* best_atom_re_nodes[YR_MAX_ATOM_LENGTH];
904
905
  // This holds the ATOM_TREE_OR node where leaves (ATOM_TREE_LEAF) are
906
  // currently being appended.
907
6.62k
  YR_ATOM_TREE_NODE* current_appending_node = NULL;
908
909
  // This holds the ATOM_TREE_LEAF node whose atom is currently being updated.
910
6.62k
  YR_ATOM_TREE_NODE* leaf = NULL;
911
912
6.62k
  FAIL_ON_ERROR(yr_stack_create(1024, sizeof(si), &stack));
913
914
  // This first item pushed in the stack is the last one to be poped out, the
915
  // sole purpose of this item is forcing that any pending leaf is appended to
916
  // appending_node during the last iteration of the loop.
917
6.62k
  si.re_node = NULL;
918
6.62k
  si.new_appending_node = appending_node;
919
920
6.62k
  FAIL_ON_ERROR_WITH_CLEANUP(
921
6.62k
      yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack));
922
923
  // Start processing the root node.
924
6.62k
  si.re_node = re_ast->root_node;
925
926
  // Leaf nodes are initially appended to the node passed in the appending_node,
927
  // argument which is the root ATOM_TREE_OR node that is empty at this point.
928
6.62k
  si.new_appending_node = appending_node;
929
930
6.62k
  FAIL_ON_ERROR_WITH_CLEANUP(
931
6.62k
      yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack));
932
933
13.3M
  while (yr_stack_pop(stack, (void*) &si))
934
13.3M
  {
935
    // Change the appending node if the item poped from the stack says so.
936
13.3M
    if (si.new_appending_node != NULL)
937
509k
    {
938
      // Before changing the appending node let's append any pending leaf to
939
      // the current appending node.
940
509k
      if (n > 0)
941
312k
      {
942
312k
        make_atom_from_re_nodes(atom, n, recent_re_nodes);
943
312k
        shift = _yr_atoms_trim(&atom);
944
312k
        quality = config->get_atom_quality(config, &atom);
945
946
312k
        FAIL_ON_NULL_WITH_CLEANUP(
947
312k
            leaf = _yr_atoms_tree_node_create(ATOM_TREE_LEAF),
948
312k
            yr_stack_destroy(stack));
949
950
312k
        if (quality > best_quality)
951
93.3k
        {
952
93.3k
          memcpy(&leaf->atom, &atom, sizeof(atom));
953
93.3k
          memcpy(
954
93.3k
              &leaf->re_nodes,
955
93.3k
              &recent_re_nodes[shift],
956
93.3k
              sizeof(recent_re_nodes) - shift * sizeof(recent_re_nodes[0]));
957
93.3k
        }
958
218k
        else
959
218k
        {
960
218k
          memcpy(&leaf->atom, &best_atom, sizeof(best_atom));
961
218k
          memcpy(
962
218k
              &leaf->re_nodes, &best_atom_re_nodes, sizeof(best_atom_re_nodes));
963
218k
        }
964
965
312k
        _yr_atoms_tree_node_append(current_appending_node, leaf);
966
312k
        n = 0;
967
312k
      }
968
969
509k
      current_appending_node = si.new_appending_node;
970
509k
      best_quality = -1;
971
509k
    }
972
973
13.3M
    if (si.re_node != NULL)
974
12.8M
    {
975
12.8M
      switch (si.re_node->type)
976
12.8M
      {
977
11.9M
      case RE_NODE_LITERAL:
978
11.9M
      case RE_NODE_MASKED_LITERAL:
979
12.0M
      case RE_NODE_ANY:
980
981
12.0M
        if (n < YR_MAX_ATOM_LENGTH)
982
1.02M
        {
983
1.02M
          recent_re_nodes[n] = si.re_node;
984
1.02M
          best_atom_re_nodes[n] = si.re_node;
985
1.02M
          best_atom.bytes[n] = (uint8_t) si.re_node->value;
986
1.02M
          best_atom.mask[n] = (uint8_t) si.re_node->mask;
987
1.02M
          best_atom.length = ++n;
988
1.02M
        }
989
11.0M
        else if (best_quality < YR_MAX_ATOM_QUALITY)
990
3.63M
        {
991
3.63M
          make_atom_from_re_nodes(atom, n, recent_re_nodes);
992
3.63M
          shift = _yr_atoms_trim(&atom);
993
3.63M
          quality = config->get_atom_quality(config, &atom);
994
995
3.63M
          if (quality > best_quality)
996
697k
          {
997
3.48M
            for (i = 0; i < atom.length; i++)
998
2.78M
            {
999
2.78M
              best_atom.bytes[i] = atom.bytes[i];
1000
2.78M
              best_atom.mask[i] = atom.mask[i];
1001
2.78M
              best_atom_re_nodes[i] = recent_re_nodes[i + shift];
1002
2.78M
            }
1003
1004
697k
            best_atom.length = atom.length;
1005
697k
            best_quality = quality;
1006
697k
          }
1007
1008
14.5M
          for (i = 1; i < YR_MAX_ATOM_LENGTH; i++)
1009
10.9M
            recent_re_nodes[i - 1] = recent_re_nodes[i];
1010
1011
3.63M
          recent_re_nodes[YR_MAX_ATOM_LENGTH - 1] = si.re_node;
1012
3.63M
        }
1013
1014
12.0M
        break;
1015
1016
315k
      case RE_NODE_CONCAT:
1017
1018
315k
        re_node = si.re_node->children_tail;
1019
1020
        // Push children right to left, they are poped left to right.
1021
12.9M
        while (re_node != NULL)
1022
12.6M
        {
1023
12.6M
          si.new_appending_node = NULL;
1024
12.6M
          si.re_node = re_node;
1025
1026
12.6M
          FAIL_ON_ERROR_WITH_CLEANUP(
1027
12.6M
              yr_stack_push(stack, &si), yr_stack_destroy(stack));
1028
1029
12.6M
          re_node = re_node->prev_sibling;
1030
12.6M
        }
1031
1032
315k
        break;
1033
1034
315k
      case RE_NODE_ALT:
1035
1036
        // Create ATOM_TREE_AND node with two ATOM_TREE_OR children nodes.
1037
9.78k
        and_node = _yr_atoms_tree_node_create(ATOM_TREE_AND);
1038
9.78k
        left_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1039
9.78k
        right_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1040
1041
9.78k
        if (and_node == NULL || left_node == NULL || right_node == NULL)
1042
0
        {
1043
0
          _yr_atoms_tree_node_destroy(and_node);
1044
0
          _yr_atoms_tree_node_destroy(left_node);
1045
0
          _yr_atoms_tree_node_destroy(right_node);
1046
1047
0
          yr_stack_destroy(stack);
1048
1049
0
          return ERROR_INSUFFICIENT_MEMORY;
1050
0
        }
1051
1052
9.78k
        and_node->children_head = left_node;
1053
9.78k
        and_node->children_tail = right_node;
1054
9.78k
        left_node->next_sibling = right_node;
1055
1056
        // Add the ATOM_TREE_AND as children of the current node.
1057
9.78k
        _yr_atoms_tree_node_append(current_appending_node, and_node);
1058
1059
9.78k
        re_node = si.re_node;
1060
1061
9.78k
        si.new_appending_node = current_appending_node;
1062
9.78k
        si.re_node = NULL;
1063
1064
9.78k
        FAIL_ON_ERROR_WITH_CLEANUP(
1065
9.78k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1066
1067
        // RE_NODE_ALT nodes has only two children, so children_head is the
1068
        // left one, and children_tail is right one.
1069
9.78k
        si.new_appending_node = right_node;
1070
9.78k
        si.re_node = re_node->children_tail;
1071
1072
9.78k
        FAIL_ON_ERROR_WITH_CLEANUP(
1073
9.78k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1074
1075
9.78k
        si.new_appending_node = left_node;
1076
9.78k
        si.re_node = re_node->children_head;
1077
1078
9.78k
        FAIL_ON_ERROR_WITH_CLEANUP(
1079
9.78k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1080
1081
9.78k
        break;
1082
1083
654
      case RE_NODE_PLUS:
1084
1085
654
        re_node = si.re_node;
1086
1087
654
        si.new_appending_node = current_appending_node;
1088
654
        si.re_node = NULL;
1089
1090
654
        FAIL_ON_ERROR_WITH_CLEANUP(
1091
654
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1092
1093
654
        si.new_appending_node = NULL;
1094
        // RE_NODE_PLUS nodes has a single child, which is children_head.
1095
654
        si.re_node = re_node->children_head;
1096
1097
654
        FAIL_ON_ERROR_WITH_CLEANUP(
1098
654
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1099
1100
654
        break;
1101
1102
56.0k
      case RE_NODE_RANGE:
1103
1104
56.0k
        re_node = si.re_node;
1105
1106
56.0k
        si.new_appending_node = current_appending_node;
1107
56.0k
        si.re_node = NULL;
1108
1109
56.0k
        FAIL_ON_ERROR_WITH_CLEANUP(
1110
56.0k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1111
1112
56.0k
        si.new_appending_node = NULL;
1113
1114
        // RE_NODE_RANGE nodes has a single child, which is children_head.
1115
56.0k
        si.re_node = re_node->children_head;
1116
1117
        // In a regexp like /a{10,20}/ the optimal atom is 'aaaa' (assuming
1118
        // that YR_MAX_ATOM_LENGTH = 4) because the 'a' character must appear
1119
        // at least 10 times in the matching string. Each call in the loop
1120
        // will append one 'a' to the atom, so YR_MAX_ATOM_LENGTH iterations
1121
        // are enough.
1122
1123
230k
        for (i = 0; i < yr_min(re_node->start, YR_MAX_ATOM_LENGTH); i++)
1124
174k
        {
1125
174k
          FAIL_ON_ERROR_WITH_CLEANUP(
1126
174k
              yr_stack_push(stack, &si), yr_stack_destroy(stack));
1127
174k
        }
1128
1129
56.0k
        break;
1130
1131
56.0k
      case RE_NODE_RANGE_ANY:
1132
2.05k
      case RE_NODE_STAR:
1133
2.48k
      case RE_NODE_CLASS:
1134
2.61k
      case RE_NODE_WORD_CHAR:
1135
2.80k
      case RE_NODE_NON_WORD_CHAR:
1136
2.80k
      case RE_NODE_SPACE:
1137
2.93k
      case RE_NODE_NON_SPACE:
1138
3.11k
      case RE_NODE_DIGIT:
1139
3.11k
      case RE_NODE_NON_DIGIT:
1140
4.05k
      case RE_NODE_EMPTY:
1141
243k
      case RE_NODE_ANCHOR_START:
1142
408k
      case RE_NODE_ANCHOR_END:
1143
408k
      case RE_NODE_WORD_BOUNDARY:
1144
409k
      case RE_NODE_NON_WORD_BOUNDARY:
1145
409k
      case RE_NODE_NOT_LITERAL:
1146
409k
      case RE_NODE_MASKED_NOT_LITERAL:
1147
1148
409k
        si.new_appending_node = current_appending_node;
1149
409k
        si.re_node = NULL;
1150
1151
409k
        FAIL_ON_ERROR_WITH_CLEANUP(
1152
409k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1153
1154
409k
        break;
1155
1156
0
      default:
1157
0
        assert(false);
1158
12.8M
      }
1159
12.8M
    }
1160
13.3M
  }
1161
1162
6.62k
  yr_stack_destroy(stack);
1163
1164
6.62k
  return ERROR_SUCCESS;
1165
6.62k
}
1166
1167
////////////////////////////////////////////////////////////////////////////////
1168
// Makes an exact copy of an YR_ATOM_LIST_ITEM.
1169
//
1170
static YR_ATOM_LIST_ITEM* _yr_atoms_clone_list_item(YR_ATOM_LIST_ITEM* item)
1171
426k
{
1172
426k
  YR_ATOM_LIST_ITEM* clone = (YR_ATOM_LIST_ITEM*) yr_malloc(
1173
426k
      sizeof(YR_ATOM_LIST_ITEM));
1174
1175
426k
  if (clone == NULL)
1176
0
    return NULL;
1177
1178
426k
  memcpy(clone, item, sizeof(YR_ATOM_LIST_ITEM));
1179
1180
426k
  return clone;
1181
426k
}
1182
1183
////////////////////////////////////////////////////////////////////////////////
1184
// Given list of atoms that may contain wildcards, replace those wildcarded
1185
// atoms with a list of non-wildcarded atoms covering all the combinations
1186
// allowed by the wildcarded atom. For example, the atom {01 ?2 03} will be
1187
// replaced by {01 02 03}, {01 12 03}, {01 22 03} .. {01 F2 03}. The list
1188
// is modified in-place.
1189
//
1190
// Args:
1191
//   atoms: Pointer to first element of the list.
1192
//
1193
// Returns:
1194
//   ERROR_SUCCESS or ERROR_INSUFFICIENT_MEMORY.
1195
//
1196
static int _yr_atoms_expand_wildcards(YR_ATOM_LIST_ITEM* atoms)
1197
6.62k
{
1198
6.62k
  int i;
1199
1200
6.62k
  YR_ATOM_LIST_ITEM* atom = atoms;
1201
6.62k
  YR_ATOM_LIST_ITEM* new_atom;
1202
6.62k
  YR_ATOM_LIST_ITEM* prev_atom;
1203
6.62k
  YR_ATOM_LIST_ITEM* next_atom;
1204
1205
451k
  while (atom != NULL)
1206
444k
  {
1207
444k
    bool expanded = false;
1208
1209
2.15M
    for (i = 0; i < atom->atom.length; i++)
1210
1.70M
    {
1211
1.70M
      uint16_t a, s, e, incr = 1;
1212
1213
1.70M
      switch (atom->atom.mask[i])
1214
1.70M
      {
1215
1.48k
      case 0x00:
1216
1.48k
        expanded = true;
1217
1.48k
        s = 0x00;
1218
1.48k
        e = 0xFF;
1219
1.48k
        break;
1220
1221
468
      case 0x0F:
1222
468
        expanded = true;
1223
468
        s = atom->atom.bytes[i];
1224
468
        e = atom->atom.bytes[i] | 0xF0;
1225
468
        incr = 0x10;
1226
468
        break;
1227
1228
2.71k
      case 0xF0:
1229
2.71k
        expanded = true;
1230
2.71k
        s = atom->atom.bytes[i];
1231
2.71k
        e = atom->atom.bytes[i] | 0x0F;
1232
2.71k
        break;
1233
1234
1.70M
      default:
1235
1.70M
        s = 0;
1236
1.70M
        e = 0;
1237
1.70M
      }
1238
1239
1.70M
      if (s != e)
1240
4.66k
      {
1241
4.66k
        atom->atom.bytes[i] = (uint8_t) s;
1242
4.66k
        atom->atom.mask[i] = 0xFF;
1243
4.66k
      }
1244
1245
1.70M
      prev_atom = atom;
1246
1.70M
      next_atom = atom->next;
1247
1248
2.13M
      for (a = s + incr; a <= e; a += incr)
1249
426k
      {
1250
426k
        new_atom = _yr_atoms_clone_list_item(atom);
1251
1252
426k
        if (new_atom == NULL)
1253
0
          return ERROR_INSUFFICIENT_MEMORY;
1254
1255
426k
        new_atom->atom.bytes[i] = (uint8_t) a;
1256
426k
        new_atom->atom.mask[i] = 0xFF;
1257
426k
        new_atom->next = next_atom;
1258
426k
        prev_atom->next = new_atom;
1259
426k
        prev_atom = new_atom;
1260
426k
      }
1261
1.70M
    }
1262
1263
444k
    if (!expanded)
1264
439k
      atom = atom->next;
1265
444k
  }
1266
1267
6.62k
  return ERROR_SUCCESS;
1268
6.62k
}
1269
1270
////////////////////////////////////////////////////////////////////////////////
1271
// Extract atoms from a regular expression. This function receives the abstract
1272
// syntax tree for a regexp (or hex pattern) and returns a list of atoms that
1273
// should be added to the Aho-Corasick automaton.
1274
//
1275
int yr_atoms_extract_from_re(
1276
    YR_ATOMS_CONFIG* config,
1277
    RE_AST* re_ast,
1278
    YR_MODIFIER modifier,
1279
    YR_ATOM_LIST_ITEM** atoms,
1280
    int* min_atom_quality)
1281
6.62k
{
1282
6.62k
  YR_ATOM_TREE* atom_tree = (YR_ATOM_TREE*) yr_malloc(sizeof(YR_ATOM_TREE));
1283
1284
6.62k
  YR_ATOM_LIST_ITEM* wide_atoms;
1285
6.62k
  YR_ATOM_LIST_ITEM* case_insensitive_atoms;
1286
1287
6.62k
  if (atom_tree == NULL)
1288
0
    return ERROR_INSUFFICIENT_MEMORY;
1289
1290
6.62k
  atom_tree->root_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1291
1292
6.62k
  if (atom_tree->root_node == NULL)
1293
0
  {
1294
0
    _yr_atoms_tree_destroy(atom_tree);
1295
0
    return ERROR_INSUFFICIENT_MEMORY;
1296
0
  }
1297
1298
6.62k
  FAIL_ON_ERROR_WITH_CLEANUP(
1299
6.62k
      _yr_atoms_extract_from_re(config, re_ast, atom_tree->root_node),
1300
6.62k
      _yr_atoms_tree_destroy(atom_tree));
1301
1302
  // Initialize atom list
1303
6.62k
  *atoms = NULL;
1304
1305
  // Choose the atoms that will be used.
1306
6.62k
  FAIL_ON_ERROR_WITH_CLEANUP(
1307
6.62k
      _yr_atoms_choose(config, atom_tree->root_node, atoms, min_atom_quality),
1308
6.62k
      _yr_atoms_tree_destroy(atom_tree));
1309
1310
6.62k
  _yr_atoms_tree_destroy(atom_tree);
1311
1312
6.62k
  FAIL_ON_ERROR_WITH_CLEANUP(
1313
6.62k
      _yr_atoms_expand_wildcards(*atoms),
1314
6.62k
      {  // Cleanup
1315
6.62k
        yr_atoms_list_destroy(*atoms);
1316
6.62k
        *atoms = NULL;
1317
6.62k
      });
1318
1319
  // Don't do convert atoms to wide here if either base64 modifier is used.
1320
  // This is to avoid the situation where we have "base64 wide" because
1321
  // the wide has already been applied BEFORE the base64 encoding.
1322
6.62k
  if (modifier.flags & STRING_FLAGS_WIDE &&
1323
1.90k
      !(modifier.flags & STRING_FLAGS_BASE64 ||
1324
1.78k
        modifier.flags & STRING_FLAGS_BASE64_WIDE))
1325
1.42k
  {
1326
1.42k
    FAIL_ON_ERROR_WITH_CLEANUP(
1327
1.42k
        _yr_atoms_wide(*atoms, &wide_atoms),
1328
1.42k
        {  // Cleanup
1329
1.42k
          yr_atoms_list_destroy(*atoms);
1330
1.42k
          yr_atoms_list_destroy(wide_atoms);
1331
1.42k
          *atoms = NULL;
1332
1.42k
        });
1333
1334
1.42k
    if (modifier.flags & STRING_FLAGS_ASCII)
1335
592
    {
1336
592
      *atoms = _yr_atoms_list_concat(*atoms, wide_atoms);
1337
592
    }
1338
832
    else
1339
832
    {
1340
832
      yr_atoms_list_destroy(*atoms);
1341
832
      *atoms = wide_atoms;
1342
832
    }
1343
1.42k
  }
1344
1345
6.62k
  if (modifier.flags & STRING_FLAGS_NO_CASE)
1346
1.72k
  {
1347
1.72k
    FAIL_ON_ERROR_WITH_CLEANUP(
1348
1.72k
        _yr_atoms_case_insensitive(*atoms, &case_insensitive_atoms),
1349
1.72k
        {  // Cleanup
1350
1.72k
          yr_atoms_list_destroy(*atoms);
1351
1.72k
          yr_atoms_list_destroy(case_insensitive_atoms);
1352
1.72k
          *atoms = NULL;
1353
1.72k
        });
1354
1355
1.72k
    *atoms = _yr_atoms_list_concat(*atoms, case_insensitive_atoms);
1356
1.72k
  }
1357
1358
6.62k
  return ERROR_SUCCESS;
1359
6.62k
}
1360
1361
////////////////////////////////////////////////////////////////////////////////
1362
// Extract atoms from a string.
1363
//
1364
int yr_atoms_extract_from_string(
1365
    YR_ATOMS_CONFIG* config,
1366
    uint8_t* string,
1367
    int32_t string_length,
1368
    YR_MODIFIER modifier,
1369
    YR_ATOM_LIST_ITEM** atoms,
1370
    int* min_atom_quality)
1371
22.7k
{
1372
22.7k
  YR_ATOM_LIST_ITEM* item;
1373
22.7k
  YR_ATOM_LIST_ITEM* case_insensitive_atoms;
1374
22.7k
  YR_ATOM_LIST_ITEM* xor_atoms;
1375
22.7k
  YR_ATOM_LIST_ITEM* wide_atoms;
1376
1377
22.7k
  YR_ATOM atom;
1378
1379
22.7k
  int quality, max_quality;
1380
22.7k
  int i;
1381
1382
22.7k
  item = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
1383
1384
22.7k
  if (item == NULL)
1385
0
    return ERROR_INSUFFICIENT_MEMORY;
1386
1387
22.7k
  item->forward_code_ref = YR_ARENA_NULL_REF;
1388
22.7k
  item->backward_code_ref = YR_ARENA_NULL_REF;
1389
22.7k
  item->next = NULL;
1390
22.7k
  item->backtrack = 0;
1391
1392
22.7k
  item->atom.length = yr_min(string_length, YR_MAX_ATOM_LENGTH);
1393
1394
81.3k
  for (i = 0; i < item->atom.length; i++)
1395
58.6k
  {
1396
58.6k
    item->atom.bytes[i] = string[i];
1397
58.6k
    item->atom.mask[i] = 0xFF;
1398
58.6k
  }
1399
1400
22.7k
  max_quality = config->get_atom_quality(config, &item->atom);
1401
1402
22.7k
  atom.length = YR_MAX_ATOM_LENGTH;
1403
22.7k
  memset(atom.mask, 0xFF, atom.length);
1404
1405
22.7k
  for (i = YR_MAX_ATOM_LENGTH;
1406
68.0k
       i < string_length && max_quality < YR_MAX_ATOM_QUALITY;
1407
45.3k
       i++)
1408
45.3k
  {
1409
45.3k
    atom.length = YR_MAX_ATOM_LENGTH;
1410
45.3k
    memcpy(atom.bytes, string + i - YR_MAX_ATOM_LENGTH + 1, atom.length);
1411
1412
45.3k
    quality = config->get_atom_quality(config, &atom);
1413
1414
45.3k
    if (quality > max_quality)
1415
4.87k
    {
1416
4.87k
      memcpy(&item->atom, &atom, sizeof(atom));
1417
4.87k
      item->backtrack = i - YR_MAX_ATOM_LENGTH + 1;
1418
4.87k
      max_quality = quality;
1419
4.87k
    }
1420
45.3k
  }
1421
1422
22.7k
  *atoms = item;
1423
22.7k
  *min_atom_quality = max_quality;
1424
1425
22.7k
  if (modifier.flags & STRING_FLAGS_WIDE)
1426
1.93k
  {
1427
1.93k
    FAIL_ON_ERROR_WITH_CLEANUP(
1428
1.93k
        _yr_atoms_wide(*atoms, &wide_atoms),
1429
1.93k
        {  // Cleanup
1430
1.93k
          yr_atoms_list_destroy(*atoms);
1431
1.93k
          yr_atoms_list_destroy(wide_atoms);
1432
1.93k
          *atoms = NULL;
1433
1.93k
        });
1434
1435
1.93k
    if (modifier.flags & STRING_FLAGS_ASCII)
1436
1.02k
    {
1437
1.02k
      *atoms = _yr_atoms_list_concat(*atoms, wide_atoms);
1438
1.02k
    }
1439
909
    else
1440
909
    {
1441
909
      yr_atoms_list_destroy(*atoms);
1442
909
      *atoms = wide_atoms;
1443
909
    }
1444
1.93k
  }
1445
1446
22.7k
  if (modifier.flags & STRING_FLAGS_NO_CASE)
1447
5.66k
  {
1448
5.66k
    FAIL_ON_ERROR_WITH_CLEANUP(
1449
5.66k
        _yr_atoms_case_insensitive(*atoms, &case_insensitive_atoms),
1450
5.66k
        {  // Cleanup
1451
5.66k
          yr_atoms_list_destroy(*atoms);
1452
5.66k
          yr_atoms_list_destroy(case_insensitive_atoms);
1453
5.66k
          *atoms = NULL;
1454
5.66k
        });
1455
1456
5.66k
    *atoms = _yr_atoms_list_concat(*atoms, case_insensitive_atoms);
1457
5.66k
  }
1458
1459
22.7k
  if (modifier.flags & STRING_FLAGS_XOR)
1460
899
  {
1461
899
    FAIL_ON_ERROR_WITH_CLEANUP(
1462
899
        _yr_atoms_xor(*atoms, modifier.xor_min, modifier.xor_max, &xor_atoms),
1463
899
        {  // Cleanup
1464
899
          yr_atoms_list_destroy(*atoms);
1465
899
          yr_atoms_list_destroy(xor_atoms);
1466
899
          *atoms = NULL;
1467
899
        });
1468
1469
899
    yr_atoms_list_destroy(*atoms);
1470
899
    *atoms = xor_atoms;
1471
899
  }
1472
1473
  // Recheck the atom quality, in case we have just generated some poor atoms.
1474
  // https://github.com/VirusTotal/yara/issues/1172
1475
236k
  for (item = *atoms; item != NULL; item = item->next)
1476
213k
  {
1477
213k
    quality = config->get_atom_quality(config, &item->atom);
1478
213k
    if (quality < *min_atom_quality)
1479
1.53k
      *min_atom_quality = quality;
1480
213k
  }
1481
1482
22.7k
  return ERROR_SUCCESS;
1483
22.7k
}
1484
1485
////////////////////////////////////////////////////////////////////////////////
1486
// Prints an atom tree node. Used only for debugging purposes.
1487
//
1488
void yr_atoms_tree_node_print(YR_ATOM_TREE_NODE* node)
1489
0
{
1490
0
  YR_ATOM_TREE_NODE* child;
1491
1492
0
  if (node == NULL)
1493
0
  {
1494
0
    printf("Empty tree node\n");
1495
0
    return;
1496
0
  }
1497
1498
0
  switch (node->type)
1499
0
  {
1500
0
  case ATOM_TREE_LEAF:
1501
0
    for (int i = 0; i < node->atom.length; i++)
1502
0
      printf("%02X", node->atom.bytes[i]);
1503
0
    break;
1504
1505
0
  case ATOM_TREE_AND:
1506
0
  case ATOM_TREE_OR:
1507
0
    if (node->type == ATOM_TREE_AND)
1508
0
      printf("AND");
1509
0
    else
1510
0
      printf("OR");
1511
0
    printf("(");
1512
0
    child = node->children_head;
1513
0
    while (child != NULL)
1514
0
    {
1515
0
      yr_atoms_tree_node_print(child);
1516
0
      child = child->next_sibling;
1517
0
      if (child != NULL)
1518
0
        printf(",");
1519
0
    }
1520
0
    printf(")");
1521
0
    break;
1522
0
  }
1523
0
}