Coverage Report

Created: 2023-09-25 07:12

/src/yara/libyara/atoms.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013-2018. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module handles atom extraction from regexps and hex strings. Atoms are
33
undivided substrings found in a regexps and hex strings. Let's consider this
34
hex string:
35
36
{ 01 02 03 04 05 ?? 06 07 08 [1-2] 09 0A }
37
38
In the above string, byte sequences 0102030405, 060708 and 090A are atoms.
39
Similarly, in this regexp:
40
41
/abc.*ed[0-9]+fgh/
42
43
The strings "abc", "ed" and "fgh" are atoms.
44
45
When searching for regexps/hex strings matching a file, YARA uses these
46
atoms to find locations inside the file where the regexp/hex string could
47
match. If the atom "abc" is found somewhere inside the file, there is a chance
48
for /abc.*ed[0-9]+fgh/ to match the file, if "abc" doesn't appear in the file
49
there's no chance for the regexp to match. When the atom is found in the file
50
YARA proceeds to fully evaluate the regexp/hex string to determine if it's
51
actually a match.
52
53
For each regexp/hex string YARA extracts one or more atoms. Sometimes a
54
single atom is enough (like in the previous example "abc" is enough for finding
55
/abc.*ed[0-9]+fgh/), but sometimes a single atom isn't enough like in the
56
regexp /(abc|efg)/. In this case YARA must search for both "abc" AND "efg" and
57
fully evaluate the regexp whenever one of these atoms is found.
58
59
In the regexp /Look(at|into)this/ YARA can search for "Look", or search for
60
"this", or search for both "at" and "into". This is what we call an atoms tree,
61
because it can be represented by the following tree structure:
62
63
-OR
64
  |- "Look"
65
  |
66
  |- AND
67
  |   |
68
  |   |- "at"
69
  |    - "into"
70
  |
71
   - "this"
72
73
From an atom tree YARA chooses the best combination, trying to minimize the
74
number of required atoms, but also using high quality atoms (long atoms with
75
not too many zeroes and a bit of byte diversity). In the previous example YARA
76
will end up using the "Look" atom alone, but in /a(bcd|efg)h/ atoms "bcd" and
77
"efg" will be used because "a" and "h" are too short.
78
*/
79
80
#include <assert.h>
81
#include <string.h>
82
#include <yara/atoms.h>
83
#include <yara/error.h>
84
#include <yara/globals.h>
85
#include <yara/limits.h>
86
#include <yara/mem.h>
87
#include <yara/stack.h>
88
#include <yara/types.h>
89
#include <yara/utils.h>
90
91
////////////////////////////////////////////////////////////////////////////////
92
// Returns a numeric value indicating the quality of an atom. The quality
93
// depends on some characteristics of the atom, including its length, number
94
// of very common bytes like 00 and FF and number of unique distinct bytes.
95
// Atom 00 00 has a very low quality, because it's only two bytes long and
96
// both bytes are zeroes. Atom 01 01 01 01 is better but still not optimal,
97
// because the same byte is repeated. Atom 01 02 03 04 is an optimal one.
98
//
99
// Args:
100
//    config: Pointer to YR_ATOMS_CONFIG struct.
101
//    atom: Pointer to YR_ATOM struct.
102
//
103
// Returns:
104
//    An integer indicating the atom's quality
105
//
106
int yr_atoms_heuristic_quality(YR_ATOMS_CONFIG* config, YR_ATOM* atom)
107
0
{
108
0
  YR_BITMASK seen_bytes[YR_BITMASK_SIZE(256)];
109
110
0
  int quality = 0;
111
0
  int unique_bytes = 0;
112
113
0
  assert(atom->length <= YR_MAX_ATOM_LENGTH);
114
115
0
  yr_bitmask_clear_all(seen_bytes);
116
117
  // Each byte in the atom contributes a certain amount of points to the
118
  // quality. Bytes [a-zA-Z] contribute 18 points each, common bytes like
119
  // 0x00, 0x20 and 0xFF contribute only 12 points, and the rest of the
120
  // bytes contribute 20 points. The ?? mask substracts 10 points, and masks
121
  // X? and ?X contribute 4 points. An additional boost consisting in 2x the
122
  // number of unique bytes in the atom is added to the quality. This are
123
  // some examples of the quality of atoms:
124
  //
125
  //   01 02 03 04   quality = 20 + 20 + 20 + 20 + 8 = 88
126
  //   01 ?? 03 04   quality = 20 - 10 + 20 + 20 + 6 = 56
127
  //   01 0? 03      quality = 20 +  4 + 20      + 4 = 48
128
  //   01 02         quality = 20 + 20           + 4 = 44
129
  //   01 ?? ?3 04   quality = 20 - 10 +  2 + 20 + 4 = 36
130
  //   61 62         quality = 18 + 18           + 4 = 40
131
  //   61 61         quality = 18 + 18           + 2 = 38  <- warning threshold
132
  //   00 01         quality = 12 + 20           + 4 = 36
133
  //   01 ?? 02      quality = 20 - 10 + 20      + 4 = 34
134
  //   01            quality = 20                + 1 = 21
135
136
0
  for (int i = 0; i < atom->length; i++)
137
0
  {
138
0
    switch (atom->mask[i])
139
0
    {
140
0
    case 0x00:
141
0
      quality -= 10;
142
0
      break;
143
0
    case 0x0F:
144
0
      quality += 4;
145
0
      break;
146
0
    case 0xF0:
147
0
      quality += 4;
148
0
      break;
149
0
    case 0xFF:
150
0
      switch (atom->bytes[i])
151
0
      {
152
0
      case 0x00:
153
0
      case 0x20:
154
0
      case 0xCC:
155
0
      case 0xFF:
156
        // Common bytes contribute less to the quality than the rest.
157
0
        quality += 12;
158
0
        break;
159
0
      default:
160
        // Bytes in the a-z and A-Z ranges have a slightly lower quality
161
        // than the rest. We want to favor atoms that contain bytes outside
162
        // those ranges because they generate less additional atoms during
163
        // calls to _yr_atoms_case_combinations.
164
0
        if (yr_lowercase[atom->bytes[i]] >= 'a' &&
165
0
            yr_lowercase[atom->bytes[i]] <= 'z')
166
0
          quality += 18;
167
0
        else
168
0
          quality += 20;
169
0
      }
170
171
0
      if (!yr_bitmask_is_set(seen_bytes, atom->bytes[i]))
172
0
      {
173
0
        yr_bitmask_set(seen_bytes, atom->bytes[i]);
174
0
        unique_bytes++;
175
0
      }
176
0
    }
177
0
  }
178
179
  // If all the bytes in the atom are equal and very common, let's penalize
180
  // it heavily.
181
0
  if (unique_bytes == 1 && (yr_bitmask_is_set(seen_bytes, 0x00) ||
182
0
                            yr_bitmask_is_set(seen_bytes, 0x20) ||
183
0
                            yr_bitmask_is_set(seen_bytes, 0x90) ||
184
0
                            yr_bitmask_is_set(seen_bytes, 0xCC) ||
185
0
                            yr_bitmask_is_set(seen_bytes, 0xFF)))
186
0
  {
187
0
    quality -= 10 * atom->length;
188
0
  }
189
  // In general atoms with more unique bytes have a better quality, so let's
190
  // boost the quality in the amount of unique bytes.
191
0
  else
192
0
  {
193
0
    quality += 2 * unique_bytes;
194
0
  }
195
196
  // The final quality is not zero-based, we start at YR_MAX_ATOM_QUALITY
197
  // for the best possible atom and substract from there. The best possible
198
  // quality is 21 * YR_MAX_ATOM_LENGTH (20 points per byte + 2 additional point
199
  // per unique byte, with a maximum of 2*YR_MAX_ATOM_LENGTH unique bytes).
200
201
0
  return YR_MAX_ATOM_QUALITY - 22 * YR_MAX_ATOM_LENGTH + quality;
202
0
}
203
204
////////////////////////////////////////////////////////////////////////////////
205
// Compares the byte sequence in a1 with the YR_ATOM in a2, taking atom's mask
206
// into account.
207
//
208
// Returns:
209
//   < 0 if the first byte that does not match has a lower value in a1 than
210
//       in a2.
211
//   > 0 if the first byte that does not match has a greater value in a1 than
212
//       in a2.
213
//   = 0 if a1 is equal or matches a2.
214
//
215
static int _yr_atoms_cmp(const uint8_t* a1, YR_ATOM* a2)
216
0
{
217
0
  int result = 0;
218
0
  int i = 0;
219
220
0
  while (result == 0 && i < a2->length)
221
0
  {
222
0
    switch (a2->mask[i])
223
0
    {
224
0
    case 0xFF:
225
0
    case 0x0F:
226
0
    case 0xF0:
227
0
    case 0x00:
228
0
      result = (a1[i] & a2->mask[i]) - a2->bytes[i];
229
0
      break;
230
0
    default:
231
0
      assert(false);
232
0
    }
233
234
0
    i++;
235
0
  }
236
237
0
  return result;
238
0
}
239
240
////////////////////////////////////////////////////////////////////////////////
241
// Returns a numeric value indicating the quality of an atom. The quality is
242
// based in the atom quality table passed in "config". Very common atoms
243
// (i.e: those with greater quality) have lower quality than those that are
244
// uncommon. See the comment for yr_compiler_set_atom_quality_table for
245
// details about the quality table's format.
246
//
247
// Args:
248
//    YR_ATOMS_CONFIG* config   - Pointer to YR_ATOMS_CONFIG struct.
249
//    YR_ATOM* atom             - Pointer to YR_ATOM struct.
250
//
251
// Returns:
252
//    An integer indicating the atom's quality
253
//
254
int yr_atoms_table_quality(YR_ATOMS_CONFIG* config, YR_ATOM* atom)
255
0
{
256
0
  YR_ATOM_QUALITY_TABLE_ENTRY* table = config->quality_table;
257
258
0
  int begin = 0;
259
0
  int end = config->quality_table_entries;
260
261
0
  assert(atom->length <= YR_MAX_ATOM_LENGTH);
262
263
0
  while (end > begin)
264
0
  {
265
0
    int middle = begin + (end - begin) / 2;
266
0
    int c = _yr_atoms_cmp(table[middle].atom, atom);
267
268
0
    if (c < 0)
269
0
    {
270
0
      begin = middle + 1;
271
0
    }
272
0
    else if (c > 0)
273
0
    {
274
0
      end = middle;
275
0
    }
276
0
    else
277
0
    {
278
0
      int i = middle + 1;
279
0
      int quality = table[middle].quality;
280
0
      int min_quality = quality;
281
282
0
      while (i < end && _yr_atoms_cmp(table[i].atom, atom) == 0)
283
0
      {
284
0
        if (min_quality > table[i].quality)
285
0
          min_quality = table[i].quality;
286
287
0
        i++;
288
0
      }
289
290
0
      i = middle - 1;
291
292
0
      while (i >= begin && _yr_atoms_cmp(table[i].atom, atom) == 0)
293
0
      {
294
0
        if (min_quality > table[i].quality)
295
0
          min_quality = table[i].quality;
296
297
0
        i--;
298
0
      }
299
300
0
      return min_quality >> (YR_MAX_ATOM_LENGTH - atom->length);
301
0
    }
302
0
  }
303
304
0
  return YR_MAX_ATOM_QUALITY;
305
0
}
306
307
////////////////////////////////////////////////////////////////////////////////
308
// Returns the quality for the worst quality atom in a list.
309
//
310
int yr_atoms_min_quality(YR_ATOMS_CONFIG* config, YR_ATOM_LIST_ITEM* atom_list)
311
0
{
312
0
  YR_ATOM_LIST_ITEM* atom;
313
314
0
  int quality;
315
0
  int min_quality = YR_MAX_ATOM_QUALITY;
316
317
0
  if (atom_list == NULL)
318
0
    return YR_MIN_ATOM_QUALITY;
319
320
0
  atom = atom_list;
321
322
0
  while (atom != NULL)
323
0
  {
324
0
    quality = config->get_atom_quality(config, &atom->atom);
325
326
0
    if (quality < min_quality)
327
0
      min_quality = quality;
328
329
0
    atom = atom->next;
330
0
  }
331
332
0
  return min_quality;
333
0
}
334
335
////////////////////////////////////////////////////////////////////////////////
336
// Creates a new node for an atoms tree.
337
//
338
static YR_ATOM_TREE_NODE* _yr_atoms_tree_node_create(uint8_t type)
339
0
{
340
0
  YR_ATOM_TREE_NODE* new_node = (YR_ATOM_TREE_NODE*) yr_malloc(
341
0
      sizeof(YR_ATOM_TREE_NODE));
342
343
0
  if (new_node != NULL)
344
0
  {
345
0
    new_node->type = type;
346
0
    new_node->atom.length = 0;
347
0
    new_node->next_sibling = NULL;
348
0
    new_node->children_head = NULL;
349
0
    new_node->children_tail = NULL;
350
0
  }
351
352
0
  return new_node;
353
0
}
354
355
////////////////////////////////////////////////////////////////////////////////
356
// Destroys a node from an atoms tree.
357
//
358
static void _yr_atoms_tree_node_destroy(YR_ATOM_TREE_NODE* node)
359
0
{
360
0
  YR_ATOM_TREE_NODE* child;
361
0
  YR_ATOM_TREE_NODE* next_child;
362
363
0
  if (node == NULL)
364
0
    return;
365
366
0
  if (node->type == ATOM_TREE_OR || node->type == ATOM_TREE_AND)
367
0
  {
368
0
    child = node->children_head;
369
370
0
    while (child != NULL)
371
0
    {
372
0
      next_child = child->next_sibling;
373
0
      _yr_atoms_tree_node_destroy(child);
374
0
      child = next_child;
375
0
    }
376
0
  }
377
378
0
  yr_free(node);
379
0
}
380
381
////////////////////////////////////////////////////////////////////////////////
382
// Appends a new child node to another atoms tree node.
383
//
384
static void _yr_atoms_tree_node_append(
385
    YR_ATOM_TREE_NODE* dest,
386
    YR_ATOM_TREE_NODE* node)
387
0
{
388
0
  if (dest->children_head == NULL)
389
0
    dest->children_head = node;
390
391
0
  if (dest->children_tail != NULL)
392
0
    dest->children_tail->next_sibling = node;
393
394
0
  dest->children_tail = node;
395
0
}
396
397
////////////////////////////////////////////////////////////////////////////////
398
// Destroys an atoms tree.
399
//
400
static void _yr_atoms_tree_destroy(YR_ATOM_TREE* atom_tree)
401
0
{
402
0
  _yr_atoms_tree_node_destroy(atom_tree->root_node);
403
0
  yr_free(atom_tree);
404
0
}
405
406
////////////////////////////////////////////////////////////////////////////////
407
// Destroys an atoms list.
408
//
409
void yr_atoms_list_destroy(YR_ATOM_LIST_ITEM* list_head)
410
0
{
411
0
  YR_ATOM_LIST_ITEM* item = list_head;
412
0
  YR_ATOM_LIST_ITEM* next;
413
414
0
  while (item != NULL)
415
0
  {
416
0
    next = item->next;
417
0
    yr_free(item);
418
0
    item = next;
419
0
  }
420
0
}
421
422
////////////////////////////////////////////////////////////////////////////////
423
// Concats two atoms lists.
424
//
425
static YR_ATOM_LIST_ITEM* _yr_atoms_list_concat(
426
    YR_ATOM_LIST_ITEM* list1,
427
    YR_ATOM_LIST_ITEM* list2)
428
0
{
429
0
  YR_ATOM_LIST_ITEM* item;
430
431
0
  if (list1 == NULL)
432
0
    return list2;
433
434
0
  item = list1;
435
436
0
  while (item->next != NULL)
437
0
  {
438
0
    item = item->next;
439
0
  }
440
441
0
  item->next = list2;
442
0
  return list1;
443
0
}
444
445
////////////////////////////////////////////////////////////////////////////////
446
// If the atom starts or ends with an unknown byte (mask == 0x00), trim
447
// those bytes out of the atom. We don't want to expand an atom like
448
// { ?? 01 02 } into { 00 01 02 }, { 01 01 02}, { 02 01 02} .. { FF 01 02}
449
// in those cases it's better to simply have a shorter atom { 01 02 }.
450
//
451
// Args:
452
//   atom: Pointer to the YR_ATOM to be trimmed.
453
//
454
// Returns:
455
//   The number of bytes that were trimmed from the beginning of the atom.
456
//
457
int _yr_atoms_trim(YR_ATOM* atom)
458
0
{
459
0
  int mask_00 = 0;
460
0
  int mask_ff = 0;
461
462
0
  int trim_left = 0;
463
464
0
  while (trim_left < atom->length && atom->mask[trim_left] == 0) trim_left++;
465
466
0
  while (atom->length > trim_left && atom->mask[atom->length - 1] == 0)
467
0
    atom->length--;
468
469
0
  atom->length -= trim_left;
470
471
0
  if (atom->length == 0)
472
0
    return 0;
473
474
  // The trimmed atom goes from trim_left to trim_left + atom->length and the
475
  // first and last byte in the atom are known (mask == 0xFF). Now count the
476
  // number of known and unknown bytes in the atom (mask == 0xFF and
477
  // mask == 0x00 respectively).
478
479
0
  for (int i = 0; i < atom->length; i++)
480
0
  {
481
0
    if (atom->mask[trim_left + i] == 0xFF)
482
0
      mask_ff++;
483
0
    else if (atom->mask[trim_left + i] == 0x00)
484
0
      mask_00++;
485
0
  }
486
487
  // If the number of unknown bytes is >= than the number of known bytes
488
  // it doesn't make sense the to use this atom, so we use a single byte atom
489
  // containing the first known byte. If YR_MAX_ATOM_LENGTH == 4 this happens
490
  // only when the atom is like { XX ?? ?? YY }, so using the first known
491
  // byte is good enough. For larger values of YR_MAX_ATOM_LENGTH this is not
492
  // the most efficient solution, as better atoms could be choosen. For
493
  // example, in { XX ?? ?? ?? YY ZZ } the best atom is { YY ZZ } not { XX }.
494
  // But let's keep it like this for simplicity.
495
496
0
  if (mask_00 >= mask_ff)
497
0
    atom->length = 1;
498
499
0
  if (trim_left == 0)
500
0
    return 0;
501
502
  // Shift bytes and mask trim_left positions to the left.
503
504
0
  for (int i = 0; i < YR_MAX_ATOM_LENGTH - trim_left; i++)
505
0
  {
506
0
    atom->bytes[i] = atom->bytes[trim_left + i];
507
0
    atom->mask[i] = atom->mask[trim_left + i];
508
0
  }
509
510
0
  return trim_left;
511
0
}
512
513
////////////////////////////////////////////////////////////////////////////////
514
// This function receives an atom tree and returns a list of atoms to be added
515
// to the Aho-Corasick automaton.
516
//
517
static int _yr_atoms_choose(
518
    YR_ATOMS_CONFIG* config,
519
    YR_ATOM_TREE_NODE* node,
520
    YR_ATOM_LIST_ITEM** chosen_atoms,
521
    int* atoms_quality)
522
0
{
523
0
  YR_ATOM_TREE_NODE* child;
524
0
  YR_ATOM_LIST_ITEM* item;
525
0
  YR_ATOM_LIST_ITEM* tail;
526
527
0
  int shift, quality;
528
529
0
  int max_quality = YR_MIN_ATOM_QUALITY;
530
0
  int min_quality = YR_MAX_ATOM_QUALITY;
531
532
0
  *chosen_atoms = NULL;
533
0
  *atoms_quality = YR_MIN_ATOM_QUALITY;
534
535
0
  switch (node->type)
536
0
  {
537
0
  case ATOM_TREE_LEAF:
538
539
0
    item = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
540
541
0
    if (item == NULL)
542
0
      return ERROR_INSUFFICIENT_MEMORY;
543
544
0
    memcpy(&item->atom, &node->atom, sizeof(YR_ATOM));
545
546
0
    shift = _yr_atoms_trim(&item->atom);
547
548
0
    if (item->atom.length > 0)
549
0
    {
550
0
      item->forward_code_ref = node->re_nodes[shift]->forward_code_ref;
551
0
      item->backward_code_ref = node->re_nodes[shift]->backward_code_ref;
552
0
      item->backtrack = 0;
553
0
      item->next = NULL;
554
555
0
      *chosen_atoms = item;
556
0
      *atoms_quality = config->get_atom_quality(config, &item->atom);
557
0
    }
558
0
    else
559
0
    {
560
0
      yr_free(item);
561
0
    }
562
563
0
    break;
564
565
0
  case ATOM_TREE_OR:
566
567
    // The choosen nodes are those coming from the highest quality child.
568
569
0
    child = node->children_head;
570
571
0
    while (child != NULL)
572
0
    {
573
0
      FAIL_ON_ERROR(_yr_atoms_choose(config, child, &item, &quality));
574
575
0
      if (quality > max_quality)
576
0
      {
577
0
        max_quality = quality;
578
0
        yr_atoms_list_destroy(*chosen_atoms);
579
0
        *chosen_atoms = item;
580
0
      }
581
0
      else
582
0
      {
583
0
        yr_atoms_list_destroy(item);
584
0
      }
585
586
0
      if (max_quality == YR_MAX_ATOM_QUALITY)
587
0
        break;
588
589
0
      child = child->next_sibling;
590
0
    }
591
592
0
    *atoms_quality = max_quality;
593
0
    break;
594
595
0
  case ATOM_TREE_AND:
596
597
    // The choosen nodes are the concatenation of the the nodes choosen from
598
    // all the children.
599
600
0
    child = node->children_head;
601
602
0
    while (child != NULL)
603
0
    {
604
0
      FAIL_ON_ERROR(_yr_atoms_choose(config, child, &item, &quality));
605
606
0
      if (quality < min_quality)
607
0
        min_quality = quality;
608
609
0
      if (item != NULL)
610
0
      {
611
0
        tail = item;
612
0
        while (tail->next != NULL) tail = tail->next;
613
614
0
        tail->next = *chosen_atoms;
615
0
        *chosen_atoms = item;
616
0
      }
617
618
0
      child = child->next_sibling;
619
0
    }
620
621
0
    *atoms_quality = min_quality;
622
0
    break;
623
0
  }
624
625
0
  return ERROR_SUCCESS;
626
0
}
627
628
////////////////////////////////////////////////////////////////////////////////
629
// Returns all combinations of lower and upper cases for a given atom. For
630
// atom "abc" the output would be "abc" "abC" "aBC" and so on. Resulting
631
// atoms are written into the output buffer in this format:
632
//
633
//  [size of atom 1] [atom 1]  ... [size of atom N] [atom N] [0]
634
//
635
// Notice the zero at the end to indicate where the output ends.
636
//
637
// The caller is responsible of providing a buffer large enough to hold the
638
// returned atoms.
639
//
640
static uint8_t* _yr_atoms_case_combinations(
641
    uint8_t* atom,
642
    int atom_length,
643
    int atom_offset,
644
    uint8_t* output_buffer)
645
0
{
646
0
  uint8_t c;
647
0
  uint8_t* new_atom;
648
649
0
  if (atom_offset + 1 < atom_length)
650
0
    output_buffer = _yr_atoms_case_combinations(
651
0
        atom, atom_length, atom_offset + 1, output_buffer);
652
653
0
  c = atom[atom_offset];
654
655
0
  if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
656
0
  {
657
    // Write atom length.
658
0
    *output_buffer = atom_length;
659
0
    output_buffer++;
660
661
0
    memcpy(output_buffer, atom, atom_length);
662
663
0
    new_atom = output_buffer;
664
0
    output_buffer += atom_length;
665
666
    // Swap character case.
667
0
    if (c >= 'a' && c <= 'z')
668
0
      new_atom[atom_offset] -= 32;
669
0
    else
670
0
      new_atom[atom_offset] += 32;
671
672
0
    if (atom_offset + 1 < atom_length)
673
0
      output_buffer = _yr_atoms_case_combinations(
674
0
          new_atom, atom_length, atom_offset + 1, output_buffer);
675
0
  }
676
677
0
  if (atom_offset == 0)
678
0
    *output_buffer = 0;
679
680
0
  return output_buffer;
681
0
}
682
683
// Size of buffer used in _yr_atoms_case_insensitive for storing the all
684
// the possible combinations for an atom. Each atom has up to YR_MAX_ATOM_LENGTH
685
// characters and each character has two possible values (upper and lower case).
686
// That means 2 ^ YR_MAX_ATOM_LENGTH combinations for an atom, where each atom
687
// occupies YR_MAX_ATOM_LENGTH + 1 bytes (the atom itself +1 byte for its
688
// length). One extra bytes is allocated for the zero value indicating the end.
689
690
#define CASE_COMBINATIONS_BUFFER_SIZE \
691
  (1 << YR_MAX_ATOM_LENGTH) * (YR_MAX_ATOM_LENGTH + 1) + 1
692
693
////////////////////////////////////////////////////////////////////////////////
694
// For a given list of atoms returns another list of atoms
695
// with every case combination.
696
//
697
static int _yr_atoms_case_insensitive(
698
    YR_ATOM_LIST_ITEM* atoms,
699
    YR_ATOM_LIST_ITEM** case_insensitive_atoms)
700
0
{
701
0
  YR_ATOM_LIST_ITEM* atom;
702
0
  YR_ATOM_LIST_ITEM* new_atom;
703
704
0
  uint8_t buffer[CASE_COMBINATIONS_BUFFER_SIZE];
705
0
  uint8_t atom_length;
706
0
  uint8_t* atoms_cursor;
707
708
0
  int i;
709
710
0
  *case_insensitive_atoms = NULL;
711
0
  atom = atoms;
712
713
0
  while (atom != NULL)
714
0
  {
715
0
    _yr_atoms_case_combinations(atom->atom.bytes, atom->atom.length, 0, buffer);
716
717
0
    atoms_cursor = buffer;
718
0
    atom_length = *atoms_cursor;
719
0
    atoms_cursor++;
720
721
0
    while (atom_length != 0)
722
0
    {
723
0
      new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
724
725
0
      if (new_atom == NULL)
726
0
        return ERROR_INSUFFICIENT_MEMORY;
727
728
0
      for (i = 0; i < atom_length; i++)
729
0
      {
730
0
        new_atom->atom.bytes[i] = atoms_cursor[i];
731
0
        new_atom->atom.mask[i] = 0xFF;
732
0
      }
733
734
0
      new_atom->atom.length = atom_length;
735
0
      new_atom->forward_code_ref = atom->forward_code_ref;
736
0
      new_atom->backward_code_ref = atom->backward_code_ref;
737
0
      new_atom->backtrack = atom->backtrack;
738
0
      new_atom->next = *case_insensitive_atoms;
739
740
0
      *case_insensitive_atoms = new_atom;
741
742
0
      atoms_cursor += atom_length;
743
0
      atom_length = *atoms_cursor;
744
0
      atoms_cursor++;
745
0
    }
746
747
0
    atom = atom->next;
748
0
  }
749
750
0
  return ERROR_SUCCESS;
751
0
}
752
753
////////////////////////////////////////////////////////////////////////////////
754
// For a given list of atoms returns another list after a single byte xor
755
// has been applied to it.
756
//
757
static int _yr_atoms_xor(
758
    YR_ATOM_LIST_ITEM* atoms,
759
    uint8_t min,
760
    uint8_t max,
761
    YR_ATOM_LIST_ITEM** xor_atoms)
762
0
{
763
0
  YR_ATOM_LIST_ITEM* atom;
764
0
  YR_ATOM_LIST_ITEM* new_atom;
765
766
0
  int i, j;
767
0
  *xor_atoms = NULL;
768
0
  atom = atoms;
769
770
0
  while (atom != NULL)
771
0
  {
772
0
    for (j = min; j <= max; j++)
773
0
    {
774
0
      new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
775
776
0
      if (new_atom == NULL)
777
0
        return ERROR_INSUFFICIENT_MEMORY;
778
779
0
      for (i = 0; i < atom->atom.length; i++)
780
0
      {
781
0
        new_atom->atom.bytes[i] = atom->atom.bytes[i] ^ j;
782
0
        new_atom->atom.mask[i] = 0xFF;
783
0
      }
784
785
0
      new_atom->atom.length = yr_min(atom->atom.length, YR_MAX_ATOM_LENGTH);
786
0
      new_atom->forward_code_ref = atom->forward_code_ref;
787
0
      new_atom->backward_code_ref = atom->backward_code_ref;
788
0
      new_atom->backtrack = atom->backtrack;
789
0
      new_atom->next = *xor_atoms;
790
791
0
      *xor_atoms = new_atom;
792
0
    }
793
794
0
    atom = atom->next;
795
0
  }
796
0
  return ERROR_SUCCESS;
797
0
}
798
799
////////////////////////////////////////////////////////////////////////////////
800
// For a given list of atoms returns another list with the corresponding
801
// wide atoms. Wide atoms are just the original atoms with interleaved zeroes,
802
// for example: 01 02 -> 01 00 02 00
803
//
804
static int _yr_atoms_wide(
805
    YR_ATOM_LIST_ITEM* atoms,
806
    YR_ATOM_LIST_ITEM** wide_atoms)
807
0
{
808
0
  YR_ATOM_LIST_ITEM* atom;
809
0
  YR_ATOM_LIST_ITEM* new_atom;
810
811
0
  int i;
812
813
0
  *wide_atoms = NULL;
814
0
  atom = atoms;
815
816
0
  while (atom != NULL)
817
0
  {
818
0
    new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
819
820
0
    if (new_atom == NULL)
821
0
      return ERROR_INSUFFICIENT_MEMORY;
822
823
0
    for (i = 0; i < YR_MAX_ATOM_LENGTH; i++)
824
0
    {
825
0
      new_atom->atom.bytes[i] = 0;
826
0
      new_atom->atom.mask[i] = 0xFF;
827
0
    }
828
829
0
    for (i = 0; i < atom->atom.length; i++)
830
0
    {
831
0
      if (i * 2 < YR_MAX_ATOM_LENGTH)
832
0
        new_atom->atom.bytes[i * 2] = atom->atom.bytes[i];
833
0
      else
834
0
        break;
835
0
    }
836
837
0
    new_atom->atom.length = yr_min(atom->atom.length * 2, YR_MAX_ATOM_LENGTH);
838
0
    new_atom->forward_code_ref = atom->forward_code_ref;
839
0
    new_atom->backward_code_ref = atom->backward_code_ref;
840
0
    new_atom->backtrack = atom->backtrack * 2;
841
0
    new_atom->next = *wide_atoms;
842
843
0
    *wide_atoms = new_atom;
844
845
0
    atom = atom->next;
846
0
  }
847
848
0
  return ERROR_SUCCESS;
849
0
}
850
851
struct STACK_ITEM
852
{
853
  RE_NODE* re_node;
854
  YR_ATOM_TREE_NODE* new_appending_node;
855
};
856
857
#define make_atom_from_re_nodes(atom, nodes_length, nodes)   \
858
0
  {                                                          \
859
0
    atom.length = nodes_length;                              \
860
0
    for (i = 0; i < atom.length; i++)                        \
861
0
    {                                                        \
862
0
      atom.bytes[i] = (uint8_t) (recent_re_nodes)[i]->value; \
863
0
      atom.mask[i] = (uint8_t) (recent_re_nodes)[i]->mask;   \
864
0
    }                                                        \
865
0
  }
866
867
////////////////////////////////////////////////////////////////////////////////
868
// Extract atoms from a regular expression. This is a helper function used by
869
// yr_atoms_extract_from_re that receives the abstract syntax tree for a regexp
870
// (or hex pattern) and builds an atom tree. The appending_node argument is a
871
// pointer to the ATOM_TREE_OR node at the root of the atom tree. This function
872
// creates the tree by appending new nodes to it.
873
//
874
static int _yr_atoms_extract_from_re(
875
    YR_ATOMS_CONFIG* config,
876
    RE_AST* re_ast,
877
    YR_ATOM_TREE_NODE* appending_node)
878
0
{
879
0
  YR_STACK* stack;
880
0
  RE_NODE* re_node;
881
882
0
  YR_ATOM atom = {0};
883
0
  YR_ATOM best_atom = {0};
884
885
0
  struct STACK_ITEM si;
886
887
0
  int i, shift;
888
0
  int quality;
889
0
  int best_quality = -1;
890
0
  int n = 0;
891
892
0
  YR_ATOM_TREE_NODE* and_node;
893
0
  YR_ATOM_TREE_NODE* left_node;
894
0
  YR_ATOM_TREE_NODE* right_node;
895
896
  // The RE_NODEs most recently visited that can conform an atom (ie:
897
  // RE_NODE_LITERAL, RE_NODE_MASKED_LITERAL and RE_NODE_ANY). The number of
898
  // items in this array is n.
899
0
  RE_NODE* recent_re_nodes[YR_MAX_ATOM_LENGTH];
900
901
  // The RE_NODEs corresponding to the best atom found so far for the current
902
  // appending node.
903
0
  RE_NODE* best_atom_re_nodes[YR_MAX_ATOM_LENGTH];
904
905
  // This holds the ATOM_TREE_OR node where leaves (ATOM_TREE_LEAF) are
906
  // currently being appended.
907
0
  YR_ATOM_TREE_NODE* current_appending_node = NULL;
908
909
  // This holds the ATOM_TREE_LEAF node whose atom is currently being updated.
910
0
  YR_ATOM_TREE_NODE* leaf = NULL;
911
912
0
  FAIL_ON_ERROR(yr_stack_create(1024, sizeof(si), &stack));
913
914
  // This first item pushed in the stack is the last one to be poped out, the
915
  // sole purpose of this item is forcing that any pending leaf is appended to
916
  // appending_node during the last iteration of the loop.
917
0
  si.re_node = NULL;
918
0
  si.new_appending_node = appending_node;
919
920
0
  FAIL_ON_ERROR_WITH_CLEANUP(
921
0
      yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack));
922
923
  // Start processing the root node.
924
0
  si.re_node = re_ast->root_node;
925
926
  // Leaf nodes are initially appended to the node passed in the appending_node,
927
  // argument which is the root ATOM_TREE_OR node that is empty at this point.
928
0
  si.new_appending_node = appending_node;
929
930
0
  FAIL_ON_ERROR_WITH_CLEANUP(
931
0
      yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack));
932
933
0
  while (yr_stack_pop(stack, (void*) &si))
934
0
  {
935
    // Change the appending node if the item poped from the stack says so.
936
0
    if (si.new_appending_node != NULL)
937
0
    {
938
      // Before changing the appending node let's append any pending leaf to
939
      // the current appending node.
940
0
      if (n > 0)
941
0
      {
942
0
        make_atom_from_re_nodes(atom, n, recent_re_nodes);
943
0
        shift = _yr_atoms_trim(&atom);
944
0
        quality = config->get_atom_quality(config, &atom);
945
946
0
        FAIL_ON_NULL_WITH_CLEANUP(
947
0
            leaf = _yr_atoms_tree_node_create(ATOM_TREE_LEAF),
948
0
            yr_stack_destroy(stack));
949
950
0
        if (quality > best_quality)
951
0
        {
952
0
          memcpy(&leaf->atom, &atom, sizeof(atom));
953
0
          memcpy(
954
0
              &leaf->re_nodes,
955
0
              &recent_re_nodes[shift],
956
0
              sizeof(recent_re_nodes) - shift * sizeof(recent_re_nodes[0]));
957
0
        }
958
0
        else
959
0
        {
960
0
          memcpy(&leaf->atom, &best_atom, sizeof(best_atom));
961
0
          memcpy(
962
0
              &leaf->re_nodes, &best_atom_re_nodes, sizeof(best_atom_re_nodes));
963
0
        }
964
965
0
        _yr_atoms_tree_node_append(current_appending_node, leaf);
966
0
        n = 0;
967
0
      }
968
969
0
      current_appending_node = si.new_appending_node;
970
0
      best_quality = -1;
971
0
    }
972
973
0
    if (si.re_node != NULL)
974
0
    {
975
0
      switch (si.re_node->type)
976
0
      {
977
0
      case RE_NODE_LITERAL:
978
0
      case RE_NODE_MASKED_LITERAL:
979
0
      case RE_NODE_ANY:
980
981
0
        if (n < YR_MAX_ATOM_LENGTH)
982
0
        {
983
0
          recent_re_nodes[n] = si.re_node;
984
0
          best_atom_re_nodes[n] = si.re_node;
985
0
          best_atom.bytes[n] = (uint8_t) si.re_node->value;
986
0
          best_atom.mask[n] = (uint8_t) si.re_node->mask;
987
0
          best_atom.length = ++n;
988
0
        }
989
0
        else if (best_quality < YR_MAX_ATOM_QUALITY)
990
0
        {
991
0
          make_atom_from_re_nodes(atom, n, recent_re_nodes);
992
0
          shift = _yr_atoms_trim(&atom);
993
0
          quality = config->get_atom_quality(config, &atom);
994
995
0
          if (quality > best_quality)
996
0
          {
997
0
            for (i = 0; i < atom.length; i++)
998
0
            {
999
0
              best_atom.bytes[i] = atom.bytes[i];
1000
0
              best_atom.mask[i] = atom.mask[i];
1001
0
              best_atom_re_nodes[i] = recent_re_nodes[i + shift];
1002
0
            }
1003
1004
0
            best_atom.length = atom.length;
1005
0
            best_quality = quality;
1006
0
          }
1007
1008
0
          for (i = 1; i < YR_MAX_ATOM_LENGTH; i++)
1009
0
            recent_re_nodes[i - 1] = recent_re_nodes[i];
1010
1011
0
          recent_re_nodes[YR_MAX_ATOM_LENGTH - 1] = si.re_node;
1012
0
        }
1013
1014
0
        break;
1015
1016
0
      case RE_NODE_CONCAT:
1017
1018
0
        re_node = si.re_node->children_tail;
1019
1020
        // Push children right to left, they are poped left to right.
1021
0
        while (re_node != NULL)
1022
0
        {
1023
0
          si.new_appending_node = NULL;
1024
0
          si.re_node = re_node;
1025
1026
0
          FAIL_ON_ERROR_WITH_CLEANUP(
1027
0
              yr_stack_push(stack, &si), yr_stack_destroy(stack));
1028
1029
0
          re_node = re_node->prev_sibling;
1030
0
        }
1031
1032
0
        break;
1033
1034
0
      case RE_NODE_ALT:
1035
1036
        // Create ATOM_TREE_AND node with two ATOM_TREE_OR children nodes.
1037
0
        and_node = _yr_atoms_tree_node_create(ATOM_TREE_AND);
1038
0
        left_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1039
0
        right_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1040
1041
0
        if (and_node == NULL || left_node == NULL || right_node == NULL)
1042
0
        {
1043
0
          _yr_atoms_tree_node_destroy(and_node);
1044
0
          _yr_atoms_tree_node_destroy(left_node);
1045
0
          _yr_atoms_tree_node_destroy(right_node);
1046
1047
0
          yr_stack_destroy(stack);
1048
1049
0
          return ERROR_INSUFFICIENT_MEMORY;
1050
0
        }
1051
1052
0
        and_node->children_head = left_node;
1053
0
        and_node->children_tail = right_node;
1054
0
        left_node->next_sibling = right_node;
1055
1056
        // Add the ATOM_TREE_AND as children of the current node.
1057
0
        _yr_atoms_tree_node_append(current_appending_node, and_node);
1058
1059
0
        re_node = si.re_node;
1060
1061
0
        si.new_appending_node = current_appending_node;
1062
0
        si.re_node = NULL;
1063
1064
0
        FAIL_ON_ERROR_WITH_CLEANUP(
1065
0
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1066
1067
        // RE_NODE_ALT nodes has only two children, so children_head is the
1068
        // left one, and children_tail is right one.
1069
0
        si.new_appending_node = right_node;
1070
0
        si.re_node = re_node->children_tail;
1071
1072
0
        FAIL_ON_ERROR_WITH_CLEANUP(
1073
0
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1074
1075
0
        si.new_appending_node = left_node;
1076
0
        si.re_node = re_node->children_head;
1077
1078
0
        FAIL_ON_ERROR_WITH_CLEANUP(
1079
0
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1080
1081
0
        break;
1082
1083
0
      case RE_NODE_PLUS:
1084
1085
0
        re_node = si.re_node;
1086
1087
0
        si.new_appending_node = current_appending_node;
1088
0
        si.re_node = NULL;
1089
1090
0
        FAIL_ON_ERROR_WITH_CLEANUP(
1091
0
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1092
1093
0
        si.new_appending_node = NULL;
1094
        // RE_NODE_PLUS nodes has a single child, which is children_head.
1095
0
        si.re_node = re_node->children_head;
1096
1097
0
        FAIL_ON_ERROR_WITH_CLEANUP(
1098
0
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1099
1100
0
        break;
1101
1102
0
      case RE_NODE_RANGE:
1103
1104
0
        re_node = si.re_node;
1105
1106
0
        si.new_appending_node = current_appending_node;
1107
0
        si.re_node = NULL;
1108
1109
0
        FAIL_ON_ERROR_WITH_CLEANUP(
1110
0
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1111
1112
0
        si.new_appending_node = NULL;
1113
1114
        // RE_NODE_RANGE nodes has a single child, which is children_head.
1115
0
        si.re_node = re_node->children_head;
1116
1117
        // In a regexp like /a{10,20}/ the optimal atom is 'aaaa' (assuming
1118
        // that YR_MAX_ATOM_LENGTH = 4) because the 'a' character must appear
1119
        // at least 10 times in the matching string. Each call in the loop
1120
        // will append one 'a' to the atom, so YR_MAX_ATOM_LENGTH iterations
1121
        // are enough.
1122
1123
0
        for (i = 0; i < yr_min(re_node->start, YR_MAX_ATOM_LENGTH); i++)
1124
0
        {
1125
0
          FAIL_ON_ERROR_WITH_CLEANUP(
1126
0
              yr_stack_push(stack, &si), yr_stack_destroy(stack));
1127
0
        }
1128
1129
0
        break;
1130
1131
0
      case RE_NODE_RANGE_ANY:
1132
0
      case RE_NODE_STAR:
1133
0
      case RE_NODE_CLASS:
1134
0
      case RE_NODE_WORD_CHAR:
1135
0
      case RE_NODE_NON_WORD_CHAR:
1136
0
      case RE_NODE_SPACE:
1137
0
      case RE_NODE_NON_SPACE:
1138
0
      case RE_NODE_DIGIT:
1139
0
      case RE_NODE_NON_DIGIT:
1140
0
      case RE_NODE_EMPTY:
1141
0
      case RE_NODE_ANCHOR_START:
1142
0
      case RE_NODE_ANCHOR_END:
1143
0
      case RE_NODE_WORD_BOUNDARY:
1144
0
      case RE_NODE_NON_WORD_BOUNDARY:
1145
0
      case RE_NODE_NOT_LITERAL:
1146
0
      case RE_NODE_MASKED_NOT_LITERAL:
1147
1148
0
        si.new_appending_node = current_appending_node;
1149
0
        si.re_node = NULL;
1150
1151
0
        FAIL_ON_ERROR_WITH_CLEANUP(
1152
0
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1153
1154
0
        break;
1155
1156
0
      default:
1157
0
        assert(false);
1158
0
      }
1159
0
    }
1160
0
  }
1161
1162
0
  yr_stack_destroy(stack);
1163
1164
0
  return ERROR_SUCCESS;
1165
0
}
1166
1167
////////////////////////////////////////////////////////////////////////////////
1168
// Makes an exact copy of an YR_ATOM_LIST_ITEM.
1169
//
1170
static YR_ATOM_LIST_ITEM* _yr_atoms_clone_list_item(YR_ATOM_LIST_ITEM* item)
1171
0
{
1172
0
  YR_ATOM_LIST_ITEM* clone = (YR_ATOM_LIST_ITEM*) yr_malloc(
1173
0
      sizeof(YR_ATOM_LIST_ITEM));
1174
1175
0
  if (clone == NULL)
1176
0
    return NULL;
1177
1178
0
  memcpy(clone, item, sizeof(YR_ATOM_LIST_ITEM));
1179
1180
0
  return clone;
1181
0
}
1182
1183
////////////////////////////////////////////////////////////////////////////////
1184
// Given list of atoms that may contain wildcards, replace those wildcarded
1185
// atoms with a list of non-wildcarded atoms covering all the combinations
1186
// allowed by the wildcarded atom. For example, the atom {01 ?2 03} will be
1187
// replaced by {01 02 03}, {01 12 03}, {01 22 03} .. {01 F2 03}. The list
1188
// is modified in-place.
1189
//
1190
// Args:
1191
//   atoms: Pointer to first element of the list.
1192
//
1193
// Returns:
1194
//   ERROR_SUCCESS or ERROR_INSUFFICIENT_MEMORY.
1195
//
1196
static int _yr_atoms_expand_wildcards(YR_ATOM_LIST_ITEM* atoms)
1197
0
{
1198
0
  int i;
1199
1200
0
  YR_ATOM_LIST_ITEM* atom = atoms;
1201
0
  YR_ATOM_LIST_ITEM* new_atom;
1202
0
  YR_ATOM_LIST_ITEM* prev_atom;
1203
0
  YR_ATOM_LIST_ITEM* next_atom;
1204
1205
0
  while (atom != NULL)
1206
0
  {
1207
0
    bool expanded = false;
1208
1209
0
    for (i = 0; i < atom->atom.length; i++)
1210
0
    {
1211
0
      uint16_t a, s, e, incr = 1;
1212
1213
0
      switch (atom->atom.mask[i])
1214
0
      {
1215
0
      case 0x00:
1216
0
        expanded = true;
1217
0
        s = 0x00;
1218
0
        e = 0xFF;
1219
0
        break;
1220
1221
0
      case 0x0F:
1222
0
        expanded = true;
1223
0
        s = atom->atom.bytes[i];
1224
0
        e = atom->atom.bytes[i] | 0xF0;
1225
0
        incr = 0x10;
1226
0
        break;
1227
1228
0
      case 0xF0:
1229
0
        expanded = true;
1230
0
        s = atom->atom.bytes[i];
1231
0
        e = atom->atom.bytes[i] | 0x0F;
1232
0
        break;
1233
1234
0
      default:
1235
0
        s = 0;
1236
0
        e = 0;
1237
0
      }
1238
1239
0
      if (s != e)
1240
0
      {
1241
0
        atom->atom.bytes[i] = (uint8_t) s;
1242
0
        atom->atom.mask[i] = 0xFF;
1243
0
      }
1244
1245
0
      prev_atom = atom;
1246
0
      next_atom = atom->next;
1247
1248
0
      for (a = s + incr; a <= e; a += incr)
1249
0
      {
1250
0
        new_atom = _yr_atoms_clone_list_item(atom);
1251
1252
0
        if (new_atom == NULL)
1253
0
          return ERROR_INSUFFICIENT_MEMORY;
1254
1255
0
        new_atom->atom.bytes[i] = (uint8_t) a;
1256
0
        new_atom->atom.mask[i] = 0xFF;
1257
0
        new_atom->next = next_atom;
1258
0
        prev_atom->next = new_atom;
1259
0
        prev_atom = new_atom;
1260
0
      }
1261
0
    }
1262
1263
0
    if (!expanded)
1264
0
      atom = atom->next;
1265
0
  }
1266
1267
0
  return ERROR_SUCCESS;
1268
0
}
1269
1270
////////////////////////////////////////////////////////////////////////////////
1271
// Extract atoms from a regular expression. This function receives the abstract
1272
// syntax tree for a regexp (or hex pattern) and returns a list of atoms that
1273
// should be added to the Aho-Corasick automaton.
1274
//
1275
int yr_atoms_extract_from_re(
1276
    YR_ATOMS_CONFIG* config,
1277
    RE_AST* re_ast,
1278
    YR_MODIFIER modifier,
1279
    YR_ATOM_LIST_ITEM** atoms,
1280
    int* min_atom_quality)
1281
0
{
1282
0
  YR_ATOM_TREE* atom_tree = (YR_ATOM_TREE*) yr_malloc(sizeof(YR_ATOM_TREE));
1283
1284
0
  YR_ATOM_LIST_ITEM* wide_atoms;
1285
0
  YR_ATOM_LIST_ITEM* case_insensitive_atoms;
1286
1287
0
  if (atom_tree == NULL)
1288
0
    return ERROR_INSUFFICIENT_MEMORY;
1289
1290
0
  atom_tree->root_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1291
1292
0
  if (atom_tree->root_node == NULL)
1293
0
  {
1294
0
    _yr_atoms_tree_destroy(atom_tree);
1295
0
    return ERROR_INSUFFICIENT_MEMORY;
1296
0
  }
1297
1298
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1299
0
      _yr_atoms_extract_from_re(config, re_ast, atom_tree->root_node),
1300
0
      _yr_atoms_tree_destroy(atom_tree));
1301
1302
  // Initialize atom list
1303
0
  *atoms = NULL;
1304
1305
  // Choose the atoms that will be used.
1306
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1307
0
      _yr_atoms_choose(config, atom_tree->root_node, atoms, min_atom_quality),
1308
0
      _yr_atoms_tree_destroy(atom_tree));
1309
1310
0
  _yr_atoms_tree_destroy(atom_tree);
1311
1312
0
  FAIL_ON_ERROR_WITH_CLEANUP(
1313
0
      _yr_atoms_expand_wildcards(*atoms),
1314
0
      {  // Cleanup
1315
0
        yr_atoms_list_destroy(*atoms);
1316
0
        *atoms = NULL;
1317
0
      });
1318
1319
  // Don't do convert atoms to wide here if either base64 modifier is used.
1320
  // This is to avoid the situation where we have "base64 wide" because
1321
  // the wide has already been applied BEFORE the base64 encoding.
1322
0
  if (modifier.flags & STRING_FLAGS_WIDE &&
1323
0
      !(modifier.flags & STRING_FLAGS_BASE64 ||
1324
0
        modifier.flags & STRING_FLAGS_BASE64_WIDE))
1325
0
  {
1326
0
    FAIL_ON_ERROR_WITH_CLEANUP(
1327
0
        _yr_atoms_wide(*atoms, &wide_atoms),
1328
0
        {  // Cleanup
1329
0
          yr_atoms_list_destroy(*atoms);
1330
0
          yr_atoms_list_destroy(wide_atoms);
1331
0
          *atoms = NULL;
1332
0
        });
1333
1334
0
    if (modifier.flags & STRING_FLAGS_ASCII)
1335
0
    {
1336
0
      *atoms = _yr_atoms_list_concat(*atoms, wide_atoms);
1337
0
    }
1338
0
    else
1339
0
    {
1340
0
      yr_atoms_list_destroy(*atoms);
1341
0
      *atoms = wide_atoms;
1342
0
    }
1343
0
  }
1344
1345
0
  if (modifier.flags & STRING_FLAGS_NO_CASE)
1346
0
  {
1347
0
    FAIL_ON_ERROR_WITH_CLEANUP(
1348
0
        _yr_atoms_case_insensitive(*atoms, &case_insensitive_atoms),
1349
0
        {  // Cleanup
1350
0
          yr_atoms_list_destroy(*atoms);
1351
0
          yr_atoms_list_destroy(case_insensitive_atoms);
1352
0
          *atoms = NULL;
1353
0
        });
1354
1355
0
    *atoms = _yr_atoms_list_concat(*atoms, case_insensitive_atoms);
1356
0
  }
1357
1358
  // No atoms has been extracted, let's add a zero-length atom.
1359
1360
0
  if (*atoms == NULL)
1361
0
  {
1362
0
    *atoms = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
1363
1364
0
    if (*atoms == NULL)
1365
0
      return ERROR_INSUFFICIENT_MEMORY;
1366
1367
0
    (*atoms)->atom.length = 0;
1368
0
    (*atoms)->backtrack = 0;
1369
0
    (*atoms)->forward_code_ref = re_ast->root_node->forward_code_ref;
1370
0
    (*atoms)->backward_code_ref = YR_ARENA_NULL_REF;
1371
0
    (*atoms)->next = NULL;
1372
0
  }
1373
1374
0
  return ERROR_SUCCESS;
1375
0
}
1376
1377
////////////////////////////////////////////////////////////////////////////////
1378
// Extract atoms from a string.
1379
//
1380
int yr_atoms_extract_from_string(
1381
    YR_ATOMS_CONFIG* config,
1382
    uint8_t* string,
1383
    int32_t string_length,
1384
    YR_MODIFIER modifier,
1385
    YR_ATOM_LIST_ITEM** atoms,
1386
    int* min_atom_quality)
1387
0
{
1388
0
  YR_ATOM_LIST_ITEM* item;
1389
0
  YR_ATOM_LIST_ITEM* case_insensitive_atoms;
1390
0
  YR_ATOM_LIST_ITEM* xor_atoms;
1391
0
  YR_ATOM_LIST_ITEM* wide_atoms;
1392
1393
0
  YR_ATOM atom;
1394
1395
0
  int quality, max_quality;
1396
0
  int i;
1397
1398
0
  item = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
1399
1400
0
  if (item == NULL)
1401
0
    return ERROR_INSUFFICIENT_MEMORY;
1402
1403
0
  item->forward_code_ref = YR_ARENA_NULL_REF;
1404
0
  item->backward_code_ref = YR_ARENA_NULL_REF;
1405
0
  item->next = NULL;
1406
0
  item->backtrack = 0;
1407
1408
0
  item->atom.length = yr_min(string_length, YR_MAX_ATOM_LENGTH);
1409
1410
0
  for (i = 0; i < item->atom.length; i++)
1411
0
  {
1412
0
    item->atom.bytes[i] = string[i];
1413
0
    item->atom.mask[i] = 0xFF;
1414
0
  }
1415
1416
0
  max_quality = config->get_atom_quality(config, &item->atom);
1417
1418
0
  atom.length = YR_MAX_ATOM_LENGTH;
1419
0
  memset(atom.mask, 0xFF, atom.length);
1420
1421
0
  for (i = YR_MAX_ATOM_LENGTH;
1422
0
       i < string_length && max_quality < YR_MAX_ATOM_QUALITY;
1423
0
       i++)
1424
0
  {
1425
0
    atom.length = YR_MAX_ATOM_LENGTH;
1426
0
    memcpy(atom.bytes, string + i - YR_MAX_ATOM_LENGTH + 1, atom.length);
1427
1428
0
    quality = config->get_atom_quality(config, &atom);
1429
1430
0
    if (quality > max_quality)
1431
0
    {
1432
0
      memcpy(&item->atom, &atom, sizeof(atom));
1433
0
      item->backtrack = i - YR_MAX_ATOM_LENGTH + 1;
1434
0
      max_quality = quality;
1435
0
    }
1436
0
  }
1437
1438
0
  *atoms = item;
1439
0
  *min_atom_quality = max_quality;
1440
1441
0
  if (modifier.flags & STRING_FLAGS_WIDE)
1442
0
  {
1443
0
    FAIL_ON_ERROR_WITH_CLEANUP(
1444
0
        _yr_atoms_wide(*atoms, &wide_atoms),
1445
0
        {  // Cleanup
1446
0
          yr_atoms_list_destroy(*atoms);
1447
0
          yr_atoms_list_destroy(wide_atoms);
1448
0
          *atoms = NULL;
1449
0
        });
1450
1451
0
    if (modifier.flags & STRING_FLAGS_ASCII)
1452
0
    {
1453
0
      *atoms = _yr_atoms_list_concat(*atoms, wide_atoms);
1454
0
    }
1455
0
    else
1456
0
    {
1457
0
      yr_atoms_list_destroy(*atoms);
1458
0
      *atoms = wide_atoms;
1459
0
    }
1460
0
  }
1461
1462
0
  if (modifier.flags & STRING_FLAGS_NO_CASE)
1463
0
  {
1464
0
    FAIL_ON_ERROR_WITH_CLEANUP(
1465
0
        _yr_atoms_case_insensitive(*atoms, &case_insensitive_atoms),
1466
0
        {  // Cleanup
1467
0
          yr_atoms_list_destroy(*atoms);
1468
0
          yr_atoms_list_destroy(case_insensitive_atoms);
1469
0
          *atoms = NULL;
1470
0
        });
1471
1472
0
    *atoms = _yr_atoms_list_concat(*atoms, case_insensitive_atoms);
1473
0
  }
1474
1475
0
  if (modifier.flags & STRING_FLAGS_XOR)
1476
0
  {
1477
0
    FAIL_ON_ERROR_WITH_CLEANUP(
1478
0
        _yr_atoms_xor(*atoms, modifier.xor_min, modifier.xor_max, &xor_atoms),
1479
0
        {  // Cleanup
1480
0
          yr_atoms_list_destroy(*atoms);
1481
0
          yr_atoms_list_destroy(xor_atoms);
1482
0
          *atoms = NULL;
1483
0
        });
1484
1485
0
    yr_atoms_list_destroy(*atoms);
1486
0
    *atoms = xor_atoms;
1487
0
  }
1488
1489
  // Recheck the atom quality, in case we have just generated some poor atoms.
1490
  // https://github.com/VirusTotal/yara/issues/1172
1491
0
  for (item = *atoms; item != NULL; item = item->next)
1492
0
  {
1493
0
    quality = config->get_atom_quality(config, &item->atom);
1494
0
    if (quality < *min_atom_quality)
1495
0
      *min_atom_quality = quality;
1496
0
  }
1497
1498
0
  return ERROR_SUCCESS;
1499
0
}
1500
1501
////////////////////////////////////////////////////////////////////////////////
1502
// Prints an atom tree node. Used only for debugging purposes.
1503
//
1504
void yr_atoms_tree_node_print(YR_ATOM_TREE_NODE* node)
1505
0
{
1506
0
  YR_ATOM_TREE_NODE* child;
1507
1508
0
  if (node == NULL)
1509
0
  {
1510
0
    printf("Empty tree node\n");
1511
0
    return;
1512
0
  }
1513
1514
0
  switch (node->type)
1515
0
  {
1516
0
  case ATOM_TREE_LEAF:
1517
0
    for (int i = 0; i < node->atom.length; i++)
1518
0
      printf("%02X", node->atom.bytes[i]);
1519
0
    break;
1520
1521
0
  case ATOM_TREE_AND:
1522
0
  case ATOM_TREE_OR:
1523
0
    if (node->type == ATOM_TREE_AND)
1524
0
      printf("AND");
1525
0
    else
1526
0
      printf("OR");
1527
0
    printf("(");
1528
0
    child = node->children_head;
1529
0
    while (child != NULL)
1530
0
    {
1531
0
      yr_atoms_tree_node_print(child);
1532
0
      child = child->next_sibling;
1533
0
      if (child != NULL)
1534
0
        printf(",");
1535
0
    }
1536
0
    printf(")");
1537
0
    break;
1538
0
  }
1539
0
}