Coverage Report

Created: 2026-01-09 06:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/yara/libyara/atoms.c
Line
Count
Source
1
/*
2
Copyright (c) 2013-2018. The YARA Authors. All Rights Reserved.
3
4
Redistribution and use in source and binary forms, with or without modification,
5
are permitted provided that the following conditions are met:
6
7
1. Redistributions of source code must retain the above copyright notice, this
8
list of conditions and the following disclaimer.
9
10
2. Redistributions in binary form must reproduce the above copyright notice,
11
this list of conditions and the following disclaimer in the documentation and/or
12
other materials provided with the distribution.
13
14
3. Neither the name of the copyright holder nor the names of its contributors
15
may be used to endorse or promote products derived from this software without
16
specific prior written permission.
17
18
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
*/
29
30
/*
31
32
This module handles atom extraction from regexps and hex strings. Atoms are
33
undivided substrings found in a regexps and hex strings. Let's consider this
34
hex string:
35
36
{ 01 02 03 04 05 ?? 06 07 08 [1-2] 09 0A }
37
38
In the above string, byte sequences 0102030405, 060708 and 090A are atoms.
39
Similarly, in this regexp:
40
41
/abc.*ed[0-9]+fgh/
42
43
The strings "abc", "ed" and "fgh" are atoms.
44
45
When searching for regexps/hex strings matching a file, YARA uses these
46
atoms to find locations inside the file where the regexp/hex string could
47
match. If the atom "abc" is found somewhere inside the file, there is a chance
48
for /abc.*ed[0-9]+fgh/ to match the file, if "abc" doesn't appear in the file
49
there's no chance for the regexp to match. When the atom is found in the file
50
YARA proceeds to fully evaluate the regexp/hex string to determine if it's
51
actually a match.
52
53
For each regexp/hex string YARA extracts one or more atoms. Sometimes a
54
single atom is enough (like in the previous example "abc" is enough for finding
55
/abc.*ed[0-9]+fgh/), but sometimes a single atom isn't enough like in the
56
regexp /(abc|efg)/. In this case YARA must search for both "abc" AND "efg" and
57
fully evaluate the regexp whenever one of these atoms is found.
58
59
In the regexp /Look(at|into)this/ YARA can search for "Look", or search for
60
"this", or search for both "at" and "into". This is what we call an atoms tree,
61
because it can be represented by the following tree structure:
62
63
-OR
64
  |- "Look"
65
  |
66
  |- AND
67
  |   |
68
  |   |- "at"
69
  |    - "into"
70
  |
71
   - "this"
72
73
From an atom tree YARA chooses the best combination, trying to minimize the
74
number of required atoms, but also using high quality atoms (long atoms with
75
not too many zeroes and a bit of byte diversity). In the previous example YARA
76
will end up using the "Look" atom alone, but in /a(bcd|efg)h/ atoms "bcd" and
77
"efg" will be used because "a" and "h" are too short.
78
*/
79
80
#include <assert.h>
81
#include <string.h>
82
#include <yara/atoms.h>
83
#include <yara/error.h>
84
#include <yara/globals.h>
85
#include <yara/limits.h>
86
#include <yara/mem.h>
87
#include <yara/stack.h>
88
#include <yara/types.h>
89
#include <yara/utils.h>
90
91
////////////////////////////////////////////////////////////////////////////////
92
// Returns a numeric value indicating the quality of an atom. The quality
93
// depends on some characteristics of the atom, including its length, number
94
// of very common bytes like 00 and FF and number of unique distinct bytes.
95
// Atom 00 00 has a very low quality, because it's only two bytes long and
96
// both bytes are zeroes. Atom 01 01 01 01 is better but still not optimal,
97
// because the same byte is repeated. Atom 01 02 03 04 is an optimal one.
98
//
99
// Args:
100
//    config: Pointer to YR_ATOMS_CONFIG struct.
101
//    atom: Pointer to YR_ATOM struct.
102
//
103
// Returns:
104
//    An integer indicating the atom's quality
105
//
106
int yr_atoms_heuristic_quality(YR_ATOMS_CONFIG* config, YR_ATOM* atom)
107
8.06M
{
108
8.06M
  YR_BITMASK seen_bytes[YR_BITMASK_SIZE(256)];
109
110
8.06M
  int quality = 0;
111
8.06M
  int unique_bytes = 0;
112
113
8.06M
  assert(atom->length <= YR_MAX_ATOM_LENGTH);
114
115
8.06M
  yr_bitmask_clear_all(seen_bytes);
116
117
  // Each byte in the atom contributes a certain amount of points to the
118
  // quality. Bytes [a-zA-Z] contribute 18 points each, common bytes like
119
  // 0x00, 0x20 and 0xFF contribute only 12 points, and the rest of the
120
  // bytes contribute 20 points. The ?? mask substracts 10 points, and masks
121
  // X? and ?X contribute 4 points. An additional boost consisting in 2x the
122
  // number of unique bytes in the atom is added to the quality. This are
123
  // some examples of the quality of atoms:
124
  //
125
  //   01 02 03 04   quality = 20 + 20 + 20 + 20 + 8 = 88
126
  //   01 ?? 03 04   quality = 20 - 10 + 20 + 20 + 6 = 56
127
  //   01 0? 03      quality = 20 +  4 + 20      + 4 = 48
128
  //   01 02         quality = 20 + 20           + 4 = 44
129
  //   01 ?? ?3 04   quality = 20 - 10 +  2 + 20 + 4 = 36
130
  //   61 62         quality = 18 + 18           + 4 = 40
131
  //   61 61         quality = 18 + 18           + 2 = 38  <- warning threshold
132
  //   00 01         quality = 12 + 20           + 4 = 36
133
  //   01 ?? 02      quality = 20 - 10 + 20      + 4 = 34
134
  //   01            quality = 20                + 1 = 21
135
136
39.4M
  for (int i = 0; i < atom->length; i++)
137
31.4M
  {
138
31.4M
    switch (atom->mask[i])
139
31.4M
    {
140
19.5k
    case 0x00:
141
19.5k
      quality -= 10;
142
19.5k
      break;
143
938
    case 0x0F:
144
938
      quality += 4;
145
938
      break;
146
1.66k
    case 0xF0:
147
1.66k
      quality += 4;
148
1.66k
      break;
149
31.3M
    case 0xFF:
150
31.3M
      switch (atom->bytes[i])
151
31.3M
      {
152
2.07M
      case 0x00:
153
2.84M
      case 0x20:
154
2.84M
      case 0xCC:
155
7.94M
      case 0xFF:
156
        // Common bytes contribute less to the quality than the rest.
157
7.94M
        quality += 12;
158
7.94M
        break;
159
23.4M
      default:
160
        // Bytes in the a-z and A-Z ranges have a slightly lower quality
161
        // than the rest. We want to favor atoms that contain bytes outside
162
        // those ranges because they generate less additional atoms during
163
        // calls to _yr_atoms_case_combinations.
164
23.4M
        if (yr_lowercase[atom->bytes[i]] >= 'a' &&
165
17.4M
            yr_lowercase[atom->bytes[i]] <= 'z')
166
16.0M
          quality += 18;
167
7.42M
        else
168
7.42M
          quality += 20;
169
31.3M
      }
170
171
31.3M
      if (!yr_bitmask_is_set(seen_bytes, atom->bytes[i]))
172
24.5M
      {
173
24.5M
        yr_bitmask_set(seen_bytes, atom->bytes[i]);
174
24.5M
        unique_bytes++;
175
24.5M
      }
176
31.4M
    }
177
31.4M
  }
178
179
  // If all the bytes in the atom are equal and very common, let's penalize
180
  // it heavily.
181
8.06M
  if (unique_bytes == 1 && (yr_bitmask_is_set(seen_bytes, 0x00) ||
182
1.39M
                            yr_bitmask_is_set(seen_bytes, 0x20) ||
183
1.38M
                            yr_bitmask_is_set(seen_bytes, 0x90) ||
184
1.38M
                            yr_bitmask_is_set(seen_bytes, 0xCC) ||
185
1.38M
                            yr_bitmask_is_set(seen_bytes, 0xFF)))
186
989k
  {
187
989k
    quality -= 10 * atom->length;
188
989k
  }
189
  // In general atoms with more unique bytes have a better quality, so let's
190
  // boost the quality in the amount of unique bytes.
191
7.07M
  else
192
7.07M
  {
193
7.07M
    quality += 2 * unique_bytes;
194
7.07M
  }
195
196
  // The final quality is not zero-based, we start at YR_MAX_ATOM_QUALITY
197
  // for the best possible atom and substract from there. The best possible
198
  // quality is 21 * YR_MAX_ATOM_LENGTH (20 points per byte + 2 additional point
199
  // per unique byte, with a maximum of 2*YR_MAX_ATOM_LENGTH unique bytes).
200
201
8.06M
  return YR_MAX_ATOM_QUALITY - 22 * YR_MAX_ATOM_LENGTH + quality;
202
8.06M
}
203
204
////////////////////////////////////////////////////////////////////////////////
205
// Compares the byte sequence in a1 with the YR_ATOM in a2, taking atom's mask
206
// into account.
207
//
208
// Returns:
209
//   < 0 if the first byte that does not match has a lower value in a1 than
210
//       in a2.
211
//   > 0 if the first byte that does not match has a greater value in a1 than
212
//       in a2.
213
//   = 0 if a1 is equal or matches a2.
214
//
215
static int _yr_atoms_cmp(const uint8_t* a1, YR_ATOM* a2)
216
0
{
217
0
  int result = 0;
218
0
  int i = 0;
219
220
0
  while (result == 0 && i < a2->length)
221
0
  {
222
0
    switch (a2->mask[i])
223
0
    {
224
0
    case 0xFF:
225
0
    case 0x0F:
226
0
    case 0xF0:
227
0
    case 0x00:
228
0
      result = (a1[i] & a2->mask[i]) - a2->bytes[i];
229
0
      break;
230
0
    default:
231
0
      assert(false);
232
0
    }
233
234
0
    i++;
235
0
  }
236
237
0
  return result;
238
0
}
239
240
////////////////////////////////////////////////////////////////////////////////
241
// Returns a numeric value indicating the quality of an atom. The quality is
242
// based in the atom quality table passed in "config". Very common atoms
243
// (i.e: those with greater quality) have lower quality than those that are
244
// uncommon. See the comment for yr_compiler_set_atom_quality_table for
245
// details about the quality table's format.
246
//
247
// Args:
248
//    YR_ATOMS_CONFIG* config   - Pointer to YR_ATOMS_CONFIG struct.
249
//    YR_ATOM* atom             - Pointer to YR_ATOM struct.
250
//
251
// Returns:
252
//    An integer indicating the atom's quality
253
//
254
int yr_atoms_table_quality(YR_ATOMS_CONFIG* config, YR_ATOM* atom)
255
0
{
256
0
  YR_ATOM_QUALITY_TABLE_ENTRY* table = config->quality_table;
257
258
0
  int begin = 0;
259
0
  int end = config->quality_table_entries;
260
261
0
  assert(atom->length <= YR_MAX_ATOM_LENGTH);
262
263
0
  while (end > begin)
264
0
  {
265
0
    int middle = begin + (end - begin) / 2;
266
0
    int c = _yr_atoms_cmp(table[middle].atom, atom);
267
268
0
    if (c < 0)
269
0
    {
270
0
      begin = middle + 1;
271
0
    }
272
0
    else if (c > 0)
273
0
    {
274
0
      end = middle;
275
0
    }
276
0
    else
277
0
    {
278
0
      int i = middle + 1;
279
0
      int quality = table[middle].quality;
280
0
      int min_quality = quality;
281
282
0
      while (i < end && _yr_atoms_cmp(table[i].atom, atom) == 0)
283
0
      {
284
0
        if (min_quality > table[i].quality)
285
0
          min_quality = table[i].quality;
286
287
0
        i++;
288
0
      }
289
290
0
      i = middle - 1;
291
292
0
      while (i >= begin && _yr_atoms_cmp(table[i].atom, atom) == 0)
293
0
      {
294
0
        if (min_quality > table[i].quality)
295
0
          min_quality = table[i].quality;
296
297
0
        i--;
298
0
      }
299
300
0
      return min_quality >> (YR_MAX_ATOM_LENGTH - atom->length);
301
0
    }
302
0
  }
303
304
0
  return YR_MAX_ATOM_QUALITY;
305
0
}
306
307
////////////////////////////////////////////////////////////////////////////////
308
// Returns the quality for the worst quality atom in a list.
309
//
310
int yr_atoms_min_quality(YR_ATOMS_CONFIG* config, YR_ATOM_LIST_ITEM* atom_list)
311
0
{
312
0
  YR_ATOM_LIST_ITEM* atom;
313
314
0
  int quality;
315
0
  int min_quality = YR_MAX_ATOM_QUALITY;
316
317
0
  if (atom_list == NULL)
318
0
    return YR_MIN_ATOM_QUALITY;
319
320
0
  atom = atom_list;
321
322
0
  while (atom != NULL)
323
0
  {
324
0
    quality = config->get_atom_quality(config, &atom->atom);
325
326
0
    if (quality < min_quality)
327
0
      min_quality = quality;
328
329
0
    atom = atom->next;
330
0
  }
331
332
0
  return min_quality;
333
0
}
334
335
////////////////////////////////////////////////////////////////////////////////
336
// Creates a new node for an atoms tree.
337
//
338
static YR_ATOM_TREE_NODE* _yr_atoms_tree_node_create(uint8_t type)
339
620k
{
340
620k
  YR_ATOM_TREE_NODE* new_node = (YR_ATOM_TREE_NODE*) yr_malloc(
341
620k
      sizeof(YR_ATOM_TREE_NODE));
342
343
620k
  if (new_node != NULL)
344
620k
  {
345
620k
    new_node->type = type;
346
620k
    new_node->atom.length = 0;
347
620k
    new_node->next_sibling = NULL;
348
620k
    new_node->children_head = NULL;
349
620k
    new_node->children_tail = NULL;
350
620k
  }
351
352
620k
  return new_node;
353
620k
}
354
355
////////////////////////////////////////////////////////////////////////////////
356
// Destroys a node from an atoms tree.
357
//
358
static void _yr_atoms_tree_node_destroy(YR_ATOM_TREE_NODE* node)
359
620k
{
360
620k
  YR_ATOM_TREE_NODE* child;
361
620k
  YR_ATOM_TREE_NODE* next_child;
362
363
620k
  if (node == NULL)
364
0
    return;
365
366
620k
  if (node->type == ATOM_TREE_OR || node->type == ATOM_TREE_AND)
367
33.6k
  {
368
33.6k
    child = node->children_head;
369
370
647k
    while (child != NULL)
371
613k
    {
372
613k
      next_child = child->next_sibling;
373
613k
      _yr_atoms_tree_node_destroy(child);
374
613k
      child = next_child;
375
613k
    }
376
33.6k
  }
377
378
620k
  yr_free(node);
379
620k
}
380
381
////////////////////////////////////////////////////////////////////////////////
382
// Appends a new child node to another atoms tree node.
383
//
384
static void _yr_atoms_tree_node_append(
385
    YR_ATOM_TREE_NODE* dest,
386
    YR_ATOM_TREE_NODE* node)
387
595k
{
388
595k
  if (dest->children_head == NULL)
389
23.6k
    dest->children_head = node;
390
391
595k
  if (dest->children_tail != NULL)
392
572k
    dest->children_tail->next_sibling = node;
393
394
595k
  dest->children_tail = node;
395
595k
}
396
397
////////////////////////////////////////////////////////////////////////////////
398
// Destroys an atoms tree.
399
//
400
static void _yr_atoms_tree_destroy(YR_ATOM_TREE* atom_tree)
401
7.00k
{
402
7.00k
  _yr_atoms_tree_node_destroy(atom_tree->root_node);
403
7.00k
  yr_free(atom_tree);
404
7.00k
}
405
406
////////////////////////////////////////////////////////////////////////////////
407
// Destroys an atoms list.
408
//
409
void yr_atoms_list_destroy(YR_ATOM_LIST_ITEM* list_head)
410
55.9k
{
411
55.9k
  YR_ATOM_LIST_ITEM* item = list_head;
412
55.9k
  YR_ATOM_LIST_ITEM* next;
413
414
1.81M
  while (item != NULL)
415
1.75M
  {
416
1.75M
    next = item->next;
417
1.75M
    yr_free(item);
418
1.75M
    item = next;
419
1.75M
  }
420
55.9k
}
421
422
////////////////////////////////////////////////////////////////////////////////
423
// Concats two atoms lists.
424
//
425
static YR_ATOM_LIST_ITEM* _yr_atoms_list_concat(
426
    YR_ATOM_LIST_ITEM* list1,
427
    YR_ATOM_LIST_ITEM* list2)
428
8.64k
{
429
8.64k
  YR_ATOM_LIST_ITEM* item;
430
431
8.64k
  if (list1 == NULL)
432
812
    return list2;
433
434
7.83k
  item = list1;
435
436
209k
  while (item->next != NULL)
437
202k
  {
438
202k
    item = item->next;
439
202k
  }
440
441
7.83k
  item->next = list2;
442
7.83k
  return list1;
443
8.64k
}
444
445
////////////////////////////////////////////////////////////////////////////////
446
// If the atom starts or ends with an unknown byte (mask == 0x00), trim
447
// those bytes out of the atom. We don't want to expand an atom like
448
// { ?? 01 02 } into { 00 01 02 }, { 01 01 02}, { 02 01 02} .. { FF 01 02}
449
// in those cases it's better to simply have a shorter atom { 01 02 }.
450
//
451
// Args:
452
//   atom: Pointer to the YR_ATOM to be trimmed.
453
//
454
// Returns:
455
//   The number of bytes that were trimmed from the beginning of the atom.
456
//
457
int _yr_atoms_trim(YR_ATOM* atom)
458
7.73M
{
459
7.73M
  int mask_00 = 0;
460
7.73M
  int mask_ff = 0;
461
462
7.73M
  int trim_left = 0;
463
464
7.81M
  while (trim_left < atom->length && atom->mask[trim_left] == 0) trim_left++;
465
466
7.74M
  while (atom->length > trim_left && atom->mask[atom->length - 1] == 0)
467
9.28k
    atom->length--;
468
469
7.73M
  atom->length -= trim_left;
470
471
7.73M
  if (atom->length == 0)
472
76.0k
    return 0;
473
474
  // The trimmed atom goes from trim_left to trim_left + atom->length and the
475
  // first and last byte in the atom are known (mask == 0xFF). Now count the
476
  // number of known and unknown bytes in the atom (mask == 0xFF and
477
  // mask == 0x00 respectively).
478
479
38.0M
  for (int i = 0; i < atom->length; i++)
480
30.4M
  {
481
30.4M
    if (atom->mask[trim_left + i] == 0xFF)
482
30.3M
      mask_ff++;
483
23.1k
    else if (atom->mask[trim_left + i] == 0x00)
484
19.5k
      mask_00++;
485
30.4M
  }
486
487
  // If the number of unknown bytes is >= than the number of known bytes
488
  // it doesn't make sense the to use this atom, so we use a single byte atom
489
  // containing the first known byte. If YR_MAX_ATOM_LENGTH == 4 this happens
490
  // only when the atom is like { XX ?? ?? YY }, so using the first known
491
  // byte is good enough. For larger values of YR_MAX_ATOM_LENGTH this is not
492
  // the most efficient solution, as better atoms could be choosen. For
493
  // example, in { XX ?? ?? ?? YY ZZ } the best atom is { YY ZZ } not { XX }.
494
  // But let's keep it like this for simplicity.
495
496
7.65M
  if (mask_00 >= mask_ff)
497
1.72k
    atom->length = 1;
498
499
7.65M
  if (trim_left == 0)
500
7.64M
    return 0;
501
502
  // Shift bytes and mask trim_left positions to the left.
503
504
37.5k
  for (int i = 0; i < YR_MAX_ATOM_LENGTH - trim_left; i++)
505
28.1k
  {
506
28.1k
    atom->bytes[i] = atom->bytes[trim_left + i];
507
28.1k
    atom->mask[i] = atom->mask[trim_left + i];
508
28.1k
  }
509
510
9.40k
  return trim_left;
511
7.65M
}
512
513
////////////////////////////////////////////////////////////////////////////////
514
// This function receives an atom tree and returns a list of atoms to be added
515
// to the Aho-Corasick automaton.
516
//
517
static int _yr_atoms_choose(
518
    YR_ATOMS_CONFIG* config,
519
    YR_ATOM_TREE_NODE* node,
520
    YR_ATOM_LIST_ITEM** chosen_atoms,
521
    int* atoms_quality)
522
50.5k
{
523
50.5k
  YR_ATOM_TREE_NODE* child;
524
50.5k
  YR_ATOM_LIST_ITEM* item;
525
50.5k
  YR_ATOM_LIST_ITEM* tail;
526
527
50.5k
  int shift, quality;
528
529
50.5k
  int max_quality = YR_MIN_ATOM_QUALITY;
530
50.5k
  int min_quality = YR_MAX_ATOM_QUALITY;
531
532
50.5k
  *chosen_atoms = NULL;
533
50.5k
  *atoms_quality = YR_MIN_ATOM_QUALITY;
534
535
50.5k
  switch (node->type)
536
50.5k
  {
537
16.8k
  case ATOM_TREE_LEAF:
538
539
16.8k
    item = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
540
541
16.8k
    if (item == NULL)
542
0
      return ERROR_INSUFFICIENT_MEMORY;
543
544
16.8k
    memcpy(&item->atom, &node->atom, sizeof(YR_ATOM));
545
546
16.8k
    shift = _yr_atoms_trim(&item->atom);
547
548
16.8k
    if (item->atom.length > 0)
549
16.1k
    {
550
16.1k
      item->forward_code_ref = node->re_nodes[shift]->forward_code_ref;
551
16.1k
      item->backward_code_ref = node->re_nodes[shift]->backward_code_ref;
552
16.1k
      item->backtrack = 0;
553
16.1k
      item->next = NULL;
554
555
16.1k
      *chosen_atoms = item;
556
16.1k
      *atoms_quality = config->get_atom_quality(config, &item->atom);
557
16.1k
    }
558
722
    else
559
722
    {
560
722
      yr_free(item);
561
722
    }
562
563
16.8k
    break;
564
565
24.8k
  case ATOM_TREE_OR:
566
567
    // The choosen nodes are those coming from the highest quality child.
568
569
24.8k
    child = node->children_head;
570
571
50.0k
    while (child != NULL)
572
25.7k
    {
573
25.7k
      FAIL_ON_ERROR(_yr_atoms_choose(config, child, &item, &quality));
574
575
25.7k
      if (quality > max_quality)
576
22.8k
      {
577
22.8k
        max_quality = quality;
578
22.8k
        yr_atoms_list_destroy(*chosen_atoms);
579
22.8k
        *chosen_atoms = item;
580
22.8k
      }
581
2.94k
      else
582
2.94k
      {
583
2.94k
        yr_atoms_list_destroy(item);
584
2.94k
      }
585
586
25.7k
      if (max_quality == YR_MAX_ATOM_QUALITY)
587
595
        break;
588
589
25.2k
      child = child->next_sibling;
590
25.2k
    }
591
592
24.8k
    *atoms_quality = max_quality;
593
24.8k
    break;
594
595
8.89k
  case ATOM_TREE_AND:
596
597
    // The choosen nodes are the concatenation of the the nodes choosen from
598
    // all the children.
599
600
8.89k
    child = node->children_head;
601
602
26.6k
    while (child != NULL)
603
17.7k
    {
604
17.7k
      FAIL_ON_ERROR(_yr_atoms_choose(config, child, &item, &quality));
605
606
17.7k
      if (quality < min_quality)
607
9.81k
        min_quality = quality;
608
609
17.7k
      if (item != NULL)
610
16.8k
      {
611
16.8k
        tail = item;
612
85.1k
        while (tail->next != NULL) tail = tail->next;
613
614
16.8k
        tail->next = *chosen_atoms;
615
16.8k
        *chosen_atoms = item;
616
16.8k
      }
617
618
17.7k
      child = child->next_sibling;
619
17.7k
    }
620
621
8.89k
    *atoms_quality = min_quality;
622
8.89k
    break;
623
50.5k
  }
624
625
50.5k
  return ERROR_SUCCESS;
626
50.5k
}
627
628
////////////////////////////////////////////////////////////////////////////////
629
// Returns all combinations of lower and upper cases for a given atom. For
630
// atom "abc" the output would be "abc" "abC" "aBC" and so on. Resulting
631
// atoms are written into the output buffer in this format:
632
//
633
//  [size of atom 1] [atom 1]  ... [size of atom N] [atom N] [0]
634
//
635
// Notice the zero at the end to indicate where the output ends.
636
//
637
// The caller is responsible of providing a buffer large enough to hold the
638
// returned atoms.
639
//
640
static uint8_t* _yr_atoms_case_combinations(
641
    uint8_t* atom,
642
    int atom_length,
643
    int atom_offset,
644
    uint8_t* output_buffer)
645
1.71M
{
646
1.71M
  uint8_t c;
647
1.71M
  uint8_t* new_atom;
648
649
1.71M
  if (atom_offset + 1 < atom_length)
650
968k
    output_buffer = _yr_atoms_case_combinations(
651
968k
        atom, atom_length, atom_offset + 1, output_buffer);
652
653
1.71M
  c = atom[atom_offset];
654
655
1.71M
  if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
656
1.20M
  {
657
    // Write atom length.
658
1.20M
    *output_buffer = atom_length;
659
1.20M
    output_buffer++;
660
661
1.20M
    memcpy(output_buffer, atom, atom_length);
662
663
1.20M
    new_atom = output_buffer;
664
1.20M
    output_buffer += atom_length;
665
666
    // Swap character case.
667
1.20M
    if (c >= 'a' && c <= 'z')
668
1.09M
      new_atom[atom_offset] -= 32;
669
110k
    else
670
110k
      new_atom[atom_offset] += 32;
671
672
1.20M
    if (atom_offset + 1 < atom_length)
673
558k
      output_buffer = _yr_atoms_case_combinations(
674
558k
          new_atom, atom_length, atom_offset + 1, output_buffer);
675
1.20M
  }
676
677
1.71M
  if (atom_offset == 0)
678
188k
    *output_buffer = 0;
679
680
1.71M
  return output_buffer;
681
1.71M
}
682
683
// Size of buffer used in _yr_atoms_case_insensitive for storing the all
684
// the possible combinations for an atom. Each atom has up to YR_MAX_ATOM_LENGTH
685
// characters and each character has two possible values (upper and lower case).
686
// That means 2 ^ YR_MAX_ATOM_LENGTH combinations for an atom, where each atom
687
// occupies YR_MAX_ATOM_LENGTH + 1 bytes (the atom itself +1 byte for its
688
// length). One extra bytes is allocated for the zero value indicating the end.
689
690
#define CASE_COMBINATIONS_BUFFER_SIZE \
691
  (1 << YR_MAX_ATOM_LENGTH) * (YR_MAX_ATOM_LENGTH + 1) + 1
692
693
////////////////////////////////////////////////////////////////////////////////
694
// For a given list of atoms returns another list of atoms
695
// with every case combination.
696
//
697
static int _yr_atoms_case_insensitive(
698
    YR_ATOM_LIST_ITEM* atoms,
699
    YR_ATOM_LIST_ITEM** case_insensitive_atoms)
700
6.88k
{
701
6.88k
  YR_ATOM_LIST_ITEM* atom;
702
6.88k
  YR_ATOM_LIST_ITEM* new_atom;
703
704
6.88k
  uint8_t buffer[CASE_COMBINATIONS_BUFFER_SIZE];
705
6.88k
  uint8_t atom_length;
706
6.88k
  uint8_t* atoms_cursor;
707
708
6.88k
  int i;
709
710
6.88k
  *case_insensitive_atoms = NULL;
711
6.88k
  atom = atoms;
712
713
195k
  while (atom != NULL)
714
188k
  {
715
188k
    _yr_atoms_case_combinations(atom->atom.bytes, atom->atom.length, 0, buffer);
716
717
188k
    atoms_cursor = buffer;
718
188k
    atom_length = *atoms_cursor;
719
188k
    atoms_cursor++;
720
721
1.39M
    while (atom_length != 0)
722
1.20M
    {
723
1.20M
      new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
724
725
1.20M
      if (new_atom == NULL)
726
0
        return ERROR_INSUFFICIENT_MEMORY;
727
728
6.00M
      for (i = 0; i < atom_length; i++)
729
4.79M
      {
730
4.79M
        new_atom->atom.bytes[i] = atoms_cursor[i];
731
4.79M
        new_atom->atom.mask[i] = 0xFF;
732
4.79M
      }
733
734
1.20M
      new_atom->atom.length = atom_length;
735
1.20M
      new_atom->forward_code_ref = atom->forward_code_ref;
736
1.20M
      new_atom->backward_code_ref = atom->backward_code_ref;
737
1.20M
      new_atom->backtrack = atom->backtrack;
738
1.20M
      new_atom->next = *case_insensitive_atoms;
739
740
1.20M
      *case_insensitive_atoms = new_atom;
741
742
1.20M
      atoms_cursor += atom_length;
743
1.20M
      atom_length = *atoms_cursor;
744
1.20M
      atoms_cursor++;
745
1.20M
    }
746
747
188k
    atom = atom->next;
748
188k
  }
749
750
6.88k
  return ERROR_SUCCESS;
751
6.88k
}
752
753
////////////////////////////////////////////////////////////////////////////////
754
// For a given list of atoms returns another list after a single byte xor
755
// has been applied to it.
756
//
757
static int _yr_atoms_xor(
758
    YR_ATOM_LIST_ITEM* atoms,
759
    uint8_t min,
760
    uint8_t max,
761
    YR_ATOM_LIST_ITEM** xor_atoms)
762
857
{
763
857
  YR_ATOM_LIST_ITEM* atom;
764
857
  YR_ATOM_LIST_ITEM* new_atom;
765
766
857
  int i, j;
767
857
  *xor_atoms = NULL;
768
857
  atom = atoms;
769
770
2.11k
  while (atom != NULL)
771
1.25k
  {
772
230k
    for (j = min; j <= max; j++)
773
229k
    {
774
229k
      new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
775
776
229k
      if (new_atom == NULL)
777
0
        return ERROR_INSUFFICIENT_MEMORY;
778
779
892k
      for (i = 0; i < atom->atom.length; i++)
780
663k
      {
781
663k
        new_atom->atom.bytes[i] = atom->atom.bytes[i] ^ j;
782
663k
        new_atom->atom.mask[i] = 0xFF;
783
663k
      }
784
785
229k
      new_atom->atom.length = yr_min(atom->atom.length, YR_MAX_ATOM_LENGTH);
786
229k
      new_atom->forward_code_ref = atom->forward_code_ref;
787
229k
      new_atom->backward_code_ref = atom->backward_code_ref;
788
229k
      new_atom->backtrack = atom->backtrack;
789
229k
      new_atom->next = *xor_atoms;
790
791
229k
      *xor_atoms = new_atom;
792
229k
    }
793
794
1.25k
    atom = atom->next;
795
1.25k
  }
796
857
  return ERROR_SUCCESS;
797
857
}
798
799
////////////////////////////////////////////////////////////////////////////////
800
// For a given list of atoms returns another list with the corresponding
801
// wide atoms. Wide atoms are just the original atoms with interleaved zeroes,
802
// for example: 01 02 -> 01 00 02 00
803
//
804
static int _yr_atoms_wide(
805
    YR_ATOM_LIST_ITEM* atoms,
806
    YR_ATOM_LIST_ITEM** wide_atoms)
807
3.29k
{
808
3.29k
  YR_ATOM_LIST_ITEM* atom;
809
3.29k
  YR_ATOM_LIST_ITEM* new_atom;
810
811
3.29k
  int i;
812
813
3.29k
  *wide_atoms = NULL;
814
3.29k
  atom = atoms;
815
816
31.8k
  while (atom != NULL)
817
28.5k
  {
818
28.5k
    new_atom = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
819
820
28.5k
    if (new_atom == NULL)
821
0
      return ERROR_INSUFFICIENT_MEMORY;
822
823
142k
    for (i = 0; i < YR_MAX_ATOM_LENGTH; i++)
824
114k
    {
825
114k
      new_atom->atom.bytes[i] = 0;
826
114k
      new_atom->atom.mask[i] = 0xFF;
827
114k
    }
828
829
83.4k
    for (i = 0; i < atom->atom.length; i++)
830
80.8k
    {
831
80.8k
      if (i * 2 < YR_MAX_ATOM_LENGTH)
832
54.9k
        new_atom->atom.bytes[i * 2] = atom->atom.bytes[i];
833
25.8k
      else
834
25.8k
        break;
835
80.8k
    }
836
837
28.5k
    new_atom->atom.length = yr_min(atom->atom.length * 2, YR_MAX_ATOM_LENGTH);
838
28.5k
    new_atom->forward_code_ref = atom->forward_code_ref;
839
28.5k
    new_atom->backward_code_ref = atom->backward_code_ref;
840
28.5k
    new_atom->backtrack = atom->backtrack * 2;
841
28.5k
    new_atom->next = *wide_atoms;
842
843
28.5k
    *wide_atoms = new_atom;
844
845
28.5k
    atom = atom->next;
846
28.5k
  }
847
848
3.29k
  return ERROR_SUCCESS;
849
3.29k
}
850
851
struct STACK_ITEM
852
{
853
  RE_NODE* re_node;
854
  YR_ATOM_TREE_NODE* new_appending_node;
855
};
856
857
#define make_atom_from_re_nodes(atom, nodes_length, nodes)   \
858
7.71M
  {                                                          \
859
7.71M
    atom.length = nodes_length;                              \
860
38.1M
    for (i = 0; i < atom.length; i++)                        \
861
30.4M
    {                                                        \
862
30.4M
      atom.bytes[i] = (uint8_t) (recent_re_nodes)[i]->value; \
863
30.4M
      atom.mask[i] = (uint8_t) (recent_re_nodes)[i]->mask;   \
864
30.4M
    }                                                        \
865
7.71M
  }
866
867
////////////////////////////////////////////////////////////////////////////////
868
// Extract atoms from a regular expression. This is a helper function used by
869
// yr_atoms_extract_from_re that receives the abstract syntax tree for a regexp
870
// (or hex pattern) and builds an atom tree. The appending_node argument is a
871
// pointer to the ATOM_TREE_OR node at the root of the atom tree. This function
872
// creates the tree by appending new nodes to it.
873
//
874
static int _yr_atoms_extract_from_re(
875
    YR_ATOMS_CONFIG* config,
876
    RE_AST* re_ast,
877
    YR_ATOM_TREE_NODE* appending_node)
878
7.00k
{
879
7.00k
  YR_STACK* stack;
880
7.00k
  RE_NODE* re_node;
881
882
7.00k
  YR_ATOM atom = {0};
883
7.00k
  YR_ATOM best_atom = {0};
884
885
7.00k
  struct STACK_ITEM si;
886
887
7.00k
  int i, shift;
888
7.00k
  int quality;
889
7.00k
  int best_quality = -1;
890
7.00k
  int n = 0;
891
892
7.00k
  YR_ATOM_TREE_NODE* and_node;
893
7.00k
  YR_ATOM_TREE_NODE* left_node;
894
7.00k
  YR_ATOM_TREE_NODE* right_node;
895
896
  // The RE_NODEs most recently visited that can conform an atom (ie:
897
  // RE_NODE_LITERAL, RE_NODE_MASKED_LITERAL and RE_NODE_ANY). The number of
898
  // items in this array is n.
899
7.00k
  RE_NODE* recent_re_nodes[YR_MAX_ATOM_LENGTH];
900
901
  // The RE_NODEs corresponding to the best atom found so far for the current
902
  // appending node.
903
7.00k
  RE_NODE* best_atom_re_nodes[YR_MAX_ATOM_LENGTH];
904
905
  // This holds the ATOM_TREE_OR node where leaves (ATOM_TREE_LEAF) are
906
  // currently being appended.
907
7.00k
  YR_ATOM_TREE_NODE* current_appending_node = NULL;
908
909
  // This holds the ATOM_TREE_LEAF node whose atom is currently being updated.
910
7.00k
  YR_ATOM_TREE_NODE* leaf = NULL;
911
912
7.00k
  FAIL_ON_ERROR(yr_stack_create(1024, sizeof(si), &stack));
913
914
  // This first item pushed in the stack is the last one to be poped out, the
915
  // sole purpose of this item is forcing that any pending leaf is appended to
916
  // appending_node during the last iteration of the loop.
917
7.00k
  si.re_node = NULL;
918
7.00k
  si.new_appending_node = appending_node;
919
920
7.00k
  FAIL_ON_ERROR_WITH_CLEANUP(
921
7.00k
      yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack));
922
923
  // Start processing the root node.
924
7.00k
  si.re_node = re_ast->root_node;
925
926
  // Leaf nodes are initially appended to the node passed in the appending_node,
927
  // argument which is the root ATOM_TREE_OR node that is empty at this point.
928
7.00k
  si.new_appending_node = appending_node;
929
930
7.00k
  FAIL_ON_ERROR_WITH_CLEANUP(
931
7.00k
      yr_stack_push(stack, (void*) &si), yr_stack_destroy(stack));
932
933
17.0M
  while (yr_stack_pop(stack, (void*) &si))
934
17.0M
  {
935
    // Change the appending node if the item poped from the stack says so.
936
17.0M
    if (si.new_appending_node != NULL)
937
924k
    {
938
      // Before changing the appending node let's append any pending leaf to
939
      // the current appending node.
940
924k
      if (n > 0)
941
586k
      {
942
586k
        make_atom_from_re_nodes(atom, n, recent_re_nodes);
943
586k
        shift = _yr_atoms_trim(&atom);
944
586k
        quality = config->get_atom_quality(config, &atom);
945
946
586k
        FAIL_ON_NULL_WITH_CLEANUP(
947
586k
            leaf = _yr_atoms_tree_node_create(ATOM_TREE_LEAF),
948
586k
            yr_stack_destroy(stack));
949
950
586k
        if (quality > best_quality)
951
196k
        {
952
196k
          memcpy(&leaf->atom, &atom, sizeof(atom));
953
196k
          memcpy(
954
196k
              &leaf->re_nodes,
955
196k
              &recent_re_nodes[shift],
956
196k
              sizeof(recent_re_nodes) - shift * sizeof(recent_re_nodes[0]));
957
196k
        }
958
390k
        else
959
390k
        {
960
390k
          memcpy(&leaf->atom, &best_atom, sizeof(best_atom));
961
390k
          memcpy(
962
390k
              &leaf->re_nodes, &best_atom_re_nodes, sizeof(best_atom_re_nodes));
963
390k
        }
964
965
586k
        _yr_atoms_tree_node_append(current_appending_node, leaf);
966
586k
        n = 0;
967
586k
      }
968
969
924k
      current_appending_node = si.new_appending_node;
970
924k
      best_quality = -1;
971
924k
    }
972
973
17.0M
    if (si.re_node != NULL)
974
16.1M
    {
975
16.1M
      switch (si.re_node->type)
976
16.1M
      {
977
14.5M
      case RE_NODE_LITERAL:
978
14.5M
      case RE_NODE_MASKED_LITERAL:
979
14.6M
      case RE_NODE_ANY:
980
981
14.6M
        if (n < YR_MAX_ATOM_LENGTH)
982
1.95M
        {
983
1.95M
          recent_re_nodes[n] = si.re_node;
984
1.95M
          best_atom_re_nodes[n] = si.re_node;
985
1.95M
          best_atom.bytes[n] = (uint8_t) si.re_node->value;
986
1.95M
          best_atom.mask[n] = (uint8_t) si.re_node->mask;
987
1.95M
          best_atom.length = ++n;
988
1.95M
        }
989
12.7M
        else if (best_quality < YR_MAX_ATOM_QUALITY)
990
7.12M
        {
991
7.12M
          make_atom_from_re_nodes(atom, n, recent_re_nodes);
992
7.12M
          shift = _yr_atoms_trim(&atom);
993
7.12M
          quality = config->get_atom_quality(config, &atom);
994
995
7.12M
          if (quality > best_quality)
996
1.09M
          {
997
5.46M
            for (i = 0; i < atom.length; i++)
998
4.36M
            {
999
4.36M
              best_atom.bytes[i] = atom.bytes[i];
1000
4.36M
              best_atom.mask[i] = atom.mask[i];
1001
4.36M
              best_atom_re_nodes[i] = recent_re_nodes[i + shift];
1002
4.36M
            }
1003
1004
1.09M
            best_atom.length = atom.length;
1005
1.09M
            best_quality = quality;
1006
1.09M
          }
1007
1008
28.5M
          for (i = 1; i < YR_MAX_ATOM_LENGTH; i++)
1009
21.3M
            recent_re_nodes[i - 1] = recent_re_nodes[i];
1010
1011
7.12M
          recent_re_nodes[YR_MAX_ATOM_LENGTH - 1] = si.re_node;
1012
7.12M
        }
1013
1014
14.6M
        break;
1015
1016
566k
      case RE_NODE_CONCAT:
1017
1018
566k
        re_node = si.re_node->children_tail;
1019
1020
        // Push children right to left, they are poped left to right.
1021
16.3M
        while (re_node != NULL)
1022
15.7M
        {
1023
15.7M
          si.new_appending_node = NULL;
1024
15.7M
          si.re_node = re_node;
1025
1026
15.7M
          FAIL_ON_ERROR_WITH_CLEANUP(
1027
15.7M
              yr_stack_push(stack, &si), yr_stack_destroy(stack));
1028
1029
15.7M
          re_node = re_node->prev_sibling;
1030
15.7M
        }
1031
1032
566k
        break;
1033
1034
566k
      case RE_NODE_ALT:
1035
1036
        // Create ATOM_TREE_AND node with two ATOM_TREE_OR children nodes.
1037
8.89k
        and_node = _yr_atoms_tree_node_create(ATOM_TREE_AND);
1038
8.89k
        left_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1039
8.89k
        right_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1040
1041
8.89k
        if (and_node == NULL || left_node == NULL || right_node == NULL)
1042
0
        {
1043
0
          _yr_atoms_tree_node_destroy(and_node);
1044
0
          _yr_atoms_tree_node_destroy(left_node);
1045
0
          _yr_atoms_tree_node_destroy(right_node);
1046
1047
0
          yr_stack_destroy(stack);
1048
1049
0
          return ERROR_INSUFFICIENT_MEMORY;
1050
0
        }
1051
1052
8.89k
        and_node->children_head = left_node;
1053
8.89k
        and_node->children_tail = right_node;
1054
8.89k
        left_node->next_sibling = right_node;
1055
1056
        // Add the ATOM_TREE_AND as children of the current node.
1057
8.89k
        _yr_atoms_tree_node_append(current_appending_node, and_node);
1058
1059
8.89k
        re_node = si.re_node;
1060
1061
8.89k
        si.new_appending_node = current_appending_node;
1062
8.89k
        si.re_node = NULL;
1063
1064
8.89k
        FAIL_ON_ERROR_WITH_CLEANUP(
1065
8.89k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1066
1067
        // RE_NODE_ALT nodes has only two children, so children_head is the
1068
        // left one, and children_tail is right one.
1069
8.89k
        si.new_appending_node = right_node;
1070
8.89k
        si.re_node = re_node->children_tail;
1071
1072
8.89k
        FAIL_ON_ERROR_WITH_CLEANUP(
1073
8.89k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1074
1075
8.89k
        si.new_appending_node = left_node;
1076
8.89k
        si.re_node = re_node->children_head;
1077
1078
8.89k
        FAIL_ON_ERROR_WITH_CLEANUP(
1079
8.89k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1080
1081
8.89k
        break;
1082
1083
532
      case RE_NODE_PLUS:
1084
1085
532
        re_node = si.re_node;
1086
1087
532
        si.new_appending_node = current_appending_node;
1088
532
        si.re_node = NULL;
1089
1090
532
        FAIL_ON_ERROR_WITH_CLEANUP(
1091
532
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1092
1093
532
        si.new_appending_node = NULL;
1094
        // RE_NODE_PLUS nodes has a single child, which is children_head.
1095
532
        si.re_node = re_node->children_head;
1096
1097
532
        FAIL_ON_ERROR_WITH_CLEANUP(
1098
532
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1099
1100
532
        break;
1101
1102
106k
      case RE_NODE_RANGE:
1103
1104
106k
        re_node = si.re_node;
1105
1106
106k
        si.new_appending_node = current_appending_node;
1107
106k
        si.re_node = NULL;
1108
1109
106k
        FAIL_ON_ERROR_WITH_CLEANUP(
1110
106k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1111
1112
106k
        si.new_appending_node = NULL;
1113
1114
        // RE_NODE_RANGE nodes has a single child, which is children_head.
1115
106k
        si.re_node = re_node->children_head;
1116
1117
        // In a regexp like /a{10,20}/ the optimal atom is 'aaaa' (assuming
1118
        // that YR_MAX_ATOM_LENGTH = 4) because the 'a' character must appear
1119
        // at least 10 times in the matching string. Each call in the loop
1120
        // will append one 'a' to the atom, so YR_MAX_ATOM_LENGTH iterations
1121
        // are enough.
1122
1123
434k
        for (i = 0; i < yr_min(re_node->start, YR_MAX_ATOM_LENGTH); i++)
1124
328k
        {
1125
328k
          FAIL_ON_ERROR_WITH_CLEANUP(
1126
328k
              yr_stack_push(stack, &si), yr_stack_destroy(stack));
1127
328k
        }
1128
1129
106k
        break;
1130
1131
106k
      case RE_NODE_RANGE_ANY:
1132
2.03k
      case RE_NODE_STAR:
1133
2.47k
      case RE_NODE_CLASS:
1134
2.61k
      case RE_NODE_WORD_CHAR:
1135
2.80k
      case RE_NODE_NON_WORD_CHAR:
1136
2.80k
      case RE_NODE_SPACE:
1137
2.95k
      case RE_NODE_NON_SPACE:
1138
3.13k
      case RE_NODE_DIGIT:
1139
3.13k
      case RE_NODE_NON_DIGIT:
1140
3.58k
      case RE_NODE_EMPTY:
1141
450k
      case RE_NODE_ANCHOR_START:
1142
775k
      case RE_NODE_ANCHOR_END:
1143
775k
      case RE_NODE_WORD_BOUNDARY:
1144
775k
      case RE_NODE_NON_WORD_BOUNDARY:
1145
776k
      case RE_NODE_NOT_LITERAL:
1146
776k
      case RE_NODE_MASKED_NOT_LITERAL:
1147
1148
776k
        si.new_appending_node = current_appending_node;
1149
776k
        si.re_node = NULL;
1150
1151
776k
        FAIL_ON_ERROR_WITH_CLEANUP(
1152
776k
            yr_stack_push(stack, &si), yr_stack_destroy(stack));
1153
1154
776k
        break;
1155
1156
0
      default:
1157
0
        assert(false);
1158
16.1M
      }
1159
16.1M
    }
1160
17.0M
  }
1161
1162
7.00k
  yr_stack_destroy(stack);
1163
1164
7.00k
  return ERROR_SUCCESS;
1165
7.00k
}
1166
1167
////////////////////////////////////////////////////////////////////////////////
1168
// Makes an exact copy of an YR_ATOM_LIST_ITEM.
1169
//
1170
static YR_ATOM_LIST_ITEM* _yr_atoms_clone_list_item(YR_ATOM_LIST_ITEM* item)
1171
251k
{
1172
251k
  YR_ATOM_LIST_ITEM* clone = (YR_ATOM_LIST_ITEM*) yr_malloc(
1173
251k
      sizeof(YR_ATOM_LIST_ITEM));
1174
1175
251k
  if (clone == NULL)
1176
0
    return NULL;
1177
1178
251k
  memcpy(clone, item, sizeof(YR_ATOM_LIST_ITEM));
1179
1180
251k
  return clone;
1181
251k
}
1182
1183
////////////////////////////////////////////////////////////////////////////////
1184
// Given list of atoms that may contain wildcards, replace those wildcarded
1185
// atoms with a list of non-wildcarded atoms covering all the combinations
1186
// allowed by the wildcarded atom. For example, the atom {01 ?2 03} will be
1187
// replaced by {01 02 03}, {01 12 03}, {01 22 03} .. {01 F2 03}. The list
1188
// is modified in-place.
1189
//
1190
// Args:
1191
//   atoms: Pointer to first element of the list.
1192
//
1193
// Returns:
1194
//   ERROR_SUCCESS or ERROR_INSUFFICIENT_MEMORY.
1195
//
1196
static int _yr_atoms_expand_wildcards(YR_ATOM_LIST_ITEM* atoms)
1197
7.00k
{
1198
7.00k
  int i;
1199
1200
7.00k
  YR_ATOM_LIST_ITEM* atom = atoms;
1201
7.00k
  YR_ATOM_LIST_ITEM* new_atom;
1202
7.00k
  YR_ATOM_LIST_ITEM* prev_atom;
1203
7.00k
  YR_ATOM_LIST_ITEM* next_atom;
1204
1205
276k
  while (atom != NULL)
1206
269k
  {
1207
269k
    bool expanded = false;
1208
1209
1.27M
    for (i = 0; i < atom->atom.length; i++)
1210
1.00M
    {
1211
1.00M
      uint16_t a, s, e, incr = 1;
1212
1213
1.00M
      switch (atom->atom.mask[i])
1214
1.00M
      {
1215
823
      case 0x00:
1216
823
        expanded = true;
1217
823
        s = 0x00;
1218
823
        e = 0xFF;
1219
823
        break;
1220
1221
926
      case 0x0F:
1222
926
        expanded = true;
1223
926
        s = atom->atom.bytes[i];
1224
926
        e = atom->atom.bytes[i] | 0xF0;
1225
926
        incr = 0x10;
1226
926
        break;
1227
1228
1.87k
      case 0xF0:
1229
1.87k
        expanded = true;
1230
1.87k
        s = atom->atom.bytes[i];
1231
1.87k
        e = atom->atom.bytes[i] | 0x0F;
1232
1.87k
        break;
1233
1234
998k
      default:
1235
998k
        s = 0;
1236
998k
        e = 0;
1237
1.00M
      }
1238
1239
1.00M
      if (s != e)
1240
3.61k
      {
1241
3.61k
        atom->atom.bytes[i] = (uint8_t) s;
1242
3.61k
        atom->atom.mask[i] = 0xFF;
1243
3.61k
      }
1244
1245
1.00M
      prev_atom = atom;
1246
1.00M
      next_atom = atom->next;
1247
1248
1.25M
      for (a = s + incr; a <= e; a += incr)
1249
251k
      {
1250
251k
        new_atom = _yr_atoms_clone_list_item(atom);
1251
1252
251k
        if (new_atom == NULL)
1253
0
          return ERROR_INSUFFICIENT_MEMORY;
1254
1255
251k
        new_atom->atom.bytes[i] = (uint8_t) a;
1256
251k
        new_atom->atom.mask[i] = 0xFF;
1257
251k
        new_atom->next = next_atom;
1258
251k
        prev_atom->next = new_atom;
1259
251k
        prev_atom = new_atom;
1260
251k
      }
1261
1.00M
    }
1262
1263
269k
    if (!expanded)
1264
265k
      atom = atom->next;
1265
269k
  }
1266
1267
7.00k
  return ERROR_SUCCESS;
1268
7.00k
}
1269
1270
////////////////////////////////////////////////////////////////////////////////
1271
// Extract atoms from a regular expression. This function receives the abstract
1272
// syntax tree for a regexp (or hex pattern) and returns a list of atoms that
1273
// should be added to the Aho-Corasick automaton.
1274
//
1275
int yr_atoms_extract_from_re(
1276
    YR_ATOMS_CONFIG* config,
1277
    RE_AST* re_ast,
1278
    YR_MODIFIER modifier,
1279
    YR_ATOM_LIST_ITEM** atoms,
1280
    int* min_atom_quality)
1281
7.00k
{
1282
7.00k
  YR_ATOM_TREE* atom_tree = (YR_ATOM_TREE*) yr_malloc(sizeof(YR_ATOM_TREE));
1283
1284
7.00k
  YR_ATOM_LIST_ITEM* wide_atoms;
1285
7.00k
  YR_ATOM_LIST_ITEM* case_insensitive_atoms;
1286
1287
7.00k
  if (atom_tree == NULL)
1288
0
    return ERROR_INSUFFICIENT_MEMORY;
1289
1290
7.00k
  atom_tree->root_node = _yr_atoms_tree_node_create(ATOM_TREE_OR);
1291
1292
7.00k
  if (atom_tree->root_node == NULL)
1293
0
  {
1294
0
    _yr_atoms_tree_destroy(atom_tree);
1295
0
    return ERROR_INSUFFICIENT_MEMORY;
1296
0
  }
1297
1298
7.00k
  FAIL_ON_ERROR_WITH_CLEANUP(
1299
7.00k
      _yr_atoms_extract_from_re(config, re_ast, atom_tree->root_node),
1300
7.00k
      _yr_atoms_tree_destroy(atom_tree));
1301
1302
  // Initialize atom list
1303
7.00k
  *atoms = NULL;
1304
1305
  // Choose the atoms that will be used.
1306
7.00k
  FAIL_ON_ERROR_WITH_CLEANUP(
1307
7.00k
      _yr_atoms_choose(config, atom_tree->root_node, atoms, min_atom_quality),
1308
7.00k
      _yr_atoms_tree_destroy(atom_tree));
1309
1310
7.00k
  _yr_atoms_tree_destroy(atom_tree);
1311
1312
7.00k
  FAIL_ON_ERROR_WITH_CLEANUP(
1313
7.00k
      _yr_atoms_expand_wildcards(*atoms),
1314
7.00k
      {  // Cleanup
1315
7.00k
        yr_atoms_list_destroy(*atoms);
1316
7.00k
        *atoms = NULL;
1317
7.00k
      });
1318
1319
  // Don't do convert atoms to wide here if either base64 modifier is used.
1320
  // This is to avoid the situation where we have "base64 wide" because
1321
  // the wide has already been applied BEFORE the base64 encoding.
1322
7.00k
  if (modifier.flags & STRING_FLAGS_WIDE &&
1323
1.89k
      !(modifier.flags & STRING_FLAGS_BASE64 ||
1324
1.77k
        modifier.flags & STRING_FLAGS_BASE64_WIDE))
1325
1.40k
  {
1326
1.40k
    FAIL_ON_ERROR_WITH_CLEANUP(
1327
1.40k
        _yr_atoms_wide(*atoms, &wide_atoms),
1328
1.40k
        {  // Cleanup
1329
1.40k
          yr_atoms_list_destroy(*atoms);
1330
1.40k
          yr_atoms_list_destroy(wide_atoms);
1331
1.40k
          *atoms = NULL;
1332
1.40k
        });
1333
1334
1.40k
    if (modifier.flags & STRING_FLAGS_ASCII)
1335
571
    {
1336
571
      *atoms = _yr_atoms_list_concat(*atoms, wide_atoms);
1337
571
    }
1338
832
    else
1339
832
    {
1340
832
      yr_atoms_list_destroy(*atoms);
1341
832
      *atoms = wide_atoms;
1342
832
    }
1343
1.40k
  }
1344
1345
7.00k
  if (modifier.flags & STRING_FLAGS_NO_CASE)
1346
1.77k
  {
1347
1.77k
    FAIL_ON_ERROR_WITH_CLEANUP(
1348
1.77k
        _yr_atoms_case_insensitive(*atoms, &case_insensitive_atoms),
1349
1.77k
        {  // Cleanup
1350
1.77k
          yr_atoms_list_destroy(*atoms);
1351
1.77k
          yr_atoms_list_destroy(case_insensitive_atoms);
1352
1.77k
          *atoms = NULL;
1353
1.77k
        });
1354
1355
1.77k
    *atoms = _yr_atoms_list_concat(*atoms, case_insensitive_atoms);
1356
1.77k
  }
1357
1358
7.00k
  return ERROR_SUCCESS;
1359
7.00k
}
1360
1361
////////////////////////////////////////////////////////////////////////////////
1362
// Extract atoms from a string.
1363
//
1364
int yr_atoms_extract_from_string(
1365
    YR_ATOMS_CONFIG* config,
1366
    uint8_t* string,
1367
    int32_t string_length,
1368
    YR_MODIFIER modifier,
1369
    YR_ATOM_LIST_ITEM** atoms,
1370
    int* min_atom_quality)
1371
20.7k
{
1372
20.7k
  YR_ATOM_LIST_ITEM* item;
1373
20.7k
  YR_ATOM_LIST_ITEM* case_insensitive_atoms;
1374
20.7k
  YR_ATOM_LIST_ITEM* xor_atoms;
1375
20.7k
  YR_ATOM_LIST_ITEM* wide_atoms;
1376
1377
20.7k
  YR_ATOM atom;
1378
1379
20.7k
  int quality, max_quality;
1380
20.7k
  int i;
1381
1382
20.7k
  item = (YR_ATOM_LIST_ITEM*) yr_malloc(sizeof(YR_ATOM_LIST_ITEM));
1383
1384
20.7k
  if (item == NULL)
1385
0
    return ERROR_INSUFFICIENT_MEMORY;
1386
1387
20.7k
  item->forward_code_ref = YR_ARENA_NULL_REF;
1388
20.7k
  item->backward_code_ref = YR_ARENA_NULL_REF;
1389
20.7k
  item->next = NULL;
1390
20.7k
  item->backtrack = 0;
1391
1392
20.7k
  item->atom.length = yr_min(string_length, YR_MAX_ATOM_LENGTH);
1393
1394
72.7k
  for (i = 0; i < item->atom.length; i++)
1395
52.0k
  {
1396
52.0k
    item->atom.bytes[i] = string[i];
1397
52.0k
    item->atom.mask[i] = 0xFF;
1398
52.0k
  }
1399
1400
20.7k
  max_quality = config->get_atom_quality(config, &item->atom);
1401
1402
20.7k
  atom.length = YR_MAX_ATOM_LENGTH;
1403
20.7k
  memset(atom.mask, 0xFF, atom.length);
1404
1405
20.7k
  for (i = YR_MAX_ATOM_LENGTH;
1406
71.5k
       i < string_length && max_quality < YR_MAX_ATOM_QUALITY;
1407
50.8k
       i++)
1408
50.8k
  {
1409
50.8k
    atom.length = YR_MAX_ATOM_LENGTH;
1410
50.8k
    memcpy(atom.bytes, string + i - YR_MAX_ATOM_LENGTH + 1, atom.length);
1411
1412
50.8k
    quality = config->get_atom_quality(config, &atom);
1413
1414
50.8k
    if (quality > max_quality)
1415
4.40k
    {
1416
4.40k
      memcpy(&item->atom, &atom, sizeof(atom));
1417
4.40k
      item->backtrack = i - YR_MAX_ATOM_LENGTH + 1;
1418
4.40k
      max_quality = quality;
1419
4.40k
    }
1420
50.8k
  }
1421
1422
20.7k
  *atoms = item;
1423
20.7k
  *min_atom_quality = max_quality;
1424
1425
20.7k
  if (modifier.flags & STRING_FLAGS_WIDE)
1426
1.89k
  {
1427
1.89k
    FAIL_ON_ERROR_WITH_CLEANUP(
1428
1.89k
        _yr_atoms_wide(*atoms, &wide_atoms),
1429
1.89k
        {  // Cleanup
1430
1.89k
          yr_atoms_list_destroy(*atoms);
1431
1.89k
          yr_atoms_list_destroy(wide_atoms);
1432
1.89k
          *atoms = NULL;
1433
1.89k
        });
1434
1435
1.89k
    if (modifier.flags & STRING_FLAGS_ASCII)
1436
1.18k
    {
1437
1.18k
      *atoms = _yr_atoms_list_concat(*atoms, wide_atoms);
1438
1.18k
    }
1439
712
    else
1440
712
    {
1441
712
      yr_atoms_list_destroy(*atoms);
1442
712
      *atoms = wide_atoms;
1443
712
    }
1444
1.89k
  }
1445
1446
20.7k
  if (modifier.flags & STRING_FLAGS_NO_CASE)
1447
5.11k
  {
1448
5.11k
    FAIL_ON_ERROR_WITH_CLEANUP(
1449
5.11k
        _yr_atoms_case_insensitive(*atoms, &case_insensitive_atoms),
1450
5.11k
        {  // Cleanup
1451
5.11k
          yr_atoms_list_destroy(*atoms);
1452
5.11k
          yr_atoms_list_destroy(case_insensitive_atoms);
1453
5.11k
          *atoms = NULL;
1454
5.11k
        });
1455
1456
5.11k
    *atoms = _yr_atoms_list_concat(*atoms, case_insensitive_atoms);
1457
5.11k
  }
1458
1459
20.7k
  if (modifier.flags & STRING_FLAGS_XOR)
1460
857
  {
1461
857
    FAIL_ON_ERROR_WITH_CLEANUP(
1462
857
        _yr_atoms_xor(*atoms, modifier.xor_min, modifier.xor_max, &xor_atoms),
1463
857
        {  // Cleanup
1464
857
          yr_atoms_list_destroy(*atoms);
1465
857
          yr_atoms_list_destroy(xor_atoms);
1466
857
          *atoms = NULL;
1467
857
        });
1468
1469
857
    yr_atoms_list_destroy(*atoms);
1470
857
    *atoms = xor_atoms;
1471
857
  }
1472
1473
  // Recheck the atom quality, in case we have just generated some poor atoms.
1474
  // https://github.com/VirusTotal/yara/issues/1172
1475
278k
  for (item = *atoms; item != NULL; item = item->next)
1476
257k
  {
1477
257k
    quality = config->get_atom_quality(config, &item->atom);
1478
257k
    if (quality < *min_atom_quality)
1479
1.55k
      *min_atom_quality = quality;
1480
257k
  }
1481
1482
20.7k
  return ERROR_SUCCESS;
1483
20.7k
}
1484
1485
////////////////////////////////////////////////////////////////////////////////
1486
// Prints an atom tree node. Used only for debugging purposes.
1487
//
1488
void yr_atoms_tree_node_print(YR_ATOM_TREE_NODE* node)
1489
0
{
1490
0
  YR_ATOM_TREE_NODE* child;
1491
1492
0
  if (node == NULL)
1493
0
  {
1494
0
    printf("Empty tree node\n");
1495
0
    return;
1496
0
  }
1497
1498
0
  switch (node->type)
1499
0
  {
1500
0
  case ATOM_TREE_LEAF:
1501
0
    for (int i = 0; i < node->atom.length; i++)
1502
0
      printf("%02X", node->atom.bytes[i]);
1503
0
    break;
1504
1505
0
  case ATOM_TREE_AND:
1506
0
  case ATOM_TREE_OR:
1507
0
    if (node->type == ATOM_TREE_AND)
1508
0
      printf("AND");
1509
0
    else
1510
0
      printf("OR");
1511
0
    printf("(");
1512
0
    child = node->children_head;
1513
0
    while (child != NULL)
1514
0
    {
1515
0
      yr_atoms_tree_node_print(child);
1516
0
      child = child->next_sibling;
1517
0
      if (child != NULL)
1518
0
        printf(",");
1519
0
    }
1520
0
    printf(")");
1521
0
    break;
1522
0
  }
1523
0
}