/src/yara/libyara/base64.c

Source (jump to first uncovered line)
/*
Copyright (c) 2020. The YARA Authors. All Rights Reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <string.h>
#include <yara/base64.h>
#include <yara/error.h>
#include <yara/mem.h>
#include <yara/re.h>
#include <yara/sizedstr.h>

////////////////////////////////////////////////////////////////////////////////
// Given a pointer to a SIZED_STRING append 0, 1 or 2 bytes and base64 encode
// the string. The number of padding bytes is returned in "pad" and the caller
// is expected to trim the appropriate number of leading and trailing bytes.
//
// This is based upon the ideas at:
// https://www.leeholmes.com/blog/2019/12/10/searching-for-content-in-base-64-strings-2/
//
// The caller is responsible for freeing the returned string.
//
static SIZED_STRING* _yr_modified_base64_encode(
    SIZED_STRING* in,
    SIZED_STRING* alphabet,
    int i,
    int* pad)
{
  uint8_t* src = (uint8_t*) in->c_string;
  size_t len = in->length;
  SIZED_STRING* out;
  uint8_t* p;
  uint8_t* end;
  char* alphabet_str = alphabet->c_string;
  uint8_t* tmp;
  int j;

  *pad = ((i + len) % 3) ? 3 - ((i + len) % 3) : 0;

  // Add "i" for the number of prepended bytes.
  out = (SIZED_STRING*) yr_malloc(
      sizeof(SIZED_STRING) + i + ((len * 4 + 3) / 3) + *pad);

  if (out == NULL)
    return NULL;

  tmp = (uint8_t*) yr_malloc(sizeof(uint8_t) * (len + i));
  if (tmp == NULL)
  {
    yr_free(out);
    return NULL;
  }

  // Prepend appropriate number of bytes and copy remaining input bytes into
  // temporary buffer.
  for (j = 0; j < i; j++) tmp[j] = 'A';

  memcpy(tmp + j, src, len);
  src = tmp;

  p = (uint8_t*) out->c_string;
  end = src + len + j;

  while (end - src >= 3)
  {
    *p++ = alphabet_str[src[0] >> 2];
    *p++ = alphabet_str[((src[0] & 0x03) << 4 | src[1] >> 4)];
    *p++ = alphabet_str[((src[1] & 0x0f) << 2 | (src[2] >> 6))];
    *p++ = alphabet_str[src[2] & 0x3f];
    src += 3;
  }

  // Handle remaining bytes and padding.
  if (end - src)
  {
    *p++ = alphabet_str[src[0] >> 2];
    if (end - src == 1)
    {
      *p++ = alphabet_str[(src[0] & 0x03) << 4];
      *p++ = '=';
    }
    else
    {
      *p++ = alphabet_str[((src[0] & 0x03) << 4 | src[1] >> 4)];
      *p++ = alphabet_str[(src[1] & 0x0f) << 2];
    }
    *p++ = '=';
  }

  yr_free(tmp);
  out->length = (uint32_t)(p - (uint8_t*) out->c_string);

  return out;
}

////////////////////////////////////////////////////////////////////////////////
// Given a base64 encoded string, return a new string with leading and trailing
// bytes stripped appropriately. The number of leading bytes to skip is always
// (i + 1) or zero when no leading bytes are added and the number of trailing
// bytes is always (pad + 1) or zero when pad is zero. Also, convert the final
// string to wide if desired.
//
// Note: This implementation assumes you only prepend 0, 1 or 2 bytes.
//
static SIZED_STRING* _yr_base64_get_base64_substring(
    SIZED_STRING* encoded_str,
    int wide,
    int i,
    int pad)
{
  SIZED_STRING* new_str;
  SIZED_STRING* final_str;
  char* start;
  uint32_t length;
  int trailing;
  int leading;

  trailing = pad ? pad + 1 : 0;
  leading = i ? i + 1 : 0;

  length = encoded_str->length - (leading + trailing);

  new_str = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);

  if (new_str == NULL)
    return NULL;

  start = encoded_str->c_string + leading;

  memcpy(new_str->c_string, start, length);

  new_str->length = length;
  new_str->c_string[length] = '\0';

  if (wide)
  {
    final_str = ss_convert_to_wide(new_str);
    yr_free(new_str);
  }
  else
  {
    final_str = new_str;
  }

  return final_str;
}

// RE metacharacters which need to be escaped when generating the final RE.
#define IS_METACHAR(x)                                                      \
  (x == '\\' || x == '^' || x == '$' || x == '|' || x == '(' || x == ')' || \
   x == '[' || x == ']' || x == '*' || x == '?' || x == '{' || x == ',' ||  \
   x == '.' || x == '+' || x == '}')

////////////////////////////////////////////////////////////////////////////////
// Given a SIZED_STRING return the number of characters which will need to be
// escaped when generating the final string to pass to the regexp compiler.
//
static int _yr_base64_count_escaped(SIZED_STRING* str)
{
  int c = 0;

  for (uint32_t i = 0; i < str->length; i++)
  {
    // We must be careful to escape null bytes because they break the RE lexer.
    if (IS_METACHAR(str->c_string[i]))
      c++;
    else if (str->c_string[i] == '\x00')
      c += 4;
  }

  return c;
}

////////////////////////////////////////////////////////////////////////////////
// Create nodes representing the different encodings of a base64 string.
//
static int _yr_base64_create_nodes(
    SIZED_STRING* str,
    SIZED_STRING* alphabet,
    int wide,
    BASE64_NODE** head,
    BASE64_NODE** tail)
{
  SIZED_STRING* encoded_str;
  SIZED_STRING* final_str;
  BASE64_NODE* node;

  int pad;

  for (int i = 0; i <= 2; i++)
  {
    if (i == 1 && str->length == 1)
      continue;

    node = (BASE64_NODE*) yr_malloc(sizeof(BASE64_NODE));
    if (node == NULL)
      return ERROR_INSUFFICIENT_MEMORY;

    FAIL_ON_NULL_WITH_CLEANUP(
        encoded_str = _yr_modified_base64_encode(str, alphabet, i, &pad),
        yr_free(node));

    // Now take the encoded string and strip the bytes which are affected by
    // the leading and trailing bytes of the plaintext.
    FAIL_ON_NULL_WITH_CLEANUP(
        final_str = _yr_base64_get_base64_substring(encoded_str, wide, i, pad),
        {
          yr_free(encoded_str);
          yr_free(node);
        });

    yr_free(encoded_str);

    node->str = final_str;
    node->escaped = _yr_base64_count_escaped(node->str);
    node->next = NULL;

    if (*head == NULL)
      *head = node;

    if (*tail == NULL)
    {
      *tail = node;
    }
    else
    {
      (*tail)->next = node;
      *tail = node;
    }
  }

  return ERROR_SUCCESS;
}

////////////////////////////////////////////////////////////////////////////////
// Useful for printing the encoded strings.
//
void _yr_base64_print_nodes(BASE64_NODE* head)
{
  BASE64_NODE* p = head;

  while (p != NULL)
  {
    for (size_t i = 0; i < p->str->length; i++)
    {
      if (p->str->c_string[i] >= 32 && p->str->c_string[i] <= 126)
        printf("%c", p->str->c_string[i]);
      else
        printf("\\x%02x", p->str->c_string[i]);
    }
    printf("\n");

    p = p->next;
  }
}

////////////////////////////////////////////////////////////////////////////////
// Destroy a list of base64 nodes.
//
static void _yr_base64_destroy_nodes(BASE64_NODE* head)
{
  BASE64_NODE* p = head;
  BASE64_NODE* next;

  while (p != NULL)
  {
    yr_free(p->str);
    next = p->next;
    yr_free(p);
    p = next;
  }
}

////////////////////////////////////////////////////////////////////////////////
// Create the regexp that is the alternatives of each of the strings collected
// in the BASE64_NODE list.
//
int _yr_base64_create_regexp(
    BASE64_NODE* head,
    RE_AST** re_ast,
    RE_ERROR* re_error)
{
  BASE64_NODE* p = head;
  char* re_str;
  char* s;
  uint32_t length = 0;

  // The number of nodes in the list, used to know how many '|'.
  uint32_t c = 0;

  while (p != NULL)
  {
    length += (p->str->length + p->escaped);
    c++;
    p = p->next;
  }

  if (c == 0)
    return ERROR_INSUFFICIENT_MEMORY;

  // Make sure to include '(' and ')'.
  // The number of '|' is number of nodes - 1.
  re_str = (char*) yr_malloc(length + 2 + (c - 1) + 1);
  if (re_str == NULL)
    return ERROR_INSUFFICIENT_MEMORY;

  s = re_str;
  p = head;
  *s++ = '(';
  while (p != NULL)
  {
    for (uint32_t i = 0; i < p->str->length; i++)
    {
      if (IS_METACHAR(p->str->c_string[i]))
        *s++ = '\\';

      if (p->str->c_string[i] == '\x00')
      {
        *s++ = '\\';
        *s++ = 'x';
        *s++ = '0';
        *s++ = '0';
      }
      else
        *s++ = p->str->c_string[i];
    }

    if (p->next != NULL)
      *s++ = '|';

    p = p->next;
  }
  *s++ = ')';
  *s = '\x00';

  // Useful for debugging as long as the string has no NULL bytes in it. ;)
  // printf("%s\n", re_str);

  FAIL_ON_ERROR_WITH_CLEANUP(
      yr_re_parse(re_str, re_ast, re_error, RE_PARSER_FLAG_NONE), yr_free(re_str));

  yr_free(re_str);

  return ERROR_SUCCESS;
}

////////////////////////////////////////////////////////////////////////////////
// Given a string and an alphabet, generate the RE_AST suitable for representing
// the different encodings of the string. This means we generate
// "(ABCD|EFGH|IJKL)" and must be careful to escape any special characters as
// a result of the base64 encoding.
//
// This uses ideas from:
// https://www.leeholmes.com/blog/2019/12/10/searching-for-content-in-base-64-strings-2/
//
// This does not emit the code for the RE. A further call to yr_re_ast_emit_code
// is required to get the code.
//
int yr_base64_ast_from_string(
    SIZED_STRING* in_str,
    YR_MODIFIER modifier,
    RE_AST** re_ast,
    RE_ERROR* error)
{
  BASE64_NODE* head = NULL;
  BASE64_NODE* tail = NULL;
  SIZED_STRING* wide_str;

  if (modifier.flags & STRING_FLAGS_WIDE)
  {
    wide_str = ss_convert_to_wide(in_str);

    if (modifier.flags & STRING_FLAGS_BASE64)
    {
      FAIL_ON_ERROR_WITH_CLEANUP(
          _yr_base64_create_nodes(wide_str, modifier.alphabet, 0, &head, &tail),
          {  // Cleanup
            strcpy(error->message, "Failure encoding base64 wide string");
            yr_free(wide_str);
            _yr_base64_destroy_nodes(head);
          });
    }

    if (modifier.flags & STRING_FLAGS_BASE64_WIDE)
    {
      FAIL_ON_ERROR_WITH_CLEANUP(
          _yr_base64_create_nodes(wide_str, modifier.alphabet, 1, &head, &tail),
          {  // Cleanup
            strcpy(error->message, "Failure encoding base64wide wide string");
            yr_free(wide_str);
            _yr_base64_destroy_nodes(head);
          });
    }

    yr_free(wide_str);
  }

  if (modifier.flags & STRING_FLAGS_ASCII)
  {
    if (modifier.flags & STRING_FLAGS_BASE64)
    {
      FAIL_ON_ERROR_WITH_CLEANUP(
          _yr_base64_create_nodes(in_str, modifier.alphabet, 0, &head, &tail),
          {  // Cleanup
            strcpy(error->message, "Failure encoding base64 ascii string");
            _yr_base64_destroy_nodes(head);
          });
    }

    if (modifier.flags & STRING_FLAGS_BASE64_WIDE)
    {
      FAIL_ON_ERROR_WITH_CLEANUP(
          _yr_base64_create_nodes(in_str, modifier.alphabet, 1, &head, &tail),
          {  // Cleanup
            strcpy(error->message, "Failure encoding base64wide ascii string");
            _yr_base64_destroy_nodes(head);
          });
    }
  }

  if (!(modifier.flags & STRING_FLAGS_WIDE) &&
      !(modifier.flags & STRING_FLAGS_ASCII))
  {
    if (modifier.flags & STRING_FLAGS_BASE64)
    {
      FAIL_ON_ERROR_WITH_CLEANUP(
          _yr_base64_create_nodes(in_str, modifier.alphabet, 0, &head, &tail),
          {  // Cleanup
            strcpy(error->message, "Failure encoding base64 string");
            _yr_base64_destroy_nodes(head);
          });
    }

    if (modifier.flags & STRING_FLAGS_BASE64_WIDE)
    {
      FAIL_ON_ERROR_WITH_CLEANUP(
          _yr_base64_create_nodes(in_str, modifier.alphabet, 1, &head, &tail),
          {  // Cleanup
            strcpy(error->message, "Failure encoding base64wide string");
            _yr_base64_destroy_nodes(head);
          });
    }
  }

  // Useful for printing the contents of the nodes, to make sure they were
  // encoded and stripped properly.
  //_yr_base64_print_nodes(head);

  // Create the final regex string to be parsed from all the nodes.
  // Error message is filled in by the caller in case of failure.
  FAIL_ON_ERROR_WITH_CLEANUP(
      _yr_base64_create_regexp(head, re_ast, error),
      _yr_base64_destroy_nodes(head));

  _yr_base64_destroy_nodes(head);

  return ERROR_SUCCESS;
}

Coverage Report

Created: 2023-09-25 07:13

Line	Count	Source (jump to first uncovered line)
1		/*
2		Copyright (c) 2020. The YARA Authors. All Rights Reserved.
3
4		Redistribution and use in source and binary forms, with or without modification,
5		are permitted provided that the following conditions are met:
6
7		1. Redistributions of source code must retain the above copyright notice, this
8		list of conditions and the following disclaimer.
9
10		2. Redistributions in binary form must reproduce the above copyright notice,
11		this list of conditions and the following disclaimer in the documentation and/or
12		other materials provided with the distribution.
13
14		3. Neither the name of the copyright holder nor the names of its contributors
15		may be used to endorse or promote products derived from this software without
16		specific prior written permission.
17
18		THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19		ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20		WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21		DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22		ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23		(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24		LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25		ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26		(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27		SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28		*/
29
30		#include <string.h>
31		#include <yara/base64.h>
32		#include <yara/error.h>
33		#include <yara/mem.h>
34		#include <yara/re.h>
35		#include <yara/sizedstr.h>
36
37		////////////////////////////////////////////////////////////////////////////////
38		// Given a pointer to a SIZED_STRING append 0, 1 or 2 bytes and base64 encode
39		// the string. The number of padding bytes is returned in "pad" and the caller
40		// is expected to trim the appropriate number of leading and trailing bytes.
41		//
42		// This is based upon the ideas at:
43		// https://www.leeholmes.com/blog/2019/12/10/searching-for-content-in-base-64-strings-2/
44		//
45		// The caller is responsible for freeing the returned string.
46		//
47		static SIZED_STRING* _yr_modified_base64_encode(
48		SIZED_STRING* in,
49		SIZED_STRING* alphabet,
50		int i,
51		int* pad)
52	4.86k	{
53	4.86k	uint8_t* src = (uint8_t*) in->c_string;
54	4.86k	size_t len = in->length;
55	4.86k	SIZED_STRING* out;
56	4.86k	uint8_t* p;
57	4.86k	uint8_t* end;
58	4.86k	char* alphabet_str = alphabet->c_string;
59	4.86k	uint8_t* tmp;
60	4.86k	int j;
61
62	4.86k	*pad = ((i + len) % 3) ? 3 - ((i + len) % 3) : 0;
63
64		// Add "i" for the number of prepended bytes.
65	4.86k	out = (SIZED_STRING*) yr_malloc(
66	4.86k	sizeof(SIZED_STRING) + i + ((len * 4 + 3) / 3) + *pad);
67
68	4.86k	if (out == NULL)
69	0	return NULL;
70
71	4.86k	tmp = (uint8_t) yr_malloc(sizeof(uint8_t) (len + i));
72	4.86k	if (tmp == NULL)
73	0	{
74	0	yr_free(out);
75	0	return NULL;
76	0	}
77
78		// Prepend appropriate number of bytes and copy remaining input bytes into
79		// temporary buffer.
80	9.72k	for (j = 0; j < i; j++) tmp[j] = 'A';
81
82	4.86k	memcpy(tmp + j, src, len);
83	4.86k	src = tmp;
84
85	4.86k	p = (uint8_t*) out->c_string;
86	4.86k	end = src + len + j;
87
88	315k	while (end - src >= 3)
89	310k	{
90	310k	*p++ = alphabet_str[src[0] >> 2];
91	310k	*p++ = alphabet_str[((src[0] & 0x03) << 4 \| src[1] >> 4)];
92	310k	*p++ = alphabet_str[((src[1] & 0x0f) << 2 \| (src[2] >> 6))];
93	310k	*p++ = alphabet_str[src[2] & 0x3f];
94	310k	src += 3;
95	310k	}
96
97		// Handle remaining bytes and padding.
98	4.86k	if (end - src)
99	3.14k	{
100	3.14k	*p++ = alphabet_str[src[0] >> 2];
101	3.14k	if (end - src == 1)
102	1.72k	{
103	1.72k	*p++ = alphabet_str[(src[0] & 0x03) << 4];
104	1.72k	*p++ = '=';
105	1.72k	}
106	1.42k	else
107	1.42k	{
108	1.42k	*p++ = alphabet_str[((src[0] & 0x03) << 4 \| src[1] >> 4)];
109	1.42k	*p++ = alphabet_str[(src[1] & 0x0f) << 2];
110	1.42k	}
111	3.14k	*p++ = '=';
112	3.14k	}
113
114	4.86k	yr_free(tmp);
115	4.86k	out->length = (uint32_t)(p - (uint8_t*) out->c_string);
116
117	4.86k	return out;
118	4.86k	}
119
120		////////////////////////////////////////////////////////////////////////////////
121		// Given a base64 encoded string, return a new string with leading and trailing
122		// bytes stripped appropriately. The number of leading bytes to skip is always
123		// (i + 1) or zero when no leading bytes are added and the number of trailing
124		// bytes is always (pad + 1) or zero when pad is zero. Also, convert the final
125		// string to wide if desired.
126		//
127		// Note: This implementation assumes you only prepend 0, 1 or 2 bytes.
128		//
129		static SIZED_STRING* _yr_base64_get_base64_substring(
130		SIZED_STRING* encoded_str,
131		int wide,
132		int i,
133		int pad)
134	4.86k	{
135	4.86k	SIZED_STRING* new_str;
136	4.86k	SIZED_STRING* final_str;
137	4.86k	char* start;
138	4.86k	uint32_t length;
139	4.86k	int trailing;
140	4.86k	int leading;
141
142	4.86k	trailing = pad ? pad + 1 : 0;
143	4.86k	leading = i ? i + 1 : 0;
144
145	4.86k	length = encoded_str->length - (leading + trailing);
146
147	4.86k	new_str = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
148
149	4.86k	if (new_str == NULL)
150	0	return NULL;
151
152	4.86k	start = encoded_str->c_string + leading;
153
154	4.86k	memcpy(new_str->c_string, start, length);
155
156	4.86k	new_str->length = length;
157	4.86k	new_str->c_string[length] = '\0';
158
159	4.86k	if (wide)
160	2.22k	{
161	2.22k	final_str = ss_convert_to_wide(new_str);
162	2.22k	yr_free(new_str);
163	2.22k	}
164	2.64k	else
165	2.64k	{
166	2.64k	final_str = new_str;
167	2.64k	}
168
169	4.86k	return final_str;
170	4.86k	}
171
172		// RE metacharacters which need to be escaped when generating the final RE.
173		#define IS_METACHAR(x) \
174	3.94M	(x == '\\' \|\| x == '^' \|\| x == '$' \|\| x == '\|' \|\| x == '(' \|\| x == ')' \|\| \
175	3.94M	x == '[' \|\| x == ']' \|\| x == '*' \|\| x == '?' \|\| x == '{' \|\| x == ',' \|\| \
176	3.94M	x == '.' \|\| x == '+' \|\| x == '}')
177
178		////////////////////////////////////////////////////////////////////////////////
179		// Given a SIZED_STRING return the number of characters which will need to be
180		// escaped when generating the final string to pass to the regexp compiler.
181		//
182		static int _yr_base64_count_escaped(SIZED_STRING* str)
183	4.86k	{
184	4.86k	int c = 0;
185
186	1.97M	for (uint32_t i = 0; i < str->length; i++)
187	1.97M	{
188		// We must be careful to escape null bytes because they break the RE lexer.
189	1.97M	if (IS_METACHAR(str->c_string[i]))
190	34.5k	c++;
191	1.93M	else if (str->c_string[i] == '\x00')
192	734k	c += 4;
193	1.97M	}
194
195	4.86k	return c;
196	4.86k	}
197
198		////////////////////////////////////////////////////////////////////////////////
199		// Create nodes representing the different encodings of a base64 string.
200		//
201		static int _yr_base64_create_nodes(
202		SIZED_STRING* str,
203		SIZED_STRING* alphabet,
204		int wide,
205		BASE64_NODE** head,
206		BASE64_NODE** tail)
207	1.72k	{
208	1.72k	SIZED_STRING* encoded_str;
209	1.72k	SIZED_STRING* final_str;
210	1.72k	BASE64_NODE* node;
211
212	1.72k	int pad;
213
214	6.88k	for (int i = 0; i <= 2; i++)
215	5.16k	{
216	5.16k	if (i == 1 && str->length == 1)
217	297	continue;
218
219	4.86k	node = (BASE64_NODE*) yr_malloc(sizeof(BASE64_NODE));
220	4.86k	if (node == NULL)
221	0	return ERROR_INSUFFICIENT_MEMORY;
222
223	4.86k	FAIL_ON_NULL_WITH_CLEANUP(
224	4.86k	encoded_str = _yr_modified_base64_encode(str, alphabet, i, &pad),
225	4.86k	yr_free(node));
226
227		// Now take the encoded string and strip the bytes which are affected by
228		// the leading and trailing bytes of the plaintext.
229	4.86k	FAIL_ON_NULL_WITH_CLEANUP(
230	4.86k	final_str = _yr_base64_get_base64_substring(encoded_str, wide, i, pad),
231	4.86k	{
232	4.86k	yr_free(encoded_str);
233	4.86k	yr_free(node);
234	4.86k	});
235
236	4.86k	yr_free(encoded_str);
237
238	4.86k	node->str = final_str;
239	4.86k	node->escaped = _yr_base64_count_escaped(node->str);
240	4.86k	node->next = NULL;
241
242	4.86k	if (*head == NULL)
243	1.44k	*head = node;
244
245	4.86k	if (*tail == NULL)
246	1.44k	{
247	1.44k	*tail = node;
248	1.44k	}
249	3.41k	else
250	3.41k	{
251	3.41k	(*tail)->next = node;
252	3.41k	*tail = node;
253	3.41k	}
254	4.86k	}
255
256	1.72k	return ERROR_SUCCESS;
257	1.72k	}
258
259		////////////////////////////////////////////////////////////////////////////////
260		// Useful for printing the encoded strings.
261		//
262		void _yr_base64_print_nodes(BASE64_NODE* head)
263	0	{
264	0	BASE64_NODE* p = head;
265
266	0	while (p != NULL)
267	0	{
268	0	for (size_t i = 0; i < p->str->length; i++)
269	0	{
270	0	if (p->str->c_string[i] >= 32 && p->str->c_string[i] <= 126)
271	0	printf("%c", p->str->c_string[i]);
272	0	else
273	0	printf("\\x%02x", p->str->c_string[i]);
274	0	}
275	0	printf("\n");
276
277	0	p = p->next;
278	0	}
279	0	}
280
281		////////////////////////////////////////////////////////////////////////////////
282		// Destroy a list of base64 nodes.
283		//
284		static void _yr_base64_destroy_nodes(BASE64_NODE* head)
285	1.44k	{
286	1.44k	BASE64_NODE* p = head;
287	1.44k	BASE64_NODE* next;
288
289	6.30k	while (p != NULL)
290	4.86k	{
291	4.86k	yr_free(p->str);
292	4.86k	next = p->next;
293	4.86k	yr_free(p);
294	4.86k	p = next;
295	4.86k	}
296	1.44k	}
297
298		////////////////////////////////////////////////////////////////////////////////
299		// Create the regexp that is the alternatives of each of the strings collected
300		// in the BASE64_NODE list.
301		//
302		int _yr_base64_create_regexp(
303		BASE64_NODE* head,
304		RE_AST** re_ast,
305		RE_ERROR* re_error)
306	1.44k	{
307	1.44k	BASE64_NODE* p = head;
308	1.44k	char* re_str;
309	1.44k	char* s;
310	1.44k	uint32_t length = 0;
311
312		// The number of nodes in the list, used to know how many '\|'.
313	1.44k	uint32_t c = 0;
314
315	6.30k	while (p != NULL)
316	4.86k	{
317	4.86k	length += (p->str->length + p->escaped);
318	4.86k	c++;
319	4.86k	p = p->next;
320	4.86k	}
321
322	1.44k	if (c == 0)
323	0	return ERROR_INSUFFICIENT_MEMORY;
324
325		// Make sure to include '(' and ')'.
326		// The number of '\|' is number of nodes - 1.
327	1.44k	re_str = (char*) yr_malloc(length + 2 + (c - 1) + 1);
328	1.44k	if (re_str == NULL)
329	0	return ERROR_INSUFFICIENT_MEMORY;
330
331	1.44k	s = re_str;
332	1.44k	p = head;
333	1.44k	*s++ = '(';
334	6.30k	while (p != NULL)
335	4.86k	{
336	1.97M	for (uint32_t i = 0; i < p->str->length; i++)
337	1.97M	{
338	1.97M	if (IS_METACHAR(p->str->c_string[i]))
339	34.5k	*s++ = '\\';
340
341	1.97M	if (p->str->c_string[i] == '\x00')
342	734k	{
343	734k	*s++ = '\\';
344	734k	*s++ = 'x';
345	734k	*s++ = '0';
346	734k	*s++ = '0';
347	734k	}
348	1.23M	else
349	1.23M	*s++ = p->str->c_string[i];
350	1.97M	}
351
352	4.86k	if (p->next != NULL)
353	3.41k	*s++ = '\|';
354
355	4.86k	p = p->next;
356	4.86k	}
357	1.44k	*s++ = ')';
358	1.44k	*s = '\x00';
359
360		// Useful for debugging as long as the string has no NULL bytes in it. ;)
361		// printf("%s\n", re_str);
362
363	1.44k	FAIL_ON_ERROR_WITH_CLEANUP(
364	1.44k	yr_re_parse(re_str, re_ast, re_error, RE_PARSER_FLAG_NONE), yr_free(re_str));
365
366	1.44k	yr_free(re_str);
367
368	1.44k	return ERROR_SUCCESS;
369	1.44k	}
370
371		////////////////////////////////////////////////////////////////////////////////
372		// Given a string and an alphabet, generate the RE_AST suitable for representing
373		// the different encodings of the string. This means we generate
374		// "(ABCD\|EFGH\|IJKL)" and must be careful to escape any special characters as
375		// a result of the base64 encoding.
376		//
377		// This uses ideas from:
378		// https://www.leeholmes.com/blog/2019/12/10/searching-for-content-in-base-64-strings-2/
379		//
380		// This does not emit the code for the RE. A further call to yr_re_ast_emit_code
381		// is required to get the code.
382		//
383		int yr_base64_ast_from_string(
384		SIZED_STRING* in_str,
385		YR_MODIFIER modifier,
386		RE_AST** re_ast,
387		RE_ERROR* error)
388	1.44k	{
389	1.44k	BASE64_NODE* head = NULL;
390	1.44k	BASE64_NODE* tail = NULL;
391	1.44k	SIZED_STRING* wide_str;
392
393	1.44k	if (modifier.flags & STRING_FLAGS_WIDE)
394	488	{
395	488	wide_str = ss_convert_to_wide(in_str);
396
397	488	if (modifier.flags & STRING_FLAGS_BASE64)
398	310	{
399	310	FAIL_ON_ERROR_WITH_CLEANUP(
400	310	_yr_base64_create_nodes(wide_str, modifier.alphabet, 0, &head, &tail),
401	310	{ // Cleanup
402	310	strcpy(error->message, "Failure encoding base64 wide string");
403	310	yr_free(wide_str);
404	310	_yr_base64_destroy_nodes(head);
405	310	});
406	310	}
407
408	488	if (modifier.flags & STRING_FLAGS_BASE64_WIDE)
409	184	{
410	184	FAIL_ON_ERROR_WITH_CLEANUP(
411	184	_yr_base64_create_nodes(wide_str, modifier.alphabet, 1, &head, &tail),
412	184	{ // Cleanup
413	184	strcpy(error->message, "Failure encoding base64wide wide string");
414	184	yr_free(wide_str);
415	184	_yr_base64_destroy_nodes(head);
416	184	});
417	184	}
418
419	488	yr_free(wide_str);
420	488	}
421
422	1.44k	if (modifier.flags & STRING_FLAGS_ASCII)
423	205	{
424	205	if (modifier.flags & STRING_FLAGS_BASE64)
425	179	{
426	179	FAIL_ON_ERROR_WITH_CLEANUP(
427	179	_yr_base64_create_nodes(in_str, modifier.alphabet, 0, &head, &tail),
428	179	{ // Cleanup
429	179	strcpy(error->message, "Failure encoding base64 ascii string");
430	179	_yr_base64_destroy_nodes(head);
431	179	});
432	179	}
433
434	205	if (modifier.flags & STRING_FLAGS_BASE64_WIDE)
435	26	{
436	26	FAIL_ON_ERROR_WITH_CLEANUP(
437	26	_yr_base64_create_nodes(in_str, modifier.alphabet, 1, &head, &tail),
438	26	{ // Cleanup
439	26	strcpy(error->message, "Failure encoding base64wide ascii string");
440	26	_yr_base64_destroy_nodes(head);
441	26	});
442	26	}
443	205	}
444
445	1.44k	if (!(modifier.flags & STRING_FLAGS_WIDE) &&
446	1.44k	!(modifier.flags & STRING_FLAGS_ASCII))
447	753	{
448	753	if (modifier.flags & STRING_FLAGS_BASE64)
449	465	{
450	465	FAIL_ON_ERROR_WITH_CLEANUP(
451	465	_yr_base64_create_nodes(in_str, modifier.alphabet, 0, &head, &tail),
452	465	{ // Cleanup
453	465	strcpy(error->message, "Failure encoding base64 string");
454	465	_yr_base64_destroy_nodes(head);
455	465	});
456	465	}
457
458	753	if (modifier.flags & STRING_FLAGS_BASE64_WIDE)
459	556	{
460	556	FAIL_ON_ERROR_WITH_CLEANUP(
461	556	_yr_base64_create_nodes(in_str, modifier.alphabet, 1, &head, &tail),
462	556	{ // Cleanup
463	556	strcpy(error->message, "Failure encoding base64wide string");
464	556	_yr_base64_destroy_nodes(head);
465	556	});
466	556	}
467	753	}
468
469		// Useful for printing the contents of the nodes, to make sure they were
470		// encoded and stripped properly.
471		//_yr_base64_print_nodes(head);
472
473		// Create the final regex string to be parsed from all the nodes.
474		// Error message is filled in by the caller in case of failure.
475	1.44k	FAIL_ON_ERROR_WITH_CLEANUP(
476	1.44k	_yr_base64_create_regexp(head, re_ast, error),
477	1.44k	_yr_base64_destroy_nodes(head));
478
479	1.44k	_yr_base64_destroy_nodes(head);
480
481	1.44k	return ERROR_SUCCESS;
482	1.44k	}