/src/yara/libyara/base64.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2020. The YARA Authors. All Rights Reserved. |
3 | | |
4 | | Redistribution and use in source and binary forms, with or without modification, |
5 | | are permitted provided that the following conditions are met: |
6 | | |
7 | | 1. Redistributions of source code must retain the above copyright notice, this |
8 | | list of conditions and the following disclaimer. |
9 | | |
10 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
11 | | this list of conditions and the following disclaimer in the documentation and/or |
12 | | other materials provided with the distribution. |
13 | | |
14 | | 3. Neither the name of the copyright holder nor the names of its contributors |
15 | | may be used to endorse or promote products derived from this software without |
16 | | specific prior written permission. |
17 | | |
18 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
19 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
20 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR |
22 | | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
23 | | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
24 | | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
25 | | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
27 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | */ |
29 | | |
30 | | #include <string.h> |
31 | | #include <yara/base64.h> |
32 | | #include <yara/error.h> |
33 | | #include <yara/mem.h> |
34 | | #include <yara/re.h> |
35 | | #include <yara/sizedstr.h> |
36 | | |
37 | | //////////////////////////////////////////////////////////////////////////////// |
38 | | // Given a pointer to a SIZED_STRING append 0, 1 or 2 bytes and base64 encode |
39 | | // the string. The number of padding bytes is returned in "pad" and the caller |
40 | | // is expected to trim the appropriate number of leading and trailing bytes. |
41 | | // |
42 | | // This is based upon the ideas at: |
43 | | // https://www.leeholmes.com/blog/2019/12/10/searching-for-content-in-base-64-strings-2/ |
44 | | // |
45 | | // The caller is responsible for freeing the returned string. |
46 | | // |
47 | | static SIZED_STRING* _yr_modified_base64_encode( |
48 | | SIZED_STRING* in, |
49 | | SIZED_STRING* alphabet, |
50 | | int i, |
51 | | int* pad) |
52 | 4.86k | { |
53 | 4.86k | uint8_t* src = (uint8_t*) in->c_string; |
54 | 4.86k | size_t len = in->length; |
55 | 4.86k | SIZED_STRING* out; |
56 | 4.86k | uint8_t* p; |
57 | 4.86k | uint8_t* end; |
58 | 4.86k | char* alphabet_str = alphabet->c_string; |
59 | 4.86k | uint8_t* tmp; |
60 | 4.86k | int j; |
61 | | |
62 | 4.86k | *pad = ((i + len) % 3) ? 3 - ((i + len) % 3) : 0; |
63 | | |
64 | | // Add "i" for the number of prepended bytes. |
65 | 4.86k | out = (SIZED_STRING*) yr_malloc( |
66 | 4.86k | sizeof(SIZED_STRING) + i + ((len * 4 + 3) / 3) + *pad); |
67 | | |
68 | 4.86k | if (out == NULL) |
69 | 0 | return NULL; |
70 | | |
71 | 4.86k | tmp = (uint8_t*) yr_malloc(sizeof(uint8_t) * (len + i)); |
72 | 4.86k | if (tmp == NULL) |
73 | 0 | { |
74 | 0 | yr_free(out); |
75 | 0 | return NULL; |
76 | 0 | } |
77 | | |
78 | | // Prepend appropriate number of bytes and copy remaining input bytes into |
79 | | // temporary buffer. |
80 | 9.72k | for (j = 0; j < i; j++) tmp[j] = 'A'; |
81 | | |
82 | 4.86k | memcpy(tmp + j, src, len); |
83 | 4.86k | src = tmp; |
84 | | |
85 | 4.86k | p = (uint8_t*) out->c_string; |
86 | 4.86k | end = src + len + j; |
87 | | |
88 | 315k | while (end - src >= 3) |
89 | 310k | { |
90 | 310k | *p++ = alphabet_str[src[0] >> 2]; |
91 | 310k | *p++ = alphabet_str[((src[0] & 0x03) << 4 | src[1] >> 4)]; |
92 | 310k | *p++ = alphabet_str[((src[1] & 0x0f) << 2 | (src[2] >> 6))]; |
93 | 310k | *p++ = alphabet_str[src[2] & 0x3f]; |
94 | 310k | src += 3; |
95 | 310k | } |
96 | | |
97 | | // Handle remaining bytes and padding. |
98 | 4.86k | if (end - src) |
99 | 3.14k | { |
100 | 3.14k | *p++ = alphabet_str[src[0] >> 2]; |
101 | 3.14k | if (end - src == 1) |
102 | 1.72k | { |
103 | 1.72k | *p++ = alphabet_str[(src[0] & 0x03) << 4]; |
104 | 1.72k | *p++ = '='; |
105 | 1.72k | } |
106 | 1.42k | else |
107 | 1.42k | { |
108 | 1.42k | *p++ = alphabet_str[((src[0] & 0x03) << 4 | src[1] >> 4)]; |
109 | 1.42k | *p++ = alphabet_str[(src[1] & 0x0f) << 2]; |
110 | 1.42k | } |
111 | 3.14k | *p++ = '='; |
112 | 3.14k | } |
113 | | |
114 | 4.86k | yr_free(tmp); |
115 | 4.86k | out->length = (uint32_t)(p - (uint8_t*) out->c_string); |
116 | | |
117 | 4.86k | return out; |
118 | 4.86k | } |
119 | | |
120 | | //////////////////////////////////////////////////////////////////////////////// |
121 | | // Given a base64 encoded string, return a new string with leading and trailing |
122 | | // bytes stripped appropriately. The number of leading bytes to skip is always |
123 | | // (i + 1) or zero when no leading bytes are added and the number of trailing |
124 | | // bytes is always (pad + 1) or zero when pad is zero. Also, convert the final |
125 | | // string to wide if desired. |
126 | | // |
127 | | // Note: This implementation assumes you only prepend 0, 1 or 2 bytes. |
128 | | // |
129 | | static SIZED_STRING* _yr_base64_get_base64_substring( |
130 | | SIZED_STRING* encoded_str, |
131 | | int wide, |
132 | | int i, |
133 | | int pad) |
134 | 4.86k | { |
135 | 4.86k | SIZED_STRING* new_str; |
136 | 4.86k | SIZED_STRING* final_str; |
137 | 4.86k | char* start; |
138 | 4.86k | uint32_t length; |
139 | 4.86k | int trailing; |
140 | 4.86k | int leading; |
141 | | |
142 | 4.86k | trailing = pad ? pad + 1 : 0; |
143 | 4.86k | leading = i ? i + 1 : 0; |
144 | | |
145 | 4.86k | length = encoded_str->length - (leading + trailing); |
146 | | |
147 | 4.86k | new_str = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length); |
148 | | |
149 | 4.86k | if (new_str == NULL) |
150 | 0 | return NULL; |
151 | | |
152 | 4.86k | start = encoded_str->c_string + leading; |
153 | | |
154 | 4.86k | memcpy(new_str->c_string, start, length); |
155 | | |
156 | 4.86k | new_str->length = length; |
157 | 4.86k | new_str->c_string[length] = '\0'; |
158 | | |
159 | 4.86k | if (wide) |
160 | 2.22k | { |
161 | 2.22k | final_str = ss_convert_to_wide(new_str); |
162 | 2.22k | yr_free(new_str); |
163 | 2.22k | } |
164 | 2.64k | else |
165 | 2.64k | { |
166 | 2.64k | final_str = new_str; |
167 | 2.64k | } |
168 | | |
169 | 4.86k | return final_str; |
170 | 4.86k | } |
171 | | |
172 | | // RE metacharacters which need to be escaped when generating the final RE. |
173 | | #define IS_METACHAR(x) \ |
174 | 3.94M | (x == '\\' || x == '^' || x == '$' || x == '|' || x == '(' || x == ')' || \ |
175 | 3.94M | x == '[' || x == ']' || x == '*' || x == '?' || x == '{' || x == ',' || \ |
176 | 3.94M | x == '.' || x == '+' || x == '}') |
177 | | |
178 | | //////////////////////////////////////////////////////////////////////////////// |
179 | | // Given a SIZED_STRING return the number of characters which will need to be |
180 | | // escaped when generating the final string to pass to the regexp compiler. |
181 | | // |
182 | | static int _yr_base64_count_escaped(SIZED_STRING* str) |
183 | 4.86k | { |
184 | 4.86k | int c = 0; |
185 | | |
186 | 1.97M | for (uint32_t i = 0; i < str->length; i++) |
187 | 1.97M | { |
188 | | // We must be careful to escape null bytes because they break the RE lexer. |
189 | 1.97M | if (IS_METACHAR(str->c_string[i])) |
190 | 34.5k | c++; |
191 | 1.93M | else if (str->c_string[i] == '\x00') |
192 | 734k | c += 4; |
193 | 1.97M | } |
194 | | |
195 | 4.86k | return c; |
196 | 4.86k | } |
197 | | |
198 | | //////////////////////////////////////////////////////////////////////////////// |
199 | | // Create nodes representing the different encodings of a base64 string. |
200 | | // |
201 | | static int _yr_base64_create_nodes( |
202 | | SIZED_STRING* str, |
203 | | SIZED_STRING* alphabet, |
204 | | int wide, |
205 | | BASE64_NODE** head, |
206 | | BASE64_NODE** tail) |
207 | 1.72k | { |
208 | 1.72k | SIZED_STRING* encoded_str; |
209 | 1.72k | SIZED_STRING* final_str; |
210 | 1.72k | BASE64_NODE* node; |
211 | | |
212 | 1.72k | int pad; |
213 | | |
214 | 6.88k | for (int i = 0; i <= 2; i++) |
215 | 5.16k | { |
216 | 5.16k | if (i == 1 && str->length == 1) |
217 | 297 | continue; |
218 | | |
219 | 4.86k | node = (BASE64_NODE*) yr_malloc(sizeof(BASE64_NODE)); |
220 | 4.86k | if (node == NULL) |
221 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
222 | | |
223 | 4.86k | FAIL_ON_NULL_WITH_CLEANUP( |
224 | 4.86k | encoded_str = _yr_modified_base64_encode(str, alphabet, i, &pad), |
225 | 4.86k | yr_free(node)); |
226 | | |
227 | | // Now take the encoded string and strip the bytes which are affected by |
228 | | // the leading and trailing bytes of the plaintext. |
229 | 4.86k | FAIL_ON_NULL_WITH_CLEANUP( |
230 | 4.86k | final_str = _yr_base64_get_base64_substring(encoded_str, wide, i, pad), |
231 | 4.86k | { |
232 | 4.86k | yr_free(encoded_str); |
233 | 4.86k | yr_free(node); |
234 | 4.86k | }); |
235 | | |
236 | 4.86k | yr_free(encoded_str); |
237 | | |
238 | 4.86k | node->str = final_str; |
239 | 4.86k | node->escaped = _yr_base64_count_escaped(node->str); |
240 | 4.86k | node->next = NULL; |
241 | | |
242 | 4.86k | if (*head == NULL) |
243 | 1.44k | *head = node; |
244 | | |
245 | 4.86k | if (*tail == NULL) |
246 | 1.44k | { |
247 | 1.44k | *tail = node; |
248 | 1.44k | } |
249 | 3.41k | else |
250 | 3.41k | { |
251 | 3.41k | (*tail)->next = node; |
252 | 3.41k | *tail = node; |
253 | 3.41k | } |
254 | 4.86k | } |
255 | | |
256 | 1.72k | return ERROR_SUCCESS; |
257 | 1.72k | } |
258 | | |
259 | | //////////////////////////////////////////////////////////////////////////////// |
260 | | // Useful for printing the encoded strings. |
261 | | // |
262 | | void _yr_base64_print_nodes(BASE64_NODE* head) |
263 | 0 | { |
264 | 0 | BASE64_NODE* p = head; |
265 | |
|
266 | 0 | while (p != NULL) |
267 | 0 | { |
268 | 0 | for (size_t i = 0; i < p->str->length; i++) |
269 | 0 | { |
270 | 0 | if (p->str->c_string[i] >= 32 && p->str->c_string[i] <= 126) |
271 | 0 | printf("%c", p->str->c_string[i]); |
272 | 0 | else |
273 | 0 | printf("\\x%02x", p->str->c_string[i]); |
274 | 0 | } |
275 | 0 | printf("\n"); |
276 | |
|
277 | 0 | p = p->next; |
278 | 0 | } |
279 | 0 | } |
280 | | |
281 | | //////////////////////////////////////////////////////////////////////////////// |
282 | | // Destroy a list of base64 nodes. |
283 | | // |
284 | | static void _yr_base64_destroy_nodes(BASE64_NODE* head) |
285 | 1.44k | { |
286 | 1.44k | BASE64_NODE* p = head; |
287 | 1.44k | BASE64_NODE* next; |
288 | | |
289 | 6.30k | while (p != NULL) |
290 | 4.86k | { |
291 | 4.86k | yr_free(p->str); |
292 | 4.86k | next = p->next; |
293 | 4.86k | yr_free(p); |
294 | 4.86k | p = next; |
295 | 4.86k | } |
296 | 1.44k | } |
297 | | |
298 | | //////////////////////////////////////////////////////////////////////////////// |
299 | | // Create the regexp that is the alternatives of each of the strings collected |
300 | | // in the BASE64_NODE list. |
301 | | // |
302 | | int _yr_base64_create_regexp( |
303 | | BASE64_NODE* head, |
304 | | RE_AST** re_ast, |
305 | | RE_ERROR* re_error) |
306 | 1.44k | { |
307 | 1.44k | BASE64_NODE* p = head; |
308 | 1.44k | char* re_str; |
309 | 1.44k | char* s; |
310 | 1.44k | uint32_t length = 0; |
311 | | |
312 | | // The number of nodes in the list, used to know how many '|'. |
313 | 1.44k | uint32_t c = 0; |
314 | | |
315 | 6.30k | while (p != NULL) |
316 | 4.86k | { |
317 | 4.86k | length += (p->str->length + p->escaped); |
318 | 4.86k | c++; |
319 | 4.86k | p = p->next; |
320 | 4.86k | } |
321 | | |
322 | 1.44k | if (c == 0) |
323 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
324 | | |
325 | | // Make sure to include '(' and ')'. |
326 | | // The number of '|' is number of nodes - 1. |
327 | 1.44k | re_str = (char*) yr_malloc(length + 2 + (c - 1) + 1); |
328 | 1.44k | if (re_str == NULL) |
329 | 0 | return ERROR_INSUFFICIENT_MEMORY; |
330 | | |
331 | 1.44k | s = re_str; |
332 | 1.44k | p = head; |
333 | 1.44k | *s++ = '('; |
334 | 6.30k | while (p != NULL) |
335 | 4.86k | { |
336 | 1.97M | for (uint32_t i = 0; i < p->str->length; i++) |
337 | 1.97M | { |
338 | 1.97M | if (IS_METACHAR(p->str->c_string[i])) |
339 | 34.5k | *s++ = '\\'; |
340 | | |
341 | 1.97M | if (p->str->c_string[i] == '\x00') |
342 | 734k | { |
343 | 734k | *s++ = '\\'; |
344 | 734k | *s++ = 'x'; |
345 | 734k | *s++ = '0'; |
346 | 734k | *s++ = '0'; |
347 | 734k | } |
348 | 1.23M | else |
349 | 1.23M | *s++ = p->str->c_string[i]; |
350 | 1.97M | } |
351 | | |
352 | 4.86k | if (p->next != NULL) |
353 | 3.41k | *s++ = '|'; |
354 | | |
355 | 4.86k | p = p->next; |
356 | 4.86k | } |
357 | 1.44k | *s++ = ')'; |
358 | 1.44k | *s = '\x00'; |
359 | | |
360 | | // Useful for debugging as long as the string has no NULL bytes in it. ;) |
361 | | // printf("%s\n", re_str); |
362 | | |
363 | 1.44k | FAIL_ON_ERROR_WITH_CLEANUP( |
364 | 1.44k | yr_re_parse(re_str, re_ast, re_error, RE_PARSER_FLAG_NONE), yr_free(re_str)); |
365 | | |
366 | 1.44k | yr_free(re_str); |
367 | | |
368 | 1.44k | return ERROR_SUCCESS; |
369 | 1.44k | } |
370 | | |
371 | | //////////////////////////////////////////////////////////////////////////////// |
372 | | // Given a string and an alphabet, generate the RE_AST suitable for representing |
373 | | // the different encodings of the string. This means we generate |
374 | | // "(ABCD|EFGH|IJKL)" and must be careful to escape any special characters as |
375 | | // a result of the base64 encoding. |
376 | | // |
377 | | // This uses ideas from: |
378 | | // https://www.leeholmes.com/blog/2019/12/10/searching-for-content-in-base-64-strings-2/ |
379 | | // |
380 | | // This does not emit the code for the RE. A further call to yr_re_ast_emit_code |
381 | | // is required to get the code. |
382 | | // |
383 | | int yr_base64_ast_from_string( |
384 | | SIZED_STRING* in_str, |
385 | | YR_MODIFIER modifier, |
386 | | RE_AST** re_ast, |
387 | | RE_ERROR* error) |
388 | 1.44k | { |
389 | 1.44k | BASE64_NODE* head = NULL; |
390 | 1.44k | BASE64_NODE* tail = NULL; |
391 | 1.44k | SIZED_STRING* wide_str; |
392 | | |
393 | 1.44k | if (modifier.flags & STRING_FLAGS_WIDE) |
394 | 488 | { |
395 | 488 | wide_str = ss_convert_to_wide(in_str); |
396 | | |
397 | 488 | if (modifier.flags & STRING_FLAGS_BASE64) |
398 | 310 | { |
399 | 310 | FAIL_ON_ERROR_WITH_CLEANUP( |
400 | 310 | _yr_base64_create_nodes(wide_str, modifier.alphabet, 0, &head, &tail), |
401 | 310 | { // Cleanup |
402 | 310 | strcpy(error->message, "Failure encoding base64 wide string"); |
403 | 310 | yr_free(wide_str); |
404 | 310 | _yr_base64_destroy_nodes(head); |
405 | 310 | }); |
406 | 310 | } |
407 | | |
408 | 488 | if (modifier.flags & STRING_FLAGS_BASE64_WIDE) |
409 | 184 | { |
410 | 184 | FAIL_ON_ERROR_WITH_CLEANUP( |
411 | 184 | _yr_base64_create_nodes(wide_str, modifier.alphabet, 1, &head, &tail), |
412 | 184 | { // Cleanup |
413 | 184 | strcpy(error->message, "Failure encoding base64wide wide string"); |
414 | 184 | yr_free(wide_str); |
415 | 184 | _yr_base64_destroy_nodes(head); |
416 | 184 | }); |
417 | 184 | } |
418 | | |
419 | 488 | yr_free(wide_str); |
420 | 488 | } |
421 | | |
422 | 1.44k | if (modifier.flags & STRING_FLAGS_ASCII) |
423 | 205 | { |
424 | 205 | if (modifier.flags & STRING_FLAGS_BASE64) |
425 | 179 | { |
426 | 179 | FAIL_ON_ERROR_WITH_CLEANUP( |
427 | 179 | _yr_base64_create_nodes(in_str, modifier.alphabet, 0, &head, &tail), |
428 | 179 | { // Cleanup |
429 | 179 | strcpy(error->message, "Failure encoding base64 ascii string"); |
430 | 179 | _yr_base64_destroy_nodes(head); |
431 | 179 | }); |
432 | 179 | } |
433 | | |
434 | 205 | if (modifier.flags & STRING_FLAGS_BASE64_WIDE) |
435 | 26 | { |
436 | 26 | FAIL_ON_ERROR_WITH_CLEANUP( |
437 | 26 | _yr_base64_create_nodes(in_str, modifier.alphabet, 1, &head, &tail), |
438 | 26 | { // Cleanup |
439 | 26 | strcpy(error->message, "Failure encoding base64wide ascii string"); |
440 | 26 | _yr_base64_destroy_nodes(head); |
441 | 26 | }); |
442 | 26 | } |
443 | 205 | } |
444 | | |
445 | 1.44k | if (!(modifier.flags & STRING_FLAGS_WIDE) && |
446 | 1.44k | !(modifier.flags & STRING_FLAGS_ASCII)) |
447 | 753 | { |
448 | 753 | if (modifier.flags & STRING_FLAGS_BASE64) |
449 | 465 | { |
450 | 465 | FAIL_ON_ERROR_WITH_CLEANUP( |
451 | 465 | _yr_base64_create_nodes(in_str, modifier.alphabet, 0, &head, &tail), |
452 | 465 | { // Cleanup |
453 | 465 | strcpy(error->message, "Failure encoding base64 string"); |
454 | 465 | _yr_base64_destroy_nodes(head); |
455 | 465 | }); |
456 | 465 | } |
457 | | |
458 | 753 | if (modifier.flags & STRING_FLAGS_BASE64_WIDE) |
459 | 556 | { |
460 | 556 | FAIL_ON_ERROR_WITH_CLEANUP( |
461 | 556 | _yr_base64_create_nodes(in_str, modifier.alphabet, 1, &head, &tail), |
462 | 556 | { // Cleanup |
463 | 556 | strcpy(error->message, "Failure encoding base64wide string"); |
464 | 556 | _yr_base64_destroy_nodes(head); |
465 | 556 | }); |
466 | 556 | } |
467 | 753 | } |
468 | | |
469 | | // Useful for printing the contents of the nodes, to make sure they were |
470 | | // encoded and stripped properly. |
471 | | //_yr_base64_print_nodes(head); |
472 | | |
473 | | // Create the final regex string to be parsed from all the nodes. |
474 | | // Error message is filled in by the caller in case of failure. |
475 | 1.44k | FAIL_ON_ERROR_WITH_CLEANUP( |
476 | 1.44k | _yr_base64_create_regexp(head, re_ast, error), |
477 | 1.44k | _yr_base64_destroy_nodes(head)); |
478 | | |
479 | 1.44k | _yr_base64_destroy_nodes(head); |
480 | | |
481 | 1.44k | return ERROR_SUCCESS; |
482 | 1.44k | } |