/src/fluent-bit/lib/monkey/deps/regex/re.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * |
3 | | * Mini regex-module inspired by Rob Pike's regex code described in: |
4 | | * |
5 | | * http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html |
6 | | * |
7 | | * |
8 | | * |
9 | | * Supports: |
10 | | * --------- |
11 | | * '.' Dot, matches any character |
12 | | * '^' Start anchor, matches beginning of string |
13 | | * '$' End anchor, matches end of string |
14 | | * '*' Asterisk, match zero or more (greedy) |
15 | | * '+' Plus, match one or more (greedy) |
16 | | * '?' Question, match zero or one (non-greedy) |
17 | | * '[abc]' Character class, match if one of {'a', 'b', 'c'} |
18 | | * '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken! |
19 | | * '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z } |
20 | | * '\s' Whitespace, \t \f \r \n \v and spaces |
21 | | * '\S' Non-whitespace |
22 | | * '\w' Alphanumeric, [a-zA-Z0-9_] |
23 | | * '\W' Non-alphanumeric |
24 | | * '\d' Digits, [0-9] |
25 | | * '\D' Non-digits |
26 | | * |
27 | | * |
28 | | */ |
29 | | |
30 | | |
31 | | |
32 | | #include "re.h" |
33 | | #include <stdio.h> |
34 | | #include <ctype.h> |
35 | | |
36 | | /* Private function declarations: */ |
37 | | static int matchpattern(regex_t* pattern, const char* text, int* matchlength); |
38 | | static int matchcharclass(char c, const char* str); |
39 | | static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength); |
40 | | static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength); |
41 | | static int matchone(regex_t p, char c); |
42 | | static int matchdigit(char c); |
43 | | static int matchalpha(char c); |
44 | | static int matchwhitespace(char c); |
45 | | static int matchmetachar(char c, const char* str); |
46 | | static int matchrange(char c, const char* str); |
47 | | static int matchdot(char c); |
48 | | static int ismetachar(char c); |
49 | | |
50 | | |
51 | | |
52 | | /* Public functions: */ |
53 | | int re_match(const char* pattern, const char* text, int* matchlength) |
54 | 0 | { |
55 | 0 | return re_matchp(re_compile(pattern), text, matchlength); |
56 | 0 | } |
57 | | |
58 | | int re_matchp(re_t pattern, const char* text, int* matchlength) |
59 | 0 | { |
60 | 0 | int matchlength_; |
61 | |
|
62 | 0 | if(NULL == matchlength) |
63 | 0 | { |
64 | 0 | matchlength = &matchlength_; |
65 | 0 | } |
66 | |
|
67 | 0 | *matchlength = 0; |
68 | 0 | if (pattern != 0) |
69 | 0 | { |
70 | 0 | if (pattern[0].type == BEGIN) |
71 | 0 | { |
72 | 0 | return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1); |
73 | 0 | } |
74 | 0 | else |
75 | 0 | { |
76 | 0 | int idx = -1; |
77 | |
|
78 | 0 | do |
79 | 0 | { |
80 | 0 | idx += 1; |
81 | |
|
82 | 0 | if (matchpattern(pattern, text, matchlength)) |
83 | 0 | { |
84 | 0 | if (text[0] == '\0') |
85 | 0 | return -1; |
86 | | |
87 | 0 | return idx; |
88 | 0 | } |
89 | 0 | } |
90 | 0 | while (*text++ != '\0'); |
91 | 0 | } |
92 | 0 | } |
93 | 0 | return -1; |
94 | 0 | } |
95 | | |
96 | | re_t re_compile(const char* pattern) |
97 | 0 | { |
98 | | /* The sizes of the two static arrays below substantiates the static RAM usage of this module. |
99 | | MAX_REGEXP_OBJECTS is the max number of symbols in the expression. |
100 | | MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */ |
101 | 0 | static regex_t re_compiled[MAX_REGEXP_OBJECTS]; |
102 | 0 | static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN]; |
103 | 0 | int ccl_bufidx = 1; |
104 | |
|
105 | 0 | char c; /* current char in pattern */ |
106 | 0 | int i = 0; /* index into pattern */ |
107 | 0 | int j = 0; /* index into re_compiled */ |
108 | |
|
109 | 0 | while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS)) |
110 | 0 | { |
111 | 0 | c = pattern[i]; |
112 | |
|
113 | 0 | switch (c) |
114 | 0 | { |
115 | | /* Meta-characters: */ |
116 | 0 | case '^': { re_compiled[j].type = BEGIN; } break; |
117 | 0 | case '$': { re_compiled[j].type = END; } break; |
118 | 0 | case '.': { re_compiled[j].type = DOT; } break; |
119 | 0 | case '*': { re_compiled[j].type = STAR; } break; |
120 | 0 | case '+': { re_compiled[j].type = PLUS; } break; |
121 | 0 | case '?': { re_compiled[j].type = QUESTIONMARK; } break; |
122 | | /* case '|': { re_compiled[j].type = BRANCH; } break; <-- not working properly */ |
123 | | |
124 | | /* Escaped character-classes (\s \w ...): */ |
125 | 0 | case '\\': |
126 | 0 | { |
127 | 0 | if (pattern[i+1] != '\0') |
128 | 0 | { |
129 | | /* Skip the escape-char '\\' */ |
130 | 0 | i += 1; |
131 | | /* ... and check the next */ |
132 | 0 | switch (pattern[i]) |
133 | 0 | { |
134 | | /* Meta-character: */ |
135 | 0 | case 'd': { re_compiled[j].type = DIGIT; } break; |
136 | 0 | case 'D': { re_compiled[j].type = NOT_DIGIT; } break; |
137 | 0 | case 'w': { re_compiled[j].type = ALPHA; } break; |
138 | 0 | case 'W': { re_compiled[j].type = NOT_ALPHA; } break; |
139 | 0 | case 's': { re_compiled[j].type = WHITESPACE; } break; |
140 | 0 | case 'S': { re_compiled[j].type = NOT_WHITESPACE; } break; |
141 | | |
142 | | /* Escaped character, e.g. '.' or '$' */ |
143 | 0 | default: |
144 | 0 | { |
145 | 0 | re_compiled[j].type = RE_CHAR; |
146 | 0 | re_compiled[j].u.ch = pattern[i]; |
147 | 0 | } break; |
148 | 0 | } |
149 | 0 | } |
150 | | /* '\\' as last char in pattern -> invalid regular expression. */ |
151 | | /* |
152 | | else |
153 | | { |
154 | | re_compiled[j].type = RE_CHAR; |
155 | | re_compiled[j].ch = pattern[i]; |
156 | | } |
157 | | */ |
158 | 0 | } break; |
159 | | |
160 | | /* Character class: */ |
161 | 0 | case '[': |
162 | 0 | { |
163 | | /* Remember where the char-buffer starts. */ |
164 | 0 | int buf_begin = ccl_bufidx; |
165 | | |
166 | | /* Look-ahead to determine if negated */ |
167 | 0 | if (pattern[i+1] == '^') |
168 | 0 | { |
169 | 0 | re_compiled[j].type = INV_CHAR_CLASS; |
170 | 0 | i += 1; /* Increment i to avoid including '^' in the char-buffer */ |
171 | 0 | if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */ |
172 | 0 | { |
173 | 0 | return 0; |
174 | 0 | } |
175 | 0 | } |
176 | 0 | else |
177 | 0 | { |
178 | 0 | re_compiled[j].type = CHAR_CLASS; |
179 | 0 | } |
180 | | |
181 | | /* Copy characters inside [..] to buffer */ |
182 | 0 | while ( (pattern[++i] != ']') |
183 | 0 | && (pattern[i] != '\0')) /* Missing ] */ |
184 | 0 | { |
185 | 0 | if (pattern[i] == '\\') |
186 | 0 | { |
187 | 0 | if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) |
188 | 0 | { |
189 | | //fputs("exceeded internal buffer!\n", stderr); |
190 | 0 | return 0; |
191 | 0 | } |
192 | 0 | if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '\\' */ |
193 | 0 | { |
194 | 0 | return 0; |
195 | 0 | } |
196 | 0 | ccl_buf[ccl_bufidx++] = pattern[i++]; |
197 | 0 | } |
198 | 0 | else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) |
199 | 0 | { |
200 | | //fputs("exceeded internal buffer!\n", stderr); |
201 | 0 | return 0; |
202 | 0 | } |
203 | 0 | ccl_buf[ccl_bufidx++] = pattern[i]; |
204 | 0 | } |
205 | 0 | if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) |
206 | 0 | { |
207 | | /* Catches cases such as [00000000000000000000000000000000000000][ */ |
208 | | //fputs("exceeded internal buffer!\n", stderr); |
209 | 0 | return 0; |
210 | 0 | } |
211 | | /* Null-terminate string end */ |
212 | 0 | ccl_buf[ccl_bufidx++] = 0; |
213 | 0 | re_compiled[j].u.ccl = &ccl_buf[buf_begin]; |
214 | 0 | } break; |
215 | | |
216 | | /* Other characters: */ |
217 | 0 | default: |
218 | 0 | { |
219 | 0 | re_compiled[j].type = RE_CHAR; |
220 | 0 | re_compiled[j].u.ch = c; |
221 | 0 | } break; |
222 | 0 | } |
223 | | /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */ |
224 | 0 | if (pattern[i] == 0) |
225 | 0 | { |
226 | 0 | return 0; |
227 | 0 | } |
228 | | |
229 | 0 | i += 1; |
230 | 0 | j += 1; |
231 | 0 | } |
232 | | /* 'UNUSED' is a sentinel used to indicate end-of-pattern */ |
233 | 0 | re_compiled[j].type = UNUSED; |
234 | |
|
235 | 0 | return (re_t) re_compiled; |
236 | 0 | } |
237 | | |
238 | | void re_print(regex_t* pattern) |
239 | 0 | { |
240 | 0 | const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "RE_CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" }; |
241 | |
|
242 | 0 | int i; |
243 | 0 | int j; |
244 | 0 | char c; |
245 | 0 | for (i = 0; i < MAX_REGEXP_OBJECTS; ++i) |
246 | 0 | { |
247 | 0 | if (pattern[i].type == UNUSED) |
248 | 0 | { |
249 | 0 | break; |
250 | 0 | } |
251 | | |
252 | 0 | printf("type: %s", types[pattern[i].type]); |
253 | 0 | if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS) |
254 | 0 | { |
255 | 0 | printf(" ["); |
256 | 0 | for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j) |
257 | 0 | { |
258 | 0 | c = pattern[i].u.ccl[j]; |
259 | 0 | if ((c == '\0') || (c == ']')) |
260 | 0 | { |
261 | 0 | break; |
262 | 0 | } |
263 | 0 | printf("%c", c); |
264 | 0 | } |
265 | 0 | printf("]"); |
266 | 0 | } |
267 | 0 | else if (pattern[i].type == RE_CHAR) |
268 | 0 | { |
269 | 0 | printf(" '%c'", pattern[i].u.ch); |
270 | 0 | } |
271 | 0 | printf("\n"); |
272 | 0 | } |
273 | 0 | } |
274 | | |
275 | | |
276 | | |
277 | | /* Private functions: */ |
278 | | static int matchdigit(char c) |
279 | 0 | { |
280 | 0 | return isdigit(c); |
281 | 0 | } |
282 | | static int matchalpha(char c) |
283 | 0 | { |
284 | 0 | return isalpha(c); |
285 | 0 | } |
286 | | static int matchwhitespace(char c) |
287 | 0 | { |
288 | 0 | return isspace(c); |
289 | 0 | } |
290 | | static int matchalphanum(char c) |
291 | 0 | { |
292 | 0 | return ((c == '_') || matchalpha(c) || matchdigit(c)); |
293 | 0 | } |
294 | | static int matchrange(char c, const char* str) |
295 | 0 | { |
296 | 0 | return ( (c != '-') |
297 | 0 | && (str[0] != '\0') |
298 | 0 | && (str[0] != '-') |
299 | 0 | && (str[1] == '-') |
300 | 0 | && (str[2] != '\0') |
301 | 0 | && ( (c >= str[0]) |
302 | 0 | && (c <= str[2]))); |
303 | 0 | } |
304 | | static int matchdot(char c) |
305 | 0 | { |
306 | 0 | #if defined(RE_DOT_MATCHES_NEWLINE) && (RE_DOT_MATCHES_NEWLINE == 1) |
307 | 0 | (void)c; |
308 | 0 | return 1; |
309 | | #else |
310 | | return c != '\n' && c != '\r'; |
311 | | #endif |
312 | 0 | } |
313 | | static int ismetachar(char c) |
314 | 0 | { |
315 | 0 | return ((c == 's') || (c == 'S') || (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D')); |
316 | 0 | } |
317 | | |
318 | | static int matchmetachar(char c, const char* str) |
319 | 0 | { |
320 | 0 | switch (str[0]) |
321 | 0 | { |
322 | 0 | case 'd': return matchdigit(c); |
323 | 0 | case 'D': return !matchdigit(c); |
324 | 0 | case 'w': return matchalphanum(c); |
325 | 0 | case 'W': return !matchalphanum(c); |
326 | 0 | case 's': return matchwhitespace(c); |
327 | 0 | case 'S': return !matchwhitespace(c); |
328 | 0 | default: return (c == str[0]); |
329 | 0 | } |
330 | 0 | } |
331 | | |
332 | | static int matchcharclass(char c, const char* str) |
333 | 0 | { |
334 | 0 | do |
335 | 0 | { |
336 | 0 | if (matchrange(c, str)) |
337 | 0 | { |
338 | 0 | return 1; |
339 | 0 | } |
340 | 0 | else if (str[0] == '\\') |
341 | 0 | { |
342 | | /* Escape-char: increment str-ptr and match on next char */ |
343 | 0 | str += 1; |
344 | 0 | if (matchmetachar(c, str)) |
345 | 0 | { |
346 | 0 | return 1; |
347 | 0 | } |
348 | 0 | else if ((c == str[0]) && !ismetachar(c)) |
349 | 0 | { |
350 | 0 | return 1; |
351 | 0 | } |
352 | 0 | } |
353 | 0 | else if (c == str[0]) |
354 | 0 | { |
355 | 0 | if (c == '-') |
356 | 0 | { |
357 | 0 | return ((str[-1] == '\0') || (str[1] == '\0')); |
358 | 0 | } |
359 | 0 | else |
360 | 0 | { |
361 | 0 | return 1; |
362 | 0 | } |
363 | 0 | } |
364 | 0 | } |
365 | 0 | while (*str++ != '\0'); |
366 | | |
367 | 0 | return 0; |
368 | 0 | } |
369 | | |
370 | | static int matchone(regex_t p, char c) |
371 | 0 | { |
372 | 0 | switch (p.type) |
373 | 0 | { |
374 | 0 | case DOT: return matchdot(c); |
375 | 0 | case CHAR_CLASS: return matchcharclass(c, (const char*)p.u.ccl); |
376 | 0 | case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p.u.ccl); |
377 | 0 | case DIGIT: return matchdigit(c); |
378 | 0 | case NOT_DIGIT: return !matchdigit(c); |
379 | 0 | case ALPHA: return matchalphanum(c); |
380 | 0 | case NOT_ALPHA: return !matchalphanum(c); |
381 | 0 | case WHITESPACE: return matchwhitespace(c); |
382 | 0 | case NOT_WHITESPACE: return !matchwhitespace(c); |
383 | 0 | default: return (p.u.ch == c); |
384 | 0 | } |
385 | 0 | } |
386 | | |
387 | | static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength) |
388 | 0 | { |
389 | 0 | int prelen = *matchlength; |
390 | 0 | const char* prepoint = text; |
391 | 0 | while ((text[0] != '\0') && matchone(p, *text)) |
392 | 0 | { |
393 | 0 | text++; |
394 | 0 | (*matchlength)++; |
395 | 0 | } |
396 | 0 | while (text >= prepoint) |
397 | 0 | { |
398 | 0 | if (matchpattern(pattern, text--, matchlength)) |
399 | 0 | return 1; |
400 | 0 | (*matchlength)--; |
401 | 0 | } |
402 | | |
403 | 0 | *matchlength = prelen; |
404 | 0 | return 0; |
405 | 0 | } |
406 | | |
407 | | static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength) |
408 | 0 | { |
409 | 0 | const char* prepoint = text; |
410 | 0 | while ((text[0] != '\0') && matchone(p, *text)) |
411 | 0 | { |
412 | 0 | text++; |
413 | 0 | (*matchlength)++; |
414 | 0 | } |
415 | 0 | while (text > prepoint) |
416 | 0 | { |
417 | 0 | if (matchpattern(pattern, text--, matchlength)) |
418 | 0 | return 1; |
419 | 0 | (*matchlength)--; |
420 | 0 | } |
421 | | |
422 | 0 | return 0; |
423 | 0 | } |
424 | | |
425 | | static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength) |
426 | 0 | { |
427 | 0 | if (p.type == UNUSED) |
428 | 0 | return 1; |
429 | 0 | if (matchpattern(pattern, text, matchlength)) |
430 | 0 | return 1; |
431 | 0 | if (*text && matchone(p, *text++)) |
432 | 0 | { |
433 | 0 | if (matchpattern(pattern, text, matchlength)) |
434 | 0 | { |
435 | 0 | (*matchlength)++; |
436 | 0 | return 1; |
437 | 0 | } |
438 | 0 | } |
439 | 0 | return 0; |
440 | 0 | } |
441 | | |
442 | | |
443 | | #if 0 |
444 | | |
445 | | /* Recursive matching */ |
446 | | static int matchpattern(regex_t* pattern, const char* text, int *matchlength) |
447 | | { |
448 | | int pre = *matchlength; |
449 | | if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) |
450 | | { |
451 | | return matchquestion(pattern[1], &pattern[2], text, matchlength); |
452 | | } |
453 | | else if (pattern[1].type == STAR) |
454 | | { |
455 | | return matchstar(pattern[0], &pattern[2], text, matchlength); |
456 | | } |
457 | | else if (pattern[1].type == PLUS) |
458 | | { |
459 | | return matchplus(pattern[0], &pattern[2], text, matchlength); |
460 | | } |
461 | | else if ((pattern[0].type == END) && pattern[1].type == UNUSED) |
462 | | { |
463 | | return text[0] == '\0'; |
464 | | } |
465 | | else if ((text[0] != '\0') && matchone(pattern[0], text[0])) |
466 | | { |
467 | | (*matchlength)++; |
468 | | return matchpattern(&pattern[1], text+1); |
469 | | } |
470 | | else |
471 | | { |
472 | | *matchlength = pre; |
473 | | return 0; |
474 | | } |
475 | | } |
476 | | |
477 | | #else |
478 | | |
479 | | /* Iterative matching */ |
480 | | static int matchpattern(regex_t* pattern, const char* text, int* matchlength) |
481 | 0 | { |
482 | 0 | int pre = *matchlength; |
483 | 0 | do |
484 | 0 | { |
485 | 0 | if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) |
486 | 0 | { |
487 | 0 | return matchquestion(pattern[0], &pattern[2], text, matchlength); |
488 | 0 | } |
489 | 0 | else if (pattern[1].type == STAR) |
490 | 0 | { |
491 | 0 | return matchstar(pattern[0], &pattern[2], text, matchlength); |
492 | 0 | } |
493 | 0 | else if (pattern[1].type == PLUS) |
494 | 0 | { |
495 | 0 | return matchplus(pattern[0], &pattern[2], text, matchlength); |
496 | 0 | } |
497 | 0 | else if ((pattern[0].type == END) && pattern[1].type == UNUSED) |
498 | 0 | { |
499 | 0 | return (text[0] == '\0'); |
500 | 0 | } |
501 | | /* Branching is not working properly |
502 | | else if (pattern[1].type == BRANCH) |
503 | | { |
504 | | return (matchpattern(pattern, text) || matchpattern(&pattern[2], text)); |
505 | | } |
506 | | */ |
507 | 0 | (*matchlength)++; |
508 | 0 | } |
509 | 0 | while ((text[0] != '\0') && matchone(*pattern++, *text++)); |
510 | | |
511 | 0 | *matchlength = pre; |
512 | 0 | return 0; |
513 | 0 | } |
514 | | |
515 | | #endif |