Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * fa.c: finite automata |
3 | | * |
4 | | * Copyright (C) 2007-2016 David Lutterkort |
5 | | * |
6 | | * This library is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * This library is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with this library; if not, write to the Free Software |
18 | | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 | | * |
20 | | * Author: David Lutterkort <dlutter@redhat.com> |
21 | | */ |
22 | | |
23 | | /* |
24 | | * This implementation follows closely the Java dk.brics.automaton package |
25 | | * by Anders Moeller. The project's website is |
26 | | * http://www.brics.dk/automaton/. |
27 | | * |
28 | | * It is by no means a complete reimplementation of that package; only a |
29 | | * subset of what Automaton provides is implemented here. |
30 | | */ |
31 | | |
32 | | #include <config.h> |
33 | | #include <limits.h> |
34 | | #include <ctype.h> |
35 | | #include <stdbool.h> |
36 | | |
37 | | #include "internal.h" |
38 | | #include "memory.h" |
39 | | #include "ref.h" |
40 | | #include "hash.h" |
41 | | #include "fa.h" |
42 | | |
43 | 0 | #define UCHAR_NUM (UCHAR_MAX+1) |
44 | 0 | #define UCHAR_MIN 0 |
45 | | typedef unsigned char uchar; |
46 | | |
47 | 0 | #define E(cond) if (cond) goto error |
48 | 0 | #define F(expr) if ((expr) < 0) goto error |
49 | | |
50 | | /* Which algorithm to use in FA_MINIMIZE */ |
51 | | int fa_minimization_algorithm = FA_MIN_HOPCROFT; |
52 | | |
53 | | /* A finite automaton. INITIAL is both the initial state and the head of |
54 | | * the list of all states. Any state that is allocated for this automaton |
55 | | * is put on this list. Dead/unreachable states are cleared from the list |
56 | | * at opportune times (e.g., during minimization) It's poor man's garbage |
57 | | * collection |
58 | | * |
59 | | * Normally, transitions are on a character range [min..max]; in |
60 | | * fa_as_regexp, we store regexps on transitions in the re field of each |
61 | | * transition. TRANS_RE indicates that we do that, and is used by fa_dot to |
62 | | * produce proper graphs of an automaton transitioning on regexps. |
63 | | * |
64 | | * For case-insensitive regexps (nocase == 1), the FA never has transitions |
65 | | * on uppercase letters [A-Z], effectively removing these letters from the |
66 | | * alphabet. |
67 | | */ |
68 | | struct fa { |
69 | | struct state *initial; |
70 | | int deterministic : 1; |
71 | | int minimal : 1; |
72 | | unsigned int nocase : 1; |
73 | | int trans_re : 1; |
74 | | }; |
75 | | |
76 | | /* A state in a finite automaton. Transitions are never shared between |
77 | | states so that we can free the list when we need to free the state */ |
78 | | struct state { |
79 | | struct state *next; |
80 | | hash_val_t hash; |
81 | | unsigned int accept : 1; |
82 | | unsigned int live : 1; |
83 | | unsigned int reachable : 1; |
84 | | unsigned int visited : 1; /* Used in various places to track progress */ |
85 | | /* Array of transitions. The TUSED first entries are used, the array |
86 | | has allocated room for TSIZE */ |
87 | | size_t tused; |
88 | | size_t tsize; |
89 | | struct trans *trans; |
90 | | }; |
91 | | |
92 | | /* A transition. If the input has a character in the inclusive |
93 | | * range [MIN, MAX], move to TO |
94 | | */ |
95 | | struct trans { |
96 | | struct state *to; |
97 | | union { |
98 | | struct { |
99 | | uchar min; |
100 | | uchar max; |
101 | | }; |
102 | | struct re *re; |
103 | | }; |
104 | | }; |
105 | | |
106 | | /* |
107 | | * Bitsets |
108 | | */ |
109 | 0 | #define UINT_BIT (sizeof(unsigned int) * CHAR_BIT) |
110 | | |
111 | | typedef unsigned int bitset; |
112 | | |
113 | 0 | static bitset *bitset_init(size_t nbits) { |
114 | 0 | bitset *bs; |
115 | 0 | if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1) |
116 | 0 | return NULL; |
117 | 0 | return bs; |
118 | 0 | } |
119 | | |
120 | 0 | static inline void bitset_clr(bitset *bs, unsigned int bit) { |
121 | 0 | bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT)); |
122 | 0 | } |
123 | | |
124 | 0 | static inline void bitset_set(bitset *bs, unsigned int bit) { |
125 | 0 | bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT); |
126 | 0 | } |
127 | | |
128 | | ATTRIBUTE_PURE |
129 | 0 | static inline int bitset_get(const bitset * const bs, unsigned int bit) { |
130 | 0 | return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1; |
131 | 0 | } |
132 | | |
133 | | ATTRIBUTE_PURE |
134 | | static inline bool bitset_disjoint(const bitset *const bs1, |
135 | | const bitset *const bs2, |
136 | 0 | size_t nbits) { |
137 | 0 | for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) { |
138 | 0 | if (bs1[i] & bs2[i]) |
139 | 0 | return false; |
140 | 0 | } |
141 | 0 | return true; |
142 | 0 | } |
143 | | |
144 | 0 | static void bitset_free(bitset *bs) { |
145 | 0 | free(bs); |
146 | 0 | } |
147 | | |
148 | 0 | static void bitset_negate(bitset *bs, size_t nbits) { |
149 | 0 | for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) |
150 | 0 | bs[i] = ~ bs[i]; |
151 | 0 | } |
152 | | |
153 | | /* |
154 | | * Representation of a parsed regular expression. The regular expression is |
155 | | * parsed according to the following grammar by PARSE_REGEXP: |
156 | | * |
157 | | * regexp: concat_exp ('|' regexp)? |
158 | | * concat_exp: repeated_exp concat_exp? |
159 | | * repeated_exp: simple_exp |
160 | | * | simple_exp '*' |
161 | | * | simple_exp '+' |
162 | | * | simple_exp '?' |
163 | | * | simple_exp '{' INT (',' INT)? '}' |
164 | | * simple_exp: char_class |
165 | | * | '.' |
166 | | * | '(' regexp ')' |
167 | | * | CHAR |
168 | | * char_class: '[' char_exp+ ']' |
169 | | * | '[' '^' char_exp+ ']' |
170 | | * char_exp: CHAR '-' CHAR |
171 | | * | CHAR |
172 | | */ |
173 | | |
174 | | enum re_type { |
175 | | UNION, |
176 | | CONCAT, |
177 | | CSET, |
178 | | CHAR, |
179 | | ITER, |
180 | | EPSILON |
181 | | }; |
182 | | |
183 | 0 | #define re_unref(r) unref(r, re) |
184 | | |
185 | | struct re { |
186 | | ref_t ref; |
187 | | enum re_type type; |
188 | | union { |
189 | | struct { /* UNION, CONCAT */ |
190 | | struct re *exp1; |
191 | | struct re *exp2; |
192 | | }; |
193 | | struct { /* CSET */ |
194 | | bool negate; |
195 | | bitset *cset; |
196 | | /* Whether we can use character ranges when converting back |
197 | | * to a string */ |
198 | | unsigned int no_ranges:1; |
199 | | }; |
200 | | struct { /* CHAR */ |
201 | | uchar c; |
202 | | }; |
203 | | struct { /* ITER */ |
204 | | struct re *exp; |
205 | | int min; |
206 | | int max; |
207 | | }; |
208 | | }; |
209 | | }; |
210 | | |
211 | | /* Used to keep state of the regex parse; RX may contain NUL's */ |
212 | | struct re_parse { |
213 | | const char *rx; /* Current position in regex */ |
214 | | const char *rend; /* Last char of rx+ 1 */ |
215 | | int error; /* error code */ |
216 | | /* Whether new CSET's should have the no_ranges flag set */ |
217 | | unsigned int no_ranges:1; |
218 | | }; |
219 | | |
220 | | /* String with explicit length, used when converting re to string */ |
221 | | struct re_str { |
222 | | char *rx; |
223 | | size_t len; |
224 | | }; |
225 | | |
226 | | static struct re *parse_regexp(struct re_parse *parse); |
227 | | |
228 | | /* A map from a set of states to a state. */ |
229 | | typedef hash_t state_set_hash; |
230 | | |
231 | | static hash_val_t ptr_hash(const void *p); |
232 | | |
233 | | static const int array_initial_size = 4; |
234 | | static const int array_max_expansion = 128; |
235 | | |
236 | | enum state_set_init_flags { |
237 | | S_NONE = 0, |
238 | | S_SORTED = (1 << 0), |
239 | | S_DATA = (1 << 1) |
240 | | }; |
241 | | |
242 | | struct state_set { |
243 | | size_t size; |
244 | | size_t used; |
245 | | unsigned int sorted : 1; |
246 | | unsigned int with_data : 1; |
247 | | struct state **states; |
248 | | void **data; |
249 | | }; |
250 | | |
251 | | struct state_set_list { |
252 | | struct state_set_list *next; |
253 | | struct state_set *set; |
254 | | }; |
255 | | |
256 | | /* Clean up FA by removing dead transitions and states and reducing |
257 | | * transitions. Unreachable states are freed. The return value is the same |
258 | | * as FA; returning it is merely a convenience. |
259 | | * |
260 | | * Only automata in this state should be returned to the user |
261 | | */ |
262 | | ATTRIBUTE_RETURN_CHECK |
263 | | static int collect(struct fa *fa); |
264 | | |
265 | | ATTRIBUTE_RETURN_CHECK |
266 | | static int totalize(struct fa *fa); |
267 | | |
268 | | /* Print an FA into a (fairly) fixed file if the environment variable |
269 | | * FA_DOT_DIR is set. This code is only used for debugging |
270 | | */ |
271 | | #define FA_DOT_DIR "FA_DOT_DIR" |
272 | | |
273 | | ATTRIBUTE_UNUSED |
274 | 0 | static void fa_dot_debug(struct fa *fa, const char *tag) { |
275 | 0 | const char *dot_dir; |
276 | 0 | static int count = 0; |
277 | 0 | int r; |
278 | 0 | char *fname; |
279 | 0 | FILE *fp; |
280 | 0 |
|
281 | 0 | if ((dot_dir = getenv(FA_DOT_DIR)) == NULL) |
282 | 0 | return; |
283 | 0 |
|
284 | 0 | r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag); |
285 | 0 | if (r == -1) |
286 | 0 | return; |
287 | 0 |
|
288 | 0 | fp = fopen(fname, "w"); |
289 | 0 | if (fp == NULL) { |
290 | 0 | free(fname); |
291 | 0 | return; |
292 | 0 | } |
293 | 0 |
|
294 | 0 | fa_dot(fp, fa); |
295 | 0 | fclose(fp); |
296 | 0 | free(fname); |
297 | 0 | } |
298 | | |
299 | 0 | static void print_char_set(struct re *set) { |
300 | 0 | int from, to; |
301 | 0 |
|
302 | 0 | if (set->negate) |
303 | 0 | printf("[^"); |
304 | 0 | else |
305 | 0 | printf("["); |
306 | 0 | for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) { |
307 | 0 | while (bitset_get(set->cset, from) == set->negate) |
308 | 0 | from += 1; |
309 | 0 | if (from > UCHAR_MAX) |
310 | 0 | break; |
311 | 0 | for (to = from; |
312 | 0 | to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate); |
313 | 0 | to++); |
314 | 0 | if (to == from) { |
315 | 0 | printf("%c", from); |
316 | 0 | } else { |
317 | 0 | printf("%c-%c", from, to); |
318 | 0 | } |
319 | 0 | } |
320 | 0 | printf("]"); |
321 | 0 | } |
322 | | |
323 | | ATTRIBUTE_UNUSED |
324 | 0 | static void print_re(struct re *re) { |
325 | 0 | switch(re->type) { |
326 | 0 | case UNION: |
327 | 0 | print_re(re->exp1); |
328 | 0 | printf("|"); |
329 | 0 | print_re(re->exp2); |
330 | 0 | break; |
331 | 0 | case CONCAT: |
332 | 0 | print_re(re->exp1); |
333 | 0 | printf("."); |
334 | 0 | print_re(re->exp2); |
335 | 0 | break; |
336 | 0 | case CSET: |
337 | 0 | print_char_set(re); |
338 | 0 | break; |
339 | 0 | case CHAR: |
340 | 0 | printf("%c", re->c); |
341 | 0 | break; |
342 | 0 | case ITER: |
343 | 0 | printf("("); |
344 | 0 | print_re(re->exp); |
345 | 0 | printf("){%d,%d}", re->min, re->max); |
346 | 0 | break; |
347 | 0 | case EPSILON: |
348 | 0 | printf("<>"); |
349 | 0 | break; |
350 | 0 | default: |
351 | 0 | printf("(**)"); |
352 | 0 | break; |
353 | 0 | } |
354 | 0 | } |
355 | | |
356 | | /* |
357 | | * struct re_str |
358 | | */ |
359 | 0 | static void release_re_str(struct re_str *str) { |
360 | 0 | if (str == NULL) |
361 | 0 | return; |
362 | 0 | FREE(str->rx); |
363 | 0 | str->len = 0; |
364 | 0 | } |
365 | | |
366 | 0 | static void free_re_str(struct re_str *str) { |
367 | 0 | if (str == NULL) |
368 | 0 | return; |
369 | 0 | release_re_str(str); |
370 | 0 | FREE(str); |
371 | 0 | } |
372 | | |
373 | 0 | static struct re_str *make_re_str(const char *s) { |
374 | 0 | struct re_str *str; |
375 | |
|
376 | 0 | if (ALLOC(str) < 0) |
377 | 0 | return NULL; |
378 | 0 | if (s != NULL) { |
379 | 0 | str->rx = strdup(s); |
380 | 0 | str->len = strlen(s); |
381 | 0 | if (str->rx == NULL) { |
382 | 0 | FREE(str); |
383 | 0 | return NULL; |
384 | 0 | } |
385 | 0 | } |
386 | 0 | return str; |
387 | 0 | } |
388 | | |
389 | 0 | static int re_str_alloc(struct re_str *str) { |
390 | 0 | return ALLOC_N(str->rx, str->len + 1); |
391 | 0 | } |
392 | | |
393 | | /* |
394 | | * Memory management |
395 | | */ |
396 | | |
397 | 0 | static void free_trans(struct state *s) { |
398 | 0 | free(s->trans); |
399 | 0 | s->trans = NULL; |
400 | 0 | s->tused = s->tsize = 0; |
401 | 0 | } |
402 | | |
403 | 0 | static void gut(struct fa *fa) { |
404 | 0 | list_for_each(s, fa->initial) { |
405 | 0 | free_trans(s); |
406 | 0 | } |
407 | 0 | list_free(fa->initial); |
408 | 0 | fa->initial = NULL; |
409 | 0 | } |
410 | | |
411 | 0 | void fa_free(struct fa *fa) { |
412 | 0 | if (fa == NULL) |
413 | 0 | return; |
414 | 0 | gut(fa); |
415 | 0 | free(fa); |
416 | 0 | } |
417 | | |
418 | 0 | static struct state *make_state(void) { |
419 | 0 | struct state *s; |
420 | 0 | if (ALLOC(s) == -1) |
421 | 0 | return NULL; |
422 | 0 | s->hash = ptr_hash(s); |
423 | 0 | return s; |
424 | 0 | } |
425 | | |
426 | 0 | static struct state *add_state(struct fa *fa, int accept) { |
427 | 0 | struct state *s = make_state(); |
428 | 0 | if (s) { |
429 | 0 | s->accept = accept; |
430 | 0 | if (fa->initial == NULL) { |
431 | 0 | fa->initial = s; |
432 | 0 | } else { |
433 | 0 | list_cons(fa->initial->next, s); |
434 | 0 | } |
435 | 0 | } |
436 | 0 | return s; |
437 | 0 | } |
438 | | |
439 | 0 | #define last_trans(s) ((s)->trans + (s)->tused - 1) |
440 | | |
441 | | #define for_each_trans(t, s) \ |
442 | 0 | for (struct trans *t = (s)->trans; \ |
443 | 0 | (t - (s)->trans) < (s)->tused; \ |
444 | 0 | t++) |
445 | | |
446 | | ATTRIBUTE_RETURN_CHECK |
447 | | static int add_new_trans(struct state *from, struct state *to, |
448 | 0 | uchar min, uchar max) { |
449 | 0 | assert(to != NULL); |
450 | | |
451 | 0 | if (from->tused == from->tsize) { |
452 | 0 | size_t tsize = from->tsize; |
453 | 0 | if (tsize == 0) |
454 | 0 | tsize = array_initial_size; |
455 | 0 | else if (from->tsize > array_max_expansion) |
456 | 0 | tsize += array_max_expansion; |
457 | 0 | else |
458 | 0 | tsize *= 2; |
459 | 0 | if (REALLOC_N(from->trans, tsize) == -1) |
460 | 0 | return -1; |
461 | 0 | from->tsize = tsize; |
462 | 0 | } |
463 | 0 | from->trans[from->tused].to = to; |
464 | 0 | from->trans[from->tused].min = min; |
465 | 0 | from->trans[from->tused].max = max; |
466 | 0 | from->tused += 1; |
467 | 0 | return 0; |
468 | 0 | } |
469 | | |
470 | | ATTRIBUTE_RETURN_CHECK |
471 | | static int add_epsilon_trans(struct state *from, |
472 | 0 | struct state *to) { |
473 | 0 | int r; |
474 | 0 | from->accept |= to->accept; |
475 | 0 | for_each_trans(t, to) { |
476 | 0 | r = add_new_trans(from, t->to, t->min, t->max); |
477 | 0 | if (r < 0) |
478 | 0 | return -1; |
479 | 0 | } |
480 | 0 | return 0; |
481 | 0 | } |
482 | | |
483 | 0 | static void set_initial(struct fa *fa, struct state *s) { |
484 | 0 | list_remove(s, fa->initial); |
485 | 0 | list_cons(fa->initial, s); |
486 | 0 | } |
487 | | |
488 | | /* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1 |
489 | | and then frees FA2. It has no influence on the language accepted by FA1 |
490 | | */ |
491 | 0 | static void fa_merge(struct fa *fa1, struct fa **fa2) { |
492 | 0 | list_append(fa1->initial, (*fa2)->initial); |
493 | 0 | free(*fa2); |
494 | 0 | *fa2 = NULL; |
495 | 0 | } |
496 | | |
497 | | /* |
498 | | * Operations on STATE_SET |
499 | | */ |
500 | 0 | static void state_set_free(struct state_set *set) { |
501 | 0 | if (set == NULL) |
502 | 0 | return; |
503 | 0 | free(set->states); |
504 | 0 | free(set->data); |
505 | 0 | free(set); |
506 | 0 | } |
507 | | |
508 | 0 | static int state_set_init_data(struct state_set *set) { |
509 | 0 | set->with_data = 1; |
510 | 0 | if (set->data == NULL) |
511 | 0 | return ALLOC_N(set->data, set->size); |
512 | 0 | else |
513 | 0 | return 0; |
514 | 0 | } |
515 | | |
516 | | /* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use |
517 | | the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating |
518 | | some options: |
519 | | - S_SORTED: keep the states in the set sorted by their address, and use |
520 | | binary search for lookups. If it is not set, entries are kept in the |
521 | | order in which they are added and lookups scan linearly through the |
522 | | set of states. |
523 | | - S_DATA: allocate the DATA array in the set, and keep its size in sync |
524 | | with the size of the STATES array. |
525 | | */ |
526 | 0 | static struct state_set *state_set_init(int size, int flags) { |
527 | 0 | struct state_set *set = NULL; |
528 | |
|
529 | 0 | F(ALLOC(set)); |
530 | | |
531 | 0 | set->sorted = (flags & S_SORTED) ? 1 : 0; |
532 | 0 | set->with_data = (flags & S_DATA) ? 1 : 0; |
533 | 0 | if (size > 0) { |
534 | 0 | set->size = size; |
535 | 0 | F(ALLOC_N(set->states, set->size)); |
536 | 0 | if (set->with_data) |
537 | 0 | F(state_set_init_data(set)); |
538 | 0 | } |
539 | 0 | return set; |
540 | 0 | error: |
541 | 0 | state_set_free(set); |
542 | 0 | return NULL; |
543 | 0 | } |
544 | | |
545 | | ATTRIBUTE_RETURN_CHECK |
546 | 0 | static int state_set_expand(struct state_set *set) { |
547 | 0 | if (set->size == 0) |
548 | 0 | set->size = array_initial_size; |
549 | 0 | else if (set->size > array_max_expansion) |
550 | 0 | set->size += array_max_expansion; |
551 | 0 | else |
552 | 0 | set->size *= 2; |
553 | 0 | if (REALLOC_N(set->states, set->size) < 0) |
554 | 0 | goto error; |
555 | 0 | if (set->with_data) |
556 | 0 | if (REALLOC_N(set->data, set->size) < 0) |
557 | 0 | goto error; |
558 | 0 | return 0; |
559 | 0 | error: |
560 | | /* We do this to provoke a SEGV as early as possible */ |
561 | 0 | FREE(set->states); |
562 | 0 | FREE(set->data); |
563 | 0 | return -1; |
564 | 0 | } |
565 | | |
566 | | /* Return the index where S belongs in SET->STATES to keep it sorted. S |
567 | | may not be in SET->STATES. The returned index is in the interval [0 |
568 | | .. SET->USED], with the latter indicating that S is larger than all |
569 | | values in SET->STATES |
570 | | */ |
571 | 0 | static int state_set_pos(const struct state_set *set, const struct state *s) { |
572 | 0 | int l = 0, h = set->used; |
573 | 0 | while (l < h) { |
574 | 0 | int m = (l + h)/2; |
575 | 0 | if (set->states[m] > s) |
576 | 0 | h = m; |
577 | 0 | else if (set->states[m] < s) |
578 | 0 | l = m + 1; |
579 | 0 | else |
580 | 0 | return m; |
581 | 0 | } |
582 | 0 | return l; |
583 | 0 | } |
584 | | |
585 | | ATTRIBUTE_RETURN_CHECK |
586 | 0 | static int state_set_push(struct state_set *set, struct state *s) { |
587 | 0 | if (set->size == set->used) |
588 | 0 | if (state_set_expand(set) < 0) |
589 | 0 | return -1; |
590 | 0 | if (set->sorted) { |
591 | 0 | int p = state_set_pos(set, s); |
592 | 0 | if (set->size == set->used) |
593 | 0 | if (state_set_expand(set) < 0) |
594 | 0 | return -1; |
595 | 0 | while (p < set->used && set->states[p] <= s) |
596 | 0 | p += 1; |
597 | 0 | if (p < set->used) { |
598 | 0 | memmove(set->states + p + 1, set->states + p, |
599 | 0 | sizeof(*set->states) * (set->used - p)); |
600 | 0 | if (set->data != NULL) |
601 | 0 | memmove(set->data + p + 1, set->data + p, |
602 | 0 | sizeof(*set->data) * (set->used - p)); |
603 | 0 | } |
604 | 0 | set->states[p] = s; |
605 | 0 | set->used += 1; |
606 | 0 | return p; |
607 | 0 | } else { |
608 | 0 | set->states[set->used++] = s; |
609 | 0 | return set->used - 1; |
610 | 0 | } |
611 | 0 | } |
612 | | |
613 | | ATTRIBUTE_RETURN_CHECK |
614 | | static int state_set_push_data(struct state_set *set, struct state *s, |
615 | 0 | void *d) { |
616 | 0 | int i = state_set_push(set, s); |
617 | 0 | if (i == -1) |
618 | 0 | return -1; |
619 | 0 | set->data[i] = d; |
620 | 0 | return i; |
621 | 0 | } |
622 | | |
623 | | static int state_set_index(const struct state_set *set, |
624 | 0 | const struct state *s) { |
625 | 0 | if (set->sorted) { |
626 | 0 | int p = state_set_pos(set, s); |
627 | 0 | return (p < set->used && set->states[p] == s) ? p : -1; |
628 | 0 | } else { |
629 | 0 | for (int i=0; i < set->used; i++) { |
630 | 0 | if (set->states[i] == s) |
631 | 0 | return i; |
632 | 0 | } |
633 | 0 | } |
634 | 0 | return -1; |
635 | 0 | } |
636 | | |
637 | | static void state_set_remove(struct state_set *set, |
638 | 0 | const struct state *s) { |
639 | 0 | if (set->sorted) { |
640 | 0 | int p = state_set_index(set, s); |
641 | 0 | if (p == -1) return; |
642 | 0 | memmove(set->states + p, set->states + p + 1, |
643 | 0 | sizeof(*set->states) * (set->used - p - 1)); |
644 | 0 | if (set->data != NULL) |
645 | 0 | memmove(set->data + p, set->data + p + 1, |
646 | 0 | sizeof(*set->data) * (set->used - p - 1)); |
647 | 0 | } else { |
648 | 0 | int p = state_set_index(set, s); |
649 | 0 | if (p >= 0) { |
650 | 0 | set->states[p] = set->states[--set->used]; |
651 | 0 | } |
652 | 0 | } |
653 | 0 | } |
654 | | |
655 | | /* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was |
656 | | already in the set and -1 on error. */ |
657 | | ATTRIBUTE_RETURN_CHECK |
658 | 0 | static int state_set_add(struct state_set *set, struct state *s) { |
659 | 0 | if (set->sorted) { |
660 | 0 | int p = state_set_pos(set, s); |
661 | 0 | if (p < set->used && set->states[p] == s) |
662 | 0 | return 0; |
663 | 0 | if (set->size == set->used) |
664 | 0 | if (state_set_expand(set) < 0) |
665 | 0 | return -1; |
666 | 0 | while (p < set->used && set->states[p] <= s) |
667 | 0 | p += 1; |
668 | 0 | if (p < set->used) { |
669 | 0 | memmove(set->states + p + 1, set->states + p, |
670 | 0 | sizeof(*set->states) * (set->used - p)); |
671 | 0 | if (set->data != NULL) |
672 | 0 | memmove(set->data + p + 1, set->data + p, |
673 | 0 | sizeof(*set->data) * (set->used - p)); |
674 | 0 | } |
675 | 0 | set->states[p] = s; |
676 | 0 | set->used += 1; |
677 | 0 | } else { |
678 | 0 | if (state_set_index(set, s) >= 0) |
679 | 0 | return 0; |
680 | 0 | if (state_set_push(set, s) < 0) |
681 | 0 | goto error; |
682 | 0 | } |
683 | 0 | return 1; |
684 | 0 | error: |
685 | | /* We do this to provoke a SEGV as early as possible */ |
686 | 0 | FREE(set->states); |
687 | 0 | FREE(set->data); |
688 | 0 | return -1; |
689 | 0 | } |
690 | | |
691 | 0 | static struct state *state_set_pop(struct state_set *set) { |
692 | 0 | struct state *s = NULL; |
693 | 0 | if (set->used > 0) |
694 | 0 | s = set->states[--set->used]; |
695 | 0 | return s; |
696 | 0 | } |
697 | | |
698 | 0 | static struct state *state_set_pop_data(struct state_set *set, void **d) { |
699 | 0 | struct state *s = NULL; |
700 | 0 | s = state_set_pop(set); |
701 | 0 | *d = set->data[set->used]; |
702 | 0 | return s; |
703 | 0 | } |
704 | | |
705 | 0 | static void *state_set_find_data(struct state_set *set, struct state *s) { |
706 | 0 | int i = state_set_index(set, s); |
707 | 0 | if (i >= 0) |
708 | 0 | return set->data[i]; |
709 | 0 | else |
710 | 0 | return NULL; |
711 | 0 | } |
712 | | |
713 | | static int state_set_equal(const struct state_set *set1, |
714 | 0 | const struct state_set *set2) { |
715 | 0 | if (set1->used != set2->used) |
716 | 0 | return 0; |
717 | 0 | if (set1->sorted && set2->sorted) { |
718 | 0 | for (int i = 0; i < set1->used; i++) |
719 | 0 | if (set1->states[i] != set2->states[i]) |
720 | 0 | return 0; |
721 | 0 | return 1; |
722 | 0 | } else { |
723 | 0 | for (int i=0; i < set1->used; i++) |
724 | 0 | if (state_set_index(set2, set1->states[i]) == -1) |
725 | 0 | return 0; |
726 | 0 | return 1; |
727 | 0 | } |
728 | 0 | } |
729 | | |
730 | | #if 0 |
731 | | static void state_set_compact(struct state_set *set) { |
732 | | while (set->used > 0 && set->states[set->used] == NULL) |
733 | | set->used -= 1; |
734 | | for (int i=0; i < set->used; i++) { |
735 | | if (set->states[i] == NULL) { |
736 | | set->used -= 1; |
737 | | set->states[i] = set->states[set->used]; |
738 | | if (set->data) |
739 | | set->data[i] = set->data[set->used]; |
740 | | } |
741 | | while (set->used > 0 && set->states[set->used] == NULL) |
742 | | set->used -= 1; |
743 | | } |
744 | | } |
745 | | #endif |
746 | | |
747 | | /* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is |
748 | | stored in SET->DATA at the same index. |
749 | | */ |
750 | | ATTRIBUTE_RETURN_CHECK |
751 | | static int state_pair_push(struct state_set **set, |
752 | 0 | struct state *fst, struct state *snd) { |
753 | 0 | if (*set == NULL) |
754 | 0 | *set = state_set_init(-1, S_DATA); |
755 | 0 | if (*set == NULL) |
756 | 0 | return -1; |
757 | 0 | int i = state_set_push(*set, fst); |
758 | 0 | if (i == -1) |
759 | 0 | return -1; |
760 | 0 | (*set)->data[i] = snd; |
761 | |
|
762 | 0 | return 0; |
763 | 0 | } |
764 | | |
765 | | /* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no |
766 | | such pair. |
767 | | */ |
768 | | static int state_pair_find(struct state_set *set, struct state *fst, |
769 | 0 | struct state *snd) { |
770 | 0 | for (int i=0; i < set->used; i++) |
771 | 0 | if (set->states[i] == fst && set->data[i] == snd) |
772 | 0 | return i; |
773 | 0 | return -1; |
774 | 0 | } |
775 | | |
776 | | /* Jenkins' hash for void* */ |
777 | 0 | static hash_val_t ptr_hash(const void *p) { |
778 | 0 | hash_val_t hash = 0; |
779 | 0 | char *c = (char *) &p; |
780 | 0 | for (int i=0; i < sizeof(p); i++) { |
781 | 0 | hash += c[i]; |
782 | 0 | hash += (hash << 10); |
783 | 0 | hash ^= (hash >> 6); |
784 | 0 | } |
785 | 0 | hash += (hash << 3); |
786 | 0 | hash ^= (hash >> 11); |
787 | 0 | hash += (hash << 15); |
788 | 0 | return hash; |
789 | 0 | } |
790 | | |
791 | | typedef hash_t state_triple_hash; |
792 | | |
793 | 0 | static hash_val_t pair_hash(const void *key) { |
794 | 0 | register struct state *const *pair = key; |
795 | 0 | return pair[0]->hash + pair[1]->hash; |
796 | 0 | } |
797 | | |
798 | 0 | static int pair_cmp(const void *key1, const void *key2) { |
799 | 0 | return memcmp(key1, key2, 2*sizeof(struct state *)); |
800 | 0 | } |
801 | | |
802 | 0 | static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) { |
803 | 0 | free((void *) hnode_getkey(node)); |
804 | 0 | free(node); |
805 | 0 | } |
806 | | |
807 | 0 | static state_triple_hash *state_triple_init(void) { |
808 | 0 | state_triple_hash *hash; |
809 | |
|
810 | 0 | hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash); |
811 | 0 | if (hash == NULL) |
812 | 0 | return NULL; |
813 | 0 | hash_set_allocator(hash, NULL, state_triple_node_free, NULL); |
814 | 0 | return hash; |
815 | 0 | } |
816 | | |
817 | | ATTRIBUTE_RETURN_CHECK |
818 | | static int state_triple_push(state_triple_hash *hash, |
819 | | struct state *s1, |
820 | | struct state *s2, |
821 | 0 | struct state *s3) { |
822 | 0 | struct state **pair; |
823 | 0 | if (ALLOC_N(pair, 2) < 0) |
824 | 0 | return -1; |
825 | 0 | pair[0] = s1; |
826 | 0 | pair[1] = s2; |
827 | 0 | return hash_alloc_insert(hash, pair, s3); |
828 | 0 | } |
829 | | |
830 | | static struct state * state_triple_thd(state_triple_hash *hash, |
831 | | struct state *s1, |
832 | 0 | struct state *s2) { |
833 | 0 | struct state *pair[2]; |
834 | 0 | hnode_t *node; |
835 | 0 | pair[0] = s1; |
836 | 0 | pair[1] = s2; |
837 | 0 | node = hash_lookup(hash, pair); |
838 | 0 | return (node == NULL) ? NULL : (struct state *) hnode_get(node); |
839 | 0 | } |
840 | | |
841 | 0 | static void state_triple_free(state_triple_hash *hash) { |
842 | 0 | if (hash != NULL) { |
843 | 0 | hash_free_nodes(hash); |
844 | 0 | hash_destroy(hash); |
845 | 0 | } |
846 | 0 | } |
847 | | |
848 | | /* |
849 | | * State operations |
850 | | */ |
851 | | ATTRIBUTE_RETURN_CHECK |
852 | 0 | static int mark_reachable(struct fa *fa) { |
853 | 0 | struct state_set *worklist = state_set_init(-1, S_NONE); |
854 | 0 | int result = -1; |
855 | |
|
856 | 0 | E(worklist == NULL); |
857 | | |
858 | 0 | list_for_each(s, fa->initial) { |
859 | 0 | s->reachable = 0; |
860 | 0 | } |
861 | 0 | fa->initial->reachable = 1; |
862 | |
|
863 | 0 | for (struct state *s = fa->initial; |
864 | 0 | s != NULL; |
865 | 0 | s = state_set_pop(worklist)) { |
866 | 0 | for_each_trans(t, s) { |
867 | 0 | if (! t->to->reachable) { |
868 | 0 | t->to->reachable = 1; |
869 | 0 | F(state_set_push(worklist, t->to)); |
870 | 0 | } |
871 | 0 | } |
872 | 0 | } |
873 | 0 | result = 0; |
874 | |
|
875 | 0 | error: |
876 | 0 | state_set_free(worklist); |
877 | 0 | return result; |
878 | 0 | } |
879 | | |
880 | | /* Return all reachable states. As a sideeffect, all states have their |
881 | | REACHABLE flag set appropriately. |
882 | | */ |
883 | 0 | static struct state_set *fa_states(struct fa *fa) { |
884 | 0 | struct state_set *visited = state_set_init(-1, S_NONE); |
885 | 0 | int r; |
886 | |
|
887 | 0 | r = mark_reachable(fa); |
888 | 0 | E(visited == NULL || r < 0); |
889 | | |
890 | 0 | list_for_each(s, fa->initial) { |
891 | 0 | if (s->reachable) |
892 | 0 | F(state_set_push(visited, s)); |
893 | 0 | } |
894 | 0 | return visited; |
895 | 0 | error: |
896 | 0 | state_set_free(visited); |
897 | 0 | return NULL; |
898 | 0 | } |
899 | | |
900 | | /* Return all reachable accepting states. As a sideeffect, all states have |
901 | | their REACHABLE flag set appropriately. |
902 | | */ |
903 | 0 | static struct state_set *fa_accept_states(struct fa *fa) { |
904 | 0 | struct state_set *accept = state_set_init(-1, S_NONE); |
905 | 0 | int r; |
906 | |
|
907 | 0 | E(accept == NULL); |
908 | | |
909 | 0 | r = mark_reachable(fa); |
910 | 0 | E(r < 0); |
911 | | |
912 | 0 | list_for_each(s, fa->initial) { |
913 | 0 | if (s->reachable && s->accept) |
914 | 0 | F(state_set_push(accept, s)); |
915 | 0 | } |
916 | 0 | return accept; |
917 | 0 | error: |
918 | 0 | state_set_free(accept); |
919 | 0 | return NULL; |
920 | 0 | } |
921 | | |
922 | | /* Mark all live states, i.e. states from which an accepting state can be |
923 | | reached. All states have their REACHABLE and LIVE flags set |
924 | | appropriately. |
925 | | */ |
926 | | ATTRIBUTE_RETURN_CHECK |
927 | 0 | static int mark_live(struct fa *fa) { |
928 | 0 | int changed; |
929 | |
|
930 | 0 | F(mark_reachable(fa)); |
931 | | |
932 | 0 | list_for_each(s, fa->initial) { |
933 | 0 | s->live = s->reachable && s->accept; |
934 | 0 | } |
935 | |
|
936 | 0 | do { |
937 | 0 | changed = 0; |
938 | 0 | list_for_each(s, fa->initial) { |
939 | 0 | if (! s->live && s->reachable) { |
940 | 0 | for_each_trans(t, s) { |
941 | 0 | if (t->to->live) { |
942 | 0 | s->live = 1; |
943 | 0 | changed = 1; |
944 | 0 | break; |
945 | 0 | } |
946 | 0 | } |
947 | 0 | } |
948 | 0 | } |
949 | 0 | } while (changed); |
950 | 0 | return 0; |
951 | 0 | error: |
952 | 0 | return -1; |
953 | 0 | } |
954 | | |
955 | | /* |
956 | | * Reverse an automaton in place. Change FA so that it accepts the |
957 | | * language that is the reverse of the input automaton. |
958 | | * |
959 | | * Returns a list of the new initial states of the automaton. The list must |
960 | | * be freed by the caller. |
961 | | */ |
962 | 0 | static struct state_set *fa_reverse(struct fa *fa) { |
963 | 0 | struct state_set *all = NULL; |
964 | 0 | struct state_set *accept = NULL; |
965 | 0 | int r; |
966 | |
|
967 | 0 | all = fa_states(fa); |
968 | 0 | E(all == NULL); |
969 | 0 | accept = fa_accept_states(fa); |
970 | 0 | E(accept == NULL); |
971 | | |
972 | 0 | F(state_set_init_data(all)); |
973 | | |
974 | | /* Reverse all transitions */ |
975 | 0 | int *tused; |
976 | 0 | F(ALLOC_N(tused, all->used)); |
977 | 0 | for (int i=0; i < all->used; i++) { |
978 | 0 | all->data[i] = all->states[i]->trans; |
979 | 0 | tused[i] = all->states[i]->tused; |
980 | 0 | all->states[i]->trans = NULL; |
981 | 0 | all->states[i]->tsize = 0; |
982 | 0 | all->states[i]->tused = 0; |
983 | 0 | } |
984 | 0 | for (int i=0; i < all->used; i++) { |
985 | 0 | struct state *s = all->states[i]; |
986 | 0 | struct trans *t = all->data[i]; |
987 | 0 | s->accept = 0; |
988 | 0 | for (int j=0; j < tused[i]; j++) { |
989 | 0 | r = add_new_trans(t[j].to, s, t[j].min, t[j].max); |
990 | 0 | if (r < 0) |
991 | 0 | goto error; |
992 | 0 | } |
993 | 0 | free(t); |
994 | 0 | } |
995 | 0 | free(tused); |
996 | | |
997 | | /* Make new initial and final states */ |
998 | 0 | struct state *s = add_state(fa, 0); |
999 | 0 | E(s == NULL); |
1000 | | |
1001 | 0 | fa->initial->accept = 1; |
1002 | 0 | set_initial(fa, s); |
1003 | 0 | for (int i=0; i < accept->used; i++) { |
1004 | 0 | r = add_epsilon_trans(s, accept->states[i]); |
1005 | 0 | if (r < 0) |
1006 | 0 | goto error; |
1007 | 0 | } |
1008 | | |
1009 | 0 | fa->deterministic = 0; |
1010 | 0 | fa->minimal = 0; |
1011 | 0 | state_set_free(all); |
1012 | 0 | return accept; |
1013 | 0 | error: |
1014 | 0 | state_set_free(all); |
1015 | 0 | state_set_free(accept); |
1016 | 0 | return NULL; |
1017 | 0 | } |
1018 | | |
1019 | | /* |
1020 | | * Return a sorted array of all interval start points in FA. The returned |
1021 | | * array is a string (null terminated) |
1022 | | */ |
1023 | 0 | static uchar* start_points(struct fa *fa, int *npoints) { |
1024 | 0 | char pointset[UCHAR_NUM]; |
1025 | 0 | uchar *points = NULL; |
1026 | |
|
1027 | 0 | F(mark_reachable(fa)); |
1028 | 0 | MEMZERO(pointset, UCHAR_NUM); |
1029 | 0 | list_for_each(s, fa->initial) { |
1030 | 0 | if (! s->reachable) |
1031 | 0 | continue; |
1032 | 0 | pointset[0] = 1; |
1033 | 0 | for_each_trans(t, s) { |
1034 | 0 | pointset[t->min] = 1; |
1035 | 0 | if (t->max < UCHAR_MAX) |
1036 | 0 | pointset[t->max+1] = 1; |
1037 | 0 | } |
1038 | 0 | } |
1039 | |
|
1040 | 0 | *npoints = 0; |
1041 | 0 | for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++); |
1042 | |
|
1043 | 0 | F(ALLOC_N(points, *npoints+1)); |
1044 | 0 | for (int i=0, n=0; i < UCHAR_NUM; i++) { |
1045 | 0 | if (pointset[i]) |
1046 | 0 | points[n++] = (uchar) i; |
1047 | 0 | } |
1048 | |
|
1049 | 0 | return points; |
1050 | 0 | error: |
1051 | 0 | free(points); |
1052 | 0 | return NULL; |
1053 | 0 | } |
1054 | | |
1055 | | /* |
1056 | | * Operations on STATE_SET_HASH |
1057 | | */ |
1058 | | static int state_set_hash_contains(state_set_hash *smap, |
1059 | 0 | struct state_set *set) { |
1060 | 0 | return hash_lookup(smap, set) != NULL; |
1061 | 0 | } |
1062 | | |
1063 | | /* |
1064 | | * Find the set in SMAP that has the same states as SET. If the two are |
1065 | | * different, i.e. they point to different memory locations, free SET and |
1066 | | * return the set found in SMAP |
1067 | | */ |
1068 | | static struct state_set *state_set_hash_uniq(state_set_hash *smap, |
1069 | 0 | struct state_set *set) { |
1070 | 0 | hnode_t *node = hash_lookup(smap, set); |
1071 | 0 | const struct state_set *orig_set = hnode_getkey(node); |
1072 | 0 | if (orig_set != set) { |
1073 | 0 | state_set_free(set); |
1074 | 0 | } |
1075 | 0 | return (struct state_set *) orig_set; |
1076 | 0 | } |
1077 | | |
1078 | | static struct state *state_set_hash_get_state(state_set_hash *smap, |
1079 | 0 | struct state_set *set) { |
1080 | 0 | hnode_t *node = hash_lookup(smap, set); |
1081 | 0 | return (struct state *) hnode_get(node); |
1082 | 0 | } |
1083 | | |
1084 | 0 | static hash_val_t set_hash(const void *key) { |
1085 | 0 | hash_val_t hash = 0; |
1086 | 0 | const struct state_set *set = key; |
1087 | |
|
1088 | 0 | for (int i = 0; i < set->used; i++) { |
1089 | 0 | hash += set->states[i]->hash; |
1090 | 0 | } |
1091 | 0 | return hash; |
1092 | 0 | } |
1093 | | |
1094 | 0 | static int set_cmp(const void *key1, const void *key2) { |
1095 | 0 | const struct state_set *set1 = key1; |
1096 | 0 | const struct state_set *set2 = key2; |
1097 | |
|
1098 | 0 | return state_set_equal(set1, set2) ? 0 : 1; |
1099 | 0 | } |
1100 | | |
1101 | 0 | static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) { |
1102 | 0 | struct state_set *set = (struct state_set *) hnode_getkey(node); |
1103 | 0 | state_set_free(set); |
1104 | 0 | free(node); |
1105 | 0 | } |
1106 | | |
1107 | | ATTRIBUTE_RETURN_CHECK |
1108 | | static int state_set_hash_add(state_set_hash **smap, |
1109 | 0 | struct state_set *set, struct fa *fa) { |
1110 | 0 | if (*smap == NULL) { |
1111 | 0 | *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash); |
1112 | 0 | E(*smap == NULL); |
1113 | 0 | hash_set_allocator(*smap, NULL, set_destroy, NULL); |
1114 | 0 | } |
1115 | 0 | struct state *s = add_state(fa, 0); |
1116 | 0 | E(s == NULL); |
1117 | 0 | F(hash_alloc_insert(*smap, set, s)); |
1118 | 0 | return 0; |
1119 | 0 | error: |
1120 | 0 | return -1; |
1121 | 0 | } |
1122 | | |
1123 | | static void state_set_hash_free(state_set_hash *smap, |
1124 | 0 | struct state_set *protect) { |
1125 | 0 | if (protect != NULL) { |
1126 | 0 | hnode_t *node = hash_lookup(smap, protect); |
1127 | 0 | hash_delete(smap, node); |
1128 | 0 | hnode_getkey(node) = NULL; |
1129 | 0 | set_destroy(node, NULL); |
1130 | 0 | } |
1131 | 0 | hash_free_nodes(smap); |
1132 | 0 | hash_destroy(smap); |
1133 | 0 | } |
1134 | | |
1135 | | static int state_set_list_add(struct state_set_list **list, |
1136 | 0 | struct state_set *set) { |
1137 | 0 | struct state_set_list *elt; |
1138 | 0 | if (ALLOC(elt) < 0) |
1139 | 0 | return -1; |
1140 | 0 | elt->set = set; |
1141 | 0 | list_cons(*list, elt); |
1142 | 0 | return 0; |
1143 | 0 | } |
1144 | | |
1145 | 0 | static struct state_set *state_set_list_pop(struct state_set_list **list) { |
1146 | 0 | struct state_set_list *elt = *list; |
1147 | 0 | struct state_set *set = elt->set; |
1148 | |
|
1149 | 0 | *list = elt->next; |
1150 | 0 | free(elt); |
1151 | 0 | return set; |
1152 | 0 | } |
1153 | | |
1154 | | /* Compare transitions lexicographically by (to, min, reverse max) */ |
1155 | 0 | static int trans_to_cmp(const void *v1, const void *v2) { |
1156 | 0 | const struct trans *t1 = v1; |
1157 | 0 | const struct trans *t2 = v2; |
1158 | |
|
1159 | 0 | if (t1->to != t2->to) { |
1160 | 0 | return (t1->to < t2->to) ? -1 : 1; |
1161 | 0 | } |
1162 | 0 | if (t1->min < t2->min) |
1163 | 0 | return -1; |
1164 | 0 | if (t1->min > t2->min) |
1165 | 0 | return 1; |
1166 | 0 | if (t1->max > t2->max) |
1167 | 0 | return -1; |
1168 | 0 | return (t1->max < t2->max) ? 1 : 0; |
1169 | 0 | } |
1170 | | |
1171 | | /* Compare transitions lexicographically by (min, reverse max, to) */ |
1172 | 0 | static int trans_intv_cmp(const void *v1, const void *v2) { |
1173 | 0 | const struct trans *t1 = v1; |
1174 | 0 | const struct trans *t2 = v2; |
1175 | |
|
1176 | 0 | if (t1->min < t2->min) |
1177 | 0 | return -1; |
1178 | 0 | if (t1->min > t2->min) |
1179 | 0 | return 1; |
1180 | 0 | if (t1->max > t2->max) |
1181 | 0 | return -1; |
1182 | 0 | if (t1->max < t2->max) |
1183 | 0 | return 1; |
1184 | 0 | if (t1->to != t2->to) { |
1185 | 0 | return (t1->to < t2->to) ? -1 : 1; |
1186 | 0 | } |
1187 | 0 | return 0; |
1188 | 0 | } |
1189 | | |
1190 | | /* |
1191 | | * Reduce an automaton by combining overlapping and adjacent edge intervals |
1192 | | * with the same destination. |
1193 | | */ |
1194 | 0 | static void reduce(struct fa *fa) { |
1195 | 0 | list_for_each(s, fa->initial) { |
1196 | 0 | if (s->tused == 0) |
1197 | 0 | continue; |
1198 | | |
1199 | 0 | qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp); |
1200 | 0 | int i=0, j=1; |
1201 | 0 | struct trans *t = s->trans; |
1202 | 0 | while (j < s->tused) { |
1203 | 0 | if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) { |
1204 | 0 | if (t[j].max > t[i].max) |
1205 | 0 | t[i].max = t[j].max; |
1206 | 0 | j += 1; |
1207 | 0 | } else { |
1208 | 0 | i += 1; |
1209 | 0 | if (i < j) |
1210 | 0 | memmove(s->trans + i, s->trans + j, |
1211 | 0 | sizeof(*s->trans) * (s->tused - j)); |
1212 | 0 | s->tused -= j - i; |
1213 | 0 | j = i + 1; |
1214 | 0 | } |
1215 | 0 | } |
1216 | 0 | s->tused = i+1; |
1217 | | /* Shrink if we use less than half the allocated size */ |
1218 | 0 | if (s->tsize > array_initial_size && 2*s->tused < s->tsize) { |
1219 | 0 | int r; |
1220 | 0 | r = REALLOC_N(s->trans, s->tsize); |
1221 | 0 | if (r == 0) |
1222 | 0 | s->tsize = s->tused; |
1223 | 0 | } |
1224 | 0 | } |
1225 | 0 | } |
1226 | | |
1227 | | /* |
1228 | | * Remove dead transitions from an FA; a transition is dead is it does not |
1229 | | * lead to a live state. This also removes any states that are not |
1230 | | * reachable any longer from FA->INITIAL. |
1231 | | * |
1232 | | * Returns the same FA as a convenience |
1233 | | */ |
1234 | | |
1235 | 0 | static void collect_trans(struct fa *fa) { |
1236 | 0 | list_for_each(s, fa->initial) { |
1237 | 0 | if (! s->live) { |
1238 | 0 | free_trans(s); |
1239 | 0 | } else { |
1240 | 0 | int i=0; |
1241 | 0 | while (i < s->tused) { |
1242 | 0 | if (! s->trans[i].to->live) { |
1243 | 0 | s->tused -= 1; |
1244 | 0 | memmove(s->trans + i, s->trans + s->tused, |
1245 | 0 | sizeof(*s->trans)); |
1246 | 0 | } else { |
1247 | 0 | i += 1; |
1248 | 0 | } |
1249 | 0 | } |
1250 | 0 | } |
1251 | 0 | } |
1252 | 0 | } |
1253 | | |
1254 | 0 | static void collect_dead_states(struct fa *fa) { |
1255 | | /* Remove all dead states and free their storage */ |
1256 | 0 | for (struct state *s = fa->initial; s->next != NULL; ) { |
1257 | 0 | if (! s->next->live) { |
1258 | 0 | struct state *del = s->next; |
1259 | 0 | s->next = del->next; |
1260 | 0 | free_trans(del); |
1261 | 0 | free(del); |
1262 | 0 | } else { |
1263 | 0 | s = s->next; |
1264 | 0 | } |
1265 | 0 | } |
1266 | 0 | } |
1267 | | |
1268 | 0 | static int collect(struct fa *fa) { |
1269 | 0 | F(mark_live(fa)); |
1270 | | |
1271 | 0 | if (! fa->initial->live) { |
1272 | | /* This automaton accepts nothing, make it the canonical |
1273 | | * epsilon automaton |
1274 | | */ |
1275 | 0 | list_for_each(s, fa->initial) { |
1276 | 0 | free_trans(s); |
1277 | 0 | } |
1278 | 0 | list_free(fa->initial->next); |
1279 | 0 | fa->deterministic = 1; |
1280 | 0 | } else { |
1281 | 0 | collect_trans(fa); |
1282 | 0 | collect_dead_states(fa); |
1283 | 0 | } |
1284 | 0 | reduce(fa); |
1285 | 0 | return 0; |
1286 | 0 | error: |
1287 | 0 | return -1; |
1288 | 0 | } |
1289 | | |
1290 | 0 | static void swap_initial(struct fa *fa) { |
1291 | 0 | struct state *s = fa->initial; |
1292 | 0 | if (s->next != NULL) { |
1293 | 0 | fa->initial = s->next; |
1294 | 0 | s->next = fa->initial->next; |
1295 | 0 | fa->initial->next = s; |
1296 | 0 | } |
1297 | 0 | } |
1298 | | |
1299 | | /* |
1300 | | * Make a finite automaton deterministic using the given set of initial |
1301 | | * states with the subset construction. This also eliminates dead states |
1302 | | * and transitions and reduces and orders the transitions for each state |
1303 | | */ |
1304 | 0 | static int determinize(struct fa *fa, struct state_set *ini) { |
1305 | 0 | int npoints; |
1306 | 0 | int make_ini = (ini == NULL); |
1307 | 0 | const uchar *points = NULL; |
1308 | 0 | state_set_hash *newstate = NULL; |
1309 | 0 | struct state_set_list *worklist = NULL; |
1310 | 0 | int ret = 0; |
1311 | |
|
1312 | 0 | if (fa->deterministic) |
1313 | 0 | return 0; |
1314 | | |
1315 | 0 | points = start_points(fa, &npoints); |
1316 | 0 | E(points == NULL); |
1317 | 0 | if (make_ini) { |
1318 | 0 | ini = state_set_init(-1, S_NONE); |
1319 | 0 | if (ini == NULL || state_set_push(ini, fa->initial) < 0) { |
1320 | 0 | state_set_free(ini); |
1321 | 0 | goto error; |
1322 | 0 | } |
1323 | 0 | } |
1324 | | |
1325 | 0 | F(state_set_list_add(&worklist, ini)); |
1326 | 0 | F(state_set_hash_add(&newstate, ini, fa)); |
1327 | | // Make the new state the initial state |
1328 | 0 | swap_initial(fa); |
1329 | 0 | while (worklist != NULL) { |
1330 | 0 | struct state_set *sset = state_set_list_pop(&worklist); |
1331 | 0 | struct state *r = state_set_hash_get_state(newstate, sset); |
1332 | 0 | for (int q=0; q < sset->used; q++) { |
1333 | 0 | r->accept |= sset->states[q]->accept; |
1334 | 0 | } |
1335 | 0 | for (int n=0; n < npoints; n++) { |
1336 | 0 | struct state_set *pset = state_set_init(-1, S_SORTED); |
1337 | 0 | E(pset == NULL); |
1338 | 0 | for(int q=0 ; q < sset->used; q++) { |
1339 | 0 | for_each_trans(t, sset->states[q]) { |
1340 | 0 | if (t->min <= points[n] && points[n] <= t->max) { |
1341 | 0 | F(state_set_add(pset, t->to)); |
1342 | 0 | } |
1343 | 0 | } |
1344 | 0 | } |
1345 | 0 | if (!state_set_hash_contains(newstate, pset)) { |
1346 | 0 | F(state_set_list_add(&worklist, pset)); |
1347 | 0 | F(state_set_hash_add(&newstate, pset, fa)); |
1348 | 0 | } |
1349 | 0 | pset = state_set_hash_uniq(newstate, pset); |
1350 | |
|
1351 | 0 | struct state *q = state_set_hash_get_state(newstate, pset); |
1352 | 0 | uchar min = points[n]; |
1353 | 0 | uchar max = UCHAR_MAX; |
1354 | 0 | if (n+1 < npoints) |
1355 | 0 | max = points[n+1] - 1; |
1356 | 0 | if (add_new_trans(r, q, min, max) < 0) |
1357 | 0 | goto error; |
1358 | 0 | } |
1359 | 0 | } |
1360 | 0 | fa->deterministic = 1; |
1361 | |
|
1362 | 0 | done: |
1363 | 0 | if (newstate) |
1364 | 0 | state_set_hash_free(newstate, make_ini ? NULL : ini); |
1365 | 0 | free((void *) points); |
1366 | 0 | if (collect(fa) < 0) |
1367 | 0 | ret = -1; |
1368 | 0 | return ret; |
1369 | 0 | error: |
1370 | 0 | ret = -1; |
1371 | 0 | goto done; |
1372 | 0 | } |
1373 | | |
1374 | | /* |
1375 | | * Minimization. As a sideeffect of minimization, the transitions are |
1376 | | * reduced and ordered. |
1377 | | */ |
1378 | | |
1379 | 0 | static struct state *step(struct state *s, uchar c) { |
1380 | 0 | for_each_trans(t, s) { |
1381 | 0 | if (t->min <= c && c <= t->max) |
1382 | 0 | return t->to; |
1383 | 0 | } |
1384 | 0 | return NULL; |
1385 | 0 | } |
1386 | | |
1387 | | struct state_list { |
1388 | | struct state_list_node *first; |
1389 | | struct state_list_node *last; |
1390 | | unsigned int size; |
1391 | | }; |
1392 | | |
1393 | | struct state_list_node { |
1394 | | struct state_list *sl; |
1395 | | struct state_list_node *next; |
1396 | | struct state_list_node *prev; |
1397 | | struct state *state; |
1398 | | }; |
1399 | | |
1400 | | static struct state_list_node *state_list_add(struct state_list *sl, |
1401 | 0 | struct state *s) { |
1402 | 0 | struct state_list_node *n; |
1403 | |
|
1404 | 0 | if (ALLOC(n) < 0) |
1405 | 0 | return NULL; |
1406 | | |
1407 | 0 | n->state = s; |
1408 | 0 | n->sl = sl; |
1409 | |
|
1410 | 0 | if (sl->size++ == 0) { |
1411 | 0 | sl->first = n; |
1412 | 0 | sl->last = n; |
1413 | 0 | } else { |
1414 | 0 | sl->last->next = n; |
1415 | 0 | n->prev = sl->last; |
1416 | 0 | sl->last = n; |
1417 | 0 | } |
1418 | 0 | return n; |
1419 | 0 | } |
1420 | | |
1421 | 0 | static void state_list_remove(struct state_list_node *n) { |
1422 | 0 | struct state_list *sl = n->sl; |
1423 | 0 | sl->size -= 1; |
1424 | 0 | if (sl->first == n) |
1425 | 0 | sl->first = n->next; |
1426 | 0 | else |
1427 | 0 | n->prev->next = n->next; |
1428 | 0 | if (sl->last == n) |
1429 | 0 | sl->last = n->prev; |
1430 | 0 | else |
1431 | 0 | n->next->prev = n->prev; |
1432 | |
|
1433 | 0 | free(n); |
1434 | 0 | } |
1435 | | |
1436 | 0 | static void state_list_free(struct state_list *sl) { |
1437 | 0 | if (sl) |
1438 | 0 | list_free(sl->first); |
1439 | 0 | free(sl); |
1440 | 0 | } |
1441 | | |
1442 | | /* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */ |
1443 | 0 | #define INDEX(q, c) (q * nsigma + c) |
1444 | | |
1445 | 0 | static int minimize_hopcroft(struct fa *fa) { |
1446 | 0 | struct state_set *states = NULL; |
1447 | 0 | uchar *sigma = NULL; |
1448 | 0 | struct state_set **reverse = NULL; |
1449 | 0 | bitset *reverse_nonempty = NULL; |
1450 | 0 | struct state_set **partition = NULL; |
1451 | 0 | unsigned int *block = NULL; |
1452 | 0 | struct state_list **active = NULL; |
1453 | 0 | struct state_list_node **active2 = NULL; |
1454 | 0 | int *pending = NULL; |
1455 | 0 | bitset *pending2 = NULL; |
1456 | 0 | struct state_set *split = NULL; |
1457 | 0 | bitset *split2 = NULL; |
1458 | 0 | int *refine = NULL; |
1459 | 0 | bitset *refine2 = NULL; |
1460 | 0 | struct state_set **splitblock = NULL; |
1461 | 0 | struct state_set *newstates = NULL; |
1462 | 0 | int *nsnum = NULL; |
1463 | 0 | int *nsind = NULL; |
1464 | 0 | int result = -1; |
1465 | 0 | unsigned int nstates = 0; |
1466 | 0 | int nsigma = 0; |
1467 | |
|
1468 | 0 | F(determinize(fa, NULL)); |
1469 | | |
1470 | | /* Total automaton, nothing to do */ |
1471 | 0 | if (fa->initial->tused == 1 |
1472 | 0 | && fa->initial->trans[0].to == fa->initial |
1473 | 0 | && fa->initial->trans[0].min == UCHAR_MIN |
1474 | 0 | && fa->initial->trans[0].max == UCHAR_MAX) |
1475 | 0 | return 0; |
1476 | | |
1477 | 0 | F(totalize(fa)); |
1478 | | |
1479 | | /* make arrays for numbered states and effective alphabet */ |
1480 | 0 | states = state_set_init(-1, S_NONE); |
1481 | 0 | E(states == NULL); |
1482 | | |
1483 | 0 | list_for_each(s, fa->initial) { |
1484 | 0 | F(state_set_push(states, s)); |
1485 | 0 | } |
1486 | 0 | nstates = states->used; |
1487 | |
|
1488 | 0 | sigma = start_points(fa, &nsigma); |
1489 | 0 | E(sigma == NULL); |
1490 | | |
1491 | | /* initialize data structures */ |
1492 | | |
1493 | | /* An ss->used x nsigma matrix of lists of states */ |
1494 | 0 | F(ALLOC_N(reverse, nstates * nsigma)); |
1495 | 0 | reverse_nonempty = bitset_init(nstates * nsigma); |
1496 | 0 | E(reverse_nonempty == NULL); |
1497 | 0 | F(ALLOC_N(partition, nstates)); |
1498 | 0 | F(ALLOC_N(block, nstates)); |
1499 | | |
1500 | 0 | F(ALLOC_N(active, nstates * nsigma)); |
1501 | 0 | F(ALLOC_N(active2, nstates * nsigma)); |
1502 | | |
1503 | | /* PENDING is an array of pairs of ints. The i'th pair is stored in |
1504 | | * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in |
1505 | | * PENDING at any time. SPENDING is the maximum number of pairs |
1506 | | * allocated for PENDING. |
1507 | | */ |
1508 | 0 | size_t npending = 0, spending = 0; |
1509 | 0 | pending2 = bitset_init(nstates * nsigma); |
1510 | 0 | E(pending2 == NULL); |
1511 | | |
1512 | 0 | split = state_set_init(-1, S_NONE); |
1513 | 0 | split2 = bitset_init(nstates); |
1514 | 0 | E(split == NULL || split2 == NULL); |
1515 | | |
1516 | 0 | F(ALLOC_N(refine, nstates)); |
1517 | 0 | refine2 = bitset_init(nstates); |
1518 | 0 | E(refine2 == NULL); |
1519 | | |
1520 | 0 | F(ALLOC_N(splitblock, nstates)); |
1521 | | |
1522 | 0 | for (int q = 0; q < nstates; q++) { |
1523 | 0 | splitblock[q] = state_set_init(-1, S_NONE); |
1524 | 0 | partition[q] = state_set_init(-1, S_NONE); |
1525 | 0 | E(splitblock[q] == NULL || partition[q] == NULL); |
1526 | 0 | for (int x = 0; x < nsigma; x++) { |
1527 | 0 | reverse[INDEX(q, x)] = state_set_init(-1, S_NONE); |
1528 | 0 | E(reverse[INDEX(q, x)] == NULL); |
1529 | 0 | F(ALLOC_N(active[INDEX(q, x)], 1)); |
1530 | 0 | } |
1531 | 0 | } |
1532 | | |
1533 | | /* find initial partition and reverse edges */ |
1534 | 0 | for (int q = 0; q < nstates; q++) { |
1535 | 0 | struct state *qq = states->states[q]; |
1536 | 0 | int j; |
1537 | 0 | if (qq->accept) |
1538 | 0 | j = 0; |
1539 | 0 | else |
1540 | 0 | j = 1; |
1541 | 0 | F(state_set_push(partition[j], qq)); |
1542 | 0 | block[q] = j; |
1543 | 0 | for (int x = 0; x < nsigma; x++) { |
1544 | 0 | uchar y = sigma[x]; |
1545 | 0 | struct state *p = step(qq, y); |
1546 | 0 | assert(p != NULL); |
1547 | 0 | int pn = state_set_index(states, p); |
1548 | 0 | assert(pn >= 0); |
1549 | 0 | F(state_set_push(reverse[INDEX(pn, x)], qq)); |
1550 | 0 | bitset_set(reverse_nonempty, INDEX(pn, x)); |
1551 | 0 | } |
1552 | 0 | } |
1553 | | |
1554 | | /* initialize active sets */ |
1555 | 0 | for (int j = 0; j <= 1; j++) |
1556 | 0 | for (int x = 0; x < nsigma; x++) |
1557 | 0 | for (int q = 0; q < partition[j]->used; q++) { |
1558 | 0 | struct state *qq = partition[j]->states[q]; |
1559 | 0 | int qn = state_set_index(states, qq); |
1560 | 0 | if (bitset_get(reverse_nonempty, INDEX(qn, x))) { |
1561 | 0 | active2[INDEX(qn, x)] = |
1562 | 0 | state_list_add(active[INDEX(j, x)], qq); |
1563 | 0 | E(active2[INDEX(qn, x)] == NULL); |
1564 | 0 | } |
1565 | 0 | } |
1566 | | |
1567 | | /* initialize pending */ |
1568 | 0 | F(ALLOC_N(pending, 2*nsigma)); |
1569 | 0 | npending = nsigma; |
1570 | 0 | spending = nsigma; |
1571 | 0 | for (int x = 0; x < nsigma; x++) { |
1572 | 0 | int a0 = active[INDEX(0,x)]->size; |
1573 | 0 | int a1 = active[INDEX(1,x)]->size; |
1574 | 0 | int j; |
1575 | 0 | if (a0 <= a1) |
1576 | 0 | j = 0; |
1577 | 0 | else |
1578 | 0 | j = 1; |
1579 | 0 | pending[2*x] = j; |
1580 | 0 | pending[2*x+1] = x; |
1581 | 0 | bitset_set(pending2, INDEX(j, x)); |
1582 | 0 | } |
1583 | | |
1584 | | /* process pending until fixed point */ |
1585 | 0 | int k = 2; |
1586 | 0 | while (npending-- > 0) { |
1587 | 0 | int p = pending[2*npending]; |
1588 | 0 | int x = pending[2*npending+1]; |
1589 | 0 | bitset_clr(pending2, INDEX(p, x)); |
1590 | 0 | int ref = 0; |
1591 | | /* find states that need to be split off their blocks */ |
1592 | 0 | struct state_list *sh = active[INDEX(p,x)]; |
1593 | 0 | for (struct state_list_node *m = sh->first; m != NULL; m = m->next) { |
1594 | 0 | int q = state_set_index(states, m->state); |
1595 | 0 | struct state_set *rev = reverse[INDEX(q, x)]; |
1596 | 0 | for (int r =0; r < rev->used; r++) { |
1597 | 0 | struct state *rs = rev->states[r]; |
1598 | 0 | int s = state_set_index(states, rs); |
1599 | 0 | if (! bitset_get(split2, s)) { |
1600 | 0 | bitset_set(split2, s); |
1601 | 0 | F(state_set_push(split, rs)); |
1602 | 0 | int j = block[s]; |
1603 | 0 | F(state_set_push(splitblock[j], rs)); |
1604 | 0 | if (!bitset_get(refine2, j)) { |
1605 | 0 | bitset_set(refine2, j); |
1606 | 0 | refine[ref++] = j; |
1607 | 0 | } |
1608 | 0 | } |
1609 | 0 | } |
1610 | 0 | } |
1611 | | // refine blocks |
1612 | 0 | for(int rr=0; rr < ref; rr++) { |
1613 | 0 | int j = refine[rr]; |
1614 | 0 | struct state_set *sp = splitblock[j]; |
1615 | 0 | if (sp->used < partition[j]->used) { |
1616 | 0 | struct state_set *b1 = partition[j]; |
1617 | 0 | struct state_set *b2 = partition[k]; |
1618 | 0 | for (int s = 0; s < sp->used; s++) { |
1619 | 0 | state_set_remove(b1, sp->states[s]); |
1620 | 0 | F(state_set_push(b2, sp->states[s])); |
1621 | 0 | int snum = state_set_index(states, sp->states[s]); |
1622 | 0 | block[snum] = k; |
1623 | 0 | for (int c = 0; c < nsigma; c++) { |
1624 | 0 | struct state_list_node *sn = active2[INDEX(snum, c)]; |
1625 | 0 | if (sn != NULL && sn->sl == active[INDEX(j,c)]) { |
1626 | 0 | state_list_remove(sn); |
1627 | 0 | active2[INDEX(snum, c)] = |
1628 | 0 | state_list_add(active[INDEX(k, c)], |
1629 | 0 | sp->states[s]); |
1630 | 0 | E(active2[INDEX(snum, c)] == NULL); |
1631 | 0 | } |
1632 | 0 | } |
1633 | 0 | } |
1634 | | // update pending |
1635 | 0 | for (int c = 0; c < nsigma; c++) { |
1636 | 0 | int aj = active[INDEX(j, c)]->size; |
1637 | 0 | int ak = active[INDEX(k, c)]->size; |
1638 | 0 | if (npending + 1 > spending) { |
1639 | 0 | spending *= 2; |
1640 | 0 | F(REALLOC_N(pending, 2 * spending)); |
1641 | 0 | } |
1642 | 0 | pending[2*npending + 1] = c; |
1643 | 0 | if (!bitset_get(pending2, INDEX(j, c)) |
1644 | 0 | && 0 < aj && aj <= ak) { |
1645 | 0 | bitset_set(pending2, INDEX(j, c)); |
1646 | 0 | pending[2*npending] = j; |
1647 | 0 | } else { |
1648 | 0 | bitset_set(pending2, INDEX(k, c)); |
1649 | 0 | pending[2*npending] = k; |
1650 | 0 | } |
1651 | 0 | npending += 1; |
1652 | 0 | } |
1653 | 0 | k++; |
1654 | 0 | } |
1655 | 0 | for (int s = 0; s < sp->used; s++) { |
1656 | 0 | int snum = state_set_index(states, sp->states[s]); |
1657 | 0 | bitset_clr(split2, snum); |
1658 | 0 | } |
1659 | 0 | bitset_clr(refine2, j); |
1660 | 0 | sp->used = 0; |
1661 | 0 | } |
1662 | 0 | split->used = 0; |
1663 | 0 | } |
1664 | | |
1665 | | /* make a new state for each equivalence class, set initial state */ |
1666 | 0 | newstates = state_set_init(k, S_NONE); |
1667 | 0 | E(newstates == NULL); |
1668 | 0 | F(ALLOC_N(nsnum, k)); |
1669 | 0 | F(ALLOC_N(nsind, nstates)); |
1670 | | |
1671 | 0 | for (int n = 0; n < k; n++) { |
1672 | 0 | struct state *s = make_state(); |
1673 | 0 | E(s == NULL); |
1674 | 0 | newstates->states[n] = s; |
1675 | 0 | struct state_set *partn = partition[n]; |
1676 | 0 | for (int q=0; q < partn->used; q++) { |
1677 | 0 | struct state *qs = partn->states[q]; |
1678 | 0 | int qnum = state_set_index(states, qs); |
1679 | 0 | if (qs == fa->initial) |
1680 | 0 | s->live = 1; /* Abuse live to flag the new initial state */ |
1681 | 0 | nsnum[n] = qnum; /* select representative */ |
1682 | 0 | nsind[qnum] = n; /* and point from partition to new state */ |
1683 | 0 | } |
1684 | 0 | } |
1685 | | |
1686 | | /* build transitions and set acceptance */ |
1687 | 0 | for (int n = 0; n < k; n++) { |
1688 | 0 | struct state *s = newstates->states[n]; |
1689 | 0 | s->accept = states->states[nsnum[n]]->accept; |
1690 | 0 | for_each_trans(t, states->states[nsnum[n]]) { |
1691 | 0 | int toind = state_set_index(states, t->to); |
1692 | 0 | struct state *nto = newstates->states[nsind[toind]]; |
1693 | 0 | F(add_new_trans(s, nto, t->min, t->max)); |
1694 | 0 | } |
1695 | 0 | } |
1696 | | |
1697 | | /* Get rid of old states and transitions and turn NEWTSTATES into |
1698 | | a linked list */ |
1699 | 0 | gut(fa); |
1700 | 0 | for (int n=0; n < k; n++) |
1701 | 0 | if (newstates->states[n]->live) { |
1702 | 0 | struct state *ini = newstates->states[n]; |
1703 | 0 | newstates->states[n] = newstates->states[0]; |
1704 | 0 | newstates->states[0] = ini; |
1705 | 0 | } |
1706 | 0 | for (int n=0; n < k-1; n++) |
1707 | 0 | newstates->states[n]->next = newstates->states[n+1]; |
1708 | 0 | fa->initial = newstates->states[0]; |
1709 | |
|
1710 | 0 | result = 0; |
1711 | | |
1712 | | /* clean up */ |
1713 | 0 | done: |
1714 | 0 | free(nsind); |
1715 | 0 | free(nsnum); |
1716 | 0 | state_set_free(states); |
1717 | 0 | free(sigma); |
1718 | 0 | bitset_free(reverse_nonempty); |
1719 | 0 | free(block); |
1720 | 0 | for (int i=0; i < nstates*nsigma; i++) { |
1721 | 0 | if (reverse) |
1722 | 0 | state_set_free(reverse[i]); |
1723 | 0 | if (active) |
1724 | 0 | state_list_free(active[i]); |
1725 | 0 | } |
1726 | 0 | free(reverse); |
1727 | 0 | free(active); |
1728 | 0 | free(active2); |
1729 | 0 | free(pending); |
1730 | 0 | bitset_free(pending2); |
1731 | 0 | state_set_free(split); |
1732 | 0 | bitset_free(split2); |
1733 | 0 | free(refine); |
1734 | 0 | bitset_free(refine2); |
1735 | 0 | for (int q=0; q < nstates; q++) { |
1736 | 0 | if (splitblock) |
1737 | 0 | state_set_free(splitblock[q]); |
1738 | 0 | if (partition) |
1739 | 0 | state_set_free(partition[q]); |
1740 | 0 | } |
1741 | 0 | free(splitblock); |
1742 | 0 | free(partition); |
1743 | 0 | state_set_free(newstates); |
1744 | |
|
1745 | 0 | if (collect(fa) < 0) |
1746 | 0 | result = -1; |
1747 | 0 | return result; |
1748 | 0 | error: |
1749 | 0 | result = -1; |
1750 | 0 | goto done; |
1751 | 0 | } |
1752 | | |
1753 | 0 | static int minimize_brzozowski(struct fa *fa) { |
1754 | 0 | struct state_set *set; |
1755 | | |
1756 | | /* Minimize using Brzozowski's algorithm */ |
1757 | 0 | set = fa_reverse(fa); |
1758 | 0 | E(set == NULL); |
1759 | 0 | F(determinize(fa, set)); |
1760 | 0 | state_set_free(set); |
1761 | |
|
1762 | 0 | set = fa_reverse(fa); |
1763 | 0 | E(set == NULL); |
1764 | 0 | F(determinize(fa, set)); |
1765 | 0 | state_set_free(set); |
1766 | 0 | return 0; |
1767 | 0 | error: |
1768 | 0 | return -1; |
1769 | 0 | } |
1770 | | |
1771 | 0 | int fa_minimize(struct fa *fa) { |
1772 | 0 | int r; |
1773 | |
|
1774 | 0 | if (fa == NULL) |
1775 | 0 | return -1; |
1776 | 0 | if (fa->minimal) |
1777 | 0 | return 0; |
1778 | | |
1779 | 0 | if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) { |
1780 | 0 | r = minimize_brzozowski(fa); |
1781 | 0 | } else { |
1782 | 0 | r = minimize_hopcroft(fa); |
1783 | 0 | } |
1784 | |
|
1785 | 0 | if (r == 0) |
1786 | 0 | fa->minimal = 1; |
1787 | 0 | return r; |
1788 | 0 | } |
1789 | | |
1790 | | /* |
1791 | | * Construction of fa |
1792 | | */ |
1793 | | |
1794 | 0 | static struct fa *fa_make_empty(void) { |
1795 | 0 | struct fa *fa; |
1796 | |
|
1797 | 0 | if (ALLOC(fa) < 0) |
1798 | 0 | return NULL; |
1799 | 0 | if (add_state(fa, 0) == NULL) { |
1800 | 0 | fa_free(fa); |
1801 | 0 | return NULL; |
1802 | 0 | } |
1803 | | /* Even though, technically, this fa is both minimal and deterministic, |
1804 | | * this function is also used to allocate new fa's which are then modified |
1805 | | * further. Rather than risk erroneously marking such an fa as minimal |
1806 | | * and deterministic, we do not do that here and take the minor hit if |
1807 | | * that should ever need to be determined for an actual empty fa |
1808 | | */ |
1809 | 0 | return fa; |
1810 | 0 | } |
1811 | | |
1812 | 0 | static struct fa *fa_make_epsilon(void) { |
1813 | 0 | struct fa *fa = fa_make_empty(); |
1814 | 0 | if (fa) { |
1815 | 0 | fa->initial->accept = 1; |
1816 | 0 | fa->deterministic= 1; |
1817 | 0 | fa->minimal = 1; |
1818 | 0 | } |
1819 | 0 | return fa; |
1820 | 0 | } |
1821 | | |
1822 | 0 | static struct fa *fa_make_char(uchar c) { |
1823 | 0 | struct fa *fa = fa_make_empty(); |
1824 | 0 | if (! fa) |
1825 | 0 | return NULL; |
1826 | | |
1827 | 0 | struct state *s = fa->initial; |
1828 | 0 | struct state *t = add_state(fa, 1); |
1829 | 0 | int r; |
1830 | |
|
1831 | 0 | if (t == NULL) |
1832 | 0 | goto error; |
1833 | | |
1834 | 0 | r = add_new_trans(s, t, c, c); |
1835 | 0 | if (r < 0) |
1836 | 0 | goto error; |
1837 | 0 | fa->deterministic = 1; |
1838 | 0 | fa->minimal = 1; |
1839 | 0 | return fa; |
1840 | 0 | error: |
1841 | 0 | fa_free(fa); |
1842 | 0 | return NULL; |
1843 | 0 | } |
1844 | | |
1845 | 0 | struct fa *fa_make_basic(unsigned int basic) { |
1846 | 0 | int r; |
1847 | |
|
1848 | 0 | if (basic == FA_EMPTY) { |
1849 | 0 | return fa_make_empty(); |
1850 | 0 | } else if (basic == FA_EPSILON) { |
1851 | 0 | return fa_make_epsilon(); |
1852 | 0 | } else if (basic == FA_TOTAL) { |
1853 | 0 | struct fa *fa = fa_make_epsilon(); |
1854 | 0 | r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX); |
1855 | 0 | if (r < 0) { |
1856 | 0 | fa_free(fa); |
1857 | 0 | fa = NULL; |
1858 | 0 | } |
1859 | 0 | return fa; |
1860 | 0 | } |
1861 | 0 | return NULL; |
1862 | 0 | } |
1863 | | |
1864 | 0 | int fa_is_basic(struct fa *fa, unsigned int basic) { |
1865 | 0 | if (basic == FA_EMPTY) { |
1866 | 0 | return ! fa->initial->accept && fa->initial->tused == 0; |
1867 | 0 | } else if (basic == FA_EPSILON) { |
1868 | 0 | return fa->initial->accept && fa->initial->tused == 0; |
1869 | 0 | } else if (basic == FA_TOTAL) { |
1870 | 0 | if (! fa->initial->accept) |
1871 | 0 | return 0; |
1872 | 0 | if (fa->nocase) { |
1873 | 0 | if (fa->initial->tused != 2) |
1874 | 0 | return 0; |
1875 | 0 | struct trans *t1 = fa->initial->trans; |
1876 | 0 | struct trans *t2 = fa->initial->trans + 1; |
1877 | 0 | if (t1->to != fa->initial || t2->to != fa->initial) |
1878 | 0 | return 0; |
1879 | 0 | if (t2->max != UCHAR_MAX) { |
1880 | 0 | t1 = t2; |
1881 | 0 | t2 = fa->initial->trans; |
1882 | 0 | } |
1883 | 0 | return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 && |
1884 | 0 | t2->min == 'Z' + 1 && t2->max == UCHAR_MAX); |
1885 | 0 | } else { |
1886 | 0 | struct trans *t = fa->initial->trans; |
1887 | 0 | return fa->initial->tused == 1 && |
1888 | 0 | t->to == fa->initial && |
1889 | 0 | t->min == UCHAR_MIN && t->max == UCHAR_MAX; |
1890 | 0 | } |
1891 | 0 | } |
1892 | 0 | return 0; |
1893 | 0 | } |
1894 | | |
1895 | 0 | static struct fa *fa_clone(struct fa *fa) { |
1896 | 0 | struct fa *result = NULL; |
1897 | 0 | struct state_set *set = state_set_init(-1, S_DATA|S_SORTED); |
1898 | 0 | int r; |
1899 | |
|
1900 | 0 | if (fa == NULL || set == NULL || ALLOC(result) < 0) |
1901 | 0 | goto error; |
1902 | | |
1903 | 0 | result->deterministic = fa->deterministic; |
1904 | 0 | result->minimal = fa->minimal; |
1905 | 0 | result->nocase = fa->nocase; |
1906 | 0 | list_for_each(s, fa->initial) { |
1907 | 0 | int i = state_set_push(set, s); |
1908 | 0 | E(i < 0); |
1909 | | |
1910 | 0 | struct state *q = add_state(result, s->accept); |
1911 | 0 | if (q == NULL) |
1912 | 0 | goto error; |
1913 | 0 | set->data[i] = q; |
1914 | 0 | q->live = s->live; |
1915 | 0 | q->reachable = s->reachable; |
1916 | 0 | } |
1917 | 0 | for (int i=0; i < set->used; i++) { |
1918 | 0 | struct state *s = set->states[i]; |
1919 | 0 | struct state *sc = set->data[i]; |
1920 | 0 | for_each_trans(t, s) { |
1921 | 0 | int to = state_set_index(set, t->to); |
1922 | 0 | assert(to >= 0); |
1923 | 0 | struct state *toc = set->data[to]; |
1924 | 0 | r = add_new_trans(sc, toc, t->min, t->max); |
1925 | 0 | if (r < 0) |
1926 | 0 | goto error; |
1927 | 0 | } |
1928 | 0 | } |
1929 | 0 | state_set_free(set); |
1930 | 0 | return result; |
1931 | 0 | error: |
1932 | 0 | state_set_free(set); |
1933 | 0 | fa_free(result); |
1934 | 0 | return NULL; |
1935 | 0 | } |
1936 | | |
1937 | | static int case_expand(struct fa *fa); |
1938 | | |
1939 | | /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */ |
1940 | | ATTRIBUTE_RETURN_CHECK |
1941 | 0 | static int union_in_place(struct fa *fa1, struct fa **fa2) { |
1942 | 0 | struct state *s; |
1943 | 0 | int r; |
1944 | |
|
1945 | 0 | if (fa1->nocase != (*fa2)->nocase) { |
1946 | 0 | if (case_expand(fa1) < 0) |
1947 | 0 | return -1; |
1948 | 0 | if (case_expand(*fa2) < 0) |
1949 | 0 | return -1; |
1950 | 0 | } |
1951 | | |
1952 | 0 | s = add_state(fa1, 0); |
1953 | 0 | if (s == NULL) |
1954 | 0 | return -1; |
1955 | 0 | r = add_epsilon_trans(s, fa1->initial); |
1956 | 0 | if (r < 0) |
1957 | 0 | return -1; |
1958 | 0 | r = add_epsilon_trans(s, (*fa2)->initial); |
1959 | 0 | if (r < 0) |
1960 | 0 | return -1; |
1961 | | |
1962 | 0 | fa1->deterministic = 0; |
1963 | 0 | fa1->minimal = 0; |
1964 | 0 | fa_merge(fa1, fa2); |
1965 | |
|
1966 | 0 | set_initial(fa1, s); |
1967 | |
|
1968 | 0 | return 0; |
1969 | 0 | } |
1970 | | |
1971 | 0 | struct fa *fa_union(struct fa *fa1, struct fa *fa2) { |
1972 | 0 | fa1 = fa_clone(fa1); |
1973 | 0 | fa2 = fa_clone(fa2); |
1974 | 0 | if (fa1 == NULL || fa2 == NULL) |
1975 | 0 | goto error; |
1976 | | |
1977 | 0 | F(union_in_place(fa1, &fa2)); |
1978 | | |
1979 | 0 | return fa1; |
1980 | 0 | error: |
1981 | 0 | fa_free(fa1); |
1982 | 0 | fa_free(fa2); |
1983 | 0 | return NULL; |
1984 | 0 | } |
1985 | | |
1986 | | /* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */ |
1987 | | ATTRIBUTE_RETURN_CHECK |
1988 | 0 | static int concat_in_place(struct fa *fa1, struct fa **fa2) { |
1989 | 0 | int r; |
1990 | |
|
1991 | 0 | if (fa1->nocase != (*fa2)->nocase) { |
1992 | 0 | if (case_expand(fa1) < 0) |
1993 | 0 | return -1; |
1994 | 0 | if (case_expand(*fa2) < 0) |
1995 | 0 | return -1; |
1996 | 0 | } |
1997 | | |
1998 | 0 | list_for_each(s, fa1->initial) { |
1999 | 0 | if (s->accept) { |
2000 | 0 | s->accept = 0; |
2001 | 0 | r = add_epsilon_trans(s, (*fa2)->initial); |
2002 | 0 | if (r < 0) |
2003 | 0 | return -1; |
2004 | 0 | } |
2005 | 0 | } |
2006 | | |
2007 | 0 | fa1->deterministic = 0; |
2008 | 0 | fa1->minimal = 0; |
2009 | 0 | fa_merge(fa1, fa2); |
2010 | |
|
2011 | 0 | return 0; |
2012 | 0 | } |
2013 | | |
2014 | 0 | struct fa *fa_concat(struct fa *fa1, struct fa *fa2) { |
2015 | 0 | fa1 = fa_clone(fa1); |
2016 | 0 | fa2 = fa_clone(fa2); |
2017 | |
|
2018 | 0 | if (fa1 == NULL || fa2 == NULL) |
2019 | 0 | goto error; |
2020 | | |
2021 | 0 | F(concat_in_place(fa1, &fa2)); |
2022 | | |
2023 | 0 | F(collect(fa1)); |
2024 | | |
2025 | 0 | return fa1; |
2026 | | |
2027 | 0 | error: |
2028 | 0 | fa_free(fa1); |
2029 | 0 | fa_free(fa2); |
2030 | 0 | return NULL; |
2031 | 0 | } |
2032 | | |
2033 | 0 | static struct fa *fa_make_char_set(bitset *cset, int negate) { |
2034 | 0 | struct fa *fa = fa_make_empty(); |
2035 | 0 | if (!fa) |
2036 | 0 | return NULL; |
2037 | | |
2038 | 0 | struct state *s = fa->initial; |
2039 | 0 | struct state *t = add_state(fa, 1); |
2040 | 0 | int from = 0; |
2041 | 0 | int r; |
2042 | |
|
2043 | 0 | if (t == NULL) |
2044 | 0 | goto error; |
2045 | | |
2046 | 0 | while (from <= UCHAR_MAX) { |
2047 | 0 | while (from <= UCHAR_MAX && bitset_get(cset, from) == negate) |
2048 | 0 | from += 1; |
2049 | 0 | if (from > UCHAR_MAX) |
2050 | 0 | break; |
2051 | 0 | int to = from; |
2052 | 0 | while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate)) |
2053 | 0 | to += 1; |
2054 | 0 | r = add_new_trans(s, t, from, to); |
2055 | 0 | if (r < 0) |
2056 | 0 | goto error; |
2057 | 0 | from = to + 1; |
2058 | 0 | } |
2059 | | |
2060 | 0 | fa->deterministic = 1; |
2061 | 0 | fa->minimal = 1; |
2062 | 0 | return fa; |
2063 | | |
2064 | 0 | error: |
2065 | 0 | fa_free(fa); |
2066 | 0 | return NULL; |
2067 | 0 | } |
2068 | | |
2069 | 0 | static struct fa *fa_star(struct fa *fa) { |
2070 | 0 | struct state *s; |
2071 | 0 | int r; |
2072 | |
|
2073 | 0 | fa = fa_clone(fa); |
2074 | 0 | if (fa == NULL) |
2075 | 0 | return NULL; |
2076 | | |
2077 | 0 | s = add_state(fa, 1); |
2078 | 0 | if (s == NULL) |
2079 | 0 | goto error; |
2080 | | |
2081 | 0 | r = add_epsilon_trans(s, fa->initial); |
2082 | 0 | if (r < 0) |
2083 | 0 | goto error; |
2084 | | |
2085 | 0 | set_initial(fa, s); |
2086 | 0 | list_for_each(p, fa->initial->next) { |
2087 | 0 | if (p->accept) { |
2088 | 0 | r = add_epsilon_trans(p, s); |
2089 | 0 | if (r < 0) |
2090 | 0 | goto error; |
2091 | 0 | } |
2092 | 0 | } |
2093 | 0 | fa->deterministic = 0; |
2094 | 0 | fa->minimal = 0; |
2095 | |
|
2096 | 0 | return fa; |
2097 | | |
2098 | 0 | error: |
2099 | 0 | fa_free(fa); |
2100 | 0 | return NULL; |
2101 | 0 | } |
2102 | | |
2103 | | /* Form the automaton (FA){N}; FA is not modified */ |
2104 | 0 | static struct fa *repeat(struct fa *fa, int n) { |
2105 | 0 | if (n == 0) { |
2106 | 0 | return fa_make_epsilon(); |
2107 | 0 | } else if (n == 1) { |
2108 | 0 | return fa_clone(fa); |
2109 | 0 | } else { |
2110 | 0 | struct fa *cfa = fa_clone(fa); |
2111 | 0 | if (cfa == NULL) |
2112 | 0 | return NULL; |
2113 | 0 | while (n > 1) { |
2114 | 0 | struct fa *tfa = fa_clone(fa); |
2115 | 0 | if (tfa == NULL) { |
2116 | 0 | fa_free(cfa); |
2117 | 0 | return NULL; |
2118 | 0 | } |
2119 | 0 | if (concat_in_place(cfa, &tfa) < 0) { |
2120 | 0 | fa_free(cfa); |
2121 | 0 | fa_free(tfa); |
2122 | 0 | return NULL; |
2123 | 0 | } |
2124 | 0 | n -= 1; |
2125 | 0 | } |
2126 | 0 | return cfa; |
2127 | 0 | } |
2128 | 0 | } |
2129 | | |
2130 | 0 | struct fa *fa_iter(struct fa *fa, int min, int max) { |
2131 | 0 | int r; |
2132 | |
|
2133 | 0 | if (min < 0) |
2134 | 0 | min = 0; |
2135 | |
|
2136 | 0 | if (min > max && max != -1) { |
2137 | 0 | return fa_make_empty(); |
2138 | 0 | } |
2139 | 0 | if (max == -1) { |
2140 | 0 | struct fa *sfa = fa_star(fa); |
2141 | 0 | if (min == 0) |
2142 | 0 | return sfa; |
2143 | 0 | if (! sfa) |
2144 | 0 | return NULL; |
2145 | 0 | struct fa *cfa = repeat(fa, min); |
2146 | 0 | if (! cfa) { |
2147 | 0 | fa_free(sfa); |
2148 | 0 | return NULL; |
2149 | 0 | } |
2150 | 0 | if (concat_in_place(cfa, &sfa) < 0) { |
2151 | 0 | fa_free(sfa); |
2152 | 0 | fa_free(cfa); |
2153 | 0 | return NULL; |
2154 | 0 | } |
2155 | 0 | return cfa; |
2156 | 0 | } else { |
2157 | 0 | struct fa *cfa = NULL; |
2158 | |
|
2159 | 0 | max -= min; |
2160 | 0 | cfa = repeat(fa, min); |
2161 | 0 | if (cfa == NULL) |
2162 | 0 | return NULL; |
2163 | 0 | if (max > 0) { |
2164 | 0 | struct fa *cfa2 = fa_clone(fa); |
2165 | 0 | if (cfa2 == NULL) { |
2166 | 0 | fa_free(cfa); |
2167 | 0 | return NULL; |
2168 | 0 | } |
2169 | 0 | while (max > 1) { |
2170 | 0 | struct fa *cfa3 = fa_clone(fa); |
2171 | 0 | if (cfa3 == NULL) { |
2172 | 0 | fa_free(cfa); |
2173 | 0 | fa_free(cfa2); |
2174 | 0 | return NULL; |
2175 | 0 | } |
2176 | 0 | list_for_each(s, cfa3->initial) { |
2177 | 0 | if (s->accept) { |
2178 | 0 | r = add_epsilon_trans(s, cfa2->initial); |
2179 | 0 | if (r < 0) { |
2180 | 0 | fa_free(cfa); |
2181 | 0 | fa_free(cfa2); |
2182 | 0 | fa_free(cfa3); |
2183 | 0 | return NULL; |
2184 | 0 | } |
2185 | 0 | } |
2186 | 0 | } |
2187 | 0 | fa_merge(cfa3, &cfa2); |
2188 | 0 | cfa2 = cfa3; |
2189 | 0 | max -= 1; |
2190 | 0 | } |
2191 | 0 | list_for_each(s, cfa->initial) { |
2192 | 0 | if (s->accept) { |
2193 | 0 | r = add_epsilon_trans(s, cfa2->initial); |
2194 | 0 | if (r < 0) { |
2195 | 0 | fa_free(cfa); |
2196 | 0 | fa_free(cfa2); |
2197 | 0 | return NULL; |
2198 | 0 | } |
2199 | 0 | } |
2200 | 0 | } |
2201 | 0 | fa_merge(cfa, &cfa2); |
2202 | 0 | cfa->deterministic = 0; |
2203 | 0 | cfa->minimal = 0; |
2204 | 0 | } |
2205 | 0 | if (collect(cfa) < 0) { |
2206 | 0 | fa_free(cfa); |
2207 | 0 | cfa = NULL; |
2208 | 0 | } |
2209 | 0 | return cfa; |
2210 | 0 | } |
2211 | 0 | } |
2212 | | |
2213 | 0 | static void sort_transition_intervals(struct fa *fa) { |
2214 | 0 | list_for_each(s, fa->initial) { |
2215 | 0 | qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp); |
2216 | 0 | } |
2217 | 0 | } |
2218 | | |
2219 | 0 | struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) { |
2220 | 0 | int ret; |
2221 | 0 | struct fa *fa = NULL; |
2222 | 0 | struct state_set *worklist = NULL; |
2223 | 0 | state_triple_hash *newstates = NULL; |
2224 | |
|
2225 | 0 | if (fa1 == fa2) |
2226 | 0 | return fa_clone(fa1); |
2227 | | |
2228 | 0 | if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY)) |
2229 | 0 | return fa_make_empty(); |
2230 | | |
2231 | 0 | if (fa1->nocase != fa2->nocase) { |
2232 | 0 | F(case_expand(fa1)); |
2233 | 0 | F(case_expand(fa2)); |
2234 | 0 | } |
2235 | | |
2236 | 0 | fa = fa_make_empty(); |
2237 | 0 | worklist = state_set_init(-1, S_NONE); |
2238 | 0 | newstates = state_triple_init(); |
2239 | 0 | if (fa == NULL || worklist == NULL || newstates == NULL) |
2240 | 0 | goto error; |
2241 | | |
2242 | 0 | sort_transition_intervals(fa1); |
2243 | 0 | sort_transition_intervals(fa2); |
2244 | |
|
2245 | 0 | F(state_set_push(worklist, fa1->initial)); |
2246 | 0 | F(state_set_push(worklist, fa2->initial)); |
2247 | 0 | F(state_set_push(worklist, fa->initial)); |
2248 | 0 | F(state_triple_push(newstates, |
2249 | 0 | fa1->initial, fa2->initial, fa->initial)); |
2250 | 0 | while (worklist->used) { |
2251 | 0 | struct state *s = state_set_pop(worklist); |
2252 | 0 | struct state *p2 = state_set_pop(worklist); |
2253 | 0 | struct state *p1 = state_set_pop(worklist); |
2254 | 0 | s->accept = p1->accept && p2->accept; |
2255 | |
|
2256 | 0 | struct trans *t1 = p1->trans; |
2257 | 0 | struct trans *t2 = p2->trans; |
2258 | 0 | for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) { |
2259 | 0 | while (b2 < p2->tused && t2[b2].max < t1[n1].min) |
2260 | 0 | b2++; |
2261 | 0 | for (int n2 = b2; |
2262 | 0 | n2 < p2->tused && t1[n1].max >= t2[n2].min; |
2263 | 0 | n2++) { |
2264 | 0 | if (t2[n2].max >= t1[n1].min) { |
2265 | 0 | struct state *r = state_triple_thd(newstates, |
2266 | 0 | t1[n1].to, t2[n2].to); |
2267 | 0 | if (r == NULL) { |
2268 | 0 | r = add_state(fa, 0); |
2269 | 0 | E(r == NULL); |
2270 | 0 | F(state_set_push(worklist, t1[n1].to)); |
2271 | 0 | F(state_set_push(worklist, t2[n2].to)); |
2272 | 0 | F(state_set_push(worklist, r)); |
2273 | 0 | F(state_triple_push(newstates, |
2274 | 0 | t1[n1].to, t2[n2].to, r)); |
2275 | 0 | } |
2276 | 0 | char min = t1[n1].min > t2[n2].min |
2277 | 0 | ? t1[n1].min : t2[n2].min; |
2278 | 0 | char max = t1[n1].max < t2[n2].max |
2279 | 0 | ? t1[n1].max : t2[n2].max; |
2280 | 0 | ret = add_new_trans(s, r, min, max); |
2281 | 0 | if (ret < 0) |
2282 | 0 | goto error; |
2283 | 0 | } |
2284 | 0 | } |
2285 | 0 | } |
2286 | 0 | } |
2287 | 0 | fa->deterministic = fa1->deterministic && fa2->deterministic; |
2288 | 0 | fa->nocase = fa1->nocase && fa2->nocase; |
2289 | 0 | done: |
2290 | 0 | state_set_free(worklist); |
2291 | 0 | state_triple_free(newstates); |
2292 | 0 | if (fa != NULL) { |
2293 | 0 | if (collect(fa) < 0) { |
2294 | 0 | fa_free(fa); |
2295 | 0 | fa = NULL; |
2296 | 0 | } |
2297 | 0 | } |
2298 | |
|
2299 | 0 | return fa; |
2300 | 0 | error: |
2301 | 0 | fa_free(fa); |
2302 | 0 | fa = NULL; |
2303 | 0 | goto done; |
2304 | 0 | } |
2305 | | |
2306 | 0 | int fa_contains(struct fa *fa1, struct fa *fa2) { |
2307 | 0 | int result = 0; |
2308 | 0 | struct state_set *worklist = NULL; /* List of pairs of states */ |
2309 | 0 | struct state_set *visited = NULL; /* List of pairs of states */ |
2310 | |
|
2311 | 0 | if (fa1 == NULL || fa2 == NULL) |
2312 | 0 | return -1; |
2313 | | |
2314 | 0 | if (fa1 == fa2) |
2315 | 0 | return 1; |
2316 | | |
2317 | 0 | F(determinize(fa2, NULL)); |
2318 | 0 | sort_transition_intervals(fa1); |
2319 | 0 | sort_transition_intervals(fa2); |
2320 | |
|
2321 | 0 | F(state_pair_push(&worklist, fa1->initial, fa2->initial)); |
2322 | 0 | F(state_pair_push(&visited, fa1->initial, fa2->initial)); |
2323 | 0 | while (worklist->used) { |
2324 | 0 | struct state *p1, *p2; |
2325 | 0 | void *v2; |
2326 | 0 | p1 = state_set_pop_data(worklist, &v2); |
2327 | 0 | p2 = v2; |
2328 | |
|
2329 | 0 | if (p1->accept && !p2->accept) |
2330 | 0 | goto done; |
2331 | | |
2332 | 0 | struct trans *t1 = p1->trans; |
2333 | 0 | struct trans *t2 = p2->trans; |
2334 | 0 | for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) { |
2335 | 0 | while (b2 < p2->tused && t2[b2].max < t1[n1].min) |
2336 | 0 | b2++; |
2337 | 0 | int min1 = t1[n1].min, max1 = t1[n1].max; |
2338 | 0 | for (int n2 = b2; |
2339 | 0 | n2 < p2->tused && t1[n1].max >= t2[n2].min; |
2340 | 0 | n2++) { |
2341 | 0 | if (t2[n2].min > min1) |
2342 | 0 | goto done; |
2343 | 0 | if (t2[n2].max < CHAR_MAX) |
2344 | 0 | min1 = t2[n2].max + 1; |
2345 | 0 | else { |
2346 | 0 | min1 = UCHAR_MAX; |
2347 | 0 | max1 = UCHAR_MIN; |
2348 | 0 | } |
2349 | 0 | if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) { |
2350 | 0 | F(state_pair_push(&worklist, t1[n1].to, t2[n2].to)); |
2351 | 0 | F(state_pair_push(&visited, t1[n1].to, t2[n2].to)); |
2352 | 0 | } |
2353 | 0 | } |
2354 | 0 | if (min1 <= max1) |
2355 | 0 | goto done; |
2356 | 0 | } |
2357 | 0 | } |
2358 | | |
2359 | 0 | result = 1; |
2360 | 0 | done: |
2361 | 0 | state_set_free(worklist); |
2362 | 0 | state_set_free(visited); |
2363 | 0 | return result; |
2364 | 0 | error: |
2365 | 0 | result = -1; |
2366 | 0 | goto done; |
2367 | 0 | } |
2368 | | |
2369 | | static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash, |
2370 | 0 | int min, int max) { |
2371 | 0 | int result; |
2372 | |
|
2373 | 0 | if (fa->nocase) { |
2374 | | /* Never transition on anything in [A-Z] */ |
2375 | 0 | if (min > 'Z' || max < 'A') { |
2376 | 0 | result = add_new_trans(s, crash, min, max); |
2377 | 0 | } else if (min >= 'A' && max <= 'Z') { |
2378 | 0 | result = 0; |
2379 | 0 | } else if (max <= 'Z') { |
2380 | | /* min < 'A' */ |
2381 | 0 | result = add_new_trans(s, crash, min, 'A' - 1); |
2382 | 0 | } else if (min >= 'A') { |
2383 | | /* max > 'Z' */ |
2384 | 0 | result = add_new_trans(s, crash, 'Z' + 1, max); |
2385 | 0 | } else { |
2386 | | /* min < 'A' && max > 'Z' */ |
2387 | 0 | result = add_new_trans(s, crash, min, 'A' - 1); |
2388 | 0 | if (result == 0) |
2389 | 0 | result = add_new_trans(s, crash, 'Z' + 1, max); |
2390 | 0 | } |
2391 | 0 | } else { |
2392 | 0 | result = add_new_trans(s, crash, min, max); |
2393 | 0 | } |
2394 | 0 | return result; |
2395 | 0 | } |
2396 | | |
2397 | 0 | static int totalize(struct fa *fa) { |
2398 | 0 | int r; |
2399 | 0 | struct state *crash = add_state(fa, 0); |
2400 | |
|
2401 | 0 | E(crash == NULL); |
2402 | 0 | F(mark_reachable(fa)); |
2403 | 0 | sort_transition_intervals(fa); |
2404 | |
|
2405 | 0 | r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX); |
2406 | 0 | if (r < 0) |
2407 | 0 | return -1; |
2408 | | |
2409 | 0 | list_for_each(s, fa->initial) { |
2410 | 0 | int next = UCHAR_MIN; |
2411 | 0 | int tused = s->tused; |
2412 | 0 | for (int i=0; i < tused; i++) { |
2413 | 0 | uchar min = s->trans[i].min, max = s->trans[i].max; |
2414 | 0 | if (min > next) { |
2415 | 0 | r = add_crash_trans(fa, s, crash, next, min - 1); |
2416 | 0 | if (r < 0) |
2417 | 0 | return -1; |
2418 | 0 | } |
2419 | 0 | if (max + 1 > next) |
2420 | 0 | next = max + 1; |
2421 | 0 | } |
2422 | 0 | if (next <= UCHAR_MAX) { |
2423 | 0 | r = add_crash_trans(fa, s, crash, next, UCHAR_MAX); |
2424 | 0 | if (r < 0) |
2425 | 0 | return -1; |
2426 | 0 | } |
2427 | 0 | } |
2428 | 0 | return 0; |
2429 | 0 | error: |
2430 | 0 | return -1; |
2431 | 0 | } |
2432 | | |
2433 | 0 | struct fa *fa_complement(struct fa *fa) { |
2434 | 0 | fa = fa_clone(fa); |
2435 | 0 | E(fa == NULL); |
2436 | 0 | F(determinize(fa, NULL)); |
2437 | 0 | F(totalize(fa)); |
2438 | 0 | list_for_each(s, fa->initial) |
2439 | 0 | s->accept = ! s->accept; |
2440 | |
|
2441 | 0 | F(collect(fa)); |
2442 | 0 | return fa; |
2443 | 0 | error: |
2444 | 0 | fa_free(fa); |
2445 | 0 | return NULL; |
2446 | 0 | } |
2447 | | |
2448 | 0 | struct fa *fa_minus(struct fa *fa1, struct fa *fa2) { |
2449 | 0 | if (fa1 == NULL || fa2 == NULL) |
2450 | 0 | return NULL; |
2451 | | |
2452 | 0 | if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2) |
2453 | 0 | return fa_make_empty(); |
2454 | 0 | if (fa_is_basic(fa2, FA_EMPTY)) |
2455 | 0 | return fa_clone(fa1); |
2456 | | |
2457 | 0 | struct fa *cfa2 = fa_complement(fa2); |
2458 | 0 | if (cfa2 == NULL) |
2459 | 0 | return NULL; |
2460 | | |
2461 | 0 | struct fa *result = fa_intersect(fa1, cfa2); |
2462 | 0 | fa_free(cfa2); |
2463 | 0 | return result; |
2464 | 0 | } |
2465 | | |
2466 | 0 | static int accept_to_accept(struct fa *fa) { |
2467 | 0 | int r; |
2468 | 0 | struct state *s = add_state(fa, 0); |
2469 | 0 | if (s == NULL) |
2470 | 0 | return -1; |
2471 | | |
2472 | 0 | F(mark_reachable(fa)); |
2473 | 0 | list_for_each(a, fa->initial) { |
2474 | 0 | if (a->accept && a->reachable) { |
2475 | 0 | r = add_epsilon_trans(s, a); |
2476 | 0 | if (r < 0) |
2477 | 0 | return -1; |
2478 | 0 | } |
2479 | 0 | } |
2480 | | |
2481 | 0 | set_initial(fa, s); |
2482 | 0 | fa->deterministic = fa->minimal = 0; |
2483 | 0 | return 0; |
2484 | 0 | error: |
2485 | 0 | return -1; |
2486 | 0 | } |
2487 | | |
2488 | 0 | struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) { |
2489 | 0 | struct fa *fa = NULL, *eps = NULL, *result = NULL; |
2490 | 0 | struct state_set *map = NULL; |
2491 | |
|
2492 | 0 | if (fa1 == NULL || fa2 == NULL) |
2493 | 0 | return NULL; |
2494 | | |
2495 | 0 | fa1 = fa_clone(fa1); |
2496 | 0 | fa2 = fa_clone(fa2); |
2497 | 0 | if (fa1 == NULL || fa2 == NULL) |
2498 | 0 | goto error; |
2499 | | |
2500 | 0 | if (determinize(fa1, NULL) < 0) |
2501 | 0 | goto error; |
2502 | 0 | if (accept_to_accept(fa1) < 0) |
2503 | 0 | goto error; |
2504 | | |
2505 | 0 | map = fa_reverse(fa2); |
2506 | 0 | state_set_free(map); |
2507 | 0 | if (determinize(fa2, NULL) < 0) |
2508 | 0 | goto error; |
2509 | 0 | if (accept_to_accept(fa2) < 0) |
2510 | 0 | goto error; |
2511 | 0 | map = fa_reverse(fa2); |
2512 | 0 | state_set_free(map); |
2513 | 0 | if (determinize(fa2, NULL) < 0) |
2514 | 0 | goto error; |
2515 | | |
2516 | 0 | fa = fa_intersect(fa1, fa2); |
2517 | 0 | if (fa == NULL) |
2518 | 0 | goto error; |
2519 | | |
2520 | 0 | eps = fa_make_epsilon(); |
2521 | 0 | if (eps == NULL) |
2522 | 0 | goto error; |
2523 | | |
2524 | 0 | result = fa_minus(fa, eps); |
2525 | 0 | if (result == NULL) |
2526 | 0 | goto error; |
2527 | | |
2528 | 0 | error: |
2529 | 0 | fa_free(fa1); |
2530 | 0 | fa_free(fa2); |
2531 | 0 | fa_free(fa); |
2532 | 0 | fa_free(eps); |
2533 | 0 | return result; |
2534 | 0 | } |
2535 | | |
2536 | 0 | int fa_equals(struct fa *fa1, struct fa *fa2) { |
2537 | 0 | if (fa1 == NULL || fa2 == NULL) |
2538 | 0 | return -1; |
2539 | | |
2540 | | /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */ |
2541 | 0 | int c1 = fa_contains(fa1, fa2); |
2542 | 0 | if (c1 < 0) |
2543 | 0 | return -1; |
2544 | 0 | if (c1 == 0) |
2545 | 0 | return 0; |
2546 | 0 | return fa_contains(fa2, fa1); |
2547 | 0 | } |
2548 | | |
2549 | 0 | static unsigned int chr_score(char c) { |
2550 | 0 | if (isalpha(c)) { |
2551 | 0 | return 2; |
2552 | 0 | } else if (isalnum(c)) { |
2553 | 0 | return 3; |
2554 | 0 | } else if (isprint(c)) { |
2555 | 0 | return 7; |
2556 | 0 | } else if (c == '\0') { |
2557 | 0 | return 10000; |
2558 | 0 | } else { |
2559 | 0 | return 100; |
2560 | 0 | } |
2561 | 0 | } |
2562 | | |
2563 | 0 | static unsigned int str_score(const struct re_str *str) { |
2564 | 0 | unsigned int score = 0; |
2565 | 0 | for (int i=0; i < str->len; i++) { |
2566 | 0 | score += chr_score(str->rx[i]); |
2567 | 0 | } |
2568 | 0 | return score; |
2569 | 0 | } |
2570 | | |
2571 | | /* See if we get a better string for DST by appending C to SRC. If DST is |
2572 | | * NULL or empty, always use SRC + C |
2573 | | */ |
2574 | | static struct re_str *string_extend(struct re_str *dst, |
2575 | | const struct re_str *src, |
2576 | 0 | char c) { |
2577 | 0 | if (dst == NULL |
2578 | 0 | || dst->len == 0 |
2579 | 0 | || str_score(src) + chr_score(c) < str_score(dst)) { |
2580 | 0 | int slen = src->len; |
2581 | 0 | if (dst == NULL) |
2582 | 0 | dst = make_re_str(NULL); |
2583 | 0 | if (dst == NULL) |
2584 | 0 | return NULL; |
2585 | 0 | if (REALLOC_N(dst->rx, slen+2) < 0) { |
2586 | 0 | free(dst); |
2587 | 0 | return NULL; |
2588 | 0 | } |
2589 | 0 | memcpy(dst->rx, src->rx, slen); |
2590 | 0 | dst->rx[slen] = c; |
2591 | 0 | dst->rx[slen + 1] = '\0'; |
2592 | 0 | dst->len = slen + 1; |
2593 | 0 | } |
2594 | 0 | return dst; |
2595 | 0 | } |
2596 | | |
2597 | 0 | static char pick_char(struct trans *t) { |
2598 | 0 | for (int c = t->min; c <= t->max; c++) |
2599 | 0 | if (isalpha(c)) return c; |
2600 | 0 | for (int c = t->min; c <= t->max; c++) |
2601 | 0 | if (isalnum(c)) return c; |
2602 | 0 | for (int c = t->min; c <= t->max; c++) |
2603 | 0 | if (isprint(c)) return c; |
2604 | 0 | return t->max; |
2605 | 0 | } |
2606 | | |
2607 | | /* Generate an example string for FA. Traverse all transitions and record |
2608 | | * at each turn the "best" word found for that state. |
2609 | | */ |
2610 | 0 | int fa_example(struct fa *fa, char **example, size_t *example_len) { |
2611 | 0 | struct re_str *word = NULL; |
2612 | 0 | struct state_set *path = NULL, *worklist = NULL; |
2613 | 0 | struct re_str *str = NULL; |
2614 | |
|
2615 | 0 | *example = NULL; |
2616 | 0 | *example_len = 0; |
2617 | | |
2618 | | /* Sort to avoid any ambiguity because of reordering of transitions */ |
2619 | 0 | sort_transition_intervals(fa); |
2620 | | |
2621 | | /* Map from state to string */ |
2622 | 0 | path = state_set_init(-1, S_DATA|S_SORTED); |
2623 | 0 | str = make_re_str(""); |
2624 | 0 | if (path == NULL || str == NULL) |
2625 | 0 | goto error; |
2626 | 0 | F(state_set_push_data(path, fa->initial, str)); |
2627 | 0 | str = NULL; |
2628 | | |
2629 | | /* List of states still to visit */ |
2630 | 0 | worklist = state_set_init(-1, S_NONE); |
2631 | 0 | if (worklist == NULL) |
2632 | 0 | goto error; |
2633 | 0 | F(state_set_push(worklist, fa->initial)); |
2634 | | |
2635 | 0 | while (worklist->used) { |
2636 | 0 | struct state *s = state_set_pop(worklist); |
2637 | 0 | struct re_str *ps = state_set_find_data(path, s); |
2638 | 0 | for_each_trans(t, s) { |
2639 | 0 | char c = pick_char(t); |
2640 | 0 | int toind = state_set_index(path, t->to); |
2641 | 0 | if (toind == -1) { |
2642 | 0 | struct re_str *w = string_extend(NULL, ps, c); |
2643 | 0 | E(w == NULL); |
2644 | 0 | F(state_set_push(worklist, t->to)); |
2645 | 0 | F(state_set_push_data(path, t->to, w)); |
2646 | 0 | } else { |
2647 | 0 | path->data[toind] = string_extend(path->data[toind], ps, c); |
2648 | 0 | } |
2649 | 0 | } |
2650 | 0 | } |
2651 | | |
2652 | 0 | for (int i=0; i < path->used; i++) { |
2653 | 0 | struct state *p = path->states[i]; |
2654 | 0 | struct re_str *ps = path->data[i]; |
2655 | 0 | if (p->accept && |
2656 | 0 | (word == NULL || word->len == 0 |
2657 | 0 | || (ps->len > 0 && str_score(word) > str_score(ps)))) { |
2658 | 0 | free_re_str(word); |
2659 | 0 | word = ps; |
2660 | 0 | } else { |
2661 | 0 | free_re_str(ps); |
2662 | 0 | } |
2663 | 0 | } |
2664 | 0 | state_set_free(path); |
2665 | 0 | state_set_free(worklist); |
2666 | 0 | if (word != NULL) { |
2667 | 0 | *example_len = word->len; |
2668 | 0 | *example = word->rx; |
2669 | 0 | free(word); |
2670 | 0 | } |
2671 | 0 | return 0; |
2672 | 0 | error: |
2673 | 0 | state_set_free(path); |
2674 | 0 | state_set_free(worklist); |
2675 | 0 | free_re_str(word); |
2676 | 0 | free_re_str(str); |
2677 | 0 | return -1; |
2678 | 0 | } |
2679 | | |
2680 | | struct enum_intl { |
2681 | | int limit; |
2682 | | int nwords; |
2683 | | char **words; |
2684 | | char *buf; |
2685 | | size_t bsize; |
2686 | | }; |
2687 | | |
2688 | 0 | static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) { |
2689 | 0 | int result = -1; |
2690 | |
|
2691 | 0 | if (ei->bsize <= pos + 1) { |
2692 | 0 | ei->bsize *= 2; |
2693 | 0 | F(REALLOC_N(ei->buf, ei->bsize)); |
2694 | 0 | } |
2695 | | |
2696 | 0 | ei->buf[pos] = '\0'; |
2697 | 0 | for_each_trans(t, s) { |
2698 | 0 | if (t->to->visited) |
2699 | 0 | return -2; |
2700 | 0 | t->to->visited = 1; |
2701 | 0 | for (int i=t->min; i <= t->max; i++) { |
2702 | 0 | ei->buf[pos] = i; |
2703 | 0 | if (t->to->accept) { |
2704 | 0 | if (ei->nwords >= ei->limit) |
2705 | 0 | return -2; |
2706 | 0 | ei->words[ei->nwords] = strdup(ei->buf); |
2707 | 0 | E(ei->words[ei->nwords] == NULL); |
2708 | 0 | ei->nwords += 1; |
2709 | 0 | } |
2710 | 0 | result = fa_enumerate_intl(t->to, ei, pos+1); |
2711 | 0 | E(result < 0); |
2712 | 0 | } |
2713 | 0 | t->to->visited = 0; |
2714 | 0 | } |
2715 | 0 | ei->buf[pos] = '\0'; |
2716 | 0 | result = 0; |
2717 | 0 | error: |
2718 | 0 | return result; |
2719 | 0 | } |
2720 | | |
2721 | 0 | int fa_enumerate(struct fa *fa, int limit, char ***words) { |
2722 | 0 | struct enum_intl ei; |
2723 | 0 | int result = -1; |
2724 | |
|
2725 | 0 | *words = NULL; |
2726 | 0 | MEMZERO(&ei, 1); |
2727 | 0 | ei.bsize = 8; /* Arbitrary initial size */ |
2728 | 0 | ei.limit = limit; |
2729 | 0 | F(ALLOC_N(ei.words, limit)); |
2730 | 0 | F(ALLOC_N(ei.buf, ei.bsize)); |
2731 | | |
2732 | | /* We use the visited bit to track which states we already visited |
2733 | | * during the construction of a word to detect loops */ |
2734 | 0 | list_for_each(s, fa->initial) |
2735 | 0 | s->visited = 0; |
2736 | 0 | fa->initial->visited = 1; |
2737 | 0 | if (fa->initial->accept) { |
2738 | 0 | if (ei.nwords >= limit) |
2739 | 0 | return -2; |
2740 | 0 | ei.words[0] = strdup(""); |
2741 | 0 | E(ei.words[0] == NULL); |
2742 | 0 | ei.nwords = 1; |
2743 | 0 | } |
2744 | 0 | result = fa_enumerate_intl(fa->initial, &ei, 0); |
2745 | 0 | E(result < 0); |
2746 | | |
2747 | 0 | result = ei.nwords; |
2748 | 0 | *words = ei.words; |
2749 | 0 | ei.words = NULL; |
2750 | 0 | done: |
2751 | 0 | free(ei.buf); |
2752 | 0 | return result; |
2753 | | |
2754 | 0 | error: |
2755 | 0 | for (int i=0; i < ei.nwords; i++) |
2756 | 0 | free(ei.words[i]); |
2757 | 0 | free(ei.words); |
2758 | 0 | goto done; |
2759 | 0 | } |
2760 | | |
2761 | | /* Expand the automaton FA by replacing every transition s(c) -> p from |
2762 | | * state s to p on character c by two transitions s(X) -> r, r(c) -> p via |
2763 | | * a new state r. |
2764 | | * If ADD_MARKER is true, also add for each original state s a new a loop |
2765 | | * s(Y) -> q and q(X) -> s through a new state q. |
2766 | | * |
2767 | | * The end result is that an automaton accepting "a|ab" is turned into one |
2768 | | * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb" |
2769 | | * when add_marker is true. |
2770 | | * |
2771 | | * The returned automaton is a copy of FA, FA is not modified. |
2772 | | */ |
2773 | | static struct fa *expand_alphabet(struct fa *fa, int add_marker, |
2774 | 0 | char X, char Y) { |
2775 | 0 | int ret; |
2776 | |
|
2777 | 0 | fa = fa_clone(fa); |
2778 | 0 | if (fa == NULL) |
2779 | 0 | return NULL; |
2780 | | |
2781 | 0 | F(mark_reachable(fa)); |
2782 | 0 | list_for_each(p, fa->initial) { |
2783 | 0 | if (! p->reachable) |
2784 | 0 | continue; |
2785 | | |
2786 | 0 | struct state *r = add_state(fa, 0); |
2787 | 0 | if (r == NULL) |
2788 | 0 | goto error; |
2789 | 0 | r->trans = p->trans; |
2790 | 0 | r->tused = p->tused; |
2791 | 0 | r->tsize = p->tsize; |
2792 | 0 | p->trans = NULL; |
2793 | 0 | p->tused = p->tsize = 0; |
2794 | 0 | ret = add_new_trans(p, r, X, X); |
2795 | 0 | if (ret < 0) |
2796 | 0 | goto error; |
2797 | 0 | if (add_marker) { |
2798 | 0 | struct state *q = add_state(fa, 0); |
2799 | 0 | ret = add_new_trans(p, q, Y, Y); |
2800 | 0 | if (ret < 0) |
2801 | 0 | goto error; |
2802 | 0 | ret = add_new_trans(q, p, X, X); |
2803 | 0 | if (ret < 0) |
2804 | 0 | goto error; |
2805 | 0 | } |
2806 | 0 | } |
2807 | 0 | return fa; |
2808 | 0 | error: |
2809 | 0 | fa_free(fa); |
2810 | 0 | return NULL; |
2811 | 0 | } |
2812 | | |
2813 | 0 | static bitset *alphabet(struct fa *fa) { |
2814 | 0 | bitset *bs = bitset_init(UCHAR_NUM); |
2815 | |
|
2816 | 0 | if (bs == NULL) |
2817 | 0 | return NULL; |
2818 | | |
2819 | 0 | list_for_each(s, fa->initial) { |
2820 | 0 | for (int i=0; i < s->tused; i++) { |
2821 | 0 | for (uint c = s->trans[i].min; c <= s->trans[i].max; c++) |
2822 | 0 | bitset_set(bs, c); |
2823 | 0 | } |
2824 | 0 | } |
2825 | 0 | return bs; |
2826 | 0 | } |
2827 | | |
2828 | 0 | static bitset *last_chars(struct fa *fa) { |
2829 | 0 | bitset *bs = bitset_init(UCHAR_NUM); |
2830 | |
|
2831 | 0 | if (bs == NULL) |
2832 | 0 | return NULL; |
2833 | | |
2834 | 0 | list_for_each(s, fa->initial) { |
2835 | 0 | for (int i=0; i < s->tused; i++) { |
2836 | 0 | if (s->trans[i].to->accept) { |
2837 | 0 | for (uint c = s->trans[i].min; c <= s->trans[i].max; c++) |
2838 | 0 | bitset_set(bs, c); |
2839 | 0 | } |
2840 | 0 | } |
2841 | 0 | } |
2842 | 0 | return bs; |
2843 | 0 | } |
2844 | | |
2845 | 0 | static bitset *first_chars(struct fa *fa) { |
2846 | 0 | bitset *bs = bitset_init(UCHAR_NUM); |
2847 | 0 | struct state *s = fa->initial; |
2848 | |
|
2849 | 0 | if (bs == NULL) |
2850 | 0 | return NULL; |
2851 | | |
2852 | 0 | for (int i=0; i < s->tused; i++) { |
2853 | 0 | for (uint c = s->trans[i].min; c <= s->trans[i].max; c++) |
2854 | 0 | bitset_set(bs, c); |
2855 | 0 | } |
2856 | 0 | return bs; |
2857 | 0 | } |
2858 | | |
2859 | | /* Return 1 if F1 and F2 are known to be unambiguously concatenable |
2860 | | * according to simple heuristics. Return 0 if they need to be checked |
2861 | | * further to decide ambiguity |
2862 | | * Return -1 if an allocation fails |
2863 | | */ |
2864 | 0 | static int is_splittable(struct fa *fa1, struct fa *fa2) { |
2865 | 0 | bitset *alpha1 = NULL; |
2866 | 0 | bitset *alpha2 = NULL; |
2867 | 0 | bitset *last1 = NULL; |
2868 | 0 | bitset *first2 = NULL; |
2869 | 0 | bool result = -1; |
2870 | |
|
2871 | 0 | alpha2 = alphabet(fa2); |
2872 | 0 | last1 = last_chars(fa1); |
2873 | 0 | if (alpha2 == NULL || last1 == NULL) |
2874 | 0 | goto done; |
2875 | 0 | if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) { |
2876 | 0 | result = 1; |
2877 | 0 | goto done; |
2878 | 0 | } |
2879 | | |
2880 | 0 | alpha1 = alphabet(fa1); |
2881 | 0 | first2 = first_chars(fa2); |
2882 | 0 | if (alpha1 == NULL || first2 == NULL) |
2883 | 0 | goto done; |
2884 | 0 | if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) { |
2885 | 0 | result = 1; |
2886 | 0 | goto done; |
2887 | 0 | } |
2888 | 0 | result = 0; |
2889 | 0 | done: |
2890 | 0 | bitset_free(alpha1); |
2891 | 0 | bitset_free(alpha2); |
2892 | 0 | bitset_free(last1); |
2893 | 0 | bitset_free(first2); |
2894 | 0 | return result; |
2895 | 0 | } |
2896 | | |
2897 | | /* This algorithm is due to Anders Moeller, and can be found in class |
2898 | | * AutomatonOperations in dk.brics.grammar |
2899 | | */ |
2900 | | int fa_ambig_example(struct fa *fa1, struct fa *fa2, |
2901 | | char **upv, size_t *upv_len, |
2902 | 0 | char **pv, char **v) { |
2903 | 0 | static const char X = '\001'; |
2904 | 0 | static const char Y = '\002'; |
2905 | 0 | char *result = NULL, *s = NULL; |
2906 | 0 | size_t result_len = 0; |
2907 | 0 | int ret = -1, r; |
2908 | 0 | struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL; |
2909 | 0 | struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL; |
2910 | 0 | struct fa *b1 = NULL, *b2 = NULL; |
2911 | |
|
2912 | 0 | *upv = NULL; |
2913 | 0 | *upv_len = 0; |
2914 | 0 | if (pv != NULL) |
2915 | 0 | *pv = NULL; |
2916 | 0 | if (v != NULL) |
2917 | 0 | *v = NULL; |
2918 | |
|
2919 | 0 | r = is_splittable(fa1, fa2); |
2920 | 0 | if (r < 0) |
2921 | 0 | goto error; |
2922 | 0 | if (r == 1) |
2923 | 0 | return 0; |
2924 | | |
2925 | 0 | #define Xs "\001" |
2926 | 0 | #define Ys "\002" |
2927 | 0 | #define MPs Ys Xs "(" Xs "(.|\n))+" |
2928 | 0 | #define MSs Ys Xs "(" Xs "(.|\n))*" |
2929 | 0 | #define SPs "(" Xs "(.|\n))+" Ys Xs |
2930 | 0 | #define SSs "(" Xs "(.|\n))*" Ys Xs |
2931 | | /* These could become static constants */ |
2932 | 0 | r = fa_compile( MPs, strlen(MPs), &mp); |
2933 | 0 | if (r != REG_NOERROR) |
2934 | 0 | goto error; |
2935 | 0 | r = fa_compile( MSs, strlen(MSs), &ms); |
2936 | 0 | if (r != REG_NOERROR) |
2937 | 0 | goto error; |
2938 | 0 | r = fa_compile( SPs, strlen(SPs), &sp); |
2939 | 0 | if (r != REG_NOERROR) |
2940 | 0 | goto error; |
2941 | 0 | r = fa_compile( SSs, strlen(SSs), &ss); |
2942 | 0 | if (r != REG_NOERROR) |
2943 | 0 | goto error; |
2944 | 0 | #undef SSs |
2945 | 0 | #undef SPs |
2946 | 0 | #undef MSs |
2947 | 0 | #undef MPs |
2948 | 0 | #undef Xs |
2949 | 0 | #undef Ys |
2950 | | |
2951 | 0 | a1f = expand_alphabet(fa1, 0, X, Y); |
2952 | 0 | a1t = expand_alphabet(fa1, 1, X, Y); |
2953 | 0 | a2f = expand_alphabet(fa2, 0, X, Y); |
2954 | 0 | a2t = expand_alphabet(fa2, 1, X, Y); |
2955 | 0 | if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL) |
2956 | 0 | goto error; |
2957 | | |
2958 | | /* Compute b1 = ((a1f . mp) & a1t) . ms */ |
2959 | 0 | if (concat_in_place(a1f, &mp) < 0) |
2960 | 0 | goto error; |
2961 | 0 | b1 = fa_intersect(a1f, a1t); |
2962 | 0 | if (b1 == NULL) |
2963 | 0 | goto error; |
2964 | 0 | if (concat_in_place(b1, &ms) < 0) |
2965 | 0 | goto error; |
2966 | 0 | if (fa_is_basic(b1, FA_EMPTY)) { |
2967 | | /* We are done - amb which we take an example from below |
2968 | | * will be empty, and there can therefore not be an ambiguity */ |
2969 | 0 | ret = 0; |
2970 | 0 | goto done; |
2971 | 0 | } |
2972 | | |
2973 | | /* Compute b2 = ss . ((sp . a2f) & a2t) */ |
2974 | 0 | if (concat_in_place(sp, &a2f) < 0) |
2975 | 0 | goto error; |
2976 | 0 | b2 = fa_intersect(sp, a2t); |
2977 | 0 | if (b2 == NULL) |
2978 | 0 | goto error; |
2979 | 0 | if (concat_in_place(ss, &b2) < 0) |
2980 | 0 | goto error; |
2981 | 0 | b2 = ss; |
2982 | 0 | ss = NULL; |
2983 | | |
2984 | | /* The automaton we are really interested in */ |
2985 | 0 | amb = fa_intersect(b1, b2); |
2986 | 0 | if (amb == NULL) |
2987 | 0 | goto error; |
2988 | | |
2989 | 0 | size_t s_len = 0; |
2990 | 0 | r = fa_example(amb, &s, &s_len); |
2991 | 0 | if (r < 0) |
2992 | 0 | goto error; |
2993 | | |
2994 | 0 | if (s != NULL) { |
2995 | 0 | char *t; |
2996 | 0 | result_len = (s_len-1)/2 - 1; |
2997 | 0 | F(ALLOC_N(result, result_len + 1)); |
2998 | 0 | t = result; |
2999 | 0 | int i = 0; |
3000 | 0 | for (i=0; s[2*i] == X; i++) { |
3001 | 0 | assert((t - result) < result_len); |
3002 | 0 | *t++ = s[2*i + 1]; |
3003 | 0 | } |
3004 | 0 | if (pv != NULL) |
3005 | 0 | *pv = t; |
3006 | 0 | i += 1; |
3007 | |
|
3008 | 0 | for ( ;s[2*i] == X; i++) { |
3009 | 0 | assert((t - result) < result_len); |
3010 | 0 | *t++ = s[2*i + 1]; |
3011 | 0 | } |
3012 | 0 | if (v != NULL) |
3013 | 0 | *v = t; |
3014 | 0 | i += 1; |
3015 | |
|
3016 | 0 | for (; 2*i+1 < s_len; i++) { |
3017 | 0 | assert((t - result) < result_len); |
3018 | 0 | *t++ = s[2*i + 1]; |
3019 | 0 | } |
3020 | 0 | } |
3021 | 0 | ret = 0; |
3022 | |
|
3023 | 0 | done: |
3024 | | /* Clean up intermediate automata */ |
3025 | 0 | fa_free(mp); |
3026 | 0 | fa_free(ms); |
3027 | 0 | fa_free(ss); |
3028 | 0 | fa_free(sp); |
3029 | 0 | fa_free(a1f); |
3030 | 0 | fa_free(a1t); |
3031 | 0 | fa_free(a2f); |
3032 | 0 | fa_free(a2t); |
3033 | 0 | fa_free(b1); |
3034 | 0 | fa_free(b2); |
3035 | 0 | fa_free(amb); |
3036 | |
|
3037 | 0 | FREE(s); |
3038 | 0 | *upv = result; |
3039 | 0 | if (result != NULL) |
3040 | 0 | *upv_len = result_len; |
3041 | 0 | return ret; |
3042 | 0 | error: |
3043 | 0 | FREE(result); |
3044 | 0 | ret = -1; |
3045 | 0 | goto done; |
3046 | 0 | } |
3047 | | |
3048 | | /* |
3049 | | * Construct an fa from a regular expression |
3050 | | */ |
3051 | 0 | static struct fa *fa_from_re(struct re *re) { |
3052 | 0 | struct fa *result = NULL; |
3053 | |
|
3054 | 0 | switch(re->type) { |
3055 | 0 | case UNION: |
3056 | 0 | { |
3057 | 0 | result = fa_from_re(re->exp1); |
3058 | 0 | if (result == NULL) |
3059 | 0 | goto error; |
3060 | 0 | struct fa *fa2 = fa_from_re(re->exp2); |
3061 | 0 | if (fa2 == NULL) |
3062 | 0 | goto error; |
3063 | 0 | if (union_in_place(result, &fa2) < 0) |
3064 | 0 | goto error; |
3065 | 0 | } |
3066 | 0 | break; |
3067 | 0 | case CONCAT: |
3068 | 0 | { |
3069 | 0 | result = fa_from_re(re->exp1); |
3070 | 0 | if (result == NULL) |
3071 | 0 | goto error; |
3072 | 0 | struct fa *fa2 = fa_from_re(re->exp2); |
3073 | 0 | if (fa2 == NULL) |
3074 | 0 | goto error; |
3075 | 0 | if (concat_in_place(result, &fa2) < 0) |
3076 | 0 | goto error; |
3077 | 0 | } |
3078 | 0 | break; |
3079 | 0 | case CSET: |
3080 | 0 | result = fa_make_char_set(re->cset, re->negate); |
3081 | 0 | break; |
3082 | 0 | case ITER: |
3083 | 0 | { |
3084 | 0 | struct fa *fa = fa_from_re(re->exp); |
3085 | 0 | if (fa == NULL) |
3086 | 0 | goto error; |
3087 | 0 | result = fa_iter(fa, re->min, re->max); |
3088 | 0 | fa_free(fa); |
3089 | 0 | } |
3090 | 0 | break; |
3091 | 0 | case EPSILON: |
3092 | 0 | result = fa_make_epsilon(); |
3093 | 0 | break; |
3094 | 0 | case CHAR: |
3095 | 0 | result = fa_make_char(re->c); |
3096 | 0 | break; |
3097 | 0 | default: |
3098 | 0 | assert(0); |
3099 | 0 | break; |
3100 | 0 | } |
3101 | 0 | return result; |
3102 | 0 | error: |
3103 | 0 | fa_free(result); |
3104 | 0 | return NULL; |
3105 | 0 | } |
3106 | | |
3107 | 0 | static void free_re(struct re *re) { |
3108 | 0 | if (re == NULL) |
3109 | 0 | return; |
3110 | 0 | assert(re->ref == 0); |
3111 | | |
3112 | 0 | if (re->type == UNION || re->type == CONCAT) { |
3113 | 0 | re_unref(re->exp1); |
3114 | 0 | re_unref(re->exp2); |
3115 | 0 | } else if (re->type == ITER) { |
3116 | 0 | re_unref(re->exp); |
3117 | 0 | } else if (re->type == CSET) { |
3118 | 0 | bitset_free(re->cset); |
3119 | 0 | } |
3120 | 0 | free(re); |
3121 | 0 | } |
3122 | | |
3123 | 0 | int fa_compile(const char *regexp, size_t size, struct fa **fa) { |
3124 | 0 | struct re *re = NULL; |
3125 | 0 | struct re_parse parse; |
3126 | |
|
3127 | 0 | *fa = NULL; |
3128 | |
|
3129 | 0 | parse.rx = regexp; |
3130 | 0 | parse.rend = regexp + size; |
3131 | 0 | parse.error = REG_NOERROR; |
3132 | |
|
3133 | 0 | re = parse_regexp(&parse); |
3134 | 0 | if (re == NULL) |
3135 | 0 | return parse.error; |
3136 | | |
3137 | 0 | *fa = fa_from_re(re); |
3138 | 0 | re_unref(re); |
3139 | | |
3140 | 0 | if (*fa == NULL || collect(*fa) < 0) |
3141 | 0 | parse.error = REG_ESPACE; |
3142 | 0 | return parse.error; |
3143 | 0 | } |
3144 | | |
3145 | | /* We represent a case-insensitive FA by using only transitions on |
3146 | | * lower-case letters. |
3147 | | */ |
3148 | 0 | int fa_nocase(struct fa *fa) { |
3149 | 0 | if (fa == NULL || fa->nocase) |
3150 | 0 | return 0; |
3151 | | |
3152 | 0 | fa->nocase = 1; |
3153 | 0 | list_for_each(s, fa->initial) { |
3154 | 0 | int tused = s->tused; |
3155 | | /* For every transition on characters in [A-Z] add a corresponding |
3156 | | * transition on [a-z]; remove any portion covering [A-Z] */ |
3157 | 0 | for (int i=0; i < tused; i++) { |
3158 | 0 | struct trans *t = s->trans + i; |
3159 | 0 | int lc_min = t->min < 'A' ? 'a' : tolower(t->min); |
3160 | 0 | int lc_max = t->max > 'Z' ? 'z' : tolower(t->max); |
3161 | |
|
3162 | 0 | if (t->min > 'Z' || t->max < 'A') |
3163 | 0 | continue; |
3164 | 0 | if (t->min >= 'A' && t->max <= 'Z') { |
3165 | 0 | t->min = tolower(t->min); |
3166 | 0 | t->max = tolower(t->max); |
3167 | 0 | } else if (t->max <= 'Z') { |
3168 | | /* t->min < 'A' */ |
3169 | 0 | t->max = 'A' - 1; |
3170 | 0 | F(add_new_trans(s, t->to, lc_min, lc_max)); |
3171 | 0 | } else if (t->min >= 'A') { |
3172 | | /* t->max > 'Z' */ |
3173 | 0 | t->min = 'Z' + 1; |
3174 | 0 | F(add_new_trans(s, t->to, lc_min, lc_max)); |
3175 | 0 | } else { |
3176 | | /* t->min < 'A' && t->max > 'Z' */ |
3177 | 0 | F(add_new_trans(s, t->to, 'Z' + 1, t->max)); |
3178 | 0 | s->trans[i].max = 'A' - 1; |
3179 | 0 | F(add_new_trans(s, s->trans[i].to, lc_min, lc_max)); |
3180 | 0 | } |
3181 | 0 | } |
3182 | 0 | } |
3183 | 0 | F(collect(fa)); |
3184 | 0 | return 0; |
3185 | 0 | error: |
3186 | 0 | return -1; |
3187 | 0 | } |
3188 | | |
3189 | 0 | int fa_is_nocase(struct fa *fa) { |
3190 | 0 | return fa->nocase; |
3191 | 0 | } |
3192 | | |
3193 | | /* If FA is case-insensitive, turn it into a case-sensitive automaton by |
3194 | | * adding transitions on upper-case letters for each existing transition on |
3195 | | * lower-case letters */ |
3196 | 0 | static int case_expand(struct fa *fa) { |
3197 | 0 | if (! fa->nocase) |
3198 | 0 | return 0; |
3199 | | |
3200 | 0 | fa->nocase = 0; |
3201 | 0 | list_for_each(s, fa->initial) { |
3202 | 0 | int tused = s->tused; |
3203 | | /* For every transition on characters in [a-z] add a corresponding |
3204 | | * transition on [A-Z] */ |
3205 | 0 | for (int i=0; i < tused; i++) { |
3206 | 0 | struct trans *t = s->trans + i; |
3207 | 0 | int lc_min = t->min < 'a' ? 'A' : toupper(t->min); |
3208 | 0 | int lc_max = t->max > 'z' ? 'Z' : toupper(t->max); |
3209 | |
|
3210 | 0 | if (t->min > 'z' || t->max < 'a') |
3211 | 0 | continue; |
3212 | 0 | F(add_new_trans(s, t->to, lc_min, lc_max)); |
3213 | 0 | } |
3214 | 0 | } |
3215 | 0 | F(collect(fa)); |
3216 | 0 | return 0; |
3217 | 0 | error: |
3218 | 0 | return -1; |
3219 | 0 | } |
3220 | | |
3221 | | /* |
3222 | | * Regular expression parser |
3223 | | */ |
3224 | | |
3225 | 0 | static struct re *make_re(enum re_type type) { |
3226 | 0 | struct re *re; |
3227 | 0 | if (make_ref(re) == 0) |
3228 | 0 | re->type = type; |
3229 | 0 | return re; |
3230 | 0 | } |
3231 | | |
3232 | 0 | static struct re *make_re_rep(struct re *exp, int min, int max) { |
3233 | 0 | struct re *re = make_re(ITER); |
3234 | 0 | if (re) { |
3235 | 0 | re->exp = exp; |
3236 | 0 | re->min = min; |
3237 | 0 | re->max = max; |
3238 | 0 | } else { |
3239 | 0 | re_unref(exp); |
3240 | 0 | } |
3241 | 0 | return re; |
3242 | 0 | } |
3243 | | |
3244 | | static struct re *make_re_binop(enum re_type type, struct re *exp1, |
3245 | 0 | struct re *exp2) { |
3246 | 0 | struct re *re = make_re(type); |
3247 | 0 | if (re) { |
3248 | 0 | re->exp1 = exp1; |
3249 | 0 | re->exp2 = exp2; |
3250 | 0 | } else { |
3251 | 0 | re_unref(exp1); |
3252 | 0 | re_unref(exp2); |
3253 | 0 | } |
3254 | 0 | return re; |
3255 | 0 | } |
3256 | | |
3257 | 0 | static struct re *make_re_char(uchar c) { |
3258 | 0 | struct re *re = make_re(CHAR); |
3259 | 0 | if (re) |
3260 | 0 | re->c = c; |
3261 | 0 | return re; |
3262 | 0 | } |
3263 | | |
3264 | 0 | static struct re *make_re_char_set(bool negate, bool no_ranges) { |
3265 | 0 | struct re *re = make_re(CSET); |
3266 | 0 | if (re) { |
3267 | 0 | re->negate = negate; |
3268 | 0 | re->no_ranges = no_ranges; |
3269 | 0 | re->cset = bitset_init(UCHAR_NUM); |
3270 | 0 | if (re->cset == NULL) |
3271 | 0 | re_unref(re); |
3272 | 0 | } |
3273 | 0 | return re; |
3274 | 0 | } |
3275 | | |
3276 | 0 | static bool more(struct re_parse *parse) { |
3277 | 0 | return parse->rx < parse->rend; |
3278 | 0 | } |
3279 | | |
3280 | 0 | static bool match(struct re_parse *parse, char m) { |
3281 | 0 | if (!more(parse)) |
3282 | 0 | return false; |
3283 | 0 | if (*parse->rx == m) { |
3284 | 0 | parse->rx += 1; |
3285 | 0 | return true; |
3286 | 0 | } |
3287 | 0 | return false; |
3288 | 0 | } |
3289 | | |
3290 | 0 | static bool peek(struct re_parse *parse, const char *chars) { |
3291 | 0 | return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL; |
3292 | 0 | } |
3293 | | |
3294 | 0 | static bool next(struct re_parse *parse, char *c) { |
3295 | 0 | if (!more(parse)) |
3296 | 0 | return false; |
3297 | 0 | *c = *parse->rx; |
3298 | 0 | parse->rx += 1; |
3299 | 0 | return true; |
3300 | 0 | } |
3301 | | |
3302 | 0 | static bool parse_char(struct re_parse *parse, int quoted, char *c) { |
3303 | 0 | if (!more(parse)) |
3304 | 0 | return false; |
3305 | 0 | if (quoted && *parse->rx == '\\') { |
3306 | 0 | parse->rx += 1; |
3307 | 0 | return next(parse, c); |
3308 | 0 | } else { |
3309 | 0 | return next(parse, c); |
3310 | 0 | } |
3311 | 0 | } |
3312 | | |
3313 | 0 | static void add_re_char(struct re *re, uchar from, uchar to) { |
3314 | 0 | assert(re->type == CSET); |
3315 | 0 | for (unsigned int c = from; c <= to; c++) |
3316 | 0 | bitset_set(re->cset, c); |
3317 | 0 | } |
3318 | | |
3319 | 0 | static void parse_char_class(struct re_parse *parse, struct re *re) { |
3320 | 0 | if (! more(parse)) { |
3321 | 0 | parse->error = REG_EBRACK; |
3322 | 0 | goto error; |
3323 | 0 | } |
3324 | 0 | char from, to; |
3325 | 0 | parse_char(parse, 0, &from); |
3326 | 0 | to = from; |
3327 | 0 | if (match(parse, '-')) { |
3328 | 0 | if (! more(parse)) { |
3329 | 0 | parse->error = REG_EBRACK; |
3330 | 0 | goto error; |
3331 | 0 | } |
3332 | 0 | if (peek(parse, "]")) { |
3333 | 0 | if (from > to) { |
3334 | 0 | parse->error = REG_ERANGE; |
3335 | 0 | goto error; |
3336 | 0 | } |
3337 | 0 | add_re_char(re, from, to); |
3338 | 0 | add_re_char(re, '-', '-'); |
3339 | 0 | return; |
3340 | 0 | } else if (!parse_char(parse, 0, &to)) { |
3341 | 0 | parse->error = REG_ERANGE; |
3342 | 0 | goto error; |
3343 | 0 | } |
3344 | 0 | } |
3345 | 0 | if (from > to) { |
3346 | 0 | parse->error = REG_ERANGE; |
3347 | 0 | goto error; |
3348 | 0 | } |
3349 | 0 | add_re_char(re, from, to); |
3350 | 0 | error: |
3351 | 0 | return; |
3352 | 0 | } |
3353 | | |
3354 | 0 | static struct re *parse_simple_exp(struct re_parse *parse) { |
3355 | 0 | struct re *re = NULL; |
3356 | |
|
3357 | 0 | if (match(parse, '[')) { |
3358 | 0 | bool negate = match(parse, '^'); |
3359 | 0 | re = make_re_char_set(negate, parse->no_ranges); |
3360 | 0 | if (re == NULL) { |
3361 | 0 | parse->error = REG_ESPACE; |
3362 | 0 | goto error; |
3363 | 0 | } |
3364 | 0 | parse_char_class(parse, re); |
3365 | 0 | if (parse->error != REG_NOERROR) |
3366 | 0 | goto error; |
3367 | 0 | while (more(parse) && ! peek(parse, "]")) { |
3368 | 0 | parse_char_class(parse, re); |
3369 | 0 | if (parse->error != REG_NOERROR) |
3370 | 0 | goto error; |
3371 | 0 | } |
3372 | 0 | if (! match(parse, ']')) { |
3373 | 0 | parse->error = REG_EBRACK; |
3374 | 0 | goto error; |
3375 | 0 | } |
3376 | 0 | } else if (match(parse, '(')) { |
3377 | 0 | if (match(parse, ')')) { |
3378 | 0 | re = make_re(EPSILON); |
3379 | 0 | if (re == NULL) { |
3380 | 0 | parse->error = REG_ESPACE; |
3381 | 0 | goto error; |
3382 | 0 | } |
3383 | 0 | } else { |
3384 | 0 | re = parse_regexp(parse); |
3385 | 0 | if (re == NULL) |
3386 | 0 | goto error; |
3387 | 0 | if (! match(parse, ')')) { |
3388 | 0 | parse->error = REG_EPAREN; |
3389 | 0 | goto error; |
3390 | 0 | } |
3391 | 0 | } |
3392 | 0 | } else if (match(parse, '.')) { |
3393 | 0 | re = make_re_char_set(1, parse->no_ranges); |
3394 | 0 | if (re == NULL) { |
3395 | 0 | parse->error = REG_ESPACE; |
3396 | 0 | goto error; |
3397 | 0 | } |
3398 | 0 | add_re_char(re, '\n', '\n'); |
3399 | 0 | } else if (more(parse)) { |
3400 | 0 | char c; |
3401 | 0 | if (!parse_char(parse, 1, &c)) { |
3402 | 0 | parse->error = REG_EESCAPE; |
3403 | 0 | goto error; |
3404 | 0 | } |
3405 | 0 | re = make_re_char(c); |
3406 | 0 | if (re == NULL) { |
3407 | 0 | parse->error = REG_ESPACE; |
3408 | 0 | goto error; |
3409 | 0 | } |
3410 | 0 | } else { |
3411 | 0 | re = make_re(EPSILON); |
3412 | 0 | if (re == NULL) { |
3413 | 0 | parse->error = REG_ESPACE; |
3414 | 0 | goto error; |
3415 | 0 | } |
3416 | 0 | } |
3417 | 0 | return re; |
3418 | 0 | error: |
3419 | 0 | re_unref(re); |
3420 | 0 | return NULL; |
3421 | 0 | } |
3422 | | |
3423 | 0 | static int parse_int(struct re_parse *parse) { |
3424 | 0 | const char *lim; |
3425 | 0 | char *end; |
3426 | 0 | size_t used; |
3427 | 0 | long l; |
3428 | | |
3429 | | /* We need to be careful that strtoul will never access |
3430 | | * memory beyond parse->rend |
3431 | | */ |
3432 | 0 | for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9'; |
3433 | 0 | lim++); |
3434 | 0 | if (lim < parse->rend) { |
3435 | 0 | l = strtoul(parse->rx, &end, 10); |
3436 | 0 | used = end - parse->rx; |
3437 | 0 | } else { |
3438 | 0 | char *s = strndup(parse->rx, parse->rend - parse->rx); |
3439 | 0 | if (s == NULL) { |
3440 | 0 | parse->error = REG_ESPACE; |
3441 | 0 | return -1; |
3442 | 0 | } |
3443 | 0 | l = strtoul(s, &end, 10); |
3444 | 0 | used = end - s; |
3445 | 0 | free(s); |
3446 | 0 | } |
3447 | | |
3448 | 0 | if (used == 0) |
3449 | 0 | return -1; |
3450 | 0 | parse->rx += used; |
3451 | 0 | if ((l<0) || (l > INT_MAX)) { |
3452 | 0 | parse->error = REG_BADBR; |
3453 | 0 | return -1; |
3454 | 0 | } |
3455 | 0 | return (int) l; |
3456 | 0 | } |
3457 | | |
3458 | 0 | static struct re *parse_repeated_exp(struct re_parse *parse) { |
3459 | 0 | struct re *re = parse_simple_exp(parse); |
3460 | 0 | if (re == NULL) |
3461 | 0 | goto error; |
3462 | 0 | if (match(parse, '?')) { |
3463 | 0 | re = make_re_rep(re, 0, 1); |
3464 | 0 | } else if (match(parse, '*')) { |
3465 | 0 | re = make_re_rep(re, 0, -1); |
3466 | 0 | } else if (match(parse, '+')) { |
3467 | 0 | re = make_re_rep(re, 1, -1); |
3468 | 0 | } else if (match(parse, '{')) { |
3469 | 0 | int min, max; |
3470 | 0 | min = parse_int(parse); |
3471 | 0 | if (min == -1) |
3472 | 0 | goto error; |
3473 | 0 | if (match(parse, ',')) { |
3474 | 0 | max = parse_int(parse); |
3475 | 0 | if (max == -1) |
3476 | 0 | max = -1; /* If it's not an int, it means 'unbounded' */ |
3477 | 0 | if (! match(parse, '}')) { |
3478 | 0 | parse->error = REG_EBRACE; |
3479 | 0 | goto error; |
3480 | 0 | } |
3481 | 0 | } else if (match(parse, '}')) { |
3482 | 0 | max = min; |
3483 | 0 | } else { |
3484 | 0 | parse->error = REG_EBRACE; |
3485 | 0 | goto error; |
3486 | 0 | } |
3487 | 0 | if (min > max && max != -1) { |
3488 | 0 | parse->error = REG_BADBR; |
3489 | 0 | goto error; |
3490 | 0 | } |
3491 | 0 | re = make_re_rep(re, min, max); |
3492 | 0 | } |
3493 | 0 | if (re == NULL) |
3494 | 0 | parse->error = REG_ESPACE; |
3495 | 0 | return re; |
3496 | 0 | error: |
3497 | 0 | re_unref(re); |
3498 | 0 | return NULL; |
3499 | 0 | } |
3500 | | |
3501 | 0 | static struct re *parse_concat_exp(struct re_parse *parse) { |
3502 | 0 | struct re *re = parse_repeated_exp(parse); |
3503 | 0 | if (re == NULL) |
3504 | 0 | goto error; |
3505 | | |
3506 | 0 | if (more(parse) && ! peek(parse, ")|")) { |
3507 | 0 | struct re *re2 = parse_concat_exp(parse); |
3508 | 0 | if (re2 == NULL) |
3509 | 0 | goto error; |
3510 | 0 | re = make_re_binop(CONCAT, re, re2); |
3511 | 0 | if (re == NULL) { |
3512 | 0 | parse->error = REG_ESPACE; |
3513 | 0 | goto error; |
3514 | 0 | } |
3515 | 0 | } |
3516 | 0 | return re; |
3517 | | |
3518 | 0 | error: |
3519 | 0 | re_unref(re); |
3520 | 0 | return NULL; |
3521 | 0 | } |
3522 | | |
3523 | 0 | static struct re *parse_regexp(struct re_parse *parse) { |
3524 | 0 | struct re *re = NULL; |
3525 | | |
3526 | | /* Something like (|r) */ |
3527 | 0 | if (peek(parse, "|")) |
3528 | 0 | re = make_re(EPSILON); |
3529 | 0 | else |
3530 | 0 | re = parse_concat_exp(parse); |
3531 | 0 | if (re == NULL) |
3532 | 0 | goto error; |
3533 | | |
3534 | 0 | if (match(parse, '|')) { |
3535 | 0 | struct re *re2 = NULL; |
3536 | | /* Something like (r|) */ |
3537 | 0 | if (peek(parse, ")")) |
3538 | 0 | re2 = make_re(EPSILON); |
3539 | 0 | else |
3540 | 0 | re2 = parse_regexp(parse); |
3541 | 0 | if (re2 == NULL) |
3542 | 0 | goto error; |
3543 | | |
3544 | 0 | re = make_re_binop(UNION, re, re2); |
3545 | 0 | if (re == NULL) { |
3546 | 0 | parse->error = REG_ESPACE; |
3547 | 0 | goto error; |
3548 | 0 | } |
3549 | 0 | } |
3550 | 0 | return re; |
3551 | | |
3552 | 0 | error: |
3553 | 0 | re_unref(re); |
3554 | 0 | return NULL; |
3555 | 0 | } |
3556 | | |
3557 | | /* |
3558 | | * Convert a STRUCT RE to a string. Code is complicated by the fact that |
3559 | | * we try to be clever and avoid unneeded parens and concatenation with |
3560 | | * epsilon etc. |
3561 | | */ |
3562 | | static int re_as_string(const struct re *re, struct re_str *str); |
3563 | | |
3564 | 0 | static int re_binop_count(enum re_type type, const struct re *re) { |
3565 | 0 | assert(type == CONCAT || type == UNION); |
3566 | 0 | if (re->type == type) { |
3567 | 0 | return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2); |
3568 | 0 | } else { |
3569 | 0 | return 1; |
3570 | 0 | } |
3571 | 0 | } |
3572 | | |
3573 | | static int re_binop_store(enum re_type type, const struct re *re, |
3574 | 0 | const struct re **list) { |
3575 | 0 | int pos = 0; |
3576 | 0 | if (type == re->type) { |
3577 | 0 | pos = re_binop_store(type, re->exp1, list); |
3578 | 0 | pos += re_binop_store(type, re->exp2, list + pos); |
3579 | 0 | } else { |
3580 | 0 | list[0] = re; |
3581 | 0 | pos = 1; |
3582 | 0 | } |
3583 | 0 | return pos; |
3584 | 0 | } |
3585 | | |
3586 | 0 | static int re_union_as_string(const struct re *re, struct re_str *str) { |
3587 | 0 | assert(re->type == UNION); |
3588 | | |
3589 | 0 | int result = -1; |
3590 | 0 | const struct re **res = NULL; |
3591 | 0 | struct re_str *strings = NULL; |
3592 | 0 | int nre = 0, r; |
3593 | |
|
3594 | 0 | nre = re_binop_count(re->type, re); |
3595 | 0 | r = ALLOC_N(res, nre); |
3596 | 0 | if (r < 0) |
3597 | 0 | goto done; |
3598 | | |
3599 | 0 | re_binop_store(re->type, re, res); |
3600 | |
|
3601 | 0 | r = ALLOC_N(strings, nre); |
3602 | 0 | if (r < 0) |
3603 | 0 | goto error; |
3604 | | |
3605 | 0 | str->len = 0; |
3606 | 0 | for (int i=0; i < nre; i++) { |
3607 | 0 | if (re_as_string(res[i], strings + i) < 0) |
3608 | 0 | goto error; |
3609 | 0 | str->len += strings[i].len; |
3610 | 0 | } |
3611 | 0 | str->len += nre-1; |
3612 | |
|
3613 | 0 | r = re_str_alloc(str); |
3614 | 0 | if (r < 0) |
3615 | 0 | goto error; |
3616 | | |
3617 | 0 | char *p = str->rx; |
3618 | 0 | for (int i=0; i < nre; i++) { |
3619 | 0 | if (i>0) |
3620 | 0 | *p++ = '|'; |
3621 | 0 | memcpy(p, strings[i].rx, strings[i].len); |
3622 | 0 | p += strings[i].len; |
3623 | 0 | } |
3624 | |
|
3625 | 0 | result = 0; |
3626 | 0 | done: |
3627 | 0 | free(res); |
3628 | 0 | if (strings != NULL) { |
3629 | 0 | for (int i=0; i < nre; i++) |
3630 | 0 | release_re_str(strings + i); |
3631 | 0 | } |
3632 | 0 | free(strings); |
3633 | 0 | return result; |
3634 | 0 | error: |
3635 | 0 | release_re_str(str); |
3636 | 0 | result = -1; |
3637 | 0 | goto done; |
3638 | 0 | } |
3639 | | |
3640 | | ATTRIBUTE_PURE |
3641 | 0 | static int re_needs_parens_in_concat(const struct re *re) { |
3642 | 0 | return (re->type != CHAR && re->type != CSET && re->type != ITER); |
3643 | 0 | } |
3644 | | |
3645 | 0 | static int re_concat_as_string(const struct re *re, struct re_str *str) { |
3646 | 0 | assert(re->type == CONCAT); |
3647 | | |
3648 | 0 | const struct re **res = NULL; |
3649 | 0 | struct re_str *strings = NULL; |
3650 | 0 | int nre = 0, r; |
3651 | 0 | int result = -1; |
3652 | |
|
3653 | 0 | nre = re_binop_count(re->type, re); |
3654 | 0 | r = ALLOC_N(res, nre); |
3655 | 0 | if (r < 0) |
3656 | 0 | goto error; |
3657 | 0 | re_binop_store(re->type, re, res); |
3658 | |
|
3659 | 0 | r = ALLOC_N(strings, nre); |
3660 | 0 | if (r < 0) |
3661 | 0 | goto error; |
3662 | | |
3663 | 0 | str->len = 0; |
3664 | 0 | for (int i=0; i < nre; i++) { |
3665 | 0 | if (res[i]->type == EPSILON) |
3666 | 0 | continue; |
3667 | 0 | if (re_as_string(res[i], strings + i) < 0) |
3668 | 0 | goto error; |
3669 | 0 | str->len += strings[i].len; |
3670 | 0 | if (re_needs_parens_in_concat(res[i])) |
3671 | 0 | str->len += 2; |
3672 | 0 | } |
3673 | | |
3674 | 0 | r = re_str_alloc(str); |
3675 | 0 | if (r < 0) |
3676 | 0 | goto error; |
3677 | | |
3678 | 0 | char *p = str->rx; |
3679 | 0 | for (int i=0; i < nre; i++) { |
3680 | 0 | if (res[i]->type == EPSILON) |
3681 | 0 | continue; |
3682 | 0 | if (re_needs_parens_in_concat(res[i])) |
3683 | 0 | *p++ = '('; |
3684 | 0 | p = memcpy(p, strings[i].rx, strings[i].len); |
3685 | 0 | p += strings[i].len; |
3686 | 0 | if (re_needs_parens_in_concat(res[i])) |
3687 | 0 | *p++ = ')'; |
3688 | 0 | } |
3689 | |
|
3690 | 0 | result = 0; |
3691 | 0 | done: |
3692 | 0 | free(res); |
3693 | 0 | if (strings != NULL) { |
3694 | 0 | for (int i=0; i < nre; i++) |
3695 | 0 | release_re_str(strings + i); |
3696 | 0 | } |
3697 | 0 | free(strings); |
3698 | 0 | return result; |
3699 | 0 | error: |
3700 | 0 | release_re_str(str); |
3701 | 0 | result = -1; |
3702 | 0 | goto done; |
3703 | 0 | } |
3704 | | |
3705 | 0 | static bool cset_contains(const struct re *cset, int c) { |
3706 | 0 | return bitset_get(cset->cset, c) != cset->negate; |
3707 | 0 | } |
3708 | | |
3709 | 0 | static int re_cset_as_string(const struct re *re, struct re_str *str) { |
3710 | 0 | const uchar rbrack = ']'; |
3711 | 0 | const uchar dash = '-'; |
3712 | 0 | const uchar nul = '\0'; |
3713 | |
|
3714 | 0 | static const char *const empty_set = "[]"; |
3715 | 0 | static const char *const total_set = "(.|\n)"; |
3716 | 0 | static const char *const not_newline = "."; |
3717 | |
|
3718 | 0 | char *s; |
3719 | 0 | int from, to, negate; |
3720 | 0 | size_t len; |
3721 | 0 | int incl_rbrack, incl_dash; |
3722 | 0 | int r; |
3723 | |
|
3724 | 0 | str->len = strlen(empty_set); |
3725 | | |
3726 | | /* We can not include NUL explicitly in a CSET since we use ordinary |
3727 | | NUL delimited strings to represent them. That means that we need to |
3728 | | use negated representation if NUL is to be included (and vice versa) |
3729 | | */ |
3730 | 0 | negate = cset_contains(re, nul); |
3731 | 0 | if (negate) { |
3732 | 0 | for (from = UCHAR_MIN; |
3733 | 0 | from <= UCHAR_MAX && cset_contains(re, from); |
3734 | 0 | from += 1); |
3735 | 0 | if (from > UCHAR_MAX) { |
3736 | | /* Special case: the set matches every character */ |
3737 | 0 | str->rx = strdup(total_set); |
3738 | 0 | goto done; |
3739 | 0 | } |
3740 | 0 | if (from == '\n') { |
3741 | 0 | for (from += 1; |
3742 | 0 | from <= UCHAR_MAX && cset_contains(re, from); |
3743 | 0 | from += 1); |
3744 | 0 | if (from > UCHAR_MAX) { |
3745 | | /* Special case: the set matches everything but '\n' */ |
3746 | 0 | str->rx = strdup(not_newline); |
3747 | 0 | goto done; |
3748 | 0 | } |
3749 | 0 | } |
3750 | 0 | } |
3751 | | |
3752 | | /* See if ']' and '-' will be explicitly included in the character set |
3753 | | (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset |
3754 | | these flags if they are in the set, but not mentioned explicitly |
3755 | | */ |
3756 | 0 | incl_rbrack = cset_contains(re, rbrack) != negate; |
3757 | 0 | incl_dash = cset_contains(re, dash) != negate; |
3758 | |
|
3759 | 0 | if (re->no_ranges) { |
3760 | 0 | for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) |
3761 | 0 | if (cset_contains(re, from) != negate) |
3762 | 0 | str->len += 1; |
3763 | 0 | } else { |
3764 | 0 | for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) { |
3765 | 0 | while (from <= UCHAR_MAX && cset_contains(re, from) == negate) |
3766 | 0 | from += 1; |
3767 | 0 | if (from > UCHAR_MAX) |
3768 | 0 | break; |
3769 | 0 | for (to = from; |
3770 | 0 | to < UCHAR_MAX && (cset_contains(re, to+1) != negate); |
3771 | 0 | to++); |
3772 | |
|
3773 | 0 | if (to == from && (from == rbrack || from == dash)) |
3774 | 0 | continue; |
3775 | 0 | if (from == rbrack || from == dash) |
3776 | 0 | from += 1; |
3777 | 0 | if (to == rbrack || to == dash) |
3778 | 0 | to -= 1; |
3779 | |
|
3780 | 0 | len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3); |
3781 | |
|
3782 | 0 | if (from < rbrack && rbrack < to) |
3783 | 0 | incl_rbrack = 0; |
3784 | 0 | if (from < dash && dash < to) |
3785 | 0 | incl_dash = 0; |
3786 | 0 | str->len += len; |
3787 | 0 | } |
3788 | 0 | str->len += incl_rbrack + incl_dash; |
3789 | 0 | } |
3790 | 0 | if (negate) |
3791 | 0 | str->len += 1; /* For the ^ */ |
3792 | |
|
3793 | 0 | r = re_str_alloc(str); |
3794 | 0 | if (r < 0) |
3795 | 0 | goto error; |
3796 | | |
3797 | 0 | s = str->rx; |
3798 | 0 | *s++ = '['; |
3799 | 0 | if (negate) |
3800 | 0 | *s++ = '^'; |
3801 | 0 | if (incl_rbrack) |
3802 | 0 | *s++ = rbrack; |
3803 | |
|
3804 | 0 | if (re->no_ranges) { |
3805 | 0 | for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) { |
3806 | 0 | if (from == rbrack || from == dash) |
3807 | 0 | continue; |
3808 | 0 | if (cset_contains(re, from) != negate) |
3809 | 0 | *s++ = from; |
3810 | 0 | } |
3811 | 0 | } else { |
3812 | 0 | for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) { |
3813 | 0 | while (from <= UCHAR_MAX && cset_contains(re, from) == negate) |
3814 | 0 | from += 1; |
3815 | 0 | if (from > UCHAR_MAX) |
3816 | 0 | break; |
3817 | 0 | for (to = from; |
3818 | 0 | to < UCHAR_MAX && (cset_contains(re, to+1) != negate); |
3819 | 0 | to++); |
3820 | |
|
3821 | 0 | if (to == from && (from == rbrack || from == dash)) |
3822 | 0 | continue; |
3823 | 0 | if (from == rbrack || from == dash) |
3824 | 0 | from += 1; |
3825 | 0 | if (to == rbrack || to == dash) |
3826 | 0 | to -= 1; |
3827 | |
|
3828 | 0 | if (to == from) { |
3829 | 0 | *s++ = from; |
3830 | 0 | } else if (to == from + 1) { |
3831 | 0 | *s++ = from; |
3832 | 0 | *s++ = to; |
3833 | 0 | } else { |
3834 | 0 | *s++ = from; |
3835 | 0 | *s++ = '-'; |
3836 | 0 | *s++ = to; |
3837 | 0 | } |
3838 | 0 | } |
3839 | 0 | } |
3840 | 0 | if (incl_dash) |
3841 | 0 | *s++ = dash; |
3842 | |
|
3843 | 0 | *s = ']'; |
3844 | 0 | done: |
3845 | 0 | if (str->rx == NULL) |
3846 | 0 | goto error; |
3847 | 0 | str->len = strlen(str->rx); |
3848 | 0 | return 0; |
3849 | 0 | error: |
3850 | 0 | release_re_str(str); |
3851 | 0 | return -1; |
3852 | 0 | } |
3853 | | |
3854 | 0 | static int re_iter_as_string(const struct re *re, struct re_str *str) { |
3855 | 0 | const char *quant = NULL; |
3856 | 0 | char *iter = NULL; |
3857 | 0 | int r, result = -1; |
3858 | |
|
3859 | 0 | if (re_as_string(re->exp, str) < 0) |
3860 | 0 | return -1; |
3861 | | |
3862 | 0 | if (re->min == 0 && re->max == -1) { |
3863 | 0 | quant = "*"; |
3864 | 0 | } else if (re->min == 1 && re->max == -1) { |
3865 | 0 | quant = "+"; |
3866 | 0 | } else if (re->min == 0 && re->max == 1) { |
3867 | 0 | quant = "?"; |
3868 | 0 | } else if (re->max == -1) { |
3869 | 0 | r = asprintf(&iter, "{%d,}", re->min); |
3870 | 0 | if (r < 0) |
3871 | 0 | return -1; |
3872 | 0 | quant = iter; |
3873 | 0 | } else { |
3874 | 0 | r = asprintf(&iter, "{%d,%d}", re->min, re->max); |
3875 | 0 | if (r < 0) |
3876 | 0 | return -1; |
3877 | 0 | quant = iter; |
3878 | 0 | } |
3879 | | |
3880 | 0 | if (re->exp->type == CHAR || re->exp->type == CSET) { |
3881 | 0 | if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0) |
3882 | 0 | goto error; |
3883 | 0 | strcpy(str->rx + str->len, quant); |
3884 | 0 | str->len += strlen(quant); |
3885 | 0 | } else { |
3886 | | /* Format '(' + str->rx ')' + quant */ |
3887 | 0 | if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0) |
3888 | 0 | goto error; |
3889 | 0 | memmove(str->rx + 1, str->rx, str->len); |
3890 | 0 | str->rx[0] = '('; |
3891 | 0 | str->rx[str->len + 1] = ')'; |
3892 | 0 | str->len += 2; |
3893 | 0 | strcpy(str->rx + str->len, quant); |
3894 | 0 | str->len += strlen(quant); |
3895 | 0 | } |
3896 | | |
3897 | 0 | result = 0; |
3898 | 0 | done: |
3899 | 0 | FREE(iter); |
3900 | 0 | return result; |
3901 | 0 | error: |
3902 | 0 | release_re_str(str); |
3903 | 0 | goto done; |
3904 | 0 | } |
3905 | | |
3906 | 0 | static int re_as_string(const struct re *re, struct re_str *str) { |
3907 | | /* Characters that must be escaped */ |
3908 | 0 | static const char * const special_chars = ".()[]{}*|+?\\^$"; |
3909 | 0 | int result = 0; |
3910 | |
|
3911 | 0 | switch(re->type) { |
3912 | 0 | case UNION: |
3913 | 0 | result = re_union_as_string(re, str); |
3914 | 0 | break; |
3915 | 0 | case CONCAT: |
3916 | 0 | result = re_concat_as_string(re, str); |
3917 | 0 | break; |
3918 | 0 | case CSET: |
3919 | 0 | result = re_cset_as_string(re, str); |
3920 | 0 | break; |
3921 | 0 | case CHAR: |
3922 | 0 | if (re->c == '\0' || strchr(special_chars, re->c) == NULL) { |
3923 | 0 | if (ALLOC_N(str->rx, 2) < 0) |
3924 | 0 | goto error; |
3925 | 0 | str->rx[0] = re->c; |
3926 | 0 | str->len = 1; |
3927 | 0 | } else { |
3928 | 0 | if (ALLOC_N(str->rx, 3) < 0) |
3929 | 0 | goto error; |
3930 | 0 | str->rx[0] = '\\'; |
3931 | 0 | str->rx[1] = re->c; |
3932 | 0 | str->len = strlen(str->rx); |
3933 | 0 | } |
3934 | 0 | break; |
3935 | 0 | case ITER: |
3936 | 0 | result = re_iter_as_string(re, str); |
3937 | 0 | break; |
3938 | 0 | case EPSILON: |
3939 | 0 | if (ALLOC_N(str->rx, 3) < 0) |
3940 | 0 | goto error; |
3941 | 0 | strcpy(str->rx, "()"); |
3942 | 0 | str->len = strlen(str->rx); |
3943 | 0 | break; |
3944 | 0 | default: |
3945 | 0 | assert(0); |
3946 | 0 | abort(); |
3947 | 0 | break; |
3948 | 0 | } |
3949 | 0 | return result; |
3950 | 0 | error: |
3951 | 0 | release_re_str(str); |
3952 | 0 | return -1; |
3953 | 0 | } |
3954 | | |
3955 | 0 | static int convert_trans_to_re(struct state *s) { |
3956 | 0 | struct re *re = NULL; |
3957 | 0 | size_t nto = 1; |
3958 | 0 | struct trans *trans = NULL; |
3959 | 0 | int r; |
3960 | |
|
3961 | 0 | if (s->tused == 0) |
3962 | 0 | return 0; |
3963 | | |
3964 | 0 | qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp); |
3965 | 0 | for (int i = 0; i < s->tused - 1; i++) { |
3966 | 0 | if (s->trans[i].to != s->trans[i+1].to) |
3967 | 0 | nto += 1; |
3968 | 0 | } |
3969 | 0 | r = ALLOC_N(trans, nto); |
3970 | 0 | if (r < 0) |
3971 | 0 | goto error; |
3972 | | |
3973 | 0 | struct state *to = s->trans[0].to; |
3974 | 0 | int tind = 0; |
3975 | 0 | for_each_trans(t, s) { |
3976 | 0 | if (t->to != to) { |
3977 | 0 | trans[tind].to = to; |
3978 | 0 | trans[tind].re = re; |
3979 | 0 | tind += 1; |
3980 | 0 | re = NULL; |
3981 | 0 | to = t->to; |
3982 | 0 | } |
3983 | 0 | if (re == NULL) { |
3984 | 0 | re = make_re_char_set(0, 0); |
3985 | 0 | if (re == NULL) |
3986 | 0 | goto error; |
3987 | 0 | } |
3988 | 0 | add_re_char(re, t->min, t->max); |
3989 | 0 | } |
3990 | 0 | assert(nto == tind + 1); |
3991 | 0 | trans[tind].to = to; |
3992 | 0 | trans[tind].re = re; |
3993 | | |
3994 | | /* Simplify CSETs with a single char to a CHAR */ |
3995 | 0 | for (int t=0; t < nto; t++) { |
3996 | 0 | int cnt = 0; |
3997 | 0 | uchar chr = UCHAR_MIN; |
3998 | 0 | for (int c = 0; c < UCHAR_NUM; c++) { |
3999 | 0 | if (bitset_get(trans[t].re->cset, c)) { |
4000 | 0 | cnt += 1; |
4001 | 0 | chr = c; |
4002 | 0 | } |
4003 | 0 | } |
4004 | 0 | if (cnt == 1) { |
4005 | 0 | re_unref(trans[t].re); |
4006 | 0 | trans[t].re = make_re_char(chr); |
4007 | 0 | if (trans[t].re == NULL) |
4008 | 0 | goto error; |
4009 | 0 | } |
4010 | 0 | } |
4011 | 0 | free_trans(s); |
4012 | 0 | s->trans = trans; |
4013 | 0 | s->tused = s->tsize = nto; |
4014 | 0 | return 0; |
4015 | | |
4016 | 0 | error: |
4017 | 0 | if (trans) |
4018 | 0 | for (int i=0; i < nto; i++) |
4019 | 0 | unref(trans[i].re, re); |
4020 | 0 | free(trans); |
4021 | 0 | return -1; |
4022 | 0 | } |
4023 | | |
4024 | | ATTRIBUTE_RETURN_CHECK |
4025 | | static int add_new_re_trans(struct state *s1, struct state *s2, |
4026 | 0 | struct re *re) { |
4027 | 0 | int r; |
4028 | 0 | r = add_new_trans(s1, s2, 0, 0); |
4029 | 0 | if (r < 0) |
4030 | 0 | return -1; |
4031 | 0 | last_trans(s1)->re = re; |
4032 | 0 | return 0; |
4033 | 0 | } |
4034 | | |
4035 | | /* Add the regular expression R1 . LOOP* . R2 to the transition |
4036 | | from S1 to S2. */ |
4037 | | static int re_collapse_trans(struct state *s1, struct state *s2, |
4038 | 0 | struct re *r1, struct re *loop, struct re *r2) { |
4039 | 0 | struct re *re = NULL; |
4040 | |
|
4041 | 0 | if (loop->type != EPSILON) { |
4042 | 0 | loop = make_re_rep(ref(loop), 0, -1); |
4043 | 0 | if (loop == NULL) |
4044 | 0 | goto error; |
4045 | 0 | } |
4046 | | |
4047 | 0 | if (r1->type == EPSILON) { |
4048 | 0 | if (loop->type == EPSILON) { |
4049 | 0 | re = ref(r2); |
4050 | 0 | } else { |
4051 | 0 | re = make_re_binop(CONCAT, loop, ref(r2)); |
4052 | 0 | } |
4053 | 0 | } else { |
4054 | 0 | if (loop->type == EPSILON) { |
4055 | 0 | if (r2->type == EPSILON) { |
4056 | 0 | re = ref(r1); |
4057 | 0 | } else { |
4058 | 0 | re = make_re_binop(CONCAT, ref(r1), ref(r2)); |
4059 | 0 | } |
4060 | 0 | } else { |
4061 | 0 | re = make_re_binop(CONCAT, ref(r1), loop); |
4062 | 0 | if (re != NULL && r2->type != EPSILON) { |
4063 | 0 | re = make_re_binop(CONCAT, re, ref(r2)); |
4064 | 0 | } |
4065 | 0 | } |
4066 | 0 | } |
4067 | 0 | if (re == NULL) |
4068 | 0 | goto error; |
4069 | | |
4070 | 0 | struct trans *t = NULL; |
4071 | 0 | for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1); |
4072 | 0 | if (t > last_trans(s1)) { |
4073 | 0 | if (add_new_re_trans(s1, s2, re) < 0) |
4074 | 0 | goto error; |
4075 | 0 | } else { |
4076 | 0 | if (t->re == NULL) { |
4077 | 0 | t->re = re; |
4078 | 0 | } else { |
4079 | 0 | t->re = make_re_binop(UNION, re, t->re); |
4080 | 0 | if (t->re == NULL) |
4081 | 0 | goto error; |
4082 | 0 | } |
4083 | 0 | } |
4084 | 0 | return 0; |
4085 | 0 | error: |
4086 | | // FIXME: make sure we don't leak loop |
4087 | 0 | return -1; |
4088 | 0 | } |
4089 | | |
4090 | 0 | static int convert_strings(struct fa *fa) { |
4091 | 0 | struct state_set *worklist = state_set_init(-1, S_NONE); |
4092 | 0 | int result = -1; |
4093 | |
|
4094 | 0 | E(worklist == NULL); |
4095 | | |
4096 | | /* Abuse hash to count indegree, reachable to mark visited states */ |
4097 | 0 | list_for_each(s, fa->initial) { |
4098 | 0 | s->hash = 0; |
4099 | 0 | s->reachable = 0; |
4100 | 0 | } |
4101 | |
|
4102 | 0 | list_for_each(s, fa->initial) { |
4103 | 0 | for_each_trans(t, s) { |
4104 | 0 | t->to->hash += 1; |
4105 | 0 | } |
4106 | 0 | } |
4107 | |
|
4108 | 0 | for (struct state *s = fa->initial; |
4109 | 0 | s != NULL; |
4110 | 0 | s = state_set_pop(worklist)) { |
4111 | 0 | for (int i=0; i < s->tused; i++) { |
4112 | 0 | struct trans *t = s->trans + i; |
4113 | 0 | struct state *to = t->to; |
4114 | 0 | while (to->hash == 1 && to->tused == 1 && ! to->accept) { |
4115 | 0 | if (t->re == NULL) { |
4116 | 0 | t->re = to->trans->re; |
4117 | 0 | to->trans->re = NULL; |
4118 | 0 | } else { |
4119 | 0 | t->re = make_re_binop(CONCAT, t->re, to->trans->re); |
4120 | 0 | if (t->re == NULL) |
4121 | 0 | goto error; |
4122 | 0 | } |
4123 | 0 | t->to = to->trans->to; |
4124 | 0 | to->tused = 0; |
4125 | 0 | to->hash -= 1; |
4126 | 0 | to = t->to; |
4127 | 0 | for (int j=0; j < s->tused; j++) { |
4128 | 0 | if (j != i && s->trans[j].to == to) { |
4129 | | /* Combine transitions i and j; remove trans j */ |
4130 | 0 | t->re = make_re_binop(UNION, t->re, s->trans[j].re); |
4131 | 0 | if (t->re == NULL) |
4132 | 0 | goto error; |
4133 | 0 | memmove(s->trans + j, s->trans + j + 1, |
4134 | 0 | sizeof(s->trans[j]) * (s->tused - j - 1)); |
4135 | 0 | to->hash -= 1; |
4136 | 0 | s->tused -= 1; |
4137 | 0 | if (j < i) { |
4138 | 0 | i = i - 1; |
4139 | 0 | t = s->trans + i; |
4140 | 0 | } |
4141 | 0 | } |
4142 | 0 | } |
4143 | 0 | } |
4144 | | |
4145 | 0 | if (! to->reachable) { |
4146 | 0 | to->reachable = 1; |
4147 | 0 | F(state_set_push(worklist, to)); |
4148 | 0 | } |
4149 | 0 | } |
4150 | 0 | } |
4151 | | |
4152 | 0 | for (struct state *s = fa->initial; s->next != NULL; ) { |
4153 | 0 | if (s->next->hash == 0 && s->next->tused == 0) { |
4154 | 0 | struct state *del = s->next; |
4155 | 0 | s->next = del->next; |
4156 | 0 | free(del->trans); |
4157 | 0 | free(del); |
4158 | 0 | } else { |
4159 | 0 | s = s->next; |
4160 | 0 | } |
4161 | 0 | } |
4162 | 0 | result = 0; |
4163 | |
|
4164 | 0 | error: |
4165 | 0 | state_set_free(worklist); |
4166 | 0 | return result; |
4167 | 0 | } |
4168 | | |
4169 | | /* Convert an FA to a regular expression. |
4170 | | * The strategy is the following: |
4171 | | * (1) For all states S1 and S2, convert the transitions between them |
4172 | | * into one transition whose regexp is a CSET |
4173 | | * (2) Add a new initial state INI and a new final state FIN to the automaton, |
4174 | | * a transition from INI to the old initial state of FA, and a transition |
4175 | | * from all accepting states of FA to FIN; the regexp on those transitions |
4176 | | * matches only the empty word |
4177 | | * (3) Eliminate states S (except for INI and FIN) one by one: |
4178 | | * Let LOOP the regexp for the transition S -> S if it exists, epsilon |
4179 | | * otherwise. |
4180 | | * For all S1, S2 different from S with S1 -> S -> S2 |
4181 | | * Let R1 the regexp of S1 -> S |
4182 | | * R2 the regexp of S -> S2 |
4183 | | * R3 the regexp S1 -> S2 (or epsilon if no such transition) |
4184 | | * set the regexp on the transition S1 -> S2 to |
4185 | | * R1 . (LOOP)* . R2 | R3 |
4186 | | * (4) The regexp for the whole FA can now be found as the regexp of |
4187 | | * the transition INI -> FIN |
4188 | | * (5) Convert that STRUCT RE to a string with RE_AS_STRING |
4189 | | */ |
4190 | 0 | int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) { |
4191 | 0 | int r; |
4192 | 0 | struct state *fin = NULL, *ini = NULL; |
4193 | 0 | struct re *eps = NULL; |
4194 | |
|
4195 | 0 | *regexp = NULL; |
4196 | 0 | *regexp_len = 0; |
4197 | 0 | fa = fa_clone(fa); |
4198 | 0 | if (fa == NULL) |
4199 | 0 | goto error; |
4200 | | |
4201 | 0 | eps = make_re(EPSILON); |
4202 | 0 | if (eps == NULL) |
4203 | 0 | goto error; |
4204 | | |
4205 | 0 | fin = add_state(fa,1); |
4206 | 0 | if (fin == NULL) |
4207 | 0 | goto error; |
4208 | | |
4209 | 0 | fa->trans_re = 1; |
4210 | |
|
4211 | 0 | list_for_each(s, fa->initial) { |
4212 | 0 | r = convert_trans_to_re(s); |
4213 | 0 | if (r < 0) |
4214 | 0 | goto error; |
4215 | 0 | if (s->accept && s != fin) { |
4216 | 0 | r = add_new_re_trans(s, fin, ref(eps)); |
4217 | 0 | if (r < 0) |
4218 | 0 | goto error; |
4219 | 0 | s->accept = 0; |
4220 | 0 | } |
4221 | 0 | } |
4222 | | |
4223 | 0 | ini = add_state(fa, 0); |
4224 | 0 | if (ini == NULL) |
4225 | 0 | goto error; |
4226 | | |
4227 | 0 | r = add_new_re_trans(ini, fa->initial, ref(eps)); |
4228 | 0 | if (r < 0) |
4229 | 0 | goto error; |
4230 | 0 | set_initial(fa, ini); |
4231 | |
|
4232 | 0 | convert_strings(fa); |
4233 | |
|
4234 | 0 | list_for_each(s, fa->initial->next) { |
4235 | 0 | if (s == fin) |
4236 | 0 | continue; |
4237 | | /* Eliminate S */ |
4238 | 0 | struct re *loop = eps; |
4239 | 0 | for_each_trans(t, s) { |
4240 | 0 | if (t->to == s) |
4241 | 0 | loop = t->re; |
4242 | 0 | } |
4243 | 0 | list_for_each(s1, fa->initial) { |
4244 | 0 | if (s == s1) |
4245 | 0 | continue; |
4246 | 0 | for (int t1 = 0; t1 < s1->tused; t1++) { |
4247 | 0 | if (s1->trans[t1].to == s) { |
4248 | 0 | for (int t = 0; t < s->tused; t++) { |
4249 | 0 | if (s->trans[t].to == s) |
4250 | 0 | continue; |
4251 | 0 | r = re_collapse_trans(s1, s->trans[t].to, |
4252 | 0 | s1->trans[t1].re, |
4253 | 0 | loop, |
4254 | 0 | s->trans[t].re); |
4255 | 0 | if (r < 0) |
4256 | 0 | goto error; |
4257 | 0 | } |
4258 | 0 | } |
4259 | 0 | } |
4260 | 0 | } |
4261 | 0 | } |
4262 | | |
4263 | 0 | re_unref(eps); |
4264 | | |
4265 | 0 | for_each_trans(t, fa->initial) { |
4266 | 0 | if (t->to == fin) { |
4267 | 0 | struct re_str str; |
4268 | 0 | MEMZERO(&str, 1); |
4269 | 0 | if (re_as_string(t->re, &str) < 0) |
4270 | 0 | goto error; |
4271 | 0 | *regexp = str.rx; |
4272 | 0 | *regexp_len = str.len; |
4273 | 0 | } |
4274 | 0 | } |
4275 | | |
4276 | 0 | list_for_each(s, fa->initial) { |
4277 | 0 | for_each_trans(t, s) { |
4278 | 0 | unref(t->re, re); |
4279 | 0 | } |
4280 | 0 | } |
4281 | 0 | fa_free(fa); |
4282 | |
|
4283 | 0 | return 0; |
4284 | 0 | error: |
4285 | 0 | fa_free(fa); |
4286 | 0 | re_unref(eps); |
4287 | 0 | return -1; |
4288 | 0 | } |
4289 | | |
4290 | 0 | static int re_restrict_alphabet(struct re *re, uchar from, uchar to) { |
4291 | 0 | int r1, r2; |
4292 | 0 | int result = 0; |
4293 | |
|
4294 | 0 | switch(re->type) { |
4295 | 0 | case UNION: |
4296 | 0 | case CONCAT: |
4297 | 0 | r1 = re_restrict_alphabet(re->exp1, from, to); |
4298 | 0 | r2 = re_restrict_alphabet(re->exp2, from, to); |
4299 | 0 | result = (r1 != 0) ? r1 : r2; |
4300 | 0 | break; |
4301 | 0 | case CSET: |
4302 | 0 | if (re->negate) { |
4303 | 0 | re->negate = 0; |
4304 | 0 | bitset_negate(re->cset, UCHAR_NUM); |
4305 | 0 | } |
4306 | 0 | for (int i=from; i <= to; i++) |
4307 | 0 | bitset_clr(re->cset, i); |
4308 | 0 | break; |
4309 | 0 | case CHAR: |
4310 | 0 | if (from <= re->c && re->c <= to) |
4311 | 0 | result = -1; |
4312 | 0 | break; |
4313 | 0 | case ITER: |
4314 | 0 | result = re_restrict_alphabet(re->exp, from, to); |
4315 | 0 | break; |
4316 | 0 | case EPSILON: |
4317 | 0 | break; |
4318 | 0 | default: |
4319 | 0 | assert(0); |
4320 | 0 | abort(); |
4321 | 0 | break; |
4322 | 0 | } |
4323 | 0 | return result; |
4324 | 0 | } |
4325 | | |
4326 | | int fa_restrict_alphabet(const char *regexp, size_t regexp_len, |
4327 | | char **newregexp, size_t *newregexp_len, |
4328 | 0 | char from, char to) { |
4329 | 0 | int result; |
4330 | 0 | struct re *re = NULL; |
4331 | 0 | struct re_parse parse; |
4332 | 0 | struct re_str str; |
4333 | |
|
4334 | 0 | *newregexp = NULL; |
4335 | 0 | MEMZERO(&parse, 1); |
4336 | 0 | parse.rx = regexp; |
4337 | 0 | parse.rend = regexp + regexp_len; |
4338 | 0 | parse.error = REG_NOERROR; |
4339 | 0 | re = parse_regexp(&parse); |
4340 | 0 | if (parse.error != REG_NOERROR) |
4341 | 0 | return parse.error; |
4342 | | |
4343 | 0 | result = re_restrict_alphabet(re, from, to); |
4344 | 0 | if (result != 0) { |
4345 | 0 | result = -2; |
4346 | 0 | goto done; |
4347 | 0 | } |
4348 | | |
4349 | 0 | MEMZERO(&str, 1); |
4350 | 0 | result = re_as_string(re, &str); |
4351 | 0 | *newregexp = str.rx; |
4352 | 0 | *newregexp_len = str.len; |
4353 | 0 | done: |
4354 | 0 | re_unref(re); |
4355 | 0 | return result; |
4356 | 0 | } |
4357 | | |
4358 | | int fa_expand_char_ranges(const char *regexp, size_t regexp_len, |
4359 | 0 | char **newregexp, size_t *newregexp_len) { |
4360 | 0 | int result; |
4361 | 0 | struct re *re = NULL; |
4362 | 0 | struct re_parse parse; |
4363 | 0 | struct re_str str; |
4364 | |
|
4365 | 0 | *newregexp = NULL; |
4366 | 0 | MEMZERO(&parse, 1); |
4367 | 0 | parse.rx = regexp; |
4368 | 0 | parse.rend = regexp + regexp_len; |
4369 | 0 | parse.error = REG_NOERROR; |
4370 | 0 | parse.no_ranges = 1; |
4371 | 0 | re = parse_regexp(&parse); |
4372 | 0 | if (parse.error != REG_NOERROR) |
4373 | 0 | return parse.error; |
4374 | | |
4375 | 0 | MEMZERO(&str, 1); |
4376 | 0 | result = re_as_string(re, &str); |
4377 | 0 | *newregexp = str.rx; |
4378 | 0 | *newregexp_len = str.len; |
4379 | 0 | re_unref(re); |
4380 | 0 | return result; |
4381 | 0 | } |
4382 | | |
4383 | | /* Expand regexp so that it is case-insensitive in a case-sensitive match. |
4384 | | * |
4385 | | * Return 1 when a change was made, -1 when an allocation failed, and 0 |
4386 | | * when no change was made. |
4387 | | */ |
4388 | 0 | static int re_case_expand(struct re *re) { |
4389 | 0 | int result = 0, r1, r2; |
4390 | |
|
4391 | 0 | switch(re->type) { |
4392 | 0 | case UNION: |
4393 | 0 | case CONCAT: |
4394 | 0 | r1 = re_case_expand(re->exp1); |
4395 | 0 | r2 = re_case_expand(re->exp2); |
4396 | 0 | result = (r1 != 0) ? r1 : r2; |
4397 | 0 | break; |
4398 | 0 | case CSET: |
4399 | 0 | for (int c = 'A'; c <= 'Z'; c++) |
4400 | 0 | if (bitset_get(re->cset, c)) { |
4401 | 0 | result = 1; |
4402 | 0 | bitset_set(re->cset, tolower(c)); |
4403 | 0 | } |
4404 | 0 | for (int c = 'a'; c <= 'z'; c++) |
4405 | 0 | if (bitset_get(re->cset, c)) { |
4406 | 0 | result = 1; |
4407 | 0 | bitset_set(re->cset, toupper(c)); |
4408 | 0 | } |
4409 | 0 | break; |
4410 | 0 | case CHAR: |
4411 | 0 | if (isalpha(re->c)) { |
4412 | 0 | int c = re->c; |
4413 | 0 | re->type = CSET; |
4414 | 0 | re->negate = false; |
4415 | 0 | re->no_ranges = 0; |
4416 | 0 | re->cset = bitset_init(UCHAR_NUM); |
4417 | 0 | if (re->cset == NULL) |
4418 | 0 | return -1; |
4419 | 0 | bitset_set(re->cset, tolower(c)); |
4420 | 0 | bitset_set(re->cset, toupper(c)); |
4421 | 0 | result = 1; |
4422 | 0 | } |
4423 | 0 | break; |
4424 | 0 | case ITER: |
4425 | 0 | result = re_case_expand(re->exp); |
4426 | 0 | break; |
4427 | 0 | case EPSILON: |
4428 | 0 | break; |
4429 | 0 | default: |
4430 | 0 | assert(0); |
4431 | 0 | abort(); |
4432 | 0 | break; |
4433 | 0 | } |
4434 | 0 | return result; |
4435 | 0 | } |
4436 | | |
4437 | | int fa_expand_nocase(const char *regexp, size_t regexp_len, |
4438 | 0 | char **newregexp, size_t *newregexp_len) { |
4439 | 0 | int result, r; |
4440 | 0 | struct re *re = NULL; |
4441 | 0 | struct re_parse parse; |
4442 | 0 | struct re_str str; |
4443 | |
|
4444 | 0 | *newregexp = NULL; |
4445 | 0 | MEMZERO(&parse, 1); |
4446 | 0 | parse.rx = regexp; |
4447 | 0 | parse.rend = regexp + regexp_len; |
4448 | 0 | parse.error = REG_NOERROR; |
4449 | 0 | re = parse_regexp(&parse); |
4450 | 0 | if (parse.error != REG_NOERROR) |
4451 | 0 | return parse.error; |
4452 | | |
4453 | 0 | r = re_case_expand(re); |
4454 | 0 | if (r < 0) { |
4455 | 0 | re_unref(re); |
4456 | 0 | return REG_ESPACE; |
4457 | 0 | } |
4458 | | |
4459 | 0 | if (r == 1) { |
4460 | 0 | MEMZERO(&str, 1); |
4461 | 0 | result = re_as_string(re, &str); |
4462 | 0 | *newregexp = str.rx; |
4463 | 0 | *newregexp_len = str.len; |
4464 | 0 | } else { |
4465 | 0 | *newregexp = strndup(regexp, regexp_len); |
4466 | 0 | *newregexp_len = regexp_len; |
4467 | 0 | result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR; |
4468 | 0 | } |
4469 | 0 | re_unref(re); |
4470 | 0 | return result; |
4471 | 0 | } |
4472 | | |
4473 | 0 | static void print_char(FILE *out, uchar c) { |
4474 | | /* We escape '/' as '\\/' since dot chokes on bare slashes in labels; |
4475 | | Also, a space ' ' is shown as '\s' */ |
4476 | 0 | static const char *const escape_from = " \n\t\v\b\r\f\a/\0"; |
4477 | 0 | static const char *const escape_to = "sntvbrfa/0"; |
4478 | 0 | char *p = strchr(escape_from, c); |
4479 | 0 | if (p != NULL) { |
4480 | 0 | int i = p - escape_from; |
4481 | 0 | fprintf(out, "\\\\%c", escape_to[i]); |
4482 | 0 | } else if (! isprint(c)) { |
4483 | 0 | fprintf(out, "\\\\0%03o", (unsigned char) c); |
4484 | 0 | } else if (c == '"') { |
4485 | 0 | fprintf(out, "\\\""); |
4486 | 0 | } else { |
4487 | 0 | fputc(c, out); |
4488 | 0 | } |
4489 | 0 | } |
4490 | | |
4491 | 0 | void fa_dot(FILE *out, struct fa *fa) { |
4492 | 0 | fprintf(out, "digraph {\n rankdir=LR;"); |
4493 | 0 | list_for_each(s, fa->initial) { |
4494 | 0 | if (s->accept) { |
4495 | 0 | fprintf(out, "\"%p\" [shape=doublecircle];\n", s); |
4496 | 0 | } else { |
4497 | 0 | fprintf(out, "\"%p\" [shape=circle];\n", s); |
4498 | 0 | } |
4499 | 0 | } |
4500 | 0 | fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa", |
4501 | 0 | fa->initial); |
4502 | |
|
4503 | 0 | struct re_str str; |
4504 | 0 | MEMZERO(&str, 1); |
4505 | 0 | list_for_each(s, fa->initial) { |
4506 | 0 | for_each_trans(t, s) { |
4507 | 0 | fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to); |
4508 | 0 | if (fa->trans_re) { |
4509 | 0 | re_as_string(t->re, &str); |
4510 | 0 | for (int i=0; i < str.len; i++) { |
4511 | 0 | print_char(out, str.rx[i]); |
4512 | 0 | } |
4513 | 0 | release_re_str(&str); |
4514 | 0 | } else { |
4515 | 0 | print_char(out, t->min); |
4516 | 0 | if (t->min != t->max) { |
4517 | 0 | fputc('-', out); |
4518 | 0 | print_char(out, t->max); |
4519 | 0 | } |
4520 | 0 | } |
4521 | 0 | fprintf(out, "\" ];\n"); |
4522 | 0 | } |
4523 | 0 | } |
4524 | 0 | fprintf(out, "}\n"); |
4525 | 0 | } |
4526 | | |
4527 | 0 | int fa_json(FILE *out, struct fa *fa) { |
4528 | 0 | hash_val_t *list_hashes = NULL; |
4529 | 0 | int list_size = 100; |
4530 | 0 | int num_states = 0; |
4531 | 0 | int it; |
4532 | 0 | char first = true; |
4533 | 0 | int result = -1; |
4534 | |
|
4535 | 0 | fprintf(out,"{\n\t\"final\": ["); |
4536 | |
|
4537 | 0 | F(ALLOC_N(list_hashes, list_size)); |
4538 | | |
4539 | 0 | list_for_each(s, fa->initial) { |
4540 | 0 | if (num_states == list_size - 1){ |
4541 | 0 | list_size += list_size; |
4542 | 0 | F(REALLOC_N(list_hashes, list_size)); |
4543 | 0 | } |
4544 | | // Store hash value |
4545 | 0 | list_hashes[num_states] = s->hash; |
4546 | | // We use the hashes to map states to Z_{num_states} |
4547 | 0 | s->hash = num_states++; |
4548 | 0 | if (s->accept) { |
4549 | 0 | if (first) { |
4550 | 0 | fprintf(out,"%ld", s->hash); |
4551 | 0 | first = false; |
4552 | 0 | } else { |
4553 | 0 | fprintf(out, ", %ld", s->hash); |
4554 | 0 | } |
4555 | 0 | } |
4556 | 0 | } |
4557 | | |
4558 | 0 | fprintf(out, "],\n\t\"deterministic\": %d,\n\t\"transitions\": [\n", |
4559 | 0 | fa->deterministic ? 1 : 0); |
4560 | |
|
4561 | 0 | first = true; |
4562 | 0 | list_for_each(s, fa->initial) { |
4563 | 0 | for_each_trans(t, s) { |
4564 | 0 | if (!first) |
4565 | 0 | fprintf(out, ",\n"); |
4566 | 0 | first = false; |
4567 | 0 | fprintf(out, "\t\t{ \"from\": %ld, \"to\": %ld, \"on\": \"", |
4568 | 0 | s->hash, t->to->hash); |
4569 | 0 | print_char(out, t->min); |
4570 | 0 | if (t->min != t->max) { |
4571 | 0 | fputc('-', out); |
4572 | 0 | print_char(out, t->max); |
4573 | 0 | } |
4574 | 0 | fprintf(out, "\" }"); |
4575 | 0 | } |
4576 | 0 | } |
4577 | |
|
4578 | 0 | fprintf(out,"\n\t]\n}"); |
4579 | 0 | result = 0; |
4580 | |
|
4581 | 0 | error: |
4582 | | // Restoring hash values to leave the FA structure untouched. That is |
4583 | | // only needed if we actually copied hashes, indicated by num_states |
4584 | | // being non-zero |
4585 | 0 | if (num_states > 0) { |
4586 | 0 | it = 0; |
4587 | 0 | list_for_each(s, fa->initial) { |
4588 | 0 | s->hash = list_hashes[it++]; |
4589 | 0 | } |
4590 | 0 | } |
4591 | 0 | free(list_hashes); |
4592 | 0 | return result; |
4593 | 0 | } |
4594 | | |
4595 | 0 | bool fa_is_deterministic(struct fa *fa) { |
4596 | 0 | return fa->deterministic; |
4597 | 0 | } |
4598 | | |
4599 | 0 | struct state *fa_state_initial(struct fa *fa) { |
4600 | 0 | return fa->initial; |
4601 | 0 | } |
4602 | | |
4603 | 0 | bool fa_state_is_accepting(struct state *st) { |
4604 | 0 | return st->accept; |
4605 | 0 | } |
4606 | | |
4607 | 0 | struct state* fa_state_next(struct state *st) { |
4608 | 0 | return st->next; |
4609 | 0 | } |
4610 | | |
4611 | 0 | size_t fa_state_num_trans(struct state *st) { |
4612 | 0 | return st->tused; |
4613 | 0 | } |
4614 | | |
4615 | | int fa_state_trans(struct state *st, size_t i, |
4616 | 0 | struct state **to, unsigned char *min, unsigned char *max) { |
4617 | 0 | if (st->tused <= i) |
4618 | 0 | return -1; |
4619 | | |
4620 | 0 | (*to) = st->trans[i].to; |
4621 | 0 | (*min) = st->trans[i].min; |
4622 | 0 | (*max) = st->trans[i].max; |
4623 | 0 | return 0; |
4624 | 0 | } |
4625 | | |
4626 | | /* |
4627 | | * Local variables: |
4628 | | * indent-tabs-mode: nil |
4629 | | * c-indent-level: 4 |
4630 | | * c-basic-offset: 4 |
4631 | | * tab-width: 4 |
4632 | | * End: |
4633 | | */ |