Coverage Report

Created: 2023-09-23 07:09

/src/augeas/src/fa.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * fa.c: finite automata
3
 *
4
 * Copyright (C) 2007-2016 David Lutterkort
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
19
 *
20
 * Author: David Lutterkort <dlutter@redhat.com>
21
 */
22
23
/*
24
 * This implementation follows closely the Java dk.brics.automaton package
25
 * by Anders Moeller. The project's website is
26
 * http://www.brics.dk/automaton/.
27
 *
28
 * It is by no means a complete reimplementation of that package; only a
29
 * subset of what Automaton provides is implemented here.
30
 */
31
32
#include <config.h>
33
#include <limits.h>
34
#include <ctype.h>
35
#include <stdbool.h>
36
37
#include "internal.h"
38
#include "memory.h"
39
#include "ref.h"
40
#include "hash.h"
41
#include "fa.h"
42
43
0
#define UCHAR_NUM (UCHAR_MAX+1)
44
0
#define UCHAR_MIN 0
45
typedef unsigned char uchar;
46
47
0
#define E(cond) if (cond) goto error
48
0
#define F(expr) if ((expr) < 0) goto error
49
50
/* Which algorithm to use in FA_MINIMIZE */
51
int fa_minimization_algorithm = FA_MIN_HOPCROFT;
52
53
/* A finite automaton. INITIAL is both the initial state and the head of
54
 * the list of all states. Any state that is allocated for this automaton
55
 * is put on this list. Dead/unreachable states are cleared from the list
56
 * at opportune times (e.g., during minimization) It's poor man's garbage
57
 * collection
58
 *
59
 * Normally, transitions are on a character range [min..max]; in
60
 * fa_as_regexp, we store regexps on transitions in the re field of each
61
 * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
62
 * produce proper graphs of an automaton transitioning on regexps.
63
 *
64
 * For case-insensitive regexps (nocase == 1), the FA never has transitions
65
 * on uppercase letters [A-Z], effectively removing these letters from the
66
 * alphabet.
67
 */
68
struct fa {
69
    struct state *initial;
70
    int           deterministic : 1;
71
    int           minimal : 1;
72
    unsigned int  nocase : 1;
73
    int           trans_re : 1;
74
};
75
76
/* A state in a finite automaton. Transitions are never shared between
77
   states so that we can free the list when we need to free the state */
78
struct state {
79
    struct state *next;
80
    hash_val_t    hash;
81
    unsigned int  accept : 1;
82
    unsigned int  live : 1;
83
    unsigned int  reachable : 1;
84
    unsigned int  visited : 1;   /* Used in various places to track progress */
85
    /* Array of transitions. The TUSED first entries are used, the array
86
       has allocated room for TSIZE */
87
    size_t        tused;
88
    size_t        tsize;
89
    struct trans *trans;
90
};
91
92
/* A transition. If the input has a character in the inclusive
93
 * range [MIN, MAX], move to TO
94
 */
95
struct trans {
96
    struct state *to;
97
    union {
98
        struct {
99
            uchar         min;
100
            uchar         max;
101
        };
102
        struct re *re;
103
    };
104
};
105
106
/*
107
 * Bitsets
108
 */
109
0
#define UINT_BIT (sizeof(unsigned int) * CHAR_BIT)
110
111
typedef unsigned int bitset;
112
113
0
static bitset *bitset_init(size_t nbits) {
114
0
    bitset *bs;
115
0
    if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1)
116
0
        return NULL;
117
0
    return bs;
118
0
}
119
120
0
static inline void bitset_clr(bitset *bs, unsigned int bit) {
121
0
    bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT));
122
0
}
123
124
0
static inline void bitset_set(bitset *bs, unsigned int bit) {
125
0
    bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT);
126
0
}
127
128
ATTRIBUTE_PURE
129
0
static inline int bitset_get(const bitset * const bs, unsigned int bit) {
130
0
    return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1;
131
0
}
132
133
ATTRIBUTE_PURE
134
static inline bool bitset_disjoint(const bitset *const bs1,
135
                                   const bitset *const bs2,
136
0
                                   size_t nbits) {
137
0
    for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) {
138
0
        if (bs1[i] & bs2[i])
139
0
            return false;
140
0
    }
141
0
    return true;
142
0
}
143
144
0
static void bitset_free(bitset *bs) {
145
0
    free(bs);
146
0
}
147
148
0
static void bitset_negate(bitset *bs, size_t nbits) {
149
0
    for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
150
0
        bs[i] = ~ bs[i];
151
0
}
152
153
/*
154
 * Representation of a parsed regular expression. The regular expression is
155
 * parsed according to the following grammar by PARSE_REGEXP:
156
 *
157
 * regexp:        concat_exp ('|' regexp)?
158
 * concat_exp:    repeated_exp concat_exp?
159
 * repeated_exp:  simple_exp
160
 *              | simple_exp '*'
161
 *              | simple_exp '+'
162
 *              | simple_exp '?'
163
 *              | simple_exp '{' INT (',' INT)? '}'
164
 * simple_exp:  char_class
165
 *            | '.'
166
 *            |  '(' regexp ')'
167
 *            |  CHAR
168
 * char_class: '[' char_exp+ ']'
169
 *           | '[' '^' char_exp+ ']'
170
 * char_exp:   CHAR '-' CHAR
171
 *           | CHAR
172
 */
173
174
enum re_type {
175
    UNION,
176
    CONCAT,
177
    CSET,
178
    CHAR,
179
    ITER,
180
    EPSILON
181
};
182
183
0
#define re_unref(r) unref(r, re)
184
185
struct re {
186
    ref_t        ref;
187
    enum re_type type;
188
    union {
189
        struct {                  /* UNION, CONCAT */
190
            struct re *exp1;
191
            struct re *exp2;
192
        };
193
        struct {                  /* CSET */
194
            bool    negate;
195
            bitset *cset;
196
            /* Whether we can use character ranges when converting back
197
             * to a string */
198
            unsigned int no_ranges:1;
199
        };
200
        struct {                  /* CHAR */
201
            uchar c;
202
        };
203
        struct {                  /* ITER */
204
            struct re *exp;
205
            int min;
206
            int max;
207
        };
208
    };
209
};
210
211
/* Used to keep state of the regex parse; RX may contain NUL's */
212
struct re_parse {
213
    const char *rx;          /* Current position in regex */
214
    const char *rend;        /* Last char of rx+ 1 */
215
    int         error;       /* error code */
216
    /* Whether new CSET's should have the no_ranges flag set */
217
    unsigned int no_ranges:1;
218
};
219
220
/* String with explicit length, used when converting re to string */
221
struct re_str {
222
    char  *rx;
223
    size_t len;
224
};
225
226
static struct re *parse_regexp(struct re_parse *parse);
227
228
/* A map from a set of states to a state. */
229
typedef hash_t state_set_hash;
230
231
static hash_val_t ptr_hash(const void *p);
232
233
static const int array_initial_size = 4;
234
static const int array_max_expansion   = 128;
235
236
enum state_set_init_flags {
237
    S_NONE   = 0,
238
    S_SORTED = (1 << 0),
239
    S_DATA   = (1 << 1)
240
};
241
242
struct state_set {
243
    size_t            size;
244
    size_t            used;
245
    unsigned int      sorted : 1;
246
    unsigned int      with_data : 1;
247
    struct state    **states;
248
    void            **data;
249
};
250
251
struct state_set_list {
252
    struct state_set_list *next;
253
    struct state_set      *set;
254
};
255
256
/* Clean up FA by removing dead transitions and states and reducing
257
 * transitions. Unreachable states are freed. The return value is the same
258
 * as FA; returning it is merely a convenience.
259
 *
260
 * Only automata in this state should be returned to the user
261
 */
262
ATTRIBUTE_RETURN_CHECK
263
static int collect(struct fa *fa);
264
265
ATTRIBUTE_RETURN_CHECK
266
static int totalize(struct fa *fa);
267
268
/* Print an FA into a (fairly) fixed file if the environment variable
269
 * FA_DOT_DIR is set. This code is only used for debugging
270
 */
271
#define FA_DOT_DIR "FA_DOT_DIR"
272
273
ATTRIBUTE_UNUSED
274
0
static void fa_dot_debug(struct fa *fa, const char *tag) {
275
0
    const char *dot_dir;
276
0
    static int count = 0;
277
0
    int r;
278
0
    char *fname;
279
0
    FILE *fp;
280
0
281
0
    if ((dot_dir = getenv(FA_DOT_DIR)) == NULL)
282
0
        return;
283
0
284
0
    r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag);
285
0
    if (r == -1)
286
0
        return;
287
0
288
0
    fp = fopen(fname, "w");
289
0
    if (fp == NULL) {
290
0
        free(fname);
291
0
        return;
292
0
    }
293
0
294
0
    fa_dot(fp, fa);
295
0
    fclose(fp);
296
0
    free(fname);
297
0
}
298
299
0
static void print_char_set(struct re *set) {
300
0
    int from, to;
301
0
302
0
    if (set->negate)
303
0
        printf("[^");
304
0
    else
305
0
        printf("[");
306
0
    for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
307
0
        while (bitset_get(set->cset, from) == set->negate)
308
0
            from += 1;
309
0
        if (from > UCHAR_MAX)
310
0
            break;
311
0
        for (to = from;
312
0
             to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate);
313
0
             to++);
314
0
        if (to == from) {
315
0
            printf("%c", from);
316
0
        } else {
317
0
            printf("%c-%c", from, to);
318
0
        }
319
0
    }
320
0
    printf("]");
321
0
}
322
323
ATTRIBUTE_UNUSED
324
0
static void print_re(struct re *re) {
325
0
    switch(re->type) {
326
0
    case UNION:
327
0
        print_re(re->exp1);
328
0
        printf("|");
329
0
        print_re(re->exp2);
330
0
        break;
331
0
    case CONCAT:
332
0
        print_re(re->exp1);
333
0
        printf(".");
334
0
        print_re(re->exp2);
335
0
        break;
336
0
    case CSET:
337
0
        print_char_set(re);
338
0
        break;
339
0
    case CHAR:
340
0
        printf("%c", re->c);
341
0
        break;
342
0
    case ITER:
343
0
        printf("(");
344
0
        print_re(re->exp);
345
0
        printf("){%d,%d}", re->min, re->max);
346
0
        break;
347
0
    case EPSILON:
348
0
        printf("<>");
349
0
        break;
350
0
    default:
351
0
        printf("(**)");
352
0
        break;
353
0
    }
354
0
}
355
356
/*
357
 * struct re_str
358
 */
359
0
static void release_re_str(struct re_str *str) {
360
0
    if (str == NULL)
361
0
        return;
362
0
    FREE(str->rx);
363
0
    str->len = 0;
364
0
}
365
366
0
static void free_re_str(struct re_str *str) {
367
0
    if (str == NULL)
368
0
        return;
369
0
    release_re_str(str);
370
0
    FREE(str);
371
0
}
372
373
0
static struct re_str *make_re_str(const char *s) {
374
0
    struct re_str *str;
375
376
0
    if (ALLOC(str) < 0)
377
0
        return NULL;
378
0
    if (s != NULL) {
379
0
        str->rx = strdup(s);
380
0
        str->len = strlen(s);
381
0
        if (str->rx == NULL) {
382
0
            FREE(str);
383
0
            return NULL;
384
0
        }
385
0
    }
386
0
    return str;
387
0
}
388
389
0
static int re_str_alloc(struct re_str *str) {
390
0
    return ALLOC_N(str->rx, str->len + 1);
391
0
}
392
393
/*
394
 * Memory management
395
 */
396
397
0
static void free_trans(struct state *s) {
398
0
    free(s->trans);
399
0
    s->trans = NULL;
400
0
    s->tused = s->tsize = 0;
401
0
}
402
403
0
static void gut(struct fa *fa) {
404
0
    list_for_each(s, fa->initial) {
405
0
        free_trans(s);
406
0
    }
407
0
    list_free(fa->initial);
408
0
    fa->initial = NULL;
409
0
}
410
411
0
void fa_free(struct fa *fa) {
412
0
    if (fa == NULL)
413
0
        return;
414
0
    gut(fa);
415
0
    free(fa);
416
0
}
417
418
0
static struct state *make_state(void) {
419
0
    struct state *s;
420
0
    if (ALLOC(s) == -1)
421
0
        return NULL;
422
0
    s->hash = ptr_hash(s);
423
0
    return s;
424
0
}
425
426
0
static struct state *add_state(struct fa *fa, int accept) {
427
0
    struct state *s = make_state();
428
0
    if (s) {
429
0
        s->accept = accept;
430
0
        if (fa->initial == NULL) {
431
0
            fa->initial = s;
432
0
        } else {
433
0
            list_cons(fa->initial->next, s);
434
0
        }
435
0
    }
436
0
    return s;
437
0
}
438
439
0
#define last_trans(s)  ((s)->trans + (s)->tused - 1)
440
441
#define for_each_trans(t, s)                                            \
442
0
    for (struct trans *t = (s)->trans;                                  \
443
0
         (t - (s)->trans) < (s)->tused;                                 \
444
0
         t++)
445
446
ATTRIBUTE_RETURN_CHECK
447
static int add_new_trans(struct state *from, struct state *to,
448
0
                         uchar min, uchar max) {
449
0
    assert(to != NULL);
450
451
0
    if (from->tused == from->tsize) {
452
0
        size_t tsize = from->tsize;
453
0
        if (tsize == 0)
454
0
            tsize = array_initial_size;
455
0
        else if (from->tsize > array_max_expansion)
456
0
            tsize += array_max_expansion;
457
0
        else
458
0
            tsize *= 2;
459
0
        if (REALLOC_N(from->trans, tsize) == -1)
460
0
            return -1;
461
0
        from->tsize = tsize;
462
0
    }
463
0
    from->trans[from->tused].to  = to;
464
0
    from->trans[from->tused].min = min;
465
0
    from->trans[from->tused].max = max;
466
0
    from->tused += 1;
467
0
    return 0;
468
0
}
469
470
ATTRIBUTE_RETURN_CHECK
471
static int add_epsilon_trans(struct state *from,
472
0
                             struct state *to) {
473
0
    int r;
474
0
    from->accept |= to->accept;
475
0
    for_each_trans(t, to) {
476
0
        r = add_new_trans(from, t->to, t->min, t->max);
477
0
        if (r < 0)
478
0
            return -1;
479
0
    }
480
0
    return 0;
481
0
}
482
483
0
static void set_initial(struct fa *fa, struct state *s) {
484
0
    list_remove(s, fa->initial);
485
0
    list_cons(fa->initial, s);
486
0
}
487
488
/* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1
489
   and then frees FA2. It has no influence on the language accepted by FA1
490
*/
491
0
static void fa_merge(struct fa *fa1, struct fa **fa2) {
492
0
    list_append(fa1->initial, (*fa2)->initial);
493
0
    free(*fa2);
494
0
    *fa2 = NULL;
495
0
}
496
497
/*
498
 * Operations on STATE_SET
499
 */
500
0
static void state_set_free(struct state_set *set) {
501
0
    if (set == NULL)
502
0
        return;
503
0
    free(set->states);
504
0
    free(set->data);
505
0
    free(set);
506
0
}
507
508
0
static int state_set_init_data(struct state_set *set) {
509
0
    set->with_data = 1;
510
0
    if (set->data == NULL)
511
0
        return ALLOC_N(set->data, set->size);
512
0
    else
513
0
        return 0;
514
0
}
515
516
/* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use
517
   the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating
518
   some options:
519
   - S_SORTED: keep the states in the set sorted by their address, and use
520
     binary search for lookups. If it is not set, entries are kept in the
521
     order in which they are added and lookups scan linearly through the
522
     set of states.
523
   - S_DATA: allocate the DATA array in the set, and keep its size in sync
524
     with the size of the STATES array.
525
*/
526
0
static struct state_set *state_set_init(int size, int flags) {
527
0
    struct state_set *set = NULL;
528
529
0
    F(ALLOC(set));
530
531
0
    set->sorted = (flags & S_SORTED) ? 1 : 0;
532
0
    set->with_data = (flags & S_DATA) ? 1 : 0;
533
0
    if (size > 0) {
534
0
        set->size = size;
535
0
        F(ALLOC_N(set->states, set->size));
536
0
        if (set->with_data)
537
0
            F(state_set_init_data(set));
538
0
    }
539
0
    return set;
540
0
 error:
541
0
    state_set_free(set);
542
0
    return NULL;
543
0
}
544
545
ATTRIBUTE_RETURN_CHECK
546
0
static int state_set_expand(struct state_set *set) {
547
0
    if (set->size == 0)
548
0
        set->size = array_initial_size;
549
0
    else if (set->size > array_max_expansion)
550
0
        set->size += array_max_expansion;
551
0
    else
552
0
        set->size *= 2;
553
0
    if (REALLOC_N(set->states, set->size) < 0)
554
0
        goto error;
555
0
    if (set->with_data)
556
0
        if (REALLOC_N(set->data, set->size) < 0)
557
0
            goto error;
558
0
    return 0;
559
0
 error:
560
    /* We do this to provoke a SEGV as early as possible */
561
0
    FREE(set->states);
562
0
    FREE(set->data);
563
0
    return -1;
564
0
}
565
566
/* Return the index where S belongs in SET->STATES to keep it sorted.  S
567
   may not be in SET->STATES. The returned index is in the interval [0
568
   .. SET->USED], with the latter indicating that S is larger than all
569
   values in SET->STATES
570
*/
571
0
static int state_set_pos(const struct state_set *set, const struct state *s) {
572
0
    int l = 0, h = set->used;
573
0
    while (l < h) {
574
0
        int m = (l + h)/2;
575
0
        if (set->states[m] > s)
576
0
            h = m;
577
0
        else if (set->states[m] < s)
578
0
            l = m + 1;
579
0
        else
580
0
            return m;
581
0
    }
582
0
    return l;
583
0
}
584
585
ATTRIBUTE_RETURN_CHECK
586
0
static int state_set_push(struct state_set *set, struct state *s) {
587
0
    if (set->size == set->used)
588
0
        if (state_set_expand(set) < 0)
589
0
            return -1;
590
0
    if (set->sorted) {
591
0
        int p = state_set_pos(set, s);
592
0
        if (set->size == set->used)
593
0
            if (state_set_expand(set) < 0)
594
0
                return -1;
595
0
        while (p < set->used && set->states[p] <= s)
596
0
            p += 1;
597
0
        if (p < set->used) {
598
0
            memmove(set->states + p + 1, set->states + p,
599
0
                    sizeof(*set->states) * (set->used - p));
600
0
            if (set->data != NULL)
601
0
                memmove(set->data + p + 1, set->data + p,
602
0
                        sizeof(*set->data) * (set->used - p));
603
0
        }
604
0
        set->states[p] = s;
605
0
        set->used += 1;
606
0
        return p;
607
0
    } else {
608
0
        set->states[set->used++] = s;
609
0
        return set->used - 1;
610
0
    }
611
0
}
612
613
ATTRIBUTE_RETURN_CHECK
614
static int state_set_push_data(struct state_set *set, struct state *s,
615
0
                               void *d) {
616
0
    int i = state_set_push(set, s);
617
0
    if (i == -1)
618
0
        return -1;
619
0
    set->data[i] = d;
620
0
    return i;
621
0
}
622
623
static int state_set_index(const struct state_set *set,
624
0
                           const struct state *s) {
625
0
    if (set->sorted) {
626
0
        int p = state_set_pos(set, s);
627
0
        return (p < set->used && set->states[p] == s) ? p : -1;
628
0
    } else {
629
0
        for (int i=0; i < set->used; i++) {
630
0
            if (set->states[i] == s)
631
0
                return i;
632
0
        }
633
0
    }
634
0
    return -1;
635
0
}
636
637
static void state_set_remove(struct state_set *set,
638
0
                             const struct state *s) {
639
0
   if (set->sorted) {
640
0
       int p = state_set_index(set, s);
641
0
       if (p == -1) return;
642
0
       memmove(set->states + p, set->states + p + 1,
643
0
               sizeof(*set->states) * (set->used - p - 1));
644
0
       if (set->data != NULL)
645
0
           memmove(set->data + p, set->data + p + 1,
646
0
                   sizeof(*set->data) * (set->used - p - 1));
647
0
   } else {
648
0
       int p = state_set_index(set, s);
649
0
       if (p >= 0) {
650
0
           set->states[p] = set->states[--set->used];
651
0
       }
652
0
   }
653
0
}
654
655
/* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was
656
   already in the set and -1 on error. */
657
ATTRIBUTE_RETURN_CHECK
658
0
static int state_set_add(struct state_set *set, struct state *s) {
659
0
    if (set->sorted) {
660
0
        int p = state_set_pos(set, s);
661
0
        if (p < set->used && set->states[p] == s)
662
0
            return 0;
663
0
        if (set->size == set->used)
664
0
            if (state_set_expand(set) < 0)
665
0
                return -1;
666
0
        while (p < set->used && set->states[p] <= s)
667
0
            p += 1;
668
0
        if (p < set->used) {
669
0
            memmove(set->states + p + 1, set->states + p,
670
0
                    sizeof(*set->states) * (set->used - p));
671
0
            if (set->data != NULL)
672
0
                memmove(set->data + p + 1, set->data + p,
673
0
                        sizeof(*set->data) * (set->used - p));
674
0
        }
675
0
        set->states[p] = s;
676
0
        set->used += 1;
677
0
    } else {
678
0
        if (state_set_index(set, s) >= 0)
679
0
            return 0;
680
0
        if (state_set_push(set, s) < 0)
681
0
            goto error;
682
0
    }
683
0
    return 1;
684
0
 error:
685
    /* We do this to provoke a SEGV as early as possible */
686
0
    FREE(set->states);
687
0
    FREE(set->data);
688
0
    return -1;
689
0
}
690
691
0
static struct state *state_set_pop(struct state_set *set) {
692
0
    struct state *s = NULL;
693
0
    if (set->used > 0)
694
0
        s = set->states[--set->used];
695
0
    return s;
696
0
}
697
698
0
static struct state *state_set_pop_data(struct state_set *set, void **d) {
699
0
    struct state *s = NULL;
700
0
    s = state_set_pop(set);
701
0
    *d = set->data[set->used];
702
0
    return s;
703
0
}
704
705
0
static void *state_set_find_data(struct state_set *set, struct state *s) {
706
0
    int i = state_set_index(set, s);
707
0
    if (i >= 0)
708
0
        return set->data[i];
709
0
    else
710
0
        return NULL;
711
0
}
712
713
static int state_set_equal(const struct state_set *set1,
714
0
                           const struct state_set *set2) {
715
0
    if (set1->used != set2->used)
716
0
        return 0;
717
0
    if (set1->sorted && set2->sorted) {
718
0
        for (int i = 0; i < set1->used; i++)
719
0
            if (set1->states[i] != set2->states[i])
720
0
                return 0;
721
0
        return 1;
722
0
    } else {
723
0
        for (int i=0; i < set1->used; i++)
724
0
            if (state_set_index(set2, set1->states[i]) == -1)
725
0
                return 0;
726
0
        return 1;
727
0
    }
728
0
}
729
730
#if 0
731
static void state_set_compact(struct state_set *set) {
732
    while (set->used > 0 && set->states[set->used] == NULL)
733
        set->used -= 1;
734
    for (int i=0; i < set->used; i++) {
735
        if (set->states[i] == NULL) {
736
            set->used -= 1;
737
            set->states[i] = set->states[set->used];
738
            if (set->data)
739
                set->data[i] = set->data[set->used];
740
        }
741
        while (set->used > 0 && set->states[set->used] == NULL)
742
            set->used -= 1;
743
    }
744
}
745
#endif
746
747
/* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is
748
   stored in SET->DATA at the same index.
749
*/
750
ATTRIBUTE_RETURN_CHECK
751
static int state_pair_push(struct state_set **set,
752
0
                           struct state *fst, struct state *snd) {
753
0
    if (*set == NULL)
754
0
        *set = state_set_init(-1, S_DATA);
755
0
    if (*set == NULL)
756
0
        return -1;
757
0
    int i = state_set_push(*set, fst);
758
0
    if (i == -1)
759
0
        return -1;
760
0
    (*set)->data[i] = snd;
761
762
0
    return 0;
763
0
}
764
765
/* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no
766
   such pair.
767
 */
768
static int state_pair_find(struct state_set *set, struct state *fst,
769
0
                           struct state *snd) {
770
0
    for (int i=0; i < set->used; i++)
771
0
        if (set->states[i] == fst && set->data[i] == snd)
772
0
            return i;
773
0
    return -1;
774
0
}
775
776
/* Jenkins' hash for void* */
777
0
static hash_val_t ptr_hash(const void *p) {
778
0
    hash_val_t hash = 0;
779
0
    char *c = (char *) &p;
780
0
    for (int i=0; i < sizeof(p); i++) {
781
0
        hash += c[i];
782
0
        hash += (hash << 10);
783
0
        hash ^= (hash >> 6);
784
0
    }
785
0
    hash += (hash << 3);
786
0
    hash ^= (hash >> 11);
787
0
    hash += (hash << 15);
788
0
    return hash;
789
0
}
790
791
typedef hash_t state_triple_hash;
792
793
0
static hash_val_t pair_hash(const void *key) {
794
0
    register struct state *const *pair = key;
795
0
    return pair[0]->hash + pair[1]->hash;
796
0
}
797
798
0
static int pair_cmp(const void *key1, const void *key2) {
799
0
    return memcmp(key1, key2, 2*sizeof(struct state *));
800
0
}
801
802
0
static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
803
0
    free((void *) hnode_getkey(node));
804
0
    free(node);
805
0
}
806
807
0
static state_triple_hash *state_triple_init(void) {
808
0
    state_triple_hash *hash;
809
810
0
    hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash);
811
0
    if (hash == NULL)
812
0
        return NULL;
813
0
    hash_set_allocator(hash, NULL, state_triple_node_free, NULL);
814
0
    return hash;
815
0
}
816
817
ATTRIBUTE_RETURN_CHECK
818
static int state_triple_push(state_triple_hash *hash,
819
                             struct state *s1,
820
                             struct state *s2,
821
0
                             struct state *s3) {
822
0
    struct state **pair;
823
0
    if (ALLOC_N(pair, 2) < 0)
824
0
        return -1;
825
0
    pair[0] = s1;
826
0
    pair[1] = s2;
827
0
    return hash_alloc_insert(hash, pair, s3);
828
0
}
829
830
static struct state * state_triple_thd(state_triple_hash *hash,
831
                                       struct state *s1,
832
0
                                       struct state *s2) {
833
0
    struct state *pair[2];
834
0
    hnode_t *node;
835
0
    pair[0] = s1;
836
0
    pair[1] = s2;
837
0
    node = hash_lookup(hash, pair);
838
0
    return (node == NULL) ? NULL : (struct state *) hnode_get(node);
839
0
}
840
841
0
static void state_triple_free(state_triple_hash *hash) {
842
0
    if (hash != NULL) {
843
0
        hash_free_nodes(hash);
844
0
        hash_destroy(hash);
845
0
    }
846
0
}
847
848
/*
849
 * State operations
850
 */
851
ATTRIBUTE_RETURN_CHECK
852
0
static int mark_reachable(struct fa *fa) {
853
0
    struct state_set *worklist = state_set_init(-1, S_NONE);
854
0
    int result = -1;
855
856
0
    E(worklist == NULL);
857
858
0
    list_for_each(s, fa->initial) {
859
0
        s->reachable = 0;
860
0
    }
861
0
    fa->initial->reachable = 1;
862
863
0
    for (struct state *s = fa->initial;
864
0
         s != NULL;
865
0
         s = state_set_pop(worklist)) {
866
0
        for_each_trans(t, s) {
867
0
            if (! t->to->reachable) {
868
0
                t->to->reachable = 1;
869
0
                F(state_set_push(worklist, t->to));
870
0
            }
871
0
        }
872
0
    }
873
0
    result = 0;
874
875
0
 error:
876
0
    state_set_free(worklist);
877
0
    return result;
878
0
}
879
880
/* Return all reachable states. As a sideeffect, all states have their
881
   REACHABLE flag set appropriately.
882
 */
883
0
static struct state_set *fa_states(struct fa *fa) {
884
0
    struct state_set *visited = state_set_init(-1, S_NONE);
885
0
    int r;
886
887
0
    r = mark_reachable(fa);
888
0
    E(visited == NULL || r < 0);
889
890
0
    list_for_each(s, fa->initial) {
891
0
        if (s->reachable)
892
0
            F(state_set_push(visited, s));
893
0
    }
894
0
    return visited;
895
0
 error:
896
0
    state_set_free(visited);
897
0
    return NULL;
898
0
}
899
900
/* Return all reachable accepting states. As a sideeffect, all states have
901
   their REACHABLE flag set appropriately.
902
 */
903
0
static struct state_set *fa_accept_states(struct fa *fa) {
904
0
    struct state_set *accept = state_set_init(-1, S_NONE);
905
0
    int r;
906
907
0
    E(accept == NULL);
908
909
0
    r = mark_reachable(fa);
910
0
    E(r < 0);
911
912
0
    list_for_each(s, fa->initial) {
913
0
        if (s->reachable && s->accept)
914
0
            F(state_set_push(accept, s));
915
0
    }
916
0
    return accept;
917
0
 error:
918
0
    state_set_free(accept);
919
0
    return NULL;
920
0
}
921
922
/* Mark all live states, i.e. states from which an accepting state can be
923
   reached. All states have their REACHABLE and LIVE flags set
924
   appropriately.
925
 */
926
ATTRIBUTE_RETURN_CHECK
927
0
static int mark_live(struct fa *fa) {
928
0
    int changed;
929
930
0
    F(mark_reachable(fa));
931
932
0
    list_for_each(s, fa->initial) {
933
0
        s->live = s->reachable && s->accept;
934
0
    }
935
936
0
    do {
937
0
        changed = 0;
938
0
        list_for_each(s, fa->initial) {
939
0
            if (! s->live && s->reachable) {
940
0
                for_each_trans(t, s) {
941
0
                    if (t->to->live) {
942
0
                        s->live = 1;
943
0
                        changed = 1;
944
0
                        break;
945
0
                    }
946
0
                }
947
0
            }
948
0
        }
949
0
    } while (changed);
950
0
    return 0;
951
0
 error:
952
0
    return -1;
953
0
}
954
955
/*
956
 * Reverse an automaton in place. Change FA so that it accepts the
957
 * language that is the reverse of the input automaton.
958
 *
959
 * Returns a list of the new initial states of the automaton. The list must
960
 * be freed by the caller.
961
 */
962
0
static struct state_set *fa_reverse(struct fa *fa) {
963
0
    struct state_set *all = NULL;
964
0
    struct state_set *accept = NULL;
965
0
    int r;
966
967
0
    all = fa_states(fa);
968
0
    E(all == NULL);
969
0
    accept = fa_accept_states(fa);
970
0
    E(accept == NULL);
971
972
0
    F(state_set_init_data(all));
973
974
    /* Reverse all transitions */
975
0
    int *tused;
976
0
    F(ALLOC_N(tused, all->used));
977
0
    for (int i=0; i < all->used; i++) {
978
0
        all->data[i] = all->states[i]->trans;
979
0
        tused[i] = all->states[i]->tused;
980
0
        all->states[i]->trans = NULL;
981
0
        all->states[i]->tsize = 0;
982
0
        all->states[i]->tused = 0;
983
0
    }
984
0
    for (int i=0; i < all->used; i++) {
985
0
        struct state *s = all->states[i];
986
0
        struct trans *t = all->data[i];
987
0
        s->accept = 0;
988
0
        for (int j=0; j < tused[i]; j++) {
989
0
            r = add_new_trans(t[j].to, s, t[j].min, t[j].max);
990
0
            if (r < 0)
991
0
                goto error;
992
0
        }
993
0
        free(t);
994
0
    }
995
0
    free(tused);
996
997
    /* Make new initial and final states */
998
0
    struct state *s = add_state(fa, 0);
999
0
    E(s == NULL);
1000
1001
0
    fa->initial->accept = 1;
1002
0
    set_initial(fa, s);
1003
0
    for (int i=0; i < accept->used; i++) {
1004
0
        r = add_epsilon_trans(s, accept->states[i]);
1005
0
        if (r < 0)
1006
0
            goto error;
1007
0
    }
1008
1009
0
    fa->deterministic = 0;
1010
0
    fa->minimal = 0;
1011
0
    state_set_free(all);
1012
0
    return accept;
1013
0
 error:
1014
0
    state_set_free(all);
1015
0
    state_set_free(accept);
1016
0
    return NULL;
1017
0
}
1018
1019
/*
1020
 * Return a sorted array of all interval start points in FA. The returned
1021
 * array is a string (null terminated)
1022
 */
1023
0
static uchar* start_points(struct fa *fa, int *npoints) {
1024
0
    char pointset[UCHAR_NUM];
1025
0
    uchar *points = NULL;
1026
1027
0
    F(mark_reachable(fa));
1028
0
    MEMZERO(pointset, UCHAR_NUM);
1029
0
    list_for_each(s, fa->initial) {
1030
0
        if (! s->reachable)
1031
0
            continue;
1032
0
        pointset[0] = 1;
1033
0
        for_each_trans(t, s) {
1034
0
            pointset[t->min] = 1;
1035
0
            if (t->max < UCHAR_MAX)
1036
0
                pointset[t->max+1] = 1;
1037
0
        }
1038
0
    }
1039
1040
0
    *npoints = 0;
1041
0
    for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++);
1042
1043
0
    F(ALLOC_N(points, *npoints+1));
1044
0
    for (int i=0, n=0; i < UCHAR_NUM; i++) {
1045
0
        if (pointset[i])
1046
0
            points[n++] = (uchar) i;
1047
0
    }
1048
1049
0
    return points;
1050
0
 error:
1051
0
    free(points);
1052
0
    return NULL;
1053
0
}
1054
1055
/*
1056
 * Operations on STATE_SET_HASH
1057
 */
1058
static int state_set_hash_contains(state_set_hash *smap,
1059
0
                               struct state_set *set) {
1060
0
    return hash_lookup(smap, set) != NULL;
1061
0
}
1062
1063
/*
1064
 * Find the set in SMAP that has the same states as SET. If the two are
1065
 * different, i.e. they point to different memory locations, free SET and
1066
 * return the set found in SMAP
1067
 */
1068
static struct state_set *state_set_hash_uniq(state_set_hash *smap,
1069
0
                                             struct state_set *set) {
1070
0
    hnode_t *node = hash_lookup(smap, set);
1071
0
    const struct state_set *orig_set = hnode_getkey(node);
1072
0
    if (orig_set != set) {
1073
0
        state_set_free(set);
1074
0
    }
1075
0
    return (struct state_set *) orig_set;
1076
0
}
1077
1078
static struct state *state_set_hash_get_state(state_set_hash *smap,
1079
0
                                             struct state_set *set) {
1080
0
    hnode_t *node = hash_lookup(smap, set);
1081
0
    return (struct state *) hnode_get(node);
1082
0
}
1083
1084
0
static hash_val_t set_hash(const void *key) {
1085
0
    hash_val_t hash = 0;
1086
0
    const struct state_set *set = key;
1087
1088
0
    for (int i = 0; i < set->used; i++) {
1089
0
        hash += set->states[i]->hash;
1090
0
    }
1091
0
    return hash;
1092
0
}
1093
1094
0
static int set_cmp(const void *key1, const void *key2) {
1095
0
    const struct state_set *set1 = key1;
1096
0
    const struct state_set *set2 = key2;
1097
1098
0
    return state_set_equal(set1, set2) ? 0 : 1;
1099
0
}
1100
1101
0
static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
1102
0
    struct state_set *set = (struct state_set *) hnode_getkey(node);
1103
0
    state_set_free(set);
1104
0
    free(node);
1105
0
}
1106
1107
ATTRIBUTE_RETURN_CHECK
1108
static int state_set_hash_add(state_set_hash **smap,
1109
0
                              struct state_set *set, struct fa *fa) {
1110
0
    if (*smap == NULL) {
1111
0
        *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash);
1112
0
        E(*smap == NULL);
1113
0
        hash_set_allocator(*smap, NULL, set_destroy, NULL);
1114
0
    }
1115
0
    struct state *s = add_state(fa, 0);
1116
0
    E(s == NULL);
1117
0
    F(hash_alloc_insert(*smap, set, s));
1118
0
    return 0;
1119
0
 error:
1120
0
    return -1;
1121
0
}
1122
1123
static void state_set_hash_free(state_set_hash *smap,
1124
0
                                struct state_set *protect) {
1125
0
    if (protect != NULL) {
1126
0
        hnode_t *node = hash_lookup(smap, protect);
1127
0
        hash_delete(smap, node);
1128
0
        hnode_getkey(node) = NULL;
1129
0
        set_destroy(node, NULL);
1130
0
    }
1131
0
    hash_free_nodes(smap);
1132
0
    hash_destroy(smap);
1133
0
}
1134
1135
static int state_set_list_add(struct state_set_list **list,
1136
0
                              struct state_set *set) {
1137
0
    struct state_set_list *elt;
1138
0
    if (ALLOC(elt) < 0)
1139
0
        return -1;
1140
0
    elt->set = set;
1141
0
    list_cons(*list, elt);
1142
0
    return 0;
1143
0
}
1144
1145
0
static struct state_set *state_set_list_pop(struct state_set_list **list) {
1146
0
    struct state_set_list *elt = *list;
1147
0
    struct state_set      *set = elt->set;
1148
1149
0
    *list = elt->next;
1150
0
    free(elt);
1151
0
    return set;
1152
0
}
1153
1154
/* Compare transitions lexicographically by (to, min, reverse max) */
1155
0
static int trans_to_cmp(const void *v1, const void *v2) {
1156
0
    const struct trans *t1 = v1;
1157
0
    const struct trans *t2 = v2;
1158
1159
0
    if (t1->to != t2->to) {
1160
0
        return (t1->to < t2->to) ? -1 : 1;
1161
0
    }
1162
0
    if (t1->min < t2->min)
1163
0
        return -1;
1164
0
    if (t1->min > t2->min)
1165
0
        return 1;
1166
0
    if (t1->max > t2->max)
1167
0
        return -1;
1168
0
    return (t1->max < t2->max) ? 1 : 0;
1169
0
}
1170
1171
/* Compare transitions lexicographically by (min, reverse max, to) */
1172
0
static int trans_intv_cmp(const void *v1, const void *v2) {
1173
0
    const struct trans *t1 = v1;
1174
0
    const struct trans *t2 = v2;
1175
1176
0
    if (t1->min < t2->min)
1177
0
        return -1;
1178
0
    if (t1->min > t2->min)
1179
0
        return 1;
1180
0
    if (t1->max > t2->max)
1181
0
        return -1;
1182
0
    if (t1->max < t2->max)
1183
0
        return 1;
1184
0
    if (t1->to != t2->to) {
1185
0
        return (t1->to < t2->to) ? -1 : 1;
1186
0
    }
1187
0
    return 0;
1188
0
}
1189
1190
/*
1191
 * Reduce an automaton by combining overlapping and adjacent edge intervals
1192
 * with the same destination.
1193
 */
1194
0
static void reduce(struct fa *fa) {
1195
0
    list_for_each(s, fa->initial) {
1196
0
        if (s->tused == 0)
1197
0
            continue;
1198
1199
0
        qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
1200
0
        int i=0, j=1;
1201
0
        struct trans *t = s->trans;
1202
0
        while (j < s->tused) {
1203
0
            if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) {
1204
0
                if (t[j].max > t[i].max)
1205
0
                    t[i].max = t[j].max;
1206
0
                j += 1;
1207
0
            } else {
1208
0
                i += 1;
1209
0
                if (i < j)
1210
0
                    memmove(s->trans + i, s->trans + j,
1211
0
                            sizeof(*s->trans) * (s->tused - j));
1212
0
                s->tused -= j - i;
1213
0
                j = i + 1;
1214
0
            }
1215
0
        }
1216
0
        s->tused = i+1;
1217
        /* Shrink if we use less than half the allocated size */
1218
0
        if (s->tsize > array_initial_size && 2*s->tused < s->tsize) {
1219
0
            int r;
1220
0
            r = REALLOC_N(s->trans, s->tsize);
1221
0
            if (r == 0)
1222
0
                s->tsize = s->tused;
1223
0
        }
1224
0
    }
1225
0
}
1226
1227
/*
1228
 * Remove dead transitions from an FA; a transition is dead is it does not
1229
 * lead to a live state. This also removes any states that are not
1230
 * reachable any longer from FA->INITIAL.
1231
 *
1232
 * Returns the same FA as a convenience
1233
 */
1234
1235
0
static void collect_trans(struct fa *fa) {
1236
0
    list_for_each(s, fa->initial) {
1237
0
        if (! s->live) {
1238
0
            free_trans(s);
1239
0
        } else {
1240
0
            int i=0;
1241
0
            while (i < s->tused) {
1242
0
                if (! s->trans[i].to->live) {
1243
0
                    s->tused -= 1;
1244
0
                    memmove(s->trans + i, s->trans + s->tused,
1245
0
                            sizeof(*s->trans));
1246
0
                } else {
1247
0
                    i += 1;
1248
0
                }
1249
0
            }
1250
0
        }
1251
0
    }
1252
0
}
1253
1254
0
static void collect_dead_states(struct fa *fa) {
1255
    /* Remove all dead states and free their storage */
1256
0
    for (struct state *s = fa->initial; s->next != NULL; ) {
1257
0
        if (! s->next->live) {
1258
0
            struct state *del = s->next;
1259
0
            s->next = del->next;
1260
0
            free_trans(del);
1261
0
            free(del);
1262
0
        } else {
1263
0
            s = s->next;
1264
0
        }
1265
0
    }
1266
0
}
1267
1268
0
static int collect(struct fa *fa) {
1269
0
    F(mark_live(fa));
1270
1271
0
    if (! fa->initial->live) {
1272
        /* This automaton accepts nothing, make it the canonical
1273
         * epsilon automaton
1274
         */
1275
0
        list_for_each(s, fa->initial) {
1276
0
            free_trans(s);
1277
0
        }
1278
0
        list_free(fa->initial->next);
1279
0
        fa->deterministic = 1;
1280
0
    } else {
1281
0
        collect_trans(fa);
1282
0
        collect_dead_states(fa);
1283
0
    }
1284
0
    reduce(fa);
1285
0
    return 0;
1286
0
 error:
1287
0
    return -1;
1288
0
}
1289
1290
0
static void swap_initial(struct fa *fa) {
1291
0
    struct state *s = fa->initial;
1292
0
    if (s->next != NULL) {
1293
0
        fa->initial = s->next;
1294
0
        s->next = fa->initial->next;
1295
0
        fa->initial->next = s;
1296
0
    }
1297
0
}
1298
1299
/*
1300
 * Make a finite automaton deterministic using the given set of initial
1301
 * states with the subset construction. This also eliminates dead states
1302
 * and transitions and reduces and orders the transitions for each state
1303
 */
1304
0
static int determinize(struct fa *fa, struct state_set *ini) {
1305
0
    int npoints;
1306
0
    int make_ini = (ini == NULL);
1307
0
    const uchar *points = NULL;
1308
0
    state_set_hash *newstate = NULL;
1309
0
    struct state_set_list *worklist = NULL;
1310
0
    int ret = 0;
1311
1312
0
    if (fa->deterministic)
1313
0
        return 0;
1314
1315
0
    points = start_points(fa, &npoints);
1316
0
    E(points == NULL);
1317
0
    if (make_ini) {
1318
0
        ini = state_set_init(-1, S_NONE);
1319
0
        if (ini == NULL || state_set_push(ini, fa->initial) < 0) {
1320
0
            state_set_free(ini);
1321
0
            goto error;
1322
0
        }
1323
0
    }
1324
1325
0
    F(state_set_list_add(&worklist, ini));
1326
0
    F(state_set_hash_add(&newstate, ini, fa));
1327
    // Make the new state the initial state
1328
0
    swap_initial(fa);
1329
0
    while (worklist != NULL) {
1330
0
        struct state_set *sset = state_set_list_pop(&worklist);
1331
0
        struct state *r = state_set_hash_get_state(newstate, sset);
1332
0
        for (int q=0; q < sset->used; q++) {
1333
0
            r->accept |= sset->states[q]->accept;
1334
0
        }
1335
0
        for (int n=0; n < npoints; n++) {
1336
0
            struct state_set *pset = state_set_init(-1, S_SORTED);
1337
0
            E(pset == NULL);
1338
0
            for(int q=0 ; q < sset->used; q++) {
1339
0
                for_each_trans(t, sset->states[q]) {
1340
0
                    if (t->min <= points[n] && points[n] <= t->max) {
1341
0
                        F(state_set_add(pset, t->to));
1342
0
                    }
1343
0
                }
1344
0
            }
1345
0
            if (!state_set_hash_contains(newstate, pset)) {
1346
0
                F(state_set_list_add(&worklist, pset));
1347
0
                F(state_set_hash_add(&newstate, pset, fa));
1348
0
            }
1349
0
            pset = state_set_hash_uniq(newstate, pset);
1350
1351
0
            struct state *q = state_set_hash_get_state(newstate, pset);
1352
0
            uchar min = points[n];
1353
0
            uchar max = UCHAR_MAX;
1354
0
            if (n+1 < npoints)
1355
0
                max = points[n+1] - 1;
1356
0
            if (add_new_trans(r, q, min, max) < 0)
1357
0
                goto error;
1358
0
        }
1359
0
    }
1360
0
    fa->deterministic = 1;
1361
1362
0
 done:
1363
0
    if (newstate)
1364
0
        state_set_hash_free(newstate, make_ini ? NULL : ini);
1365
0
    free((void *) points);
1366
0
    if (collect(fa) < 0)
1367
0
        ret = -1;
1368
0
    return ret;
1369
0
 error:
1370
0
    ret = -1;
1371
0
    goto done;
1372
0
}
1373
1374
/*
1375
 * Minimization. As a sideeffect of minimization, the transitions are
1376
 * reduced and ordered.
1377
 */
1378
1379
0
static struct state *step(struct state *s, uchar c) {
1380
0
    for_each_trans(t, s) {
1381
0
        if (t->min <= c && c <= t->max)
1382
0
            return t->to;
1383
0
    }
1384
0
    return NULL;
1385
0
}
1386
1387
struct state_list {
1388
    struct state_list_node *first;
1389
    struct state_list_node *last;
1390
    unsigned int            size;
1391
};
1392
1393
struct state_list_node {
1394
    struct state_list      *sl;
1395
    struct state_list_node *next;
1396
    struct state_list_node *prev;
1397
    struct state           *state;
1398
};
1399
1400
static struct state_list_node *state_list_add(struct state_list *sl,
1401
0
                                              struct state *s) {
1402
0
    struct state_list_node *n;
1403
1404
0
    if (ALLOC(n) < 0)
1405
0
        return NULL;
1406
1407
0
    n->state = s;
1408
0
    n->sl = sl;
1409
1410
0
    if (sl->size++ == 0) {
1411
0
        sl->first = n;
1412
0
        sl->last = n;
1413
0
    } else {
1414
0
        sl->last->next = n;
1415
0
        n->prev = sl->last;
1416
0
        sl->last = n;
1417
0
    }
1418
0
    return n;
1419
0
}
1420
1421
0
static void state_list_remove(struct state_list_node *n) {
1422
0
    struct state_list *sl = n->sl;
1423
0
    sl->size -= 1;
1424
0
    if (sl->first == n)
1425
0
        sl->first = n->next;
1426
0
    else
1427
0
        n->prev->next = n->next;
1428
0
    if (sl->last == n)
1429
0
        sl->last = n->prev;
1430
0
    else
1431
0
        n->next->prev = n->prev;
1432
1433
0
    free(n);
1434
0
}
1435
1436
0
static void state_list_free(struct state_list *sl) {
1437
0
    if (sl)
1438
0
        list_free(sl->first);
1439
0
    free(sl);
1440
0
}
1441
1442
/* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */
1443
0
#define INDEX(q, c) (q * nsigma + c)
1444
1445
0
static int minimize_hopcroft(struct fa *fa) {
1446
0
    struct state_set *states = NULL;
1447
0
    uchar *sigma = NULL;
1448
0
    struct state_set **reverse = NULL;
1449
0
    bitset *reverse_nonempty = NULL;
1450
0
    struct state_set **partition = NULL;
1451
0
    unsigned int *block = NULL;
1452
0
    struct state_list **active = NULL;
1453
0
    struct state_list_node **active2 = NULL;
1454
0
    int *pending = NULL;
1455
0
    bitset *pending2 = NULL;
1456
0
    struct state_set *split = NULL;
1457
0
    bitset *split2 = NULL;
1458
0
    int *refine = NULL;
1459
0
    bitset *refine2 = NULL;
1460
0
    struct state_set **splitblock = NULL;
1461
0
    struct state_set *newstates = NULL;
1462
0
    int *nsnum = NULL;
1463
0
    int *nsind = NULL;
1464
0
    int result = -1;
1465
0
    unsigned int nstates = 0;
1466
0
    int nsigma = 0;
1467
1468
0
    F(determinize(fa, NULL));
1469
1470
    /* Total automaton, nothing to do */
1471
0
    if (fa->initial->tused == 1
1472
0
        && fa->initial->trans[0].to == fa->initial
1473
0
        && fa->initial->trans[0].min == UCHAR_MIN
1474
0
        && fa->initial->trans[0].max == UCHAR_MAX)
1475
0
        return 0;
1476
1477
0
    F(totalize(fa));
1478
1479
    /* make arrays for numbered states and effective alphabet */
1480
0
    states = state_set_init(-1, S_NONE);
1481
0
    E(states == NULL);
1482
1483
0
    list_for_each(s, fa->initial) {
1484
0
        F(state_set_push(states, s));
1485
0
    }
1486
0
    nstates = states->used;
1487
1488
0
    sigma = start_points(fa, &nsigma);
1489
0
    E(sigma == NULL);
1490
1491
    /* initialize data structures */
1492
1493
    /* An ss->used x nsigma matrix of lists of states */
1494
0
    F(ALLOC_N(reverse, nstates * nsigma));
1495
0
    reverse_nonempty = bitset_init(nstates * nsigma);
1496
0
    E(reverse_nonempty == NULL);
1497
0
    F(ALLOC_N(partition, nstates));
1498
0
    F(ALLOC_N(block, nstates));
1499
1500
0
    F(ALLOC_N(active, nstates * nsigma));
1501
0
    F(ALLOC_N(active2, nstates * nsigma));
1502
1503
    /* PENDING is an array of pairs of ints. The i'th pair is stored in
1504
     * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in
1505
     * PENDING at any time. SPENDING is the maximum number of pairs
1506
     * allocated for PENDING.
1507
     */
1508
0
    size_t npending = 0, spending = 0;
1509
0
    pending2 = bitset_init(nstates * nsigma);
1510
0
    E(pending2 == NULL);
1511
1512
0
    split = state_set_init(-1, S_NONE);
1513
0
    split2 = bitset_init(nstates);
1514
0
    E(split == NULL || split2 == NULL);
1515
1516
0
    F(ALLOC_N(refine, nstates));
1517
0
    refine2 = bitset_init(nstates);
1518
0
    E(refine2 == NULL);
1519
1520
0
    F(ALLOC_N(splitblock, nstates));
1521
1522
0
    for (int q = 0; q < nstates; q++) {
1523
0
        splitblock[q] = state_set_init(-1, S_NONE);
1524
0
        partition[q] = state_set_init(-1, S_NONE);
1525
0
        E(splitblock[q] == NULL || partition[q] == NULL);
1526
0
        for (int x = 0; x < nsigma; x++) {
1527
0
            reverse[INDEX(q, x)] = state_set_init(-1, S_NONE);
1528
0
            E(reverse[INDEX(q, x)] == NULL);
1529
0
            F(ALLOC_N(active[INDEX(q, x)], 1));
1530
0
        }
1531
0
    }
1532
1533
    /* find initial partition and reverse edges */
1534
0
    for (int q = 0; q < nstates; q++) {
1535
0
        struct state *qq = states->states[q];
1536
0
        int j;
1537
0
        if (qq->accept)
1538
0
            j = 0;
1539
0
        else
1540
0
            j = 1;
1541
0
        F(state_set_push(partition[j], qq));
1542
0
        block[q] = j;
1543
0
        for (int x = 0; x < nsigma; x++) {
1544
0
            uchar y = sigma[x];
1545
0
            struct state *p = step(qq, y);
1546
0
            assert(p != NULL);
1547
0
            int pn = state_set_index(states, p);
1548
0
            assert(pn >= 0);
1549
0
            F(state_set_push(reverse[INDEX(pn, x)], qq));
1550
0
            bitset_set(reverse_nonempty, INDEX(pn, x));
1551
0
        }
1552
0
    }
1553
1554
    /* initialize active sets */
1555
0
    for (int j = 0; j <= 1; j++)
1556
0
        for (int x = 0; x < nsigma; x++)
1557
0
            for (int q = 0; q < partition[j]->used; q++) {
1558
0
                struct state *qq = partition[j]->states[q];
1559
0
                int qn = state_set_index(states, qq);
1560
0
                if (bitset_get(reverse_nonempty, INDEX(qn, x))) {
1561
0
                    active2[INDEX(qn, x)] =
1562
0
                        state_list_add(active[INDEX(j, x)], qq);
1563
0
                    E(active2[INDEX(qn, x)] == NULL);
1564
0
                }
1565
0
            }
1566
1567
    /* initialize pending */
1568
0
    F(ALLOC_N(pending, 2*nsigma));
1569
0
    npending = nsigma;
1570
0
    spending = nsigma;
1571
0
    for (int x = 0; x < nsigma; x++) {
1572
0
        int a0 = active[INDEX(0,x)]->size;
1573
0
        int a1 = active[INDEX(1,x)]->size;
1574
0
        int j;
1575
0
        if (a0 <= a1)
1576
0
            j = 0;
1577
0
        else
1578
0
            j = 1;
1579
0
        pending[2*x] = j;
1580
0
        pending[2*x+1] = x;
1581
0
        bitset_set(pending2, INDEX(j, x));
1582
0
    }
1583
1584
    /* process pending until fixed point */
1585
0
    int k = 2;
1586
0
    while (npending-- > 0) {
1587
0
        int p = pending[2*npending];
1588
0
        int x = pending[2*npending+1];
1589
0
        bitset_clr(pending2, INDEX(p, x));
1590
0
        int ref = 0;
1591
        /* find states that need to be split off their blocks */
1592
0
        struct state_list *sh = active[INDEX(p,x)];
1593
0
        for (struct state_list_node *m = sh->first; m != NULL; m = m->next) {
1594
0
            int q = state_set_index(states, m->state);
1595
0
            struct state_set *rev = reverse[INDEX(q, x)];
1596
0
            for (int r =0; r < rev->used; r++) {
1597
0
                struct state *rs = rev->states[r];
1598
0
                int s = state_set_index(states, rs);
1599
0
                if (! bitset_get(split2, s)) {
1600
0
                    bitset_set(split2, s);
1601
0
                    F(state_set_push(split, rs));
1602
0
                    int j = block[s];
1603
0
                    F(state_set_push(splitblock[j], rs));
1604
0
                    if (!bitset_get(refine2, j)) {
1605
0
                        bitset_set(refine2, j);
1606
0
                        refine[ref++] = j;
1607
0
                    }
1608
0
                }
1609
0
            }
1610
0
        }
1611
        // refine blocks
1612
0
        for(int rr=0; rr < ref; rr++) {
1613
0
            int j = refine[rr];
1614
0
            struct state_set *sp = splitblock[j];
1615
0
            if (sp->used < partition[j]->used) {
1616
0
                struct state_set *b1 = partition[j];
1617
0
                struct state_set *b2 = partition[k];
1618
0
                for (int s = 0; s < sp->used; s++) {
1619
0
                    state_set_remove(b1, sp->states[s]);
1620
0
                    F(state_set_push(b2, sp->states[s]));
1621
0
                    int snum = state_set_index(states, sp->states[s]);
1622
0
                    block[snum] = k;
1623
0
                    for (int c = 0; c < nsigma; c++) {
1624
0
                        struct state_list_node *sn = active2[INDEX(snum, c)];
1625
0
                        if (sn != NULL && sn->sl == active[INDEX(j,c)]) {
1626
0
                            state_list_remove(sn);
1627
0
                            active2[INDEX(snum, c)] =
1628
0
                                state_list_add(active[INDEX(k, c)],
1629
0
                                               sp->states[s]);
1630
0
                            E(active2[INDEX(snum, c)] == NULL);
1631
0
                        }
1632
0
                    }
1633
0
                }
1634
                // update pending
1635
0
                for (int c = 0; c < nsigma; c++) {
1636
0
                    int aj = active[INDEX(j, c)]->size;
1637
0
                    int ak = active[INDEX(k, c)]->size;
1638
0
                    if (npending + 1 > spending) {
1639
0
                        spending *= 2;
1640
0
                        F(REALLOC_N(pending, 2 * spending));
1641
0
                    }
1642
0
                    pending[2*npending + 1] = c;
1643
0
                    if (!bitset_get(pending2, INDEX(j, c))
1644
0
                        && 0 < aj && aj <= ak) {
1645
0
                        bitset_set(pending2, INDEX(j, c));
1646
0
                        pending[2*npending] = j;
1647
0
                    } else {
1648
0
                        bitset_set(pending2, INDEX(k, c));
1649
0
                        pending[2*npending] = k;
1650
0
                    }
1651
0
                    npending += 1;
1652
0
                }
1653
0
                k++;
1654
0
            }
1655
0
            for (int s = 0; s < sp->used; s++) {
1656
0
                int snum = state_set_index(states, sp->states[s]);
1657
0
                bitset_clr(split2, snum);
1658
0
            }
1659
0
            bitset_clr(refine2, j);
1660
0
            sp->used = 0;
1661
0
        }
1662
0
        split->used = 0;
1663
0
    }
1664
1665
    /* make a new state for each equivalence class, set initial state */
1666
0
    newstates = state_set_init(k, S_NONE);
1667
0
    E(newstates == NULL);
1668
0
    F(ALLOC_N(nsnum, k));
1669
0
    F(ALLOC_N(nsind, nstates));
1670
1671
0
    for (int n = 0; n < k; n++) {
1672
0
        struct state *s = make_state();
1673
0
        E(s == NULL);
1674
0
        newstates->states[n] = s;
1675
0
        struct state_set *partn = partition[n];
1676
0
        for (int q=0; q < partn->used; q++) {
1677
0
            struct state *qs = partn->states[q];
1678
0
            int qnum = state_set_index(states, qs);
1679
0
            if (qs == fa->initial)
1680
0
                s->live = 1;     /* Abuse live to flag the new initial state */
1681
0
            nsnum[n] = qnum;     /* select representative */
1682
0
            nsind[qnum] = n;     /* and point from partition to new state */
1683
0
        }
1684
0
    }
1685
1686
    /* build transitions and set acceptance */
1687
0
    for (int n = 0; n < k; n++) {
1688
0
        struct state *s = newstates->states[n];
1689
0
        s->accept = states->states[nsnum[n]]->accept;
1690
0
        for_each_trans(t, states->states[nsnum[n]]) {
1691
0
            int toind = state_set_index(states, t->to);
1692
0
            struct state *nto = newstates->states[nsind[toind]];
1693
0
            F(add_new_trans(s, nto, t->min, t->max));
1694
0
        }
1695
0
    }
1696
1697
    /* Get rid of old states and transitions and turn NEWTSTATES into
1698
       a linked list */
1699
0
    gut(fa);
1700
0
    for (int n=0; n < k; n++)
1701
0
        if (newstates->states[n]->live) {
1702
0
            struct state *ini = newstates->states[n];
1703
0
            newstates->states[n] = newstates->states[0];
1704
0
            newstates->states[0] = ini;
1705
0
        }
1706
0
    for (int n=0; n < k-1; n++)
1707
0
        newstates->states[n]->next = newstates->states[n+1];
1708
0
    fa->initial = newstates->states[0];
1709
1710
0
    result = 0;
1711
1712
    /* clean up */
1713
0
 done:
1714
0
    free(nsind);
1715
0
    free(nsnum);
1716
0
    state_set_free(states);
1717
0
    free(sigma);
1718
0
    bitset_free(reverse_nonempty);
1719
0
    free(block);
1720
0
    for (int i=0; i < nstates*nsigma; i++) {
1721
0
        if (reverse)
1722
0
            state_set_free(reverse[i]);
1723
0
        if (active)
1724
0
            state_list_free(active[i]);
1725
0
    }
1726
0
    free(reverse);
1727
0
    free(active);
1728
0
    free(active2);
1729
0
    free(pending);
1730
0
    bitset_free(pending2);
1731
0
    state_set_free(split);
1732
0
    bitset_free(split2);
1733
0
    free(refine);
1734
0
    bitset_free(refine2);
1735
0
    for (int q=0; q < nstates; q++) {
1736
0
        if (splitblock)
1737
0
            state_set_free(splitblock[q]);
1738
0
        if (partition)
1739
0
            state_set_free(partition[q]);
1740
0
    }
1741
0
    free(splitblock);
1742
0
    free(partition);
1743
0
    state_set_free(newstates);
1744
1745
0
    if (collect(fa) < 0)
1746
0
        result = -1;
1747
0
    return result;
1748
0
 error:
1749
0
    result = -1;
1750
0
    goto done;
1751
0
}
1752
1753
0
static int minimize_brzozowski(struct fa *fa) {
1754
0
    struct state_set *set;
1755
1756
    /* Minimize using Brzozowski's algorithm */
1757
0
    set = fa_reverse(fa);
1758
0
    E(set == NULL);
1759
0
    F(determinize(fa, set));
1760
0
    state_set_free(set);
1761
1762
0
    set = fa_reverse(fa);
1763
0
    E(set == NULL);
1764
0
    F(determinize(fa, set));
1765
0
    state_set_free(set);
1766
0
    return 0;
1767
0
 error:
1768
0
    return -1;
1769
0
}
1770
1771
0
int fa_minimize(struct fa *fa) {
1772
0
    int r;
1773
1774
0
    if (fa == NULL)
1775
0
        return -1;
1776
0
    if (fa->minimal)
1777
0
        return 0;
1778
1779
0
    if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) {
1780
0
        r = minimize_brzozowski(fa);
1781
0
    } else {
1782
0
        r = minimize_hopcroft(fa);
1783
0
    }
1784
1785
0
    if (r == 0)
1786
0
        fa->minimal = 1;
1787
0
    return r;
1788
0
}
1789
1790
/*
1791
 * Construction of fa
1792
 */
1793
1794
0
static struct fa *fa_make_empty(void) {
1795
0
    struct fa *fa;
1796
1797
0
    if (ALLOC(fa) < 0)
1798
0
        return NULL;
1799
0
    if (add_state(fa, 0) == NULL) {
1800
0
        fa_free(fa);
1801
0
        return NULL;
1802
0
    }
1803
    /* Even though, technically, this fa is both minimal and deterministic,
1804
     * this function is also used to allocate new fa's which are then modified
1805
     * further. Rather than risk erroneously marking such an fa as minimal
1806
     * and deterministic, we do not do that here and take the minor hit if
1807
     * that should ever need to be determined for an actual empty fa
1808
     */
1809
0
    return fa;
1810
0
}
1811
1812
0
static struct fa *fa_make_epsilon(void) {
1813
0
    struct fa *fa = fa_make_empty();
1814
0
    if (fa) {
1815
0
        fa->initial->accept = 1;
1816
0
        fa->deterministic= 1;
1817
0
        fa->minimal = 1;
1818
0
    }
1819
0
    return fa;
1820
0
}
1821
1822
0
static struct fa *fa_make_char(uchar c) {
1823
0
    struct fa *fa = fa_make_empty();
1824
0
    if (! fa)
1825
0
        return NULL;
1826
1827
0
    struct state *s = fa->initial;
1828
0
    struct state *t = add_state(fa, 1);
1829
0
    int r;
1830
1831
0
    if (t == NULL)
1832
0
        goto error;
1833
1834
0
    r = add_new_trans(s, t, c, c);
1835
0
    if (r < 0)
1836
0
        goto error;
1837
0
    fa->deterministic = 1;
1838
0
    fa->minimal = 1;
1839
0
    return fa;
1840
0
 error:
1841
0
    fa_free(fa);
1842
0
    return NULL;
1843
0
}
1844
1845
0
struct fa *fa_make_basic(unsigned int basic) {
1846
0
    int r;
1847
1848
0
    if (basic == FA_EMPTY) {
1849
0
        return fa_make_empty();
1850
0
    } else if (basic == FA_EPSILON) {
1851
0
        return fa_make_epsilon();
1852
0
    } else if (basic == FA_TOTAL) {
1853
0
        struct fa *fa = fa_make_epsilon();
1854
0
        r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX);
1855
0
        if (r < 0) {
1856
0
            fa_free(fa);
1857
0
            fa = NULL;
1858
0
        }
1859
0
        return fa;
1860
0
    }
1861
0
    return NULL;
1862
0
}
1863
1864
0
int fa_is_basic(struct fa *fa, unsigned int basic) {
1865
0
    if (basic == FA_EMPTY) {
1866
0
        return ! fa->initial->accept && fa->initial->tused == 0;
1867
0
    } else if (basic == FA_EPSILON) {
1868
0
        return fa->initial->accept && fa->initial->tused == 0;
1869
0
    } else if (basic == FA_TOTAL) {
1870
0
        if (! fa->initial->accept)
1871
0
            return 0;
1872
0
        if (fa->nocase) {
1873
0
            if (fa->initial->tused != 2)
1874
0
                return 0;
1875
0
            struct trans *t1 = fa->initial->trans;
1876
0
            struct trans *t2 = fa->initial->trans + 1;
1877
0
            if (t1->to != fa->initial || t2->to != fa->initial)
1878
0
                return 0;
1879
0
            if (t2->max != UCHAR_MAX) {
1880
0
                t1 = t2;
1881
0
                t2 = fa->initial->trans;
1882
0
            }
1883
0
            return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
1884
0
                    t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
1885
0
        } else {
1886
0
            struct trans *t = fa->initial->trans;
1887
0
            return fa->initial->tused == 1 &&
1888
0
                t->to == fa->initial &&
1889
0
                t->min == UCHAR_MIN && t->max == UCHAR_MAX;
1890
0
        }
1891
0
    }
1892
0
    return 0;
1893
0
}
1894
1895
0
static struct fa *fa_clone(struct fa *fa) {
1896
0
    struct fa *result = NULL;
1897
0
    struct state_set *set = state_set_init(-1, S_DATA|S_SORTED);
1898
0
    int r;
1899
1900
0
    if (fa == NULL || set == NULL || ALLOC(result) < 0)
1901
0
        goto error;
1902
1903
0
    result->deterministic = fa->deterministic;
1904
0
    result->minimal = fa->minimal;
1905
0
    result->nocase = fa->nocase;
1906
0
    list_for_each(s, fa->initial) {
1907
0
        int i = state_set_push(set, s);
1908
0
        E(i < 0);
1909
1910
0
        struct state *q = add_state(result, s->accept);
1911
0
        if (q == NULL)
1912
0
            goto error;
1913
0
        set->data[i] = q;
1914
0
        q->live = s->live;
1915
0
        q->reachable = s->reachable;
1916
0
    }
1917
0
    for (int i=0; i < set->used; i++) {
1918
0
        struct state *s = set->states[i];
1919
0
        struct state *sc = set->data[i];
1920
0
        for_each_trans(t, s) {
1921
0
            int to = state_set_index(set, t->to);
1922
0
            assert(to >= 0);
1923
0
            struct state *toc = set->data[to];
1924
0
            r = add_new_trans(sc, toc, t->min, t->max);
1925
0
            if (r < 0)
1926
0
                goto error;
1927
0
        }
1928
0
    }
1929
0
    state_set_free(set);
1930
0
    return result;
1931
0
 error:
1932
0
    state_set_free(set);
1933
0
    fa_free(result);
1934
0
    return NULL;
1935
0
}
1936
1937
static int case_expand(struct fa *fa);
1938
1939
/* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
1940
ATTRIBUTE_RETURN_CHECK
1941
0
static int union_in_place(struct fa *fa1, struct fa **fa2) {
1942
0
    struct state *s;
1943
0
    int r;
1944
1945
0
    if (fa1->nocase != (*fa2)->nocase) {
1946
0
        if (case_expand(fa1) < 0)
1947
0
            return -1;
1948
0
        if (case_expand(*fa2) < 0)
1949
0
            return -1;
1950
0
    }
1951
1952
0
    s = add_state(fa1, 0);
1953
0
    if (s == NULL)
1954
0
        return -1;
1955
0
    r = add_epsilon_trans(s, fa1->initial);
1956
0
    if (r < 0)
1957
0
        return -1;
1958
0
    r = add_epsilon_trans(s, (*fa2)->initial);
1959
0
    if (r < 0)
1960
0
        return -1;
1961
1962
0
    fa1->deterministic = 0;
1963
0
    fa1->minimal = 0;
1964
0
    fa_merge(fa1, fa2);
1965
1966
0
    set_initial(fa1, s);
1967
1968
0
    return 0;
1969
0
}
1970
1971
0
struct fa *fa_union(struct fa *fa1, struct fa *fa2) {
1972
0
    fa1 = fa_clone(fa1);
1973
0
    fa2 = fa_clone(fa2);
1974
0
    if (fa1 == NULL || fa2 == NULL)
1975
0
        goto error;
1976
1977
0
    F(union_in_place(fa1, &fa2));
1978
1979
0
    return fa1;
1980
0
 error:
1981
0
    fa_free(fa1);
1982
0
    fa_free(fa2);
1983
0
    return NULL;
1984
0
}
1985
1986
/* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */
1987
ATTRIBUTE_RETURN_CHECK
1988
0
static int concat_in_place(struct fa *fa1, struct fa **fa2) {
1989
0
    int r;
1990
1991
0
    if (fa1->nocase != (*fa2)->nocase) {
1992
0
        if (case_expand(fa1) < 0)
1993
0
            return -1;
1994
0
        if (case_expand(*fa2) < 0)
1995
0
            return -1;
1996
0
    }
1997
1998
0
    list_for_each(s, fa1->initial) {
1999
0
        if (s->accept) {
2000
0
            s->accept = 0;
2001
0
            r = add_epsilon_trans(s, (*fa2)->initial);
2002
0
            if (r < 0)
2003
0
                return -1;
2004
0
        }
2005
0
    }
2006
2007
0
    fa1->deterministic = 0;
2008
0
    fa1->minimal = 0;
2009
0
    fa_merge(fa1, fa2);
2010
2011
0
    return 0;
2012
0
}
2013
2014
0
struct fa *fa_concat(struct fa *fa1, struct fa *fa2) {
2015
0
    fa1 = fa_clone(fa1);
2016
0
    fa2 = fa_clone(fa2);
2017
2018
0
    if (fa1 == NULL || fa2 == NULL)
2019
0
        goto error;
2020
2021
0
    F(concat_in_place(fa1, &fa2));
2022
2023
0
    F(collect(fa1));
2024
2025
0
    return fa1;
2026
2027
0
 error:
2028
0
    fa_free(fa1);
2029
0
    fa_free(fa2);
2030
0
    return NULL;
2031
0
}
2032
2033
0
static struct fa *fa_make_char_set(bitset *cset, int negate) {
2034
0
    struct fa *fa = fa_make_empty();
2035
0
    if (!fa)
2036
0
        return NULL;
2037
2038
0
    struct state *s = fa->initial;
2039
0
    struct state *t = add_state(fa, 1);
2040
0
    int from = 0;
2041
0
    int r;
2042
2043
0
    if (t == NULL)
2044
0
        goto error;
2045
2046
0
    while (from <= UCHAR_MAX) {
2047
0
        while (from <= UCHAR_MAX && bitset_get(cset, from) == negate)
2048
0
            from += 1;
2049
0
        if (from > UCHAR_MAX)
2050
0
            break;
2051
0
        int to = from;
2052
0
        while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate))
2053
0
            to += 1;
2054
0
        r = add_new_trans(s, t, from, to);
2055
0
        if (r < 0)
2056
0
            goto error;
2057
0
        from = to + 1;
2058
0
    }
2059
2060
0
    fa->deterministic = 1;
2061
0
    fa->minimal = 1;
2062
0
    return fa;
2063
2064
0
 error:
2065
0
    fa_free(fa);
2066
0
    return NULL;
2067
0
}
2068
2069
0
static struct fa *fa_star(struct fa *fa) {
2070
0
    struct state *s;
2071
0
    int r;
2072
2073
0
    fa = fa_clone(fa);
2074
0
    if (fa == NULL)
2075
0
        return NULL;
2076
2077
0
    s = add_state(fa, 1);
2078
0
    if (s == NULL)
2079
0
        goto error;
2080
2081
0
    r = add_epsilon_trans(s, fa->initial);
2082
0
    if (r < 0)
2083
0
        goto error;
2084
2085
0
    set_initial(fa, s);
2086
0
    list_for_each(p, fa->initial->next) {
2087
0
        if (p->accept) {
2088
0
            r = add_epsilon_trans(p, s);
2089
0
            if (r < 0)
2090
0
                goto error;
2091
0
        }
2092
0
    }
2093
0
    fa->deterministic = 0;
2094
0
    fa->minimal = 0;
2095
2096
0
    return fa;
2097
2098
0
 error:
2099
0
    fa_free(fa);
2100
0
    return NULL;
2101
0
}
2102
2103
/* Form the automaton (FA){N}; FA is not modified */
2104
0
static struct fa *repeat(struct fa *fa, int n) {
2105
0
    if (n == 0) {
2106
0
        return fa_make_epsilon();
2107
0
    } else if (n == 1) {
2108
0
        return fa_clone(fa);
2109
0
    } else {
2110
0
        struct fa *cfa = fa_clone(fa);
2111
0
        if (cfa == NULL)
2112
0
            return NULL;
2113
0
        while (n > 1) {
2114
0
            struct fa *tfa = fa_clone(fa);
2115
0
            if (tfa == NULL) {
2116
0
                fa_free(cfa);
2117
0
                return NULL;
2118
0
            }
2119
0
            if (concat_in_place(cfa, &tfa) < 0) {
2120
0
                fa_free(cfa);
2121
0
                fa_free(tfa);
2122
0
                return NULL;
2123
0
            }
2124
0
            n -= 1;
2125
0
        }
2126
0
        return cfa;
2127
0
    }
2128
0
}
2129
2130
0
struct fa *fa_iter(struct fa *fa, int min, int max) {
2131
0
    int r;
2132
2133
0
    if (min < 0)
2134
0
        min = 0;
2135
2136
0
    if (min > max && max != -1) {
2137
0
        return fa_make_empty();
2138
0
    }
2139
0
    if (max == -1) {
2140
0
        struct fa *sfa = fa_star(fa);
2141
0
        if (min == 0)
2142
0
            return sfa;
2143
0
        if (! sfa)
2144
0
            return NULL;
2145
0
        struct fa *cfa = repeat(fa, min);
2146
0
        if (! cfa) {
2147
0
            fa_free(sfa);
2148
0
            return NULL;
2149
0
        }
2150
0
        if (concat_in_place(cfa, &sfa) < 0) {
2151
0
            fa_free(sfa);
2152
0
            fa_free(cfa);
2153
0
            return NULL;
2154
0
        }
2155
0
        return cfa;
2156
0
    } else {
2157
0
        struct fa *cfa = NULL;
2158
2159
0
        max -= min;
2160
0
        cfa = repeat(fa, min);
2161
0
        if (cfa == NULL)
2162
0
            return NULL;
2163
0
        if (max > 0) {
2164
0
            struct fa *cfa2 = fa_clone(fa);
2165
0
            if (cfa2 == NULL) {
2166
0
                fa_free(cfa);
2167
0
                return NULL;
2168
0
            }
2169
0
            while (max > 1) {
2170
0
                struct fa *cfa3 = fa_clone(fa);
2171
0
                if (cfa3 == NULL) {
2172
0
                    fa_free(cfa);
2173
0
                    fa_free(cfa2);
2174
0
                    return NULL;
2175
0
                }
2176
0
                list_for_each(s, cfa3->initial) {
2177
0
                    if (s->accept) {
2178
0
                        r = add_epsilon_trans(s, cfa2->initial);
2179
0
                        if (r < 0) {
2180
0
                            fa_free(cfa);
2181
0
                            fa_free(cfa2);
2182
0
                            fa_free(cfa3);
2183
0
                            return NULL;
2184
0
                        }
2185
0
                    }
2186
0
                }
2187
0
                fa_merge(cfa3, &cfa2);
2188
0
                cfa2 = cfa3;
2189
0
                max -= 1;
2190
0
            }
2191
0
            list_for_each(s, cfa->initial) {
2192
0
                if (s->accept) {
2193
0
                    r = add_epsilon_trans(s, cfa2->initial);
2194
0
                    if (r < 0) {
2195
0
                        fa_free(cfa);
2196
0
                        fa_free(cfa2);
2197
0
                        return NULL;
2198
0
                    }
2199
0
                }
2200
0
            }
2201
0
            fa_merge(cfa, &cfa2);
2202
0
            cfa->deterministic = 0;
2203
0
            cfa->minimal = 0;
2204
0
        }
2205
0
        if (collect(cfa) < 0) {
2206
0
            fa_free(cfa);
2207
0
            cfa = NULL;
2208
0
        }
2209
0
        return cfa;
2210
0
    }
2211
0
}
2212
2213
0
static void sort_transition_intervals(struct fa *fa) {
2214
0
    list_for_each(s, fa->initial) {
2215
0
        qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp);
2216
0
    }
2217
0
}
2218
2219
0
struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
2220
0
    int ret;
2221
0
    struct fa *fa = NULL;
2222
0
    struct state_set *worklist = NULL;
2223
0
    state_triple_hash *newstates = NULL;
2224
2225
0
    if (fa1 == fa2)
2226
0
        return fa_clone(fa1);
2227
2228
0
    if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
2229
0
        return fa_make_empty();
2230
2231
0
    if (fa1->nocase != fa2->nocase) {
2232
0
        F(case_expand(fa1));
2233
0
        F(case_expand(fa2));
2234
0
    }
2235
2236
0
    fa = fa_make_empty();
2237
0
    worklist = state_set_init(-1, S_NONE);
2238
0
    newstates = state_triple_init();
2239
0
    if (fa == NULL || worklist == NULL || newstates == NULL)
2240
0
        goto error;
2241
2242
0
    sort_transition_intervals(fa1);
2243
0
    sort_transition_intervals(fa2);
2244
2245
0
    F(state_set_push(worklist, fa1->initial));
2246
0
    F(state_set_push(worklist, fa2->initial));
2247
0
    F(state_set_push(worklist, fa->initial));
2248
0
    F(state_triple_push(newstates,
2249
0
                         fa1->initial, fa2->initial, fa->initial));
2250
0
    while (worklist->used) {
2251
0
        struct state *s  = state_set_pop(worklist);
2252
0
        struct state *p2 = state_set_pop(worklist);
2253
0
        struct state *p1 = state_set_pop(worklist);
2254
0
        s->accept = p1->accept && p2->accept;
2255
2256
0
        struct trans *t1 = p1->trans;
2257
0
        struct trans *t2 = p2->trans;
2258
0
        for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2259
0
            while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2260
0
                b2++;
2261
0
            for (int n2 = b2;
2262
0
                 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2263
0
                 n2++) {
2264
0
                if (t2[n2].max >= t1[n1].min) {
2265
0
                    struct state *r = state_triple_thd(newstates,
2266
0
                                                       t1[n1].to, t2[n2].to);
2267
0
                    if (r == NULL) {
2268
0
                        r = add_state(fa, 0);
2269
0
                        E(r == NULL);
2270
0
                        F(state_set_push(worklist, t1[n1].to));
2271
0
                        F(state_set_push(worklist, t2[n2].to));
2272
0
                        F(state_set_push(worklist, r));
2273
0
                        F(state_triple_push(newstates,
2274
0
                                             t1[n1].to, t2[n2].to, r));
2275
0
                    }
2276
0
                    char min = t1[n1].min > t2[n2].min
2277
0
                        ? t1[n1].min : t2[n2].min;
2278
0
                    char max = t1[n1].max < t2[n2].max
2279
0
                        ? t1[n1].max : t2[n2].max;
2280
0
                    ret = add_new_trans(s, r, min, max);
2281
0
                    if (ret < 0)
2282
0
                        goto error;
2283
0
                }
2284
0
            }
2285
0
        }
2286
0
    }
2287
0
    fa->deterministic = fa1->deterministic && fa2->deterministic;
2288
0
    fa->nocase = fa1->nocase && fa2->nocase;
2289
0
 done:
2290
0
    state_set_free(worklist);
2291
0
    state_triple_free(newstates);
2292
0
    if (fa != NULL) {
2293
0
        if (collect(fa) < 0) {
2294
0
            fa_free(fa);
2295
0
            fa = NULL;
2296
0
        }
2297
0
    }
2298
2299
0
    return fa;
2300
0
 error:
2301
0
    fa_free(fa);
2302
0
    fa = NULL;
2303
0
    goto done;
2304
0
}
2305
2306
0
int fa_contains(struct fa *fa1, struct fa *fa2) {
2307
0
    int result = 0;
2308
0
    struct state_set *worklist = NULL;  /* List of pairs of states */
2309
0
    struct state_set *visited = NULL;   /* List of pairs of states */
2310
2311
0
    if (fa1 == NULL || fa2 == NULL)
2312
0
        return -1;
2313
2314
0
    if (fa1 == fa2)
2315
0
        return 1;
2316
2317
0
    F(determinize(fa2, NULL));
2318
0
    sort_transition_intervals(fa1);
2319
0
    sort_transition_intervals(fa2);
2320
2321
0
    F(state_pair_push(&worklist, fa1->initial, fa2->initial));
2322
0
    F(state_pair_push(&visited, fa1->initial, fa2->initial));
2323
0
    while (worklist->used) {
2324
0
        struct state *p1, *p2;
2325
0
        void *v2;
2326
0
        p1 = state_set_pop_data(worklist, &v2);
2327
0
        p2 = v2;
2328
2329
0
        if (p1->accept && !p2->accept)
2330
0
            goto done;
2331
2332
0
        struct trans *t1 = p1->trans;
2333
0
        struct trans *t2 = p2->trans;
2334
0
        for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2335
0
            while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2336
0
                b2++;
2337
0
            int min1 = t1[n1].min, max1 = t1[n1].max;
2338
0
            for (int n2 = b2;
2339
0
                 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2340
0
                 n2++) {
2341
0
                if (t2[n2].min > min1)
2342
0
                    goto done;
2343
0
                if (t2[n2].max < CHAR_MAX)
2344
0
                    min1 = t2[n2].max + 1;
2345
0
                else {
2346
0
                    min1 = UCHAR_MAX;
2347
0
                    max1 = UCHAR_MIN;
2348
0
                }
2349
0
                if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) {
2350
0
                    F(state_pair_push(&worklist, t1[n1].to, t2[n2].to));
2351
0
                    F(state_pair_push(&visited, t1[n1].to, t2[n2].to));
2352
0
                }
2353
0
            }
2354
0
            if (min1 <= max1)
2355
0
                goto done;
2356
0
        }
2357
0
    }
2358
2359
0
    result = 1;
2360
0
 done:
2361
0
    state_set_free(worklist);
2362
0
    state_set_free(visited);
2363
0
    return result;
2364
0
 error:
2365
0
    result = -1;
2366
0
    goto done;
2367
0
}
2368
2369
static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
2370
0
                           int min, int max) {
2371
0
    int result;
2372
2373
0
    if (fa->nocase) {
2374
        /* Never transition on anything in [A-Z] */
2375
0
        if (min > 'Z' || max < 'A') {
2376
0
            result = add_new_trans(s, crash, min, max);
2377
0
        } else if (min >= 'A' && max <= 'Z') {
2378
0
            result = 0;
2379
0
        } else if (max <= 'Z') {
2380
            /* min < 'A' */
2381
0
            result = add_new_trans(s, crash, min, 'A' - 1);
2382
0
        } else if (min >= 'A') {
2383
            /* max > 'Z' */
2384
0
            result = add_new_trans(s, crash, 'Z' + 1, max);
2385
0
        } else {
2386
            /* min < 'A' && max > 'Z' */
2387
0
            result = add_new_trans(s, crash, min, 'A' - 1);
2388
0
            if (result == 0)
2389
0
                result = add_new_trans(s, crash, 'Z' + 1, max);
2390
0
        }
2391
0
    } else {
2392
0
        result = add_new_trans(s, crash, min, max);
2393
0
    }
2394
0
    return result;
2395
0
}
2396
2397
0
static int totalize(struct fa *fa) {
2398
0
    int r;
2399
0
    struct state *crash = add_state(fa, 0);
2400
2401
0
    E(crash == NULL);
2402
0
    F(mark_reachable(fa));
2403
0
    sort_transition_intervals(fa);
2404
2405
0
    r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
2406
0
    if (r < 0)
2407
0
        return -1;
2408
2409
0
    list_for_each(s, fa->initial) {
2410
0
        int next = UCHAR_MIN;
2411
0
        int tused = s->tused;
2412
0
        for (int i=0; i < tused; i++) {
2413
0
            uchar min = s->trans[i].min, max = s->trans[i].max;
2414
0
            if (min > next) {
2415
0
                r = add_crash_trans(fa, s, crash, next, min - 1);
2416
0
                if (r < 0)
2417
0
                    return -1;
2418
0
            }
2419
0
            if (max + 1 > next)
2420
0
                next = max + 1;
2421
0
        }
2422
0
        if (next <= UCHAR_MAX) {
2423
0
            r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
2424
0
            if (r < 0)
2425
0
                return -1;
2426
0
        }
2427
0
    }
2428
0
    return 0;
2429
0
 error:
2430
0
    return -1;
2431
0
}
2432
2433
0
struct fa *fa_complement(struct fa *fa) {
2434
0
    fa = fa_clone(fa);
2435
0
    E(fa == NULL);
2436
0
    F(determinize(fa, NULL));
2437
0
    F(totalize(fa));
2438
0
    list_for_each(s, fa->initial)
2439
0
        s->accept = ! s->accept;
2440
2441
0
    F(collect(fa));
2442
0
    return fa;
2443
0
 error:
2444
0
    fa_free(fa);
2445
0
    return NULL;
2446
0
}
2447
2448
0
struct fa *fa_minus(struct fa *fa1, struct fa *fa2) {
2449
0
    if (fa1 == NULL || fa2 == NULL)
2450
0
        return NULL;
2451
2452
0
    if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2)
2453
0
        return fa_make_empty();
2454
0
    if (fa_is_basic(fa2, FA_EMPTY))
2455
0
        return fa_clone(fa1);
2456
2457
0
    struct fa *cfa2 = fa_complement(fa2);
2458
0
    if (cfa2 == NULL)
2459
0
        return NULL;
2460
2461
0
    struct fa *result = fa_intersect(fa1, cfa2);
2462
0
    fa_free(cfa2);
2463
0
    return result;
2464
0
}
2465
2466
0
static int accept_to_accept(struct fa *fa) {
2467
0
    int r;
2468
0
    struct state *s = add_state(fa, 0);
2469
0
    if (s == NULL)
2470
0
        return -1;
2471
2472
0
    F(mark_reachable(fa));
2473
0
    list_for_each(a, fa->initial) {
2474
0
        if (a->accept && a->reachable) {
2475
0
            r = add_epsilon_trans(s, a);
2476
0
            if (r < 0)
2477
0
                return -1;
2478
0
        }
2479
0
    }
2480
2481
0
    set_initial(fa, s);
2482
0
    fa->deterministic = fa->minimal = 0;
2483
0
    return 0;
2484
0
 error:
2485
0
    return -1;
2486
0
}
2487
2488
0
struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) {
2489
0
    struct fa *fa = NULL, *eps = NULL, *result = NULL;
2490
0
    struct state_set *map = NULL;
2491
2492
0
    if (fa1 == NULL || fa2 == NULL)
2493
0
        return NULL;
2494
2495
0
    fa1 = fa_clone(fa1);
2496
0
    fa2 = fa_clone(fa2);
2497
0
    if (fa1 == NULL || fa2 == NULL)
2498
0
        goto error;
2499
2500
0
    if (determinize(fa1, NULL) < 0)
2501
0
        goto error;
2502
0
    if (accept_to_accept(fa1) < 0)
2503
0
        goto error;
2504
2505
0
    map = fa_reverse(fa2);
2506
0
    state_set_free(map);
2507
0
    if (determinize(fa2, NULL) < 0)
2508
0
        goto error;
2509
0
    if (accept_to_accept(fa2) < 0)
2510
0
        goto error;
2511
0
    map = fa_reverse(fa2);
2512
0
    state_set_free(map);
2513
0
    if (determinize(fa2, NULL) < 0)
2514
0
        goto error;
2515
2516
0
    fa = fa_intersect(fa1, fa2);
2517
0
    if (fa == NULL)
2518
0
        goto error;
2519
2520
0
    eps = fa_make_epsilon();
2521
0
    if (eps == NULL)
2522
0
        goto error;
2523
2524
0
    result = fa_minus(fa, eps);
2525
0
    if (result == NULL)
2526
0
        goto error;
2527
2528
0
 error:
2529
0
    fa_free(fa1);
2530
0
    fa_free(fa2);
2531
0
    fa_free(fa);
2532
0
    fa_free(eps);
2533
0
    return result;
2534
0
}
2535
2536
0
int fa_equals(struct fa *fa1, struct fa *fa2) {
2537
0
    if (fa1 == NULL || fa2 == NULL)
2538
0
        return -1;
2539
2540
    /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */
2541
0
    int c1 = fa_contains(fa1, fa2);
2542
0
    if (c1 < 0)
2543
0
        return -1;
2544
0
    if (c1 == 0)
2545
0
        return 0;
2546
0
    return fa_contains(fa2, fa1);
2547
0
}
2548
2549
0
static unsigned int chr_score(char c) {
2550
0
    if (isalpha(c)) {
2551
0
        return 2;
2552
0
    } else if (isalnum(c)) {
2553
0
        return 3;
2554
0
    } else if (isprint(c)) {
2555
0
        return 7;
2556
0
    } else if (c == '\0') {
2557
0
        return 10000;
2558
0
    } else {
2559
0
        return 100;
2560
0
    }
2561
0
}
2562
2563
0
static unsigned int str_score(const struct re_str *str) {
2564
0
    unsigned int score = 0;
2565
0
    for (int i=0; i < str->len; i++) {
2566
0
        score += chr_score(str->rx[i]);
2567
0
    }
2568
0
    return score;
2569
0
}
2570
2571
/* See if we get a better string for DST by appending C to SRC. If DST is
2572
 * NULL or empty, always use SRC + C
2573
 */
2574
static struct re_str *string_extend(struct re_str *dst,
2575
                                    const struct re_str *src,
2576
0
                                    char c) {
2577
0
    if (dst == NULL
2578
0
        || dst->len == 0
2579
0
        || str_score(src) + chr_score(c) < str_score(dst)) {
2580
0
        int slen = src->len;
2581
0
        if (dst == NULL)
2582
0
            dst = make_re_str(NULL);
2583
0
        if (dst == NULL)
2584
0
            return NULL;
2585
0
        if (REALLOC_N(dst->rx, slen+2) < 0) {
2586
0
            free(dst);
2587
0
            return NULL;
2588
0
        }
2589
0
        memcpy(dst->rx, src->rx, slen);
2590
0
        dst->rx[slen] = c;
2591
0
        dst->rx[slen + 1] = '\0';
2592
0
        dst->len = slen + 1;
2593
0
    }
2594
0
    return dst;
2595
0
}
2596
2597
0
static char pick_char(struct trans *t) {
2598
0
    for (int c = t->min; c <= t->max; c++)
2599
0
        if (isalpha(c)) return c;
2600
0
    for (int c = t->min; c <= t->max; c++)
2601
0
        if (isalnum(c)) return c;
2602
0
    for (int c = t->min; c <= t->max; c++)
2603
0
        if (isprint(c)) return c;
2604
0
    return t->max;
2605
0
}
2606
2607
/* Generate an example string for FA. Traverse all transitions and record
2608
 * at each turn the "best" word found for that state.
2609
 */
2610
0
int fa_example(struct fa *fa, char **example, size_t *example_len) {
2611
0
    struct re_str *word = NULL;
2612
0
    struct state_set *path = NULL, *worklist = NULL;
2613
0
    struct re_str *str = NULL;
2614
2615
0
    *example = NULL;
2616
0
    *example_len = 0;
2617
2618
    /* Sort to avoid any ambiguity because of reordering of transitions */
2619
0
    sort_transition_intervals(fa);
2620
2621
    /* Map from state to string */
2622
0
    path = state_set_init(-1, S_DATA|S_SORTED);
2623
0
    str = make_re_str("");
2624
0
    if (path == NULL || str == NULL)
2625
0
        goto error;
2626
0
    F(state_set_push_data(path, fa->initial, str));
2627
0
    str = NULL;
2628
2629
    /* List of states still to visit */
2630
0
    worklist = state_set_init(-1, S_NONE);
2631
0
    if (worklist == NULL)
2632
0
        goto error;
2633
0
    F(state_set_push(worklist, fa->initial));
2634
2635
0
    while (worklist->used) {
2636
0
        struct state *s = state_set_pop(worklist);
2637
0
        struct re_str *ps = state_set_find_data(path, s);
2638
0
        for_each_trans(t, s) {
2639
0
            char c = pick_char(t);
2640
0
            int toind = state_set_index(path, t->to);
2641
0
            if (toind == -1) {
2642
0
                struct re_str *w = string_extend(NULL, ps, c);
2643
0
                E(w == NULL);
2644
0
                F(state_set_push(worklist, t->to));
2645
0
                F(state_set_push_data(path, t->to, w));
2646
0
            } else {
2647
0
                path->data[toind] = string_extend(path->data[toind], ps, c);
2648
0
            }
2649
0
        }
2650
0
    }
2651
2652
0
    for (int i=0; i < path->used; i++) {
2653
0
        struct state *p = path->states[i];
2654
0
        struct re_str *ps = path->data[i];
2655
0
        if (p->accept &&
2656
0
            (word == NULL || word->len == 0
2657
0
             || (ps->len > 0 && str_score(word) > str_score(ps)))) {
2658
0
            free_re_str(word);
2659
0
            word = ps;
2660
0
        } else {
2661
0
            free_re_str(ps);
2662
0
        }
2663
0
    }
2664
0
    state_set_free(path);
2665
0
    state_set_free(worklist);
2666
0
    if (word != NULL) {
2667
0
        *example_len = word->len;
2668
0
        *example = word->rx;
2669
0
        free(word);
2670
0
    }
2671
0
    return 0;
2672
0
 error:
2673
0
    state_set_free(path);
2674
0
    state_set_free(worklist);
2675
0
    free_re_str(word);
2676
0
    free_re_str(str);
2677
0
    return -1;
2678
0
}
2679
2680
struct enum_intl {
2681
    int       limit;
2682
    int       nwords;
2683
    char    **words;
2684
    char     *buf;
2685
    size_t    bsize;
2686
};
2687
2688
0
static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) {
2689
0
    int result = -1;
2690
2691
0
    if (ei->bsize <= pos + 1) {
2692
0
        ei->bsize *= 2;
2693
0
        F(REALLOC_N(ei->buf, ei->bsize));
2694
0
    }
2695
2696
0
    ei->buf[pos] = '\0';
2697
0
    for_each_trans(t, s) {
2698
0
        if (t->to->visited)
2699
0
            return -2;
2700
0
        t->to->visited = 1;
2701
0
        for (int i=t->min; i <= t->max; i++) {
2702
0
            ei->buf[pos] = i;
2703
0
            if (t->to->accept) {
2704
0
                if (ei->nwords >= ei->limit)
2705
0
                    return -2;
2706
0
                ei->words[ei->nwords] = strdup(ei->buf);
2707
0
                E(ei->words[ei->nwords] == NULL);
2708
0
                ei->nwords += 1;
2709
0
            }
2710
0
            result = fa_enumerate_intl(t->to, ei, pos+1);
2711
0
            E(result < 0);
2712
0
        }
2713
0
        t->to->visited = 0;
2714
0
    }
2715
0
    ei->buf[pos] = '\0';
2716
0
    result = 0;
2717
0
 error:
2718
0
    return result;
2719
0
}
2720
2721
0
int fa_enumerate(struct fa *fa, int limit, char ***words) {
2722
0
    struct enum_intl ei;
2723
0
    int result = -1;
2724
2725
0
    *words = NULL;
2726
0
    MEMZERO(&ei, 1);
2727
0
    ei.bsize = 8;                    /* Arbitrary initial size */
2728
0
    ei.limit = limit;
2729
0
    F(ALLOC_N(ei.words, limit));
2730
0
    F(ALLOC_N(ei.buf, ei.bsize));
2731
2732
    /* We use the visited bit to track which states we already visited
2733
     * during the construction of a word to detect loops */
2734
0
    list_for_each(s, fa->initial)
2735
0
        s->visited = 0;
2736
0
    fa->initial->visited = 1;
2737
0
    if (fa->initial->accept) {
2738
0
        if (ei.nwords >= limit)
2739
0
            return -2;
2740
0
        ei.words[0] = strdup("");
2741
0
        E(ei.words[0] == NULL);
2742
0
        ei.nwords = 1;
2743
0
    }
2744
0
    result = fa_enumerate_intl(fa->initial, &ei, 0);
2745
0
    E(result < 0);
2746
2747
0
    result = ei.nwords;
2748
0
    *words = ei.words;
2749
0
    ei.words = NULL;
2750
0
 done:
2751
0
    free(ei.buf);
2752
0
    return result;
2753
2754
0
 error:
2755
0
    for (int i=0; i < ei.nwords; i++)
2756
0
        free(ei.words[i]);
2757
0
    free(ei.words);
2758
0
    goto done;
2759
0
}
2760
2761
/* Expand the automaton FA by replacing every transition s(c) -> p from
2762
 * state s to p on character c by two transitions s(X) -> r, r(c) -> p via
2763
 * a new state r.
2764
 * If ADD_MARKER is true, also add for each original state s a new a loop
2765
 * s(Y) -> q and q(X) -> s through a new state q.
2766
 *
2767
 * The end result is that an automaton accepting "a|ab" is turned into one
2768
 * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb"
2769
 * when add_marker is true.
2770
 *
2771
 * The returned automaton is a copy of FA, FA is not modified.
2772
 */
2773
static struct fa *expand_alphabet(struct fa *fa, int add_marker,
2774
0
                                  char X, char Y) {
2775
0
    int ret;
2776
2777
0
    fa = fa_clone(fa);
2778
0
    if (fa == NULL)
2779
0
        return NULL;
2780
2781
0
    F(mark_reachable(fa));
2782
0
    list_for_each(p, fa->initial) {
2783
0
        if (! p->reachable)
2784
0
            continue;
2785
2786
0
        struct state *r = add_state(fa, 0);
2787
0
        if (r == NULL)
2788
0
            goto error;
2789
0
        r->trans = p->trans;
2790
0
        r->tused = p->tused;
2791
0
        r->tsize = p->tsize;
2792
0
        p->trans = NULL;
2793
0
        p->tused = p->tsize = 0;
2794
0
        ret = add_new_trans(p, r, X, X);
2795
0
        if (ret < 0)
2796
0
            goto error;
2797
0
        if (add_marker) {
2798
0
            struct state *q = add_state(fa, 0);
2799
0
            ret = add_new_trans(p, q, Y, Y);
2800
0
            if (ret < 0)
2801
0
                goto error;
2802
0
            ret = add_new_trans(q, p, X, X);
2803
0
            if (ret < 0)
2804
0
                goto error;
2805
0
        }
2806
0
    }
2807
0
    return fa;
2808
0
 error:
2809
0
    fa_free(fa);
2810
0
    return NULL;
2811
0
}
2812
2813
0
static bitset *alphabet(struct fa *fa) {
2814
0
    bitset *bs = bitset_init(UCHAR_NUM);
2815
2816
0
    if (bs == NULL)
2817
0
        return NULL;
2818
2819
0
    list_for_each(s, fa->initial) {
2820
0
        for (int i=0; i < s->tused; i++) {
2821
0
            for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2822
0
                bitset_set(bs, c);
2823
0
        }
2824
0
    }
2825
0
    return bs;
2826
0
}
2827
2828
0
static bitset *last_chars(struct fa *fa) {
2829
0
    bitset *bs = bitset_init(UCHAR_NUM);
2830
2831
0
    if (bs == NULL)
2832
0
        return NULL;
2833
2834
0
    list_for_each(s, fa->initial) {
2835
0
        for (int i=0; i < s->tused; i++) {
2836
0
            if (s->trans[i].to->accept) {
2837
0
                for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2838
0
                    bitset_set(bs, c);
2839
0
            }
2840
0
        }
2841
0
    }
2842
0
    return bs;
2843
0
}
2844
2845
0
static bitset *first_chars(struct fa *fa) {
2846
0
    bitset *bs = bitset_init(UCHAR_NUM);
2847
0
    struct state *s = fa->initial;
2848
2849
0
    if (bs == NULL)
2850
0
        return NULL;
2851
2852
0
    for (int i=0; i < s->tused; i++) {
2853
0
        for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2854
0
            bitset_set(bs, c);
2855
0
    }
2856
0
    return bs;
2857
0
}
2858
2859
/* Return 1 if F1 and F2 are known to be unambiguously concatenable
2860
 * according to simple heuristics. Return 0 if they need to be checked
2861
 * further to decide ambiguity
2862
 * Return -1 if an allocation fails
2863
 */
2864
0
static int is_splittable(struct fa *fa1, struct fa *fa2) {
2865
0
    bitset *alpha1 = NULL;
2866
0
    bitset *alpha2 = NULL;
2867
0
    bitset *last1 = NULL;
2868
0
    bitset *first2 = NULL;
2869
0
    bool result = -1;
2870
2871
0
    alpha2 = alphabet(fa2);
2872
0
    last1 = last_chars(fa1);
2873
0
    if (alpha2 == NULL || last1 == NULL)
2874
0
        goto done;
2875
0
    if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) {
2876
0
        result = 1;
2877
0
        goto done;
2878
0
    }
2879
2880
0
    alpha1 = alphabet(fa1);
2881
0
    first2 = first_chars(fa2);
2882
0
    if (alpha1 == NULL || first2 == NULL)
2883
0
        goto done;
2884
0
    if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) {
2885
0
        result = 1;
2886
0
        goto done;
2887
0
    }
2888
0
    result = 0;
2889
0
 done:
2890
0
    bitset_free(alpha1);
2891
0
    bitset_free(alpha2);
2892
0
    bitset_free(last1);
2893
0
    bitset_free(first2);
2894
0
    return result;
2895
0
}
2896
2897
/* This algorithm is due to Anders Moeller, and can be found in class
2898
 * AutomatonOperations in dk.brics.grammar
2899
 */
2900
int fa_ambig_example(struct fa *fa1, struct fa *fa2,
2901
                     char **upv, size_t *upv_len,
2902
0
                     char **pv, char **v) {
2903
0
    static const char X = '\001';
2904
0
    static const char Y = '\002';
2905
0
    char *result = NULL, *s = NULL;
2906
0
    size_t result_len = 0;
2907
0
    int ret = -1, r;
2908
0
    struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL;
2909
0
    struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL;
2910
0
    struct fa *b1 = NULL, *b2 = NULL;
2911
2912
0
    *upv = NULL;
2913
0
    *upv_len = 0;
2914
0
    if (pv != NULL)
2915
0
        *pv = NULL;
2916
0
    if (v != NULL)
2917
0
        *v = NULL;
2918
2919
0
    r = is_splittable(fa1, fa2);
2920
0
    if (r < 0)
2921
0
        goto error;
2922
0
    if (r == 1)
2923
0
        return 0;
2924
2925
0
#define Xs "\001"
2926
0
#define Ys "\002"
2927
0
#define MPs Ys Xs "(" Xs "(.|\n))+"
2928
0
#define MSs Ys Xs "(" Xs "(.|\n))*"
2929
0
#define SPs "(" Xs "(.|\n))+" Ys Xs
2930
0
#define SSs "(" Xs "(.|\n))*" Ys Xs
2931
    /* These could become static constants */
2932
0
    r = fa_compile( MPs, strlen(MPs), &mp);
2933
0
    if (r != REG_NOERROR)
2934
0
        goto error;
2935
0
    r = fa_compile( MSs, strlen(MSs), &ms);
2936
0
    if (r != REG_NOERROR)
2937
0
        goto error;
2938
0
    r = fa_compile( SPs, strlen(SPs), &sp);
2939
0
    if (r != REG_NOERROR)
2940
0
        goto error;
2941
0
    r = fa_compile( SSs, strlen(SSs), &ss);
2942
0
    if (r != REG_NOERROR)
2943
0
        goto error;
2944
0
#undef SSs
2945
0
#undef SPs
2946
0
#undef MSs
2947
0
#undef MPs
2948
0
#undef Xs
2949
0
#undef Ys
2950
2951
0
    a1f = expand_alphabet(fa1, 0, X, Y);
2952
0
    a1t = expand_alphabet(fa1, 1, X, Y);
2953
0
    a2f = expand_alphabet(fa2, 0, X, Y);
2954
0
    a2t = expand_alphabet(fa2, 1, X, Y);
2955
0
    if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL)
2956
0
        goto error;
2957
2958
    /* Compute b1 = ((a1f . mp) & a1t) . ms */
2959
0
    if (concat_in_place(a1f, &mp) < 0)
2960
0
        goto error;
2961
0
    b1 = fa_intersect(a1f, a1t);
2962
0
    if (b1 == NULL)
2963
0
        goto error;
2964
0
    if (concat_in_place(b1, &ms) < 0)
2965
0
        goto error;
2966
0
    if (fa_is_basic(b1, FA_EMPTY)) {
2967
        /* We are done - amb which we take an example from below
2968
         * will be empty, and there can therefore not be an ambiguity */
2969
0
        ret = 0;
2970
0
        goto done;
2971
0
    }
2972
2973
    /* Compute b2 = ss . ((sp . a2f) & a2t) */
2974
0
    if (concat_in_place(sp, &a2f) < 0)
2975
0
        goto error;
2976
0
    b2 = fa_intersect(sp, a2t);
2977
0
    if (b2 == NULL)
2978
0
        goto error;
2979
0
    if (concat_in_place(ss, &b2) < 0)
2980
0
        goto error;
2981
0
    b2 = ss;
2982
0
    ss = NULL;
2983
2984
    /* The automaton we are really interested in */
2985
0
    amb = fa_intersect(b1, b2);
2986
0
    if (amb == NULL)
2987
0
        goto error;
2988
2989
0
    size_t s_len = 0;
2990
0
    r = fa_example(amb, &s, &s_len);
2991
0
    if (r < 0)
2992
0
        goto error;
2993
2994
0
    if (s != NULL) {
2995
0
        char *t;
2996
0
        result_len = (s_len-1)/2 - 1;
2997
0
        F(ALLOC_N(result, result_len + 1));
2998
0
        t = result;
2999
0
        int i = 0;
3000
0
        for (i=0; s[2*i] == X; i++) {
3001
0
            assert((t - result) < result_len);
3002
0
            *t++ = s[2*i + 1];
3003
0
        }
3004
0
        if (pv != NULL)
3005
0
            *pv = t;
3006
0
        i += 1;
3007
3008
0
        for ( ;s[2*i] == X; i++) {
3009
0
            assert((t - result) < result_len);
3010
0
            *t++ = s[2*i + 1];
3011
0
        }
3012
0
        if (v != NULL)
3013
0
            *v = t;
3014
0
        i += 1;
3015
3016
0
        for (; 2*i+1 < s_len; i++) {
3017
0
            assert((t - result) < result_len);
3018
0
            *t++ = s[2*i + 1];
3019
0
        }
3020
0
    }
3021
0
    ret = 0;
3022
3023
0
 done:
3024
    /* Clean up intermediate automata */
3025
0
    fa_free(mp);
3026
0
    fa_free(ms);
3027
0
    fa_free(ss);
3028
0
    fa_free(sp);
3029
0
    fa_free(a1f);
3030
0
    fa_free(a1t);
3031
0
    fa_free(a2f);
3032
0
    fa_free(a2t);
3033
0
    fa_free(b1);
3034
0
    fa_free(b2);
3035
0
    fa_free(amb);
3036
3037
0
    FREE(s);
3038
0
    *upv = result;
3039
0
    if (result != NULL)
3040
0
        *upv_len = result_len;
3041
0
    return ret;
3042
0
 error:
3043
0
    FREE(result);
3044
0
    ret = -1;
3045
0
    goto done;
3046
0
}
3047
3048
/*
3049
 * Construct an fa from a regular expression
3050
 */
3051
0
static struct fa *fa_from_re(struct re *re) {
3052
0
    struct fa *result = NULL;
3053
3054
0
    switch(re->type) {
3055
0
    case UNION:
3056
0
        {
3057
0
            result = fa_from_re(re->exp1);
3058
0
            if (result == NULL)
3059
0
                goto error;
3060
0
            struct fa *fa2 = fa_from_re(re->exp2);
3061
0
            if (fa2 == NULL)
3062
0
                goto error;
3063
0
            if (union_in_place(result, &fa2) < 0)
3064
0
                goto error;
3065
0
        }
3066
0
        break;
3067
0
    case CONCAT:
3068
0
        {
3069
0
            result = fa_from_re(re->exp1);
3070
0
            if (result == NULL)
3071
0
                goto error;
3072
0
            struct fa *fa2 = fa_from_re(re->exp2);
3073
0
            if (fa2 == NULL)
3074
0
                goto error;
3075
0
            if (concat_in_place(result, &fa2) < 0)
3076
0
                goto error;
3077
0
        }
3078
0
        break;
3079
0
    case CSET:
3080
0
        result = fa_make_char_set(re->cset, re->negate);
3081
0
        break;
3082
0
    case ITER:
3083
0
        {
3084
0
            struct fa *fa = fa_from_re(re->exp);
3085
0
            if (fa == NULL)
3086
0
                goto error;
3087
0
            result = fa_iter(fa, re->min, re->max);
3088
0
            fa_free(fa);
3089
0
        }
3090
0
        break;
3091
0
    case EPSILON:
3092
0
        result = fa_make_epsilon();
3093
0
        break;
3094
0
    case CHAR:
3095
0
        result = fa_make_char(re->c);
3096
0
        break;
3097
0
    default:
3098
0
        assert(0);
3099
0
        break;
3100
0
    }
3101
0
    return result;
3102
0
 error:
3103
0
    fa_free(result);
3104
0
    return NULL;
3105
0
}
3106
3107
0
static void free_re(struct re *re) {
3108
0
    if (re == NULL)
3109
0
        return;
3110
0
    assert(re->ref == 0);
3111
3112
0
    if (re->type == UNION || re->type == CONCAT) {
3113
0
        re_unref(re->exp1);
3114
0
        re_unref(re->exp2);
3115
0
    } else if (re->type == ITER) {
3116
0
        re_unref(re->exp);
3117
0
    } else if (re->type == CSET) {
3118
0
        bitset_free(re->cset);
3119
0
    }
3120
0
    free(re);
3121
0
}
3122
3123
0
int fa_compile(const char *regexp, size_t size, struct fa **fa) {
3124
0
    struct re *re = NULL;
3125
0
    struct re_parse parse;
3126
3127
0
    *fa = NULL;
3128
3129
0
    parse.rx = regexp;
3130
0
    parse.rend = regexp + size;
3131
0
    parse.error = REG_NOERROR;
3132
3133
0
    re = parse_regexp(&parse);
3134
0
    if (re == NULL)
3135
0
        return parse.error;
3136
3137
0
    *fa = fa_from_re(re);
3138
0
    re_unref(re);
3139
3140
0
    if (*fa == NULL || collect(*fa) < 0)
3141
0
        parse.error = REG_ESPACE;
3142
0
    return parse.error;
3143
0
}
3144
3145
/* We represent a case-insensitive FA by using only transitions on
3146
 * lower-case letters.
3147
 */
3148
0
int fa_nocase(struct fa *fa) {
3149
0
    if (fa == NULL || fa->nocase)
3150
0
        return 0;
3151
3152
0
    fa->nocase = 1;
3153
0
    list_for_each(s, fa->initial) {
3154
0
        int tused = s->tused;
3155
        /* For every transition on characters in [A-Z] add a corresponding
3156
         * transition on [a-z]; remove any portion covering [A-Z] */
3157
0
        for (int i=0; i < tused; i++) {
3158
0
            struct trans *t = s->trans + i;
3159
0
            int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
3160
0
            int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
3161
3162
0
            if (t->min > 'Z' || t->max < 'A')
3163
0
                continue;
3164
0
            if (t->min >= 'A' && t->max <= 'Z') {
3165
0
                t->min = tolower(t->min);
3166
0
                t->max = tolower(t->max);
3167
0
            } else if (t->max <= 'Z') {
3168
                /* t->min < 'A' */
3169
0
                t->max = 'A' - 1;
3170
0
                F(add_new_trans(s, t->to, lc_min, lc_max));
3171
0
            } else if (t->min >= 'A') {
3172
                /* t->max > 'Z' */
3173
0
                t->min = 'Z' + 1;
3174
0
                F(add_new_trans(s, t->to, lc_min, lc_max));
3175
0
            } else {
3176
                /* t->min < 'A' && t->max > 'Z' */
3177
0
                F(add_new_trans(s, t->to, 'Z' + 1, t->max));
3178
0
                s->trans[i].max = 'A' - 1;
3179
0
                F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
3180
0
            }
3181
0
        }
3182
0
    }
3183
0
    F(collect(fa));
3184
0
    return 0;
3185
0
 error:
3186
0
    return -1;
3187
0
}
3188
3189
0
int fa_is_nocase(struct fa *fa) {
3190
0
    return fa->nocase;
3191
0
}
3192
3193
/* If FA is case-insensitive, turn it into a case-sensitive automaton by
3194
 * adding transitions on upper-case letters for each existing transition on
3195
 * lower-case letters */
3196
0
static int case_expand(struct fa *fa) {
3197
0
    if (! fa->nocase)
3198
0
        return 0;
3199
3200
0
    fa->nocase = 0;
3201
0
    list_for_each(s, fa->initial) {
3202
0
        int tused = s->tused;
3203
        /* For every transition on characters in [a-z] add a corresponding
3204
         * transition on [A-Z] */
3205
0
        for (int i=0; i < tused; i++) {
3206
0
            struct trans *t = s->trans + i;
3207
0
            int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
3208
0
            int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
3209
3210
0
            if (t->min > 'z' || t->max < 'a')
3211
0
                continue;
3212
0
            F(add_new_trans(s, t->to, lc_min, lc_max));
3213
0
        }
3214
0
    }
3215
0
    F(collect(fa));
3216
0
    return 0;
3217
0
 error:
3218
0
    return -1;
3219
0
}
3220
3221
/*
3222
 * Regular expression parser
3223
 */
3224
3225
0
static struct re *make_re(enum re_type type) {
3226
0
    struct re *re;
3227
0
    if (make_ref(re) == 0)
3228
0
        re->type = type;
3229
0
    return re;
3230
0
}
3231
3232
0
static struct re *make_re_rep(struct re *exp, int min, int max) {
3233
0
    struct re *re = make_re(ITER);
3234
0
    if (re) {
3235
0
        re->exp = exp;
3236
0
        re->min = min;
3237
0
        re->max = max;
3238
0
    } else {
3239
0
        re_unref(exp);
3240
0
    }
3241
0
    return re;
3242
0
}
3243
3244
static struct re *make_re_binop(enum re_type type, struct re *exp1,
3245
0
                                struct re *exp2) {
3246
0
    struct re *re = make_re(type);
3247
0
    if (re) {
3248
0
        re->exp1 = exp1;
3249
0
        re->exp2 = exp2;
3250
0
    } else {
3251
0
        re_unref(exp1);
3252
0
        re_unref(exp2);
3253
0
    }
3254
0
    return re;
3255
0
}
3256
3257
0
static struct re *make_re_char(uchar c) {
3258
0
    struct re *re = make_re(CHAR);
3259
0
    if (re)
3260
0
        re->c = c;
3261
0
    return re;
3262
0
}
3263
3264
0
static struct re *make_re_char_set(bool negate, bool no_ranges) {
3265
0
    struct re *re = make_re(CSET);
3266
0
    if (re) {
3267
0
        re->negate = negate;
3268
0
        re->no_ranges = no_ranges;
3269
0
        re->cset = bitset_init(UCHAR_NUM);
3270
0
        if (re->cset == NULL)
3271
0
            re_unref(re);
3272
0
    }
3273
0
    return re;
3274
0
}
3275
3276
0
static bool more(struct re_parse *parse) {
3277
0
    return parse->rx < parse->rend;
3278
0
}
3279
3280
0
static bool match(struct re_parse *parse, char m) {
3281
0
    if (!more(parse))
3282
0
        return false;
3283
0
    if (*parse->rx == m) {
3284
0
        parse->rx += 1;
3285
0
        return true;
3286
0
    }
3287
0
    return false;
3288
0
}
3289
3290
0
static bool peek(struct re_parse *parse, const char *chars) {
3291
0
    return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
3292
0
}
3293
3294
0
static bool next(struct re_parse *parse, char *c) {
3295
0
    if (!more(parse))
3296
0
        return false;
3297
0
    *c = *parse->rx;
3298
0
    parse->rx += 1;
3299
0
    return true;
3300
0
}
3301
3302
0
static bool parse_char(struct re_parse *parse, int quoted, char *c) {
3303
0
    if (!more(parse))
3304
0
        return false;
3305
0
    if (quoted && *parse->rx == '\\') {
3306
0
        parse->rx += 1;
3307
0
        return next(parse, c);
3308
0
    } else {
3309
0
        return next(parse, c);
3310
0
    }
3311
0
}
3312
3313
0
static void add_re_char(struct re *re, uchar from, uchar to) {
3314
0
    assert(re->type == CSET);
3315
0
    for (unsigned int c = from; c <= to; c++)
3316
0
        bitset_set(re->cset, c);
3317
0
}
3318
3319
0
static void parse_char_class(struct re_parse *parse, struct re *re) {
3320
0
    if (! more(parse)) {
3321
0
        parse->error = REG_EBRACK;
3322
0
        goto error;
3323
0
    }
3324
0
    char from, to;
3325
0
    parse_char(parse, 0, &from);
3326
0
    to = from;
3327
0
    if (match(parse, '-')) {
3328
0
        if (! more(parse)) {
3329
0
            parse->error = REG_EBRACK;
3330
0
            goto error;
3331
0
        }
3332
0
        if (peek(parse, "]")) {
3333
0
            if (from > to) {
3334
0
                parse->error = REG_ERANGE;
3335
0
                goto error;
3336
0
            }
3337
0
            add_re_char(re, from, to);
3338
0
            add_re_char(re, '-', '-');
3339
0
            return;
3340
0
        } else if (!parse_char(parse, 0, &to)) {
3341
0
            parse->error = REG_ERANGE;
3342
0
            goto error;
3343
0
        }
3344
0
    }
3345
0
    if (from > to) {
3346
0
        parse->error = REG_ERANGE;
3347
0
        goto error;
3348
0
    }
3349
0
    add_re_char(re, from, to);
3350
0
 error:
3351
0
    return;
3352
0
}
3353
3354
0
static struct re *parse_simple_exp(struct re_parse *parse) {
3355
0
    struct re *re = NULL;
3356
3357
0
    if (match(parse, '[')) {
3358
0
        bool negate = match(parse, '^');
3359
0
        re = make_re_char_set(negate, parse->no_ranges);
3360
0
        if (re == NULL) {
3361
0
            parse->error = REG_ESPACE;
3362
0
            goto error;
3363
0
        }
3364
0
        parse_char_class(parse, re);
3365
0
        if (parse->error != REG_NOERROR)
3366
0
            goto error;
3367
0
        while (more(parse) && ! peek(parse, "]")) {
3368
0
            parse_char_class(parse, re);
3369
0
            if (parse->error != REG_NOERROR)
3370
0
                goto error;
3371
0
        }
3372
0
        if (! match(parse, ']')) {
3373
0
            parse->error = REG_EBRACK;
3374
0
            goto error;
3375
0
        }
3376
0
    } else if (match(parse, '(')) {
3377
0
        if (match(parse, ')')) {
3378
0
            re = make_re(EPSILON);
3379
0
            if (re == NULL) {
3380
0
                parse->error = REG_ESPACE;
3381
0
                goto error;
3382
0
            }
3383
0
        } else {
3384
0
            re = parse_regexp(parse);
3385
0
            if (re == NULL)
3386
0
                goto error;
3387
0
            if (! match(parse, ')')) {
3388
0
                parse->error = REG_EPAREN;
3389
0
                goto error;
3390
0
            }
3391
0
        }
3392
0
    } else if (match(parse, '.')) {
3393
0
        re = make_re_char_set(1, parse->no_ranges);
3394
0
        if (re == NULL) {
3395
0
            parse->error = REG_ESPACE;
3396
0
            goto error;
3397
0
        }
3398
0
        add_re_char(re, '\n', '\n');
3399
0
    } else if (more(parse)) {
3400
0
        char c;
3401
0
        if (!parse_char(parse, 1, &c)) {
3402
0
            parse->error = REG_EESCAPE;
3403
0
            goto error;
3404
0
        }
3405
0
        re = make_re_char(c);
3406
0
        if (re == NULL) {
3407
0
            parse->error = REG_ESPACE;
3408
0
            goto error;
3409
0
        }
3410
0
    } else {
3411
0
        re = make_re(EPSILON);
3412
0
        if (re == NULL) {
3413
0
            parse->error = REG_ESPACE;
3414
0
            goto error;
3415
0
        }
3416
0
    }
3417
0
    return re;
3418
0
 error:
3419
0
    re_unref(re);
3420
0
    return NULL;
3421
0
}
3422
3423
0
static int parse_int(struct re_parse *parse) {
3424
0
    const char *lim;
3425
0
    char *end;
3426
0
    size_t used;
3427
0
    long l;
3428
3429
    /* We need to be careful that strtoul will never access
3430
     * memory beyond parse->rend
3431
     */
3432
0
    for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9';
3433
0
         lim++);
3434
0
    if (lim < parse->rend) {
3435
0
        l = strtoul(parse->rx, &end, 10);
3436
0
        used = end - parse->rx;
3437
0
    } else {
3438
0
        char *s = strndup(parse->rx, parse->rend - parse->rx);
3439
0
        if (s == NULL) {
3440
0
            parse->error = REG_ESPACE;
3441
0
            return -1;
3442
0
        }
3443
0
        l = strtoul(s, &end, 10);
3444
0
        used = end - s;
3445
0
        free(s);
3446
0
    }
3447
3448
0
    if (used == 0)
3449
0
        return -1;
3450
0
    parse->rx += used;
3451
0
    if ((l<0) || (l > INT_MAX)) {
3452
0
        parse->error = REG_BADBR;
3453
0
        return -1;
3454
0
    }
3455
0
    return (int) l;
3456
0
}
3457
3458
0
static struct re *parse_repeated_exp(struct re_parse *parse) {
3459
0
    struct re *re = parse_simple_exp(parse);
3460
0
    if (re == NULL)
3461
0
        goto error;
3462
0
    if (match(parse, '?')) {
3463
0
        re = make_re_rep(re, 0, 1);
3464
0
    } else if (match(parse, '*')) {
3465
0
        re = make_re_rep(re, 0, -1);
3466
0
    } else if (match(parse, '+')) {
3467
0
        re = make_re_rep(re, 1, -1);
3468
0
    } else if (match(parse, '{')) {
3469
0
        int min, max;
3470
0
        min = parse_int(parse);
3471
0
        if (min == -1)
3472
0
            goto error;
3473
0
        if (match(parse, ',')) {
3474
0
            max = parse_int(parse);
3475
0
            if (max == -1)
3476
0
                max = -1;      /* If it's not an int, it means 'unbounded' */
3477
0
            if (! match(parse, '}')) {
3478
0
                parse->error = REG_EBRACE;
3479
0
                goto error;
3480
0
            }
3481
0
        } else if (match(parse, '}')) {
3482
0
            max = min;
3483
0
        } else {
3484
0
            parse->error = REG_EBRACE;
3485
0
            goto error;
3486
0
        }
3487
0
        if (min > max && max != -1) {
3488
0
            parse->error = REG_BADBR;
3489
0
            goto error;
3490
0
        }
3491
0
        re = make_re_rep(re, min, max);
3492
0
    }
3493
0
    if (re == NULL)
3494
0
        parse->error = REG_ESPACE;
3495
0
    return re;
3496
0
 error:
3497
0
    re_unref(re);
3498
0
    return NULL;
3499
0
}
3500
3501
0
static struct re *parse_concat_exp(struct re_parse *parse) {
3502
0
    struct re *re = parse_repeated_exp(parse);
3503
0
    if (re == NULL)
3504
0
        goto error;
3505
3506
0
    if (more(parse) && ! peek(parse, ")|")) {
3507
0
        struct re *re2 = parse_concat_exp(parse);
3508
0
        if (re2 == NULL)
3509
0
            goto error;
3510
0
        re = make_re_binop(CONCAT, re, re2);
3511
0
        if (re == NULL) {
3512
0
            parse->error = REG_ESPACE;
3513
0
            goto error;
3514
0
        }
3515
0
    }
3516
0
    return re;
3517
3518
0
 error:
3519
0
    re_unref(re);
3520
0
    return NULL;
3521
0
}
3522
3523
0
static struct re *parse_regexp(struct re_parse *parse) {
3524
0
    struct re *re = NULL;
3525
3526
    /* Something like (|r) */
3527
0
    if (peek(parse, "|"))
3528
0
        re = make_re(EPSILON);
3529
0
    else
3530
0
        re = parse_concat_exp(parse);
3531
0
    if (re == NULL)
3532
0
        goto error;
3533
3534
0
    if (match(parse, '|')) {
3535
0
        struct re *re2 = NULL;
3536
        /* Something like (r|) */
3537
0
        if (peek(parse, ")"))
3538
0
            re2 = make_re(EPSILON);
3539
0
        else
3540
0
            re2 = parse_regexp(parse);
3541
0
        if (re2 == NULL)
3542
0
            goto error;
3543
3544
0
        re = make_re_binop(UNION, re, re2);
3545
0
        if (re == NULL) {
3546
0
            parse->error = REG_ESPACE;
3547
0
            goto error;
3548
0
        }
3549
0
    }
3550
0
    return re;
3551
3552
0
 error:
3553
0
    re_unref(re);
3554
0
    return NULL;
3555
0
}
3556
3557
/*
3558
 * Convert a STRUCT RE to a string. Code is complicated by the fact that
3559
 * we try to be clever and avoid unneeded parens and concatenation with
3560
 * epsilon etc.
3561
 */
3562
static int re_as_string(const struct re *re, struct re_str *str);
3563
3564
0
static int re_binop_count(enum re_type type, const struct re *re) {
3565
0
    assert(type == CONCAT || type == UNION);
3566
0
    if (re->type == type) {
3567
0
        return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
3568
0
    } else {
3569
0
        return 1;
3570
0
    }
3571
0
}
3572
3573
static int re_binop_store(enum re_type type, const struct re *re,
3574
0
                          const struct re **list) {
3575
0
    int pos = 0;
3576
0
    if (type == re->type) {
3577
0
        pos = re_binop_store(type, re->exp1, list);
3578
0
        pos += re_binop_store(type, re->exp2, list + pos);
3579
0
    } else {
3580
0
        list[0] = re;
3581
0
        pos = 1;
3582
0
    }
3583
0
    return pos;
3584
0
}
3585
3586
0
static int re_union_as_string(const struct re *re, struct re_str *str) {
3587
0
    assert(re->type == UNION);
3588
3589
0
    int result = -1;
3590
0
    const struct re **res = NULL;
3591
0
    struct re_str *strings = NULL;
3592
0
    int nre = 0, r;
3593
3594
0
    nre = re_binop_count(re->type, re);
3595
0
    r = ALLOC_N(res, nre);
3596
0
    if (r < 0)
3597
0
        goto done;
3598
3599
0
    re_binop_store(re->type, re, res);
3600
3601
0
    r = ALLOC_N(strings, nre);
3602
0
    if (r < 0)
3603
0
        goto error;
3604
3605
0
    str->len = 0;
3606
0
    for (int i=0; i < nre; i++) {
3607
0
        if (re_as_string(res[i], strings + i) < 0)
3608
0
            goto error;
3609
0
        str->len += strings[i].len;
3610
0
    }
3611
0
    str->len += nre-1;
3612
3613
0
    r = re_str_alloc(str);
3614
0
    if (r < 0)
3615
0
        goto error;
3616
3617
0
    char *p = str->rx;
3618
0
    for (int i=0; i < nre; i++) {
3619
0
        if (i>0)
3620
0
            *p++ = '|';
3621
0
        memcpy(p, strings[i].rx, strings[i].len);
3622
0
        p += strings[i].len;
3623
0
    }
3624
3625
0
    result = 0;
3626
0
 done:
3627
0
    free(res);
3628
0
    if (strings != NULL) {
3629
0
        for (int i=0; i < nre; i++)
3630
0
            release_re_str(strings + i);
3631
0
    }
3632
0
    free(strings);
3633
0
    return result;
3634
0
 error:
3635
0
    release_re_str(str);
3636
0
    result = -1;
3637
0
    goto done;
3638
0
}
3639
3640
ATTRIBUTE_PURE
3641
0
static int re_needs_parens_in_concat(const struct re *re) {
3642
0
    return (re->type != CHAR && re->type != CSET && re->type != ITER);
3643
0
}
3644
3645
0
static int re_concat_as_string(const struct re *re, struct re_str *str) {
3646
0
    assert(re->type == CONCAT);
3647
3648
0
    const struct re **res = NULL;
3649
0
    struct re_str *strings = NULL;
3650
0
    int nre = 0, r;
3651
0
    int result = -1;
3652
3653
0
    nre = re_binop_count(re->type, re);
3654
0
    r = ALLOC_N(res, nre);
3655
0
    if (r < 0)
3656
0
        goto error;
3657
0
    re_binop_store(re->type, re, res);
3658
3659
0
    r = ALLOC_N(strings, nre);
3660
0
    if (r < 0)
3661
0
        goto error;
3662
3663
0
    str->len = 0;
3664
0
    for (int i=0; i < nre; i++) {
3665
0
        if (res[i]->type == EPSILON)
3666
0
            continue;
3667
0
        if (re_as_string(res[i], strings + i) < 0)
3668
0
            goto error;
3669
0
        str->len += strings[i].len;
3670
0
        if (re_needs_parens_in_concat(res[i]))
3671
0
            str->len += 2;
3672
0
    }
3673
3674
0
    r = re_str_alloc(str);
3675
0
    if (r < 0)
3676
0
        goto error;
3677
3678
0
    char *p = str->rx;
3679
0
    for (int i=0; i < nre; i++) {
3680
0
        if (res[i]->type == EPSILON)
3681
0
            continue;
3682
0
        if (re_needs_parens_in_concat(res[i]))
3683
0
            *p++ = '(';
3684
0
        p = memcpy(p, strings[i].rx, strings[i].len);
3685
0
        p += strings[i].len;
3686
0
        if (re_needs_parens_in_concat(res[i]))
3687
0
            *p++ = ')';
3688
0
    }
3689
3690
0
    result = 0;
3691
0
 done:
3692
0
    free(res);
3693
0
    if (strings != NULL) {
3694
0
        for (int i=0; i < nre; i++)
3695
0
            release_re_str(strings + i);
3696
0
    }
3697
0
    free(strings);
3698
0
    return result;
3699
0
 error:
3700
0
    release_re_str(str);
3701
0
    result = -1;
3702
0
    goto done;
3703
0
}
3704
3705
0
static bool cset_contains(const struct re *cset, int c) {
3706
0
    return bitset_get(cset->cset, c) != cset->negate;
3707
0
}
3708
3709
0
static int re_cset_as_string(const struct re *re, struct re_str *str) {
3710
0
    const uchar rbrack = ']';
3711
0
    const uchar dash = '-';
3712
0
    const uchar nul = '\0';
3713
3714
0
    static const char *const empty_set = "[]";
3715
0
    static const char *const total_set = "(.|\n)";
3716
0
    static const char *const not_newline = ".";
3717
3718
0
    char *s;
3719
0
    int from, to, negate;
3720
0
    size_t len;
3721
0
    int incl_rbrack, incl_dash;
3722
0
    int r;
3723
3724
0
    str->len = strlen(empty_set);
3725
3726
    /* We can not include NUL explicitly in a CSET since we use ordinary
3727
       NUL delimited strings to represent them. That means that we need to
3728
       use negated representation if NUL is to be included (and vice versa)
3729
    */
3730
0
    negate = cset_contains(re, nul);
3731
0
    if (negate) {
3732
0
        for (from = UCHAR_MIN;
3733
0
             from <= UCHAR_MAX && cset_contains(re, from);
3734
0
             from += 1);
3735
0
        if (from > UCHAR_MAX) {
3736
            /* Special case: the set matches every character */
3737
0
            str->rx = strdup(total_set);
3738
0
            goto done;
3739
0
        }
3740
0
        if (from == '\n') {
3741
0
            for (from += 1;
3742
0
                 from <= UCHAR_MAX && cset_contains(re, from);
3743
0
                 from += 1);
3744
0
            if (from > UCHAR_MAX) {
3745
                /* Special case: the set matches everything but '\n' */
3746
0
                str->rx = strdup(not_newline);
3747
0
                goto done;
3748
0
            }
3749
0
        }
3750
0
    }
3751
3752
    /* See if ']' and '-' will be explicitly included in the character set
3753
       (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset
3754
       these flags if they are in the set, but not mentioned explicitly
3755
    */
3756
0
    incl_rbrack = cset_contains(re, rbrack) != negate;
3757
0
    incl_dash = cset_contains(re, dash) != negate;
3758
3759
0
    if (re->no_ranges) {
3760
0
        for (from = UCHAR_MIN; from <= UCHAR_MAX; from++)
3761
0
            if (cset_contains(re, from) != negate)
3762
0
                str->len += 1;
3763
0
    } else {
3764
0
        for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3765
0
            while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3766
0
                from += 1;
3767
0
            if (from > UCHAR_MAX)
3768
0
                break;
3769
0
            for (to = from;
3770
0
                 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3771
0
                 to++);
3772
3773
0
            if (to == from && (from == rbrack || from == dash))
3774
0
                continue;
3775
0
            if (from == rbrack || from == dash)
3776
0
                from += 1;
3777
0
            if (to == rbrack || to == dash)
3778
0
                to -= 1;
3779
3780
0
            len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
3781
3782
0
            if (from < rbrack && rbrack < to)
3783
0
                incl_rbrack = 0;
3784
0
            if (from < dash && dash < to)
3785
0
                incl_dash = 0;
3786
0
            str->len += len;
3787
0
        }
3788
0
        str->len += incl_rbrack + incl_dash;
3789
0
    }
3790
0
    if (negate)
3791
0
        str->len += 1;        /* For the ^ */
3792
3793
0
    r = re_str_alloc(str);
3794
0
    if (r < 0)
3795
0
        goto error;
3796
3797
0
    s = str->rx;
3798
0
    *s++ = '[';
3799
0
    if (negate)
3800
0
        *s++ = '^';
3801
0
    if (incl_rbrack)
3802
0
        *s++ = rbrack;
3803
3804
0
    if (re->no_ranges) {
3805
0
        for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) {
3806
0
            if (from == rbrack || from == dash)
3807
0
                continue;
3808
0
            if (cset_contains(re, from) != negate)
3809
0
                *s++ = from;
3810
0
        }
3811
0
    } else {
3812
0
        for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3813
0
            while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3814
0
                from += 1;
3815
0
            if (from > UCHAR_MAX)
3816
0
                break;
3817
0
            for (to = from;
3818
0
                 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3819
0
                 to++);
3820
3821
0
            if (to == from && (from == rbrack || from == dash))
3822
0
                continue;
3823
0
            if (from == rbrack || from == dash)
3824
0
                from += 1;
3825
0
            if (to == rbrack || to == dash)
3826
0
                to -= 1;
3827
3828
0
            if (to == from) {
3829
0
                *s++ = from;
3830
0
            } else if (to == from + 1) {
3831
0
                *s++ = from;
3832
0
                *s++ = to;
3833
0
            } else {
3834
0
                *s++ = from;
3835
0
                *s++ = '-';
3836
0
                *s++ = to;
3837
0
            }
3838
0
        }
3839
0
    }
3840
0
    if (incl_dash)
3841
0
        *s++ = dash;
3842
3843
0
    *s = ']';
3844
0
 done:
3845
0
    if (str->rx == NULL)
3846
0
        goto error;
3847
0
    str->len = strlen(str->rx);
3848
0
    return 0;
3849
0
 error:
3850
0
    release_re_str(str);
3851
0
    return -1;
3852
0
}
3853
3854
0
static int re_iter_as_string(const struct re *re, struct re_str *str) {
3855
0
    const char *quant = NULL;
3856
0
    char *iter = NULL;
3857
0
    int r, result = -1;
3858
3859
0
    if (re_as_string(re->exp, str) < 0)
3860
0
        return -1;
3861
3862
0
    if (re->min == 0 && re->max == -1) {
3863
0
        quant = "*";
3864
0
    } else if (re->min == 1 && re->max == -1) {
3865
0
        quant = "+";
3866
0
    } else if (re->min == 0 && re->max == 1) {
3867
0
        quant = "?";
3868
0
    } else if (re->max == -1) {
3869
0
        r = asprintf(&iter, "{%d,}", re->min);
3870
0
        if (r < 0)
3871
0
            return -1;
3872
0
        quant = iter;
3873
0
    } else {
3874
0
        r = asprintf(&iter, "{%d,%d}", re->min, re->max);
3875
0
        if (r < 0)
3876
0
            return -1;
3877
0
        quant = iter;
3878
0
    }
3879
3880
0
    if (re->exp->type == CHAR || re->exp->type == CSET) {
3881
0
        if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
3882
0
            goto error;
3883
0
        strcpy(str->rx + str->len, quant);
3884
0
        str->len += strlen(quant);
3885
0
    } else {
3886
        /* Format '(' + str->rx ')' + quant */
3887
0
        if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
3888
0
            goto error;
3889
0
        memmove(str->rx + 1, str->rx, str->len);
3890
0
        str->rx[0] = '(';
3891
0
        str->rx[str->len + 1] = ')';
3892
0
        str->len += 2;
3893
0
        strcpy(str->rx + str->len, quant);
3894
0
        str->len += strlen(quant);
3895
0
    }
3896
3897
0
    result = 0;
3898
0
 done:
3899
0
    FREE(iter);
3900
0
    return result;
3901
0
 error:
3902
0
    release_re_str(str);
3903
0
    goto done;
3904
0
}
3905
3906
0
static int re_as_string(const struct re *re, struct re_str *str) {
3907
    /* Characters that must be escaped */
3908
0
    static const char * const special_chars = ".()[]{}*|+?\\^$";
3909
0
    int result = 0;
3910
3911
0
    switch(re->type) {
3912
0
    case UNION:
3913
0
        result = re_union_as_string(re, str);
3914
0
        break;
3915
0
    case CONCAT:
3916
0
        result = re_concat_as_string(re, str);
3917
0
        break;
3918
0
    case CSET:
3919
0
        result = re_cset_as_string(re, str);
3920
0
        break;
3921
0
    case CHAR:
3922
0
        if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
3923
0
            if (ALLOC_N(str->rx, 2) < 0)
3924
0
                goto error;
3925
0
            str->rx[0] = re->c;
3926
0
            str->len = 1;
3927
0
        } else {
3928
0
            if (ALLOC_N(str->rx, 3) < 0)
3929
0
                goto error;
3930
0
            str->rx[0] = '\\';
3931
0
            str->rx[1] = re->c;
3932
0
            str->len = strlen(str->rx);
3933
0
        }
3934
0
        break;
3935
0
    case ITER:
3936
0
        result = re_iter_as_string(re, str);
3937
0
        break;
3938
0
    case EPSILON:
3939
0
        if (ALLOC_N(str->rx, 3) < 0)
3940
0
            goto error;
3941
0
        strcpy(str->rx, "()");
3942
0
        str->len = strlen(str->rx);
3943
0
        break;
3944
0
    default:
3945
0
        assert(0);
3946
0
        abort();
3947
0
        break;
3948
0
    }
3949
0
    return result;
3950
0
 error:
3951
0
    release_re_str(str);
3952
0
    return -1;
3953
0
}
3954
3955
0
static int convert_trans_to_re(struct state *s) {
3956
0
    struct re *re = NULL;
3957
0
    size_t nto = 1;
3958
0
    struct trans *trans = NULL;
3959
0
    int r;
3960
3961
0
    if (s->tused == 0)
3962
0
        return 0;
3963
3964
0
    qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
3965
0
    for (int i = 0; i < s->tused - 1; i++) {
3966
0
        if (s->trans[i].to != s->trans[i+1].to)
3967
0
            nto += 1;
3968
0
    }
3969
0
    r = ALLOC_N(trans, nto);
3970
0
    if (r < 0)
3971
0
        goto error;
3972
3973
0
    struct state *to = s->trans[0].to;
3974
0
    int tind = 0;
3975
0
    for_each_trans(t, s) {
3976
0
        if (t->to != to) {
3977
0
            trans[tind].to = to;
3978
0
            trans[tind].re = re;
3979
0
            tind += 1;
3980
0
            re = NULL;
3981
0
            to = t->to;
3982
0
        }
3983
0
        if (re == NULL) {
3984
0
            re = make_re_char_set(0, 0);
3985
0
            if (re == NULL)
3986
0
                goto error;
3987
0
        }
3988
0
        add_re_char(re, t->min, t->max);
3989
0
    }
3990
0
    assert(nto == tind + 1);
3991
0
    trans[tind].to = to;
3992
0
    trans[tind].re = re;
3993
3994
    /* Simplify CSETs with a single char to a CHAR */
3995
0
    for (int t=0; t < nto; t++) {
3996
0
        int cnt = 0;
3997
0
        uchar chr = UCHAR_MIN;
3998
0
        for (int c = 0; c < UCHAR_NUM; c++) {
3999
0
            if (bitset_get(trans[t].re->cset, c)) {
4000
0
                cnt += 1;
4001
0
                chr = c;
4002
0
            }
4003
0
        }
4004
0
        if (cnt == 1) {
4005
0
            re_unref(trans[t].re);
4006
0
            trans[t].re = make_re_char(chr);
4007
0
            if (trans[t].re == NULL)
4008
0
                goto error;
4009
0
        }
4010
0
    }
4011
0
    free_trans(s);
4012
0
    s->trans = trans;
4013
0
    s->tused = s->tsize = nto;
4014
0
    return 0;
4015
4016
0
 error:
4017
0
    if (trans)
4018
0
        for (int i=0; i < nto; i++)
4019
0
            unref(trans[i].re, re);
4020
0
    free(trans);
4021
0
    return -1;
4022
0
}
4023
4024
ATTRIBUTE_RETURN_CHECK
4025
static int add_new_re_trans(struct state *s1, struct state *s2,
4026
0
                            struct re *re) {
4027
0
    int r;
4028
0
    r = add_new_trans(s1, s2, 0, 0);
4029
0
    if (r < 0)
4030
0
        return -1;
4031
0
    last_trans(s1)->re = re;
4032
0
    return 0;
4033
0
}
4034
4035
/* Add the regular expression R1 . LOOP* . R2 to the transition
4036
   from S1 to S2. */
4037
static int re_collapse_trans(struct state *s1, struct state *s2,
4038
0
                             struct re *r1, struct re *loop, struct re *r2) {
4039
0
    struct re *re = NULL;
4040
4041
0
    if (loop->type != EPSILON) {
4042
0
        loop = make_re_rep(ref(loop), 0, -1);
4043
0
        if (loop == NULL)
4044
0
            goto error;
4045
0
    }
4046
4047
0
    if (r1->type == EPSILON) {
4048
0
        if (loop->type == EPSILON) {
4049
0
            re = ref(r2);
4050
0
        } else {
4051
0
            re = make_re_binop(CONCAT, loop, ref(r2));
4052
0
        }
4053
0
    } else {
4054
0
        if (loop->type == EPSILON) {
4055
0
            if (r2->type == EPSILON) {
4056
0
                re = ref(r1);
4057
0
            } else {
4058
0
                re = make_re_binop(CONCAT, ref(r1), ref(r2));
4059
0
            }
4060
0
        } else {
4061
0
            re = make_re_binop(CONCAT, ref(r1), loop);
4062
0
            if (re != NULL && r2->type != EPSILON) {
4063
0
                re = make_re_binop(CONCAT, re, ref(r2));
4064
0
            }
4065
0
        }
4066
0
    }
4067
0
    if (re == NULL)
4068
0
        goto error;
4069
4070
0
    struct trans *t = NULL;
4071
0
    for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
4072
0
    if (t > last_trans(s1)) {
4073
0
        if (add_new_re_trans(s1, s2, re) < 0)
4074
0
            goto error;
4075
0
    } else {
4076
0
        if (t->re == NULL) {
4077
0
            t->re = re;
4078
0
        } else {
4079
0
            t->re = make_re_binop(UNION, re, t->re);
4080
0
            if (t->re == NULL)
4081
0
                goto error;
4082
0
        }
4083
0
    }
4084
0
    return 0;
4085
0
 error:
4086
    // FIXME: make sure we don't leak loop
4087
0
    return -1;
4088
0
}
4089
4090
0
static int convert_strings(struct fa *fa) {
4091
0
    struct state_set *worklist = state_set_init(-1, S_NONE);
4092
0
    int result = -1;
4093
4094
0
    E(worklist == NULL);
4095
4096
    /* Abuse hash to count indegree, reachable to mark visited states */
4097
0
    list_for_each(s, fa->initial) {
4098
0
        s->hash = 0;
4099
0
        s->reachable = 0;
4100
0
    }
4101
4102
0
    list_for_each(s, fa->initial) {
4103
0
        for_each_trans(t, s) {
4104
0
            t->to->hash += 1;
4105
0
        }
4106
0
    }
4107
4108
0
    for (struct state *s = fa->initial;
4109
0
         s != NULL;
4110
0
         s = state_set_pop(worklist)) {
4111
0
        for (int i=0; i < s->tused; i++) {
4112
0
            struct trans *t = s->trans + i;
4113
0
            struct state *to = t->to;
4114
0
            while (to->hash == 1 && to->tused == 1 && ! to->accept) {
4115
0
                if (t->re == NULL) {
4116
0
                    t->re = to->trans->re;
4117
0
                    to->trans->re = NULL;
4118
0
                } else {
4119
0
                    t->re = make_re_binop(CONCAT, t->re, to->trans->re);
4120
0
                    if (t->re == NULL)
4121
0
                        goto error;
4122
0
                }
4123
0
                t->to = to->trans->to;
4124
0
                to->tused = 0;
4125
0
                to->hash -= 1;
4126
0
                to = t->to;
4127
0
                for (int j=0; j < s->tused; j++) {
4128
0
                    if (j != i && s->trans[j].to == to) {
4129
                        /* Combine transitions i and j; remove trans j */
4130
0
                        t->re = make_re_binop(UNION, t->re, s->trans[j].re);
4131
0
                        if (t->re == NULL)
4132
0
                            goto error;
4133
0
                        memmove(s->trans + j, s->trans + j + 1,
4134
0
                                sizeof(s->trans[j]) * (s->tused - j - 1));
4135
0
                        to->hash -= 1;
4136
0
                        s->tused -= 1;
4137
0
                        if (j < i) {
4138
0
                            i = i - 1;
4139
0
                            t = s->trans + i;
4140
0
                        }
4141
0
                    }
4142
0
                }
4143
0
            }
4144
4145
0
            if (! to->reachable) {
4146
0
                to->reachable = 1;
4147
0
                F(state_set_push(worklist, to));
4148
0
            }
4149
0
        }
4150
0
    }
4151
4152
0
    for (struct state *s = fa->initial; s->next != NULL; ) {
4153
0
        if (s->next->hash == 0 && s->next->tused == 0) {
4154
0
            struct state *del = s->next;
4155
0
            s->next = del->next;
4156
0
            free(del->trans);
4157
0
            free(del);
4158
0
        } else {
4159
0
            s = s->next;
4160
0
        }
4161
0
    }
4162
0
    result = 0;
4163
4164
0
 error:
4165
0
    state_set_free(worklist);
4166
0
    return result;
4167
0
}
4168
4169
/* Convert an FA to a regular expression.
4170
 * The strategy is the following:
4171
 * (1) For all states S1 and S2, convert the transitions between them
4172
 *     into one transition whose regexp is a CSET
4173
 * (2) Add a new initial state INI and a new final state FIN to the automaton,
4174
 *     a transition from INI to the old initial state of FA, and a transition
4175
 *     from all accepting states of FA to FIN; the regexp on those transitions
4176
 *     matches only the empty word
4177
 * (3) Eliminate states S (except for INI and FIN) one by one:
4178
 *     Let LOOP the regexp for the transition S -> S if it exists, epsilon
4179
 *     otherwise.
4180
 *     For all S1, S2 different from S with S1 -> S -> S2
4181
 *       Let R1 the regexp of S1 -> S
4182
 *           R2 the regexp of S -> S2
4183
 *           R3 the regexp S1 -> S2 (or epsilon if no such transition)
4184
 *        set the regexp on the transition S1 -> S2 to
4185
 *          R1 . (LOOP)* . R2 | R3
4186
 * (4) The regexp for the whole FA can now be found as the regexp of
4187
 *     the transition INI -> FIN
4188
 * (5) Convert that STRUCT RE to a string with RE_AS_STRING
4189
 */
4190
0
int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
4191
0
    int r;
4192
0
    struct state *fin = NULL, *ini = NULL;
4193
0
    struct re *eps = NULL;
4194
4195
0
    *regexp = NULL;
4196
0
    *regexp_len = 0;
4197
0
    fa = fa_clone(fa);
4198
0
    if (fa == NULL)
4199
0
        goto error;
4200
4201
0
    eps = make_re(EPSILON);
4202
0
    if (eps == NULL)
4203
0
        goto error;
4204
4205
0
    fin = add_state(fa,1);
4206
0
    if (fin == NULL)
4207
0
        goto error;
4208
4209
0
    fa->trans_re = 1;
4210
4211
0
    list_for_each(s, fa->initial) {
4212
0
        r = convert_trans_to_re(s);
4213
0
        if (r < 0)
4214
0
            goto error;
4215
0
        if (s->accept && s != fin) {
4216
0
            r = add_new_re_trans(s, fin, ref(eps));
4217
0
            if (r < 0)
4218
0
                goto error;
4219
0
            s->accept = 0;
4220
0
        }
4221
0
    }
4222
4223
0
    ini = add_state(fa, 0);
4224
0
    if (ini == NULL)
4225
0
        goto error;
4226
4227
0
    r = add_new_re_trans(ini, fa->initial, ref(eps));
4228
0
    if (r < 0)
4229
0
        goto error;
4230
0
    set_initial(fa, ini);
4231
4232
0
    convert_strings(fa);
4233
4234
0
    list_for_each(s, fa->initial->next) {
4235
0
        if (s == fin)
4236
0
            continue;
4237
        /* Eliminate S */
4238
0
        struct re *loop = eps;
4239
0
        for_each_trans(t, s) {
4240
0
            if (t->to == s)
4241
0
                loop = t->re;
4242
0
        }
4243
0
        list_for_each(s1, fa->initial) {
4244
0
            if (s == s1)
4245
0
                continue;
4246
0
            for (int t1 = 0; t1 < s1->tused; t1++) {
4247
0
                if (s1->trans[t1].to == s) {
4248
0
                    for (int t = 0; t < s->tused; t++) {
4249
0
                        if (s->trans[t].to == s)
4250
0
                            continue;
4251
0
                        r = re_collapse_trans(s1, s->trans[t].to,
4252
0
                                              s1->trans[t1].re,
4253
0
                                              loop,
4254
0
                                              s->trans[t].re);
4255
0
                        if (r < 0)
4256
0
                            goto error;
4257
0
                    }
4258
0
                }
4259
0
            }
4260
0
        }
4261
0
    }
4262
4263
0
    re_unref(eps);
4264
4265
0
    for_each_trans(t, fa->initial) {
4266
0
        if (t->to == fin) {
4267
0
            struct re_str str;
4268
0
            MEMZERO(&str, 1);
4269
0
            if (re_as_string(t->re, &str) < 0)
4270
0
                goto error;
4271
0
            *regexp = str.rx;
4272
0
            *regexp_len = str.len;
4273
0
        }
4274
0
    }
4275
4276
0
    list_for_each(s, fa->initial) {
4277
0
        for_each_trans(t, s) {
4278
0
            unref(t->re, re);
4279
0
        }
4280
0
    }
4281
0
    fa_free(fa);
4282
4283
0
    return 0;
4284
0
 error:
4285
0
    fa_free(fa);
4286
0
    re_unref(eps);
4287
0
    return -1;
4288
0
}
4289
4290
0
static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
4291
0
    int r1, r2;
4292
0
    int result = 0;
4293
4294
0
    switch(re->type) {
4295
0
    case UNION:
4296
0
    case CONCAT:
4297
0
        r1 = re_restrict_alphabet(re->exp1, from, to);
4298
0
        r2 = re_restrict_alphabet(re->exp2, from, to);
4299
0
        result = (r1 != 0) ? r1 : r2;
4300
0
        break;
4301
0
    case CSET:
4302
0
        if (re->negate) {
4303
0
            re->negate = 0;
4304
0
            bitset_negate(re->cset, UCHAR_NUM);
4305
0
        }
4306
0
        for (int i=from; i <= to; i++)
4307
0
            bitset_clr(re->cset, i);
4308
0
        break;
4309
0
    case CHAR:
4310
0
        if (from <= re->c && re->c <= to)
4311
0
            result = -1;
4312
0
        break;
4313
0
    case ITER:
4314
0
        result = re_restrict_alphabet(re->exp, from, to);
4315
0
        break;
4316
0
    case EPSILON:
4317
0
        break;
4318
0
    default:
4319
0
        assert(0);
4320
0
        abort();
4321
0
        break;
4322
0
    }
4323
0
    return result;
4324
0
}
4325
4326
int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
4327
                         char **newregexp, size_t *newregexp_len,
4328
0
                         char from, char to) {
4329
0
    int result;
4330
0
    struct re *re = NULL;
4331
0
    struct re_parse parse;
4332
0
    struct re_str str;
4333
4334
0
    *newregexp = NULL;
4335
0
    MEMZERO(&parse, 1);
4336
0
    parse.rx = regexp;
4337
0
    parse.rend = regexp + regexp_len;
4338
0
    parse.error = REG_NOERROR;
4339
0
    re = parse_regexp(&parse);
4340
0
    if (parse.error != REG_NOERROR)
4341
0
        return parse.error;
4342
4343
0
    result = re_restrict_alphabet(re, from, to);
4344
0
    if (result != 0) {
4345
0
        result = -2;
4346
0
        goto done;
4347
0
    }
4348
4349
0
    MEMZERO(&str, 1);
4350
0
    result = re_as_string(re, &str);
4351
0
    *newregexp = str.rx;
4352
0
    *newregexp_len = str.len;
4353
0
 done:
4354
0
    re_unref(re);
4355
0
    return result;
4356
0
}
4357
4358
int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
4359
0
                          char **newregexp, size_t *newregexp_len) {
4360
0
    int result;
4361
0
    struct re *re = NULL;
4362
0
    struct re_parse parse;
4363
0
    struct re_str str;
4364
4365
0
    *newregexp = NULL;
4366
0
    MEMZERO(&parse, 1);
4367
0
    parse.rx = regexp;
4368
0
    parse.rend = regexp + regexp_len;
4369
0
    parse.error = REG_NOERROR;
4370
0
    parse.no_ranges = 1;
4371
0
    re = parse_regexp(&parse);
4372
0
    if (parse.error != REG_NOERROR)
4373
0
        return parse.error;
4374
4375
0
    MEMZERO(&str, 1);
4376
0
    result = re_as_string(re, &str);
4377
0
    *newregexp = str.rx;
4378
0
    *newregexp_len = str.len;
4379
0
    re_unref(re);
4380
0
    return result;
4381
0
}
4382
4383
/* Expand regexp so that it is case-insensitive in a case-sensitive match.
4384
 *
4385
 * Return 1 when a change was made, -1 when an allocation failed, and 0
4386
 * when no change was made.
4387
 */
4388
0
static int re_case_expand(struct re *re) {
4389
0
    int result = 0, r1, r2;
4390
4391
0
    switch(re->type) {
4392
0
    case UNION:
4393
0
    case CONCAT:
4394
0
        r1 = re_case_expand(re->exp1);
4395
0
        r2 = re_case_expand(re->exp2);
4396
0
        result = (r1 != 0) ? r1 : r2;
4397
0
        break;
4398
0
    case CSET:
4399
0
        for (int c = 'A'; c <= 'Z'; c++)
4400
0
            if (bitset_get(re->cset, c)) {
4401
0
                result = 1;
4402
0
                bitset_set(re->cset, tolower(c));
4403
0
            }
4404
0
        for (int c = 'a'; c <= 'z'; c++)
4405
0
            if (bitset_get(re->cset, c)) {
4406
0
                result = 1;
4407
0
                bitset_set(re->cset, toupper(c));
4408
0
            }
4409
0
        break;
4410
0
    case CHAR:
4411
0
        if (isalpha(re->c)) {
4412
0
            int c = re->c;
4413
0
            re->type = CSET;
4414
0
            re->negate = false;
4415
0
            re->no_ranges = 0;
4416
0
            re->cset = bitset_init(UCHAR_NUM);
4417
0
            if (re->cset == NULL)
4418
0
                return -1;
4419
0
            bitset_set(re->cset, tolower(c));
4420
0
            bitset_set(re->cset, toupper(c));
4421
0
            result = 1;
4422
0
        }
4423
0
        break;
4424
0
    case ITER:
4425
0
        result = re_case_expand(re->exp);
4426
0
        break;
4427
0
    case EPSILON:
4428
0
        break;
4429
0
    default:
4430
0
        assert(0);
4431
0
        abort();
4432
0
        break;
4433
0
    }
4434
0
    return result;
4435
0
}
4436
4437
int fa_expand_nocase(const char *regexp, size_t regexp_len,
4438
0
                     char **newregexp, size_t *newregexp_len) {
4439
0
    int result, r;
4440
0
    struct re *re = NULL;
4441
0
    struct re_parse parse;
4442
0
    struct re_str str;
4443
4444
0
    *newregexp = NULL;
4445
0
    MEMZERO(&parse, 1);
4446
0
    parse.rx = regexp;
4447
0
    parse.rend = regexp + regexp_len;
4448
0
    parse.error = REG_NOERROR;
4449
0
    re = parse_regexp(&parse);
4450
0
    if (parse.error != REG_NOERROR)
4451
0
        return parse.error;
4452
4453
0
    r = re_case_expand(re);
4454
0
    if (r < 0) {
4455
0
        re_unref(re);
4456
0
        return REG_ESPACE;
4457
0
    }
4458
4459
0
    if (r == 1) {
4460
0
        MEMZERO(&str, 1);
4461
0
        result = re_as_string(re, &str);
4462
0
        *newregexp = str.rx;
4463
0
        *newregexp_len = str.len;
4464
0
    } else {
4465
0
        *newregexp = strndup(regexp, regexp_len);
4466
0
        *newregexp_len = regexp_len;
4467
0
        result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR;
4468
0
    }
4469
0
    re_unref(re);
4470
0
    return result;
4471
0
}
4472
4473
0
static void print_char(FILE *out, uchar c) {
4474
    /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
4475
       Also, a space ' ' is shown as '\s' */
4476
0
    static const char *const escape_from = " \n\t\v\b\r\f\a/\0";
4477
0
    static const char *const escape_to = "sntvbrfa/0";
4478
0
    char *p = strchr(escape_from, c);
4479
0
    if (p != NULL) {
4480
0
        int i = p - escape_from;
4481
0
        fprintf(out, "\\\\%c", escape_to[i]);
4482
0
    } else if (! isprint(c)) {
4483
0
        fprintf(out, "\\\\0%03o", (unsigned char) c);
4484
0
    } else if (c == '"') {
4485
0
        fprintf(out, "\\\"");
4486
0
    } else {
4487
0
        fputc(c, out);
4488
0
    }
4489
0
}
4490
4491
0
void fa_dot(FILE *out, struct fa *fa) {
4492
0
    fprintf(out, "digraph {\n  rankdir=LR;");
4493
0
    list_for_each(s, fa->initial) {
4494
0
        if (s->accept) {
4495
0
            fprintf(out, "\"%p\" [shape=doublecircle];\n", s);
4496
0
        } else {
4497
0
            fprintf(out, "\"%p\" [shape=circle];\n", s);
4498
0
        }
4499
0
    }
4500
0
    fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa",
4501
0
            fa->initial);
4502
4503
0
    struct re_str str;
4504
0
    MEMZERO(&str, 1);
4505
0
    list_for_each(s, fa->initial) {
4506
0
        for_each_trans(t, s) {
4507
0
            fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to);
4508
0
            if (fa->trans_re) {
4509
0
                re_as_string(t->re, &str);
4510
0
                for (int i=0; i < str.len; i++) {
4511
0
                    print_char(out, str.rx[i]);
4512
0
                }
4513
0
                release_re_str(&str);
4514
0
            } else {
4515
0
                print_char(out, t->min);
4516
0
                if (t->min != t->max) {
4517
0
                    fputc('-', out);
4518
0
                    print_char(out, t->max);
4519
0
                }
4520
0
            }
4521
0
            fprintf(out, "\" ];\n");
4522
0
        }
4523
0
    }
4524
0
    fprintf(out, "}\n");
4525
0
}
4526
4527
0
int fa_json(FILE *out, struct fa *fa) {
4528
0
    hash_val_t *list_hashes = NULL;
4529
0
    int list_size = 100;
4530
0
    int num_states = 0;
4531
0
    int it;
4532
0
    char first = true;
4533
0
    int result = -1;
4534