Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * jmt.c: Earley parser for lenses based on Jim/Mandelbaum transducers |
3 | | * |
4 | | * Copyright (C) 2009-2016 David Lutterkort |
5 | | * |
6 | | * This library is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * This library is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with this library; if not, write to the Free Software |
18 | | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 | | * |
20 | | * Author: David Lutterkort <lutter@redhat.com> |
21 | | */ |
22 | | |
23 | | #include <config.h> |
24 | | |
25 | | #include "jmt.h" |
26 | | #include "internal.h" |
27 | | #include "memory.h" |
28 | | #include "errcode.h" |
29 | | |
30 | | /* This is an implementation of the Earley parser described in the paper |
31 | | * "Efficient Earley Parsing with Regular Right-hand Sides" by Trever Jim |
32 | | * and Yitzhak Mandelbaum. |
33 | | * |
34 | | * Since we only deal in lenses, they are both terminals and nonterminals: |
35 | | * recursive lenses (those with lens->recursive set) are nonterminals, and |
36 | | * non-recursive lenses are terminals, with the exception of non-recursive |
37 | | * lenses that match the empty word. Lenses are also the semantic actions |
38 | | * attached to a SCAN or COMPLETE during recosntruction of the parse |
39 | | * tree. Our SCAN makes sure that we only ever scan nonempty words - that |
40 | | * means that for nonrecursive lenses which match the empty word (e.g., del |
41 | | * [ \t]* "") we need to make sure we get a COMPLETE when the lens matched |
42 | | * the empty word. |
43 | | * |
44 | | * That is achieved by treating such a lens t as the construct (t|T) with |
45 | | * T := eps. |
46 | | */ |
47 | | |
48 | | /* |
49 | | * Data structures for the Jim/Mandelbaum parser |
50 | | */ |
51 | | |
52 | 0 | #define IND_MAX UINT32_MAX |
53 | | |
54 | | struct array { |
55 | | size_t elem_size; |
56 | | ind_t used; |
57 | | ind_t size; |
58 | | void *data; |
59 | | }; |
60 | | |
61 | | #define array_elem(arr, ind, typ) \ |
62 | 0 | (((typ *) (arr).data) + ind) |
63 | | |
64 | | #define array_for_each(i, arr) \ |
65 | 0 | for(ind_t i = 0; i < (arr).used; i++) |
66 | | |
67 | | #define array_each_elem(elt, arr, typ) \ |
68 | 0 | for(typ *elt = (typ *) (arr).data + 0; \ |
69 | 0 | elt - (typ *) (arr).data < (arr).used; \ |
70 | 0 | elt++) |
71 | | |
72 | 0 | static void array_init(struct array *arr, size_t elem_size) { |
73 | 0 | MEMZERO(arr, 1); |
74 | 0 | arr->elem_size = elem_size; |
75 | 0 | } |
76 | | |
77 | | ATTRIBUTE_RETURN_CHECK |
78 | 0 | static int array_add(struct array *arr, ind_t *ind) { |
79 | 0 | if (arr->used >= arr->size) { |
80 | 0 | int r; |
81 | 0 | ind_t expand = arr->size; |
82 | 0 | if (expand < 8) |
83 | 0 | expand = 8; |
84 | 0 | r = mem_realloc_n(&(arr->data), arr->elem_size, arr->size + expand); |
85 | 0 | if (r < 0) |
86 | 0 | return -1; |
87 | 0 | memset((char *) arr->data + arr->elem_size*arr->size, 0, |
88 | 0 | arr->elem_size * expand); |
89 | 0 | arr->size += expand; |
90 | 0 | } |
91 | 0 | *ind = arr->used; |
92 | 0 | arr->used += 1; |
93 | 0 | return 0; |
94 | 0 | } |
95 | | |
96 | | /* Insert a new entry into the array at index IND. Shift all entries from |
97 | | * IND on back by one. */ |
98 | | ATTRIBUTE_RETURN_CHECK |
99 | 0 | static int array_insert(struct array *arr, ind_t ind) { |
100 | 0 | ind_t last; |
101 | |
|
102 | 0 | if (array_add(arr, &last) < 0) |
103 | 0 | return -1; |
104 | | |
105 | 0 | if (ind >= last) |
106 | 0 | return 0; |
107 | | |
108 | 0 | memmove((char *) arr->data + arr->elem_size*(ind+1), |
109 | 0 | (char *) arr->data + arr->elem_size*ind, |
110 | 0 | arr->elem_size * (arr->used - ind - 1)); |
111 | 0 | memset((char *) arr->data + arr->elem_size*ind, 0, arr->elem_size); |
112 | 0 | return 0; |
113 | 0 | } |
114 | | |
115 | 0 | static void array_release(struct array *arr) { |
116 | 0 | if (arr != NULL) { |
117 | 0 | free(arr->data); |
118 | 0 | arr->used = arr->size = 0; |
119 | 0 | } |
120 | 0 | } |
121 | | |
122 | 0 | static void array_remove(struct array *arr, ind_t ind) { |
123 | 0 | char *data = arr->data; |
124 | 0 | memmove(data + ind * arr->elem_size, |
125 | 0 | data + (ind + 1) * arr->elem_size, |
126 | 0 | (arr->used - ind - 1) * arr->elem_size); |
127 | 0 | arr->used -= 1; |
128 | 0 | } |
129 | | |
130 | | ATTRIBUTE_RETURN_CHECK |
131 | 0 | static int array_join(struct array *dst, struct array *src) { |
132 | 0 | int r; |
133 | |
|
134 | 0 | if (dst->elem_size != src->elem_size) |
135 | 0 | return -1; |
136 | | |
137 | 0 | r = mem_realloc_n(&(dst->data), dst->elem_size, |
138 | 0 | dst->used + src->used); |
139 | 0 | if (r < 0) |
140 | 0 | return -1; |
141 | | |
142 | 0 | memcpy(((char *) dst->data) + dst->used * dst->elem_size, |
143 | 0 | src->data, |
144 | 0 | src->used * src->elem_size); |
145 | 0 | dst->used += src->used; |
146 | 0 | dst->size = dst->used; |
147 | 0 | return 0; |
148 | 0 | } |
149 | | |
150 | | /* Special lens indices - these don't reference lenses, but |
151 | | * some pseudo actions. We stash them at the top of the range |
152 | | * of IND_T, with LENS_MAX giving us the maximal lens index for |
153 | | * a real lens |
154 | | */ |
155 | | enum trans_op { |
156 | | EPS = IND_MAX, |
157 | | CALL = EPS - 1, |
158 | | LENS_MAX = CALL - 1 |
159 | | }; |
160 | | |
161 | | struct trans { |
162 | | struct state *to; |
163 | | ind_t lens; |
164 | | }; |
165 | | |
166 | | struct state { |
167 | | struct state *next; /* Linked list for memory management */ |
168 | | struct array trans; /* Array of struct trans */ |
169 | | ind_t nret; /* Number of returned lenses */ |
170 | | ind_t *ret; /* The returned lenses */ |
171 | | ind_t num; /* Counter for num of new states */ |
172 | | unsigned int reachable : 1; |
173 | | unsigned int live : 1; |
174 | | }; |
175 | | |
176 | | /* Sets of states used in determinizing the NFA to a DFA */ |
177 | | struct nfa_state { |
178 | | struct state *state; /* The new state in the DFA */ |
179 | | struct array set; /* Set of (struct state *) in the NFA, sorted */ |
180 | | }; |
181 | | |
182 | | /* For recursive lenses (nonterminals), the mapping of nonterminal to |
183 | | * state. We also store nonrecursive lenses here; in that case, the state |
184 | | * will be NULL */ |
185 | | struct jmt_lens { |
186 | | struct lens *lens; |
187 | | struct state *state; |
188 | | }; |
189 | | |
190 | | /* A Jim/Mandelbaum transducer */ |
191 | | struct jmt { |
192 | | struct error *error; |
193 | | struct array lenses; /* Array of struct jmt_lens */ |
194 | | struct state *start; |
195 | | ind_t lens; /* The start symbol of the grammar */ |
196 | | ind_t state_count; |
197 | | }; |
198 | | |
199 | | enum item_reason { |
200 | | R_ROOT = 1, |
201 | | R_COMPLETE = R_ROOT << 1, |
202 | | R_PREDICT = R_COMPLETE << 1, |
203 | | R_SCAN = R_PREDICT << 1 |
204 | | }; |
205 | | |
206 | | /* The reason an item was added for parse reconstruction; can be R_SCAN, |
207 | | * R_COMPLETE, R_PREDICT or R_COMPLETE|R_PREDICT */ |
208 | | struct link { |
209 | | enum item_reason reason; |
210 | | ind_t lens; /* R_COMPLETE, R_SCAN */ |
211 | | ind_t from_set; /* R_COMPLETE, R_PREDICT, R_SCAN */ |
212 | | ind_t from_item; /* R_COMPLETE, R_PREDICT, R_SCAN */ |
213 | | ind_t to_item; /* R_COMPLETE */ |
214 | | ind_t caller; /* number of a state */ |
215 | | }; |
216 | | |
217 | | struct item { |
218 | | /* The 'classical' Earley item (state, parent) */ |
219 | | struct state *state; |
220 | | ind_t parent; |
221 | | /* Backlinks to why item was added */ |
222 | | ind_t nlinks; |
223 | | struct link *links; |
224 | | }; |
225 | | |
226 | | struct item_set { |
227 | | struct array items; |
228 | | }; |
229 | | |
230 | | struct jmt_parse { |
231 | | struct jmt *jmt; |
232 | | struct error *error; |
233 | | const char *text; |
234 | | ind_t nsets; |
235 | | struct item_set **sets; |
236 | | }; |
237 | | |
238 | | #define for_each_item(it, set) \ |
239 | 0 | array_each_elem(it, (set)->items, struct item) |
240 | | |
241 | | #define for_each_trans(t, s) \ |
242 | 0 | array_each_elem(t, (s)->trans, struct trans) |
243 | | |
244 | | #define parse_state(parse, ind) \ |
245 | | array_elem(parse->states, ind, struct state) |
246 | | |
247 | | static struct item *set_item(struct jmt_parse *parse, ind_t set, |
248 | 0 | ind_t item) { |
249 | 0 | ensure(parse->sets[set] != NULL, parse); |
250 | 0 | ensure(item < parse->sets[set]->items.used, parse); |
251 | 0 | return array_elem(parse->sets[set]->items, item, struct item); |
252 | 0 | error: |
253 | 0 | return NULL; |
254 | 0 | } |
255 | | |
256 | | static struct state *item_state(struct jmt_parse *parse, ind_t set, |
257 | 0 | ind_t item) { |
258 | 0 | return set_item(parse, set, item)->state; |
259 | 0 | } |
260 | | |
261 | 0 | static struct trans *state_trans(struct state *state, ind_t t) { |
262 | 0 | return array_elem(state->trans, t, struct trans); |
263 | 0 | } |
264 | | |
265 | 0 | static ind_t item_parent(struct jmt_parse *parse, ind_t set, ind_t item) { |
266 | 0 | return set_item(parse, set, item)->parent; |
267 | 0 | } |
268 | | |
269 | 0 | static bool is_return(const struct state *s) { |
270 | 0 | return s->nret > 0; |
271 | 0 | } |
272 | | |
273 | 0 | static struct lens *lens_of_parse(struct jmt_parse *parse, ind_t lens) { |
274 | 0 | return array_elem(parse->jmt->lenses, lens, struct jmt_lens)->lens; |
275 | 0 | } |
276 | | |
277 | | /* |
278 | | * The parser |
279 | | */ |
280 | | |
281 | | /* |
282 | | * Manipulate the Earley graph. We denote edges in the graph as |
283 | | * [j, (s,i)] -> [k, item_k] => [l, item_l] |
284 | | * to indicate that we are adding item (s,i) to E_j and record that its |
285 | | * child is the item with index item_k in E_k and its sibling is item |
286 | | * item_l in E_l. |
287 | | */ |
288 | | |
289 | | /* Add item (s, k) to E_j. Note that the item was caused by action reason |
290 | | * using lens starting at from_item in E_{from_set} |
291 | | * |
292 | | * [j, (s,k)] -> [from_set, from_item] => [j, to_item] |
293 | | */ |
294 | | static ind_t parse_add_item(struct jmt_parse *parse, ind_t j, |
295 | | struct state *s, ind_t k, |
296 | | enum item_reason reason, ind_t lens, |
297 | | ind_t from_set, |
298 | | ind_t from_item, ind_t to_item, |
299 | 0 | ind_t caller) { |
300 | |
|
301 | 0 | int r; |
302 | 0 | struct item_set *set = parse->sets[j]; |
303 | 0 | struct item *item = NULL; |
304 | 0 | ind_t result = IND_MAX; |
305 | |
|
306 | 0 | ensure(from_item == EPS || from_item < parse->sets[from_set]->items.used, |
307 | 0 | parse); |
308 | 0 | ensure(to_item == EPS || to_item < parse->sets[j]->items.used, |
309 | 0 | parse); |
310 | |
|
311 | 0 | if (set == NULL) { |
312 | 0 | r = ALLOC(parse->sets[j]); |
313 | 0 | ERR_NOMEM(r < 0, parse); |
314 | 0 | array_init(&parse->sets[j]->items, sizeof(struct item)); |
315 | 0 | set = parse->sets[j]; |
316 | 0 | } |
317 | | |
318 | 0 | for (ind_t i=0; i < set->items.used; i++) { |
319 | 0 | if (item_state(parse, j, i) == s |
320 | 0 | && item_parent(parse, j, i) == k) { |
321 | 0 | result = i; |
322 | 0 | item = set_item(parse, j, i); |
323 | 0 | break; |
324 | 0 | } |
325 | 0 | } |
326 | |
|
327 | 0 | if (result == IND_MAX) { |
328 | 0 | r = array_add(&set->items, &result); |
329 | 0 | ERR_NOMEM(r < 0, parse); |
330 | |
|
331 | 0 | item = set_item(parse, j, result); |
332 | 0 | item->state = s; |
333 | 0 | item->parent = k; |
334 | 0 | } |
335 | | |
336 | 0 | for (ind_t i = 0; i < item->nlinks; i++) { |
337 | 0 | struct link *lnk = item->links + i; |
338 | 0 | if (lnk->reason == reason && lnk->lens == lens |
339 | 0 | && lnk->from_set == from_set && lnk->from_item == from_item |
340 | 0 | && lnk->to_item == to_item && lnk->caller == caller) |
341 | 0 | return result; |
342 | 0 | } |
343 | | |
344 | 0 | r = REALLOC_N(item->links, item->nlinks + 1); |
345 | 0 | ERR_NOMEM(r < 0, parse); |
346 | |
|
347 | 0 | struct link *lnk = item->links+ item->nlinks; |
348 | 0 | item->nlinks += 1; |
349 | |
|
350 | 0 | lnk->reason = reason; |
351 | 0 | lnk->lens = lens; |
352 | 0 | lnk->from_set = from_set; |
353 | 0 | lnk->from_item = from_item; |
354 | 0 | lnk->to_item = to_item; |
355 | 0 | lnk->caller = caller; |
356 | 0 | error: |
357 | 0 | return result; |
358 | 0 | } |
359 | | |
360 | | /* Add item (s, i) to set E_j and record that it was added because |
361 | | * a parse of nonterminal lens, starting at itemk in E_k, was completed |
362 | | * by item item in E_j |
363 | | * |
364 | | * [j, (s,i)] -> [j, item] => [k, itemk] |
365 | | */ |
366 | | static void parse_add_complete(struct jmt_parse *parse, ind_t j, |
367 | | struct state *s, ind_t i, |
368 | | ind_t k, ind_t itemk, |
369 | | ind_t lens, |
370 | 0 | ind_t item) { |
371 | 0 | parse_add_item(parse, j, s, i, R_COMPLETE, lens, k, itemk, item, IND_MAX); |
372 | 0 | } |
373 | | |
374 | | /* Same as parse_add_complete, but mark the item also as a predict |
375 | | * |
376 | | * [j, (s,i)] -> [j, item] => [k, itemk] |
377 | | */ |
378 | | static void parse_add_predict_complete(struct jmt_parse *parse, ind_t j, |
379 | | struct state *s, ind_t i, |
380 | | ind_t k, ind_t itemk, |
381 | | ind_t lens, |
382 | 0 | ind_t item, ind_t caller) { |
383 | 0 | parse_add_item(parse, j, s, i, R_COMPLETE|R_PREDICT, |
384 | 0 | lens, k, itemk, item, caller); |
385 | 0 | } |
386 | | |
387 | | /* Add item (s, j) to E_j and record that it was added because of a |
388 | | * prediction from item from in E_j |
389 | | */ |
390 | | static ind_t parse_add_predict(struct jmt_parse *parse, ind_t j, |
391 | 0 | struct state *s, ind_t from) { |
392 | |
|
393 | 0 | ensure(from < parse->sets[j]->items.used, parse); |
394 | 0 | struct state *t = item_state(parse, j, from); |
395 | |
|
396 | 0 | return parse_add_item(parse, j, s, j, R_PREDICT, EPS, j, from, EPS, |
397 | 0 | t->num); |
398 | 0 | error: |
399 | 0 | return IND_MAX; |
400 | 0 | } |
401 | | |
402 | | /* Add item (s,i) to E_j and record that it was added because of scanning |
403 | | * with lens starting from item item in E_k. |
404 | | * |
405 | | * [j, (s,i)] -> [k, item] |
406 | | */ |
407 | | static void parse_add_scan(struct jmt_parse *parse, ind_t j, |
408 | | struct state *s, ind_t i, |
409 | 0 | ind_t lens, ind_t k, ind_t item) { |
410 | 0 | ensure(item < parse->sets[k]->items.used, parse); |
411 | |
|
412 | 0 | parse_add_item(parse, j, s, i, R_SCAN, lens, k, item, EPS, IND_MAX); |
413 | 0 | error: |
414 | 0 | return; |
415 | 0 | } |
416 | | |
417 | | ATTRIBUTE_PURE |
418 | 0 | static bool is_complete(const struct link *lnk) { |
419 | 0 | return lnk->reason & R_COMPLETE; |
420 | 0 | } |
421 | | |
422 | | ATTRIBUTE_PURE |
423 | 0 | static bool is_predict(const struct link *lnk) { |
424 | 0 | return lnk->reason & R_PREDICT; |
425 | 0 | } |
426 | | |
427 | | ATTRIBUTE_PURE |
428 | 0 | static bool is_scan(const struct link *lnk) { |
429 | 0 | return lnk->reason & R_SCAN; |
430 | 0 | } |
431 | | |
432 | | ATTRIBUTE_PURE |
433 | 0 | static bool is_last_sibling(const struct link *lnk) { |
434 | 0 | if (is_complete(lnk)) |
435 | 0 | return false; |
436 | 0 | return lnk->reason & (R_PREDICT|R_ROOT); |
437 | 0 | } |
438 | | |
439 | | ATTRIBUTE_PURE |
440 | 0 | static bool returns(const struct state *s, ind_t l) { |
441 | 0 | for (ind_t i = 0; i < s->nret; i++) |
442 | 0 | if (s->ret[i] == l) |
443 | 0 | return true; |
444 | 0 | return false; |
445 | 0 | } |
446 | | |
447 | 0 | static void state_add_return(struct jmt *jmt, struct state *s, ind_t l) { |
448 | 0 | int r; |
449 | |
|
450 | 0 | if (s == NULL || returns(s, l)) |
451 | 0 | return; |
452 | | |
453 | 0 | r = REALLOC_N(s->ret, s->nret + 1); |
454 | 0 | ERR_NOMEM(r < 0, jmt); |
455 | 0 | s->ret[s->nret] = l; |
456 | 0 | s->nret += 1; |
457 | 0 | error: |
458 | 0 | return; |
459 | 0 | } |
460 | | |
461 | | static void state_merge_returns(struct jmt *jmt, struct state *dst, |
462 | 0 | const struct state *src) { |
463 | 0 | for (ind_t l = 0; l < src->nret; l++) |
464 | 0 | state_add_return(jmt, dst, src->ret[l]); |
465 | 0 | } |
466 | | |
467 | | static void nncomplete(struct jmt_parse *parse, ind_t j, |
468 | 0 | struct state *t, ind_t k, ind_t item) { |
469 | |
|
470 | 0 | for (ind_t itemk = 0; itemk < parse->sets[k]->items.used; itemk++) { |
471 | 0 | struct state *u = item_state(parse, k, itemk); |
472 | 0 | for_each_trans(y, u) { |
473 | 0 | if (returns(t, y->lens)) { |
474 | 0 | ind_t parent = item_parent(parse, k, itemk); |
475 | 0 | parse_add_complete(parse, j, |
476 | 0 | y->to, parent, |
477 | 0 | k, itemk, y->lens, item); |
478 | 0 | } |
479 | 0 | } |
480 | 0 | } |
481 | 0 | } |
482 | | |
483 | | /* NCALLER for (t, i) in E_j, which has index item in E_j, and t -> s a |
484 | | * call in the transducer and s a return. The item (s,j) has index pred in |
485 | | * E_j |
486 | | */ |
487 | | static void ncaller(struct jmt_parse *parse, ind_t j, ind_t item, |
488 | 0 | struct state *t, ind_t i, struct state *s, ind_t pred) { |
489 | 0 | for_each_trans(u, t) { |
490 | 0 | if (returns(s, u->lens)) { |
491 | | /* [j, (u->to, i)] -> [j, pred] => [j, item] */ |
492 | 0 | parse_add_predict_complete(parse, j, |
493 | 0 | u->to, i, |
494 | 0 | j, item, u->lens, |
495 | 0 | pred, t->num); |
496 | 0 | } |
497 | 0 | } |
498 | 0 | } |
499 | | |
500 | | /* NCALLEE for (t, parent) in E_j, which has index item in E_j, and t -> s |
501 | | * a call in the transducer and s a return. The item (s,j) has index pred |
502 | | * in E_j |
503 | | */ |
504 | | static void ncallee(struct jmt_parse *parse, ind_t j, ATTRIBUTE_UNUSED ind_t item, |
505 | 0 | ATTRIBUTE_UNUSED struct state *t, ATTRIBUTE_UNUSED ind_t parent, struct state *s, ind_t pred) { |
506 | 0 | for_each_trans(u, s) { |
507 | 0 | if (returns(s, u->lens)) { |
508 | | /* [j, (u->to, j)] -> [j, item] => [j, item] */ |
509 | 0 | parse_add_predict_complete(parse, j, |
510 | 0 | u->to, j, |
511 | 0 | j, pred, u->lens, |
512 | 0 | pred, t->num); |
513 | 0 | } |
514 | 0 | } |
515 | 0 | } |
516 | | |
517 | | static struct jmt_parse *parse_init(struct jmt *jmt, |
518 | 0 | const char *text, size_t text_len) { |
519 | 0 | int r; |
520 | 0 | struct jmt_parse *parse; |
521 | |
|
522 | 0 | r = ALLOC(parse); |
523 | 0 | ERR_NOMEM(r < 0, jmt); |
524 | |
|
525 | 0 | parse->jmt = jmt; |
526 | 0 | parse->error = jmt->error; |
527 | 0 | parse->text = text; |
528 | 0 | parse->nsets = text_len + 1; |
529 | 0 | r = ALLOC_N(parse->sets, parse->nsets); |
530 | 0 | ERR_NOMEM(r < 0, jmt); |
531 | 0 | return parse; |
532 | 0 | error: |
533 | 0 | if (parse != NULL) |
534 | 0 | free(parse->sets); |
535 | 0 | free(parse); |
536 | 0 | return NULL; |
537 | 0 | } |
538 | | |
539 | 0 | void jmt_free_parse(struct jmt_parse *parse) { |
540 | 0 | if (parse == NULL) |
541 | 0 | return; |
542 | 0 | for (int i=0; i < parse->nsets; i++) { |
543 | 0 | struct item_set *set = parse->sets[i]; |
544 | 0 | if (set != NULL) { |
545 | 0 | array_each_elem(x, set->items, struct item) |
546 | 0 | free(x->links); |
547 | 0 | array_release(&set->items); |
548 | 0 | free(set); |
549 | 0 | } |
550 | 0 | } |
551 | 0 | free(parse->sets); |
552 | 0 | free(parse); |
553 | 0 | } |
554 | | |
555 | | static struct state *lens_state(struct jmt *jmt, ind_t l); |
556 | | |
557 | 0 | static void flens(FILE *fp, ind_t l) { |
558 | 0 | if (l == 0) |
559 | 0 | fprintf(fp, "%c", 'S'); |
560 | 0 | else if (l < 'S' - 'A') |
561 | 0 | fprintf(fp, "%c", 'A' + l - 1); |
562 | 0 | else if (l <= 'Z' - 'A') |
563 | 0 | fprintf(fp, "%c", 'A' + l); |
564 | 0 | else |
565 | 0 | fprintf(fp, "%u", l); |
566 | 0 | } |
567 | | |
568 | | static void parse_dot_link(FILE *fp, struct jmt_parse *parse, |
569 | 0 | ind_t k, struct item *x, struct link *lnk) { |
570 | 0 | char *lens_label = NULL; |
571 | 0 | if (is_complete(lnk) || is_scan(lnk)) { |
572 | 0 | struct state *sA = lens_state(parse->jmt, lnk->lens); |
573 | 0 | int r; |
574 | |
|
575 | 0 | if (sA == NULL) |
576 | 0 | r = xasprintf(&lens_label, "<%d>", lnk->lens); |
577 | 0 | else |
578 | 0 | r = xasprintf(&lens_label, "%d", lnk->lens); |
579 | 0 | if (r < 0) { |
580 | 0 | fprintf(fp, "// Internal error generating lens_label\n"); |
581 | 0 | return; |
582 | 0 | } |
583 | 0 | } |
584 | 0 | fprintf(fp, " n%d_%d_%d [ label = \"(%d, %d)\"];\n", |
585 | 0 | k, x->state->num, x->parent, x->state->num, x->parent); |
586 | 0 | if (is_complete(lnk)) { |
587 | 0 | struct item *y = set_item(parse, k, lnk->to_item); |
588 | 0 | const char *pred = is_predict(lnk) ? "p" : ""; |
589 | 0 | fprintf(fp, " n%d_%d_%d -> n%s%d_%d_%d [ style = dashed ];\n", |
590 | 0 | k, x->state->num, x->parent, |
591 | 0 | pred, k, y->state->num, y->parent); |
592 | 0 | if (is_predict(lnk)) { |
593 | 0 | fprintf(fp, " n%s%d_%d_%d [ label = \"\" ];\n", |
594 | 0 | pred, k, y->state->num, y->parent); |
595 | 0 | fprintf(fp, " n%s%d_%d_%d -> n%d_%d_%d [ style = bold ];\n", |
596 | 0 | pred, k, y->state->num, y->parent, |
597 | 0 | k, y->state->num, y->parent); |
598 | 0 | } |
599 | 0 | y = set_item(parse, lnk->from_set, lnk->from_item); |
600 | 0 | fprintf(fp, |
601 | 0 | " n%d_%d_%d -> n%d_%d_%d [ style = dashed, label = \"", |
602 | 0 | k, x->state->num, x->parent, |
603 | 0 | lnk->from_set, y->state->num, y->parent); |
604 | 0 | flens(fp, lnk->lens); |
605 | 0 | fprintf(fp, "\" ];\n"); |
606 | 0 | } else if (is_scan(lnk)) { |
607 | 0 | struct item *y = |
608 | 0 | set_item(parse, lnk->from_set, lnk->from_item); |
609 | 0 | fprintf(fp, |
610 | 0 | " n%d_%d_%d -> n%d_%d_%d [ label = \"", |
611 | 0 | k, x->state->num, x->parent, |
612 | 0 | lnk->from_set, y->state->num, y->parent); |
613 | 0 | for (ind_t i=lnk->from_set; i < k; i++) |
614 | 0 | fprintf(fp, "%c", parse->text[i]); |
615 | 0 | fprintf(fp, "\" ];\n"); |
616 | |
|
617 | 0 | } else if (is_predict(lnk)) { |
618 | 0 | struct item *y = |
619 | 0 | set_item(parse, lnk->from_set, lnk->from_item); |
620 | 0 | fprintf(fp, |
621 | 0 | " n%d_%d_%d -> n%d_%d_%d [ style = bold ];\n", |
622 | 0 | k, x->state->num, x->parent, |
623 | 0 | lnk->from_set, y->state->num, y->parent); |
624 | |
|
625 | 0 | } |
626 | 0 | free(lens_label); |
627 | 0 | } |
628 | | |
629 | 0 | static void parse_dot(struct jmt_parse *parse, const char *fname) { |
630 | 0 | FILE *fp = debug_fopen("%s", fname); |
631 | 0 | if (fp == NULL) |
632 | 0 | return; |
633 | | |
634 | 0 | fprintf(fp, "digraph \"jmt_parse\" {\n"); |
635 | 0 | fprintf(fp, " rankdir = RL;\n"); |
636 | 0 | for (int k=0; k < parse->nsets; k++) { |
637 | 0 | struct item_set *set = parse->sets[k]; |
638 | |
|
639 | 0 | if (set == NULL) |
640 | 0 | continue; |
641 | | |
642 | 0 | fprintf(fp, " subgraph \"cluster_E_%d\" {\n", k); |
643 | 0 | fprintf(fp, " rankdir=RL;\n rank=same;\n"); |
644 | 0 | fprintf(fp, " title%d [ label=\"E%d\", shape=plaintext ]\n", k, k); |
645 | 0 | for_each_item(x, set) { |
646 | 0 | for (int i=0; i < x->nlinks; i++) { |
647 | 0 | struct link *lnk = x->links + i; |
648 | 0 | parse_dot_link(fp, parse, k, x, lnk); |
649 | 0 | } |
650 | 0 | } |
651 | 0 | fprintf(fp, "}\n"); |
652 | 0 | } |
653 | 0 | fprintf(fp, "}\n"); |
654 | 0 | fclose(fp); |
655 | 0 | } |
656 | | |
657 | | struct jmt_parse * |
658 | | jmt_parse(struct jmt *jmt, const char *text, size_t text_len) |
659 | 0 | { |
660 | 0 | struct jmt_parse *parse = NULL; |
661 | |
|
662 | 0 | parse = parse_init(jmt, text, text_len); |
663 | 0 | ERR_BAIL(jmt); |
664 | | |
665 | | /* INIT */ |
666 | 0 | parse_add_item(parse, 0, jmt->start, 0, R_ROOT, EPS, EPS, EPS, EPS, |
667 | 0 | jmt->lens); |
668 | | /* NINIT */ |
669 | 0 | if (is_return(jmt->start)) { |
670 | 0 | for_each_trans(x, jmt->start) { |
671 | 0 | if (returns(jmt->start, x->lens)) |
672 | 0 | parse_add_predict_complete(parse, 0, x->to, 0, |
673 | 0 | 0, 0, x->lens, 0, 0); |
674 | 0 | } |
675 | 0 | } |
676 | |
|
677 | 0 | for (int j=0; j <= text_len; j++) { |
678 | 0 | struct item_set *set = parse->sets[j]; |
679 | 0 | if (set == NULL) |
680 | 0 | continue; |
681 | | |
682 | 0 | for (int item=0; item < set->items.used; item++) { |
683 | 0 | struct state *t = item_state(parse, j, item); |
684 | 0 | ind_t i = item_parent(parse, j, item); |
685 | |
|
686 | 0 | if (is_return(t) && i != j) { |
687 | | /* NNCOMPLETE */ |
688 | 0 | nncomplete(parse, j, t, i, item); |
689 | 0 | } |
690 | |
|
691 | 0 | for_each_trans(x, t) { |
692 | 0 | if (x->lens == CALL) { |
693 | | /* PREDICT */ |
694 | 0 | ind_t pred = parse_add_predict(parse, j, x->to, item); |
695 | 0 | ERR_BAIL(parse); |
696 | 0 | if (is_return(x->to)) { |
697 | | /* NCALLER */ |
698 | 0 | ncaller(parse, j, item, t, i, x->to, pred); |
699 | | /* NCALLEE */ |
700 | 0 | ncallee(parse, j, item, t, i, x->to, pred); |
701 | 0 | } |
702 | 0 | } else { |
703 | 0 | int count; |
704 | 0 | struct lens *lens = lens_of_parse(parse, x->lens); |
705 | 0 | struct state *sA = lens_state(parse->jmt, x->lens); |
706 | 0 | if (! lens->recursive && sA == NULL) { |
707 | | /* SCAN, terminal */ |
708 | | // FIXME: We really need to find every k so that |
709 | | // text[j..k] matches lens->ctype, not just one |
710 | 0 | count = regexp_match(lens->ctype, text, text_len, j, NULL); |
711 | 0 | if (count > 0) { |
712 | 0 | parse_add_scan(parse, j+count, |
713 | 0 | x->to, i, |
714 | 0 | x->lens, j, item); |
715 | 0 | } |
716 | 0 | } |
717 | 0 | } |
718 | 0 | } |
719 | 0 | } |
720 | 0 | } |
721 | 0 | if (debugging("cf.jmt.parse")) |
722 | 0 | parse_dot(parse, "jmt_parse.dot"); |
723 | 0 | return parse; |
724 | 0 | error: |
725 | 0 | jmt_free_parse(parse); |
726 | 0 | return NULL; |
727 | 0 | } |
728 | | |
729 | | /* |
730 | | * Reconstruction of the parse tree |
731 | | */ |
732 | | |
733 | | static void |
734 | | build_nullable(struct jmt_parse *parse, ind_t pos, |
735 | 0 | struct jmt_visitor *visitor, struct lens *lens, int lvl) { |
736 | 0 | if (! lens->recursive) { |
737 | 0 | if (visitor->terminal != NULL) { |
738 | 0 | (*visitor->terminal)(lens, pos, pos, visitor->data); |
739 | 0 | ERR_BAIL(parse); |
740 | 0 | } |
741 | 0 | } else { |
742 | 0 | if (visitor->enter != NULL) { |
743 | 0 | (*visitor->enter)(lens, pos, pos, visitor->data); |
744 | 0 | ERR_BAIL(parse); |
745 | 0 | } |
746 | | |
747 | 0 | switch(lens->tag) { |
748 | 0 | case L_REC: |
749 | 0 | build_nullable(parse, pos, visitor, lens->body, lvl+1); |
750 | 0 | break; |
751 | 0 | case L_CONCAT: |
752 | 0 | for (int i=0; i < lens->nchildren; i++) |
753 | 0 | build_nullable(parse, pos, visitor, lens->children[i], lvl+1); |
754 | 0 | break; |
755 | 0 | case L_UNION: |
756 | 0 | for (int i=0; i < lens->nchildren; i++) |
757 | 0 | if (lens->children[i]->ctype_nullable) |
758 | 0 | build_nullable(parse, pos, visitor, |
759 | 0 | lens->children[i], lvl+1); |
760 | 0 | break; |
761 | 0 | case L_SUBTREE: |
762 | 0 | case L_SQUARE: |
763 | 0 | build_nullable(parse, pos, visitor, lens->child, lvl+1); |
764 | 0 | break; |
765 | 0 | case L_STAR: |
766 | 0 | case L_MAYBE: |
767 | 0 | break; |
768 | 0 | default: |
769 | 0 | BUG_ON(true, parse, "Unexpected lens tag %d", lens->tag); |
770 | 0 | } |
771 | | |
772 | 0 | if (visitor->exit != NULL) { |
773 | 0 | (*visitor->exit)(lens, pos, pos, visitor->data); |
774 | 0 | ERR_BAIL(parse); |
775 | 0 | } |
776 | 0 | } |
777 | 0 | error: |
778 | 0 | return; |
779 | 0 | } |
780 | | |
781 | | static void build_trace(const char *msg, ind_t start, ind_t end, |
782 | 0 | struct item *x, int lvl) { |
783 | 0 | for (int i=0; i < lvl; i++) putc(' ', stderr); |
784 | 0 | if (x != NULL) { |
785 | 0 | printf("%s %d..%d: (%d, %d) %d %s%s%s\n", msg, |
786 | 0 | start, end, x->state->num, |
787 | 0 | x->parent, x->links->lens, |
788 | 0 | is_complete(x->links) ? "c" : "", |
789 | 0 | is_predict(x->links) ? "p" : "", |
790 | 0 | is_scan(x->links) ? "s" : ""); |
791 | 0 | } else { |
792 | 0 | printf("%s %d..%d\n", msg, start, end); |
793 | 0 | } |
794 | 0 | } |
795 | | |
796 | 0 | static int add_sibling(struct array *siblings, ind_t lnk) { |
797 | 0 | int r; |
798 | 0 | ind_t ind; |
799 | |
|
800 | 0 | r = array_add(siblings, &ind); |
801 | 0 | if (r < 0) |
802 | 0 | return -1; |
803 | 0 | *array_elem(*siblings, ind, ind_t) = lnk; |
804 | 0 | return 0; |
805 | 0 | } |
806 | | |
807 | | /* Return true if CALLER is a possible caller for the link LNK which starts |
808 | | * at item X. |
809 | | * |
810 | | * FIXME: We can get rid of the caller field on a link if we distinguish |
811 | | * between NCALLER and NCALLEE in the Earley graph, rather than collapse |
812 | | * them both into links with reason PREDICT|COMPLETE |
813 | | */ |
814 | 0 | static bool is_caller(struct item *x, struct link *lnk, ind_t caller) { |
815 | 0 | if (lnk->reason & R_ROOT) |
816 | 0 | return caller == lnk->caller; |
817 | | |
818 | 0 | if (! is_predict(lnk)) |
819 | 0 | return false; |
820 | | |
821 | 0 | if (is_complete(lnk)) { |
822 | | /* NCALLER: caller == t |
823 | | * NCALLEE: caller == s */ |
824 | 0 | return caller == lnk->caller; |
825 | 0 | } |
826 | | /* PREDICT: caller == t || caller == s */ |
827 | 0 | return caller == lnk->caller || caller == x->state->num; |
828 | 0 | } |
829 | | |
830 | | /* Traverse the siblings of x and check that the callee's set of callers |
831 | | * contains CALLER. When a path ending with a call from CALLER exists, |
832 | | * record the number of the corresponding link for each item in |
833 | | * SIBLINGS. The links are recorded in left-to-right order, i.e. the number |
834 | | * of the first link to follow is in the last entry in SIBLINGS. |
835 | | * |
836 | | * Returns 0 if there is a path ending in a callee of CALLER. Return -1 if |
837 | | * there is none. Return -2 if there are multiple such paths. Return -3 for |
838 | | * any other error. |
839 | | */ |
840 | | static int filter_siblings(struct jmt_visitor *visitor, struct lens *lens, |
841 | | ind_t k, ind_t item, ind_t caller, |
842 | 0 | struct array *siblings) { |
843 | 0 | struct jmt_parse *parse = visitor->parse; |
844 | 0 | struct item *x = set_item(parse, k, item); |
845 | 0 | ind_t nlast = 0; |
846 | 0 | int r; |
847 | |
|
848 | 0 | for (ind_t lnk = 0; lnk < x->nlinks; lnk++) |
849 | 0 | if (is_last_sibling(x->links + lnk)) |
850 | 0 | nlast += 1; |
851 | |
|
852 | 0 | if (nlast > 0 && nlast < x->nlinks) |
853 | 0 | goto ambig; |
854 | | |
855 | 0 | if (nlast == x->nlinks) { |
856 | 0 | for (ind_t lnk = 0; lnk < x->nlinks; lnk++) { |
857 | 0 | if (is_caller(x, x->links + lnk, caller)) { |
858 | 0 | siblings->used = 0; |
859 | 0 | r = add_sibling(siblings, lnk); |
860 | 0 | if (r < 0) { |
861 | 0 | ERR_REPORT(parse, AUG_ENOMEM, NULL); |
862 | 0 | return -3; |
863 | 0 | } |
864 | 0 | return 0; |
865 | 0 | } |
866 | 0 | } |
867 | 0 | return -1; |
868 | 0 | } else { |
869 | | /* nlast == 0 */ |
870 | 0 | ind_t found = IND_MAX; |
871 | 0 | for (ind_t lnk = 0; lnk < x->nlinks; lnk++) { |
872 | 0 | struct link *l = x->links + lnk; |
873 | 0 | r = filter_siblings(visitor, lens, |
874 | 0 | l->from_set, l->from_item, caller, |
875 | 0 | siblings); |
876 | 0 | if (r == -1) |
877 | 0 | continue; |
878 | 0 | if (r == 0) { |
879 | 0 | if (found != IND_MAX) |
880 | 0 | goto ambig; |
881 | 0 | else |
882 | 0 | found = lnk; |
883 | 0 | } else { |
884 | 0 | return r; |
885 | 0 | } |
886 | 0 | } |
887 | 0 | if (found == IND_MAX) { |
888 | 0 | return -1; |
889 | 0 | } else { |
890 | 0 | r = add_sibling(siblings, found); |
891 | 0 | if (r < 0) { |
892 | 0 | ERR_REPORT(parse, AUG_ENOMEM, NULL); |
893 | 0 | return -3; |
894 | 0 | } |
895 | 0 | return 0; |
896 | 0 | } |
897 | 0 | } |
898 | 0 | ambig: |
899 | 0 | (*visitor->error)(lens, visitor->data, k, |
900 | 0 | "Ambiguous parse: %d links in state (%d, %d) in E_%d", |
901 | 0 | x->nlinks, x->state->num, x->parent, k); |
902 | 0 | return -2; |
903 | 0 | } |
904 | | |
905 | | static void visit_enter(struct jmt_visitor *visitor, struct lens *lens, |
906 | | size_t start, size_t end, |
907 | 0 | struct item *x, int lvl) { |
908 | 0 | if (debugging("cf.jmt.visit")) |
909 | 0 | build_trace("{", start, end, x, lvl); |
910 | 0 | if (visitor->enter != NULL) |
911 | 0 | (*visitor->enter)(lens, start, end, visitor->data); |
912 | 0 | } |
913 | | |
914 | | static void visit_exit(struct jmt_visitor *visitor, struct lens *lens, |
915 | | size_t start, size_t end, |
916 | 0 | struct item *x, int lvl) { |
917 | 0 | if (debugging("cf.jmt.visit")) |
918 | 0 | build_trace("}", start, end, x, lvl); |
919 | 0 | if (visitor->exit != NULL) |
920 | 0 | (*visitor->exit)(lens, start, end, visitor->data); |
921 | 0 | } |
922 | | |
923 | | static int |
924 | | build_children(struct jmt_parse *parse, ind_t k, ind_t item, |
925 | | struct jmt_visitor *visitor, int lvl, ind_t caller); |
926 | | |
927 | | static int |
928 | | build_tree(struct jmt_parse *parse, ind_t k, ind_t item, struct lens *lens, |
929 | 0 | struct jmt_visitor *visitor, int lvl) { |
930 | 0 | struct item *x = set_item(parse, k, item); |
931 | 0 | ind_t start = x->links->from_set; |
932 | 0 | ind_t end = k; |
933 | 0 | struct item *old_x = x; |
934 | |
|
935 | 0 | if (start == end) { |
936 | | /* This completion corresponds to a nullable nonterminal |
937 | | * that match epsilon. Reconstruct the full parse tree |
938 | | * for matching epsilon */ |
939 | 0 | if (debugging("cf.jmt.visit")) |
940 | 0 | build_trace("N", x->links->from_set, k, x, lvl); |
941 | 0 | build_nullable(parse, start, visitor, lens, lvl); |
942 | 0 | return end; |
943 | 0 | } |
944 | | |
945 | 0 | ensure(is_complete(x->links), parse); |
946 | |
|
947 | 0 | visit_enter(visitor, lens, start, end, x, lvl); |
948 | 0 | ERR_BAIL(parse); |
949 | | |
950 | | /* x is a completion item. (k, x->to_item) is its first child in the |
951 | | * parse tree. */ |
952 | 0 | if (! is_predict(x->links)) { |
953 | 0 | struct link *lnk = x->links; |
954 | 0 | struct item *sib = set_item(parse, lnk->from_set, lnk->from_item); |
955 | 0 | ind_t caller = sib->state->num; |
956 | |
|
957 | 0 | item = lnk->to_item; |
958 | 0 | x = set_item(parse, k, item); |
959 | 0 | build_children(parse, k, item, visitor, lvl, caller); |
960 | 0 | ERR_BAIL(parse); |
961 | 0 | } |
962 | | |
963 | 0 | visit_exit(visitor, lens, start, end, old_x, lvl); |
964 | 0 | ERR_BAIL(parse); |
965 | 0 | error: |
966 | 0 | return end; |
967 | 0 | } |
968 | | |
969 | | static int |
970 | | build_children(struct jmt_parse *parse, ind_t k, ind_t item, |
971 | 0 | struct jmt_visitor *visitor, int lvl, ind_t caller) { |
972 | 0 | struct item *x = set_item(parse, k, item); |
973 | 0 | struct lens *lens = lens_of_parse(parse, x->links->lens); |
974 | 0 | struct array siblings; |
975 | 0 | ind_t end = k; |
976 | 0 | int r; |
977 | |
|
978 | 0 | array_init(&siblings, sizeof(ind_t)); |
979 | 0 | r = filter_siblings(visitor, lens, k, item, caller, &siblings); |
980 | 0 | if (r < 0) |
981 | 0 | goto error; |
982 | | |
983 | | /* x the first item in a list of siblings; visit items (x->from_set, |
984 | | * x->from_item) in order, which will visit x and its siblings in the |
985 | | * parse tree from right to left */ |
986 | 0 | for (ind_t i = siblings.used - 1; i > 0; i--) { |
987 | 0 | ind_t lnk = *array_elem(siblings, i, ind_t); |
988 | 0 | struct lens *sub = lens_of_parse(parse, x->links[lnk].lens); |
989 | 0 | if (sub->recursive) { |
990 | 0 | build_tree(parse, k, item, sub, visitor, lvl+1); |
991 | 0 | ERR_BAIL(parse); |
992 | 0 | } else { |
993 | 0 | if (debugging("cf.jmt.visit")) |
994 | 0 | build_trace("T", x->links->from_set, k, x, lvl+1); |
995 | 0 | if (visitor->terminal != NULL) { |
996 | 0 | (*visitor->terminal)(sub, |
997 | 0 | x->links->from_set, k, visitor->data); |
998 | 0 | ERR_BAIL(parse); |
999 | 0 | } |
1000 | 0 | } |
1001 | 0 | k = x->links[lnk].from_set; |
1002 | 0 | item = x->links[lnk].from_item; |
1003 | 0 | x = set_item(parse, k, item); |
1004 | 0 | } |
1005 | 0 | error: |
1006 | 0 | array_release(&siblings); |
1007 | 0 | return end; |
1008 | 0 | } |
1009 | | |
1010 | 0 | int jmt_visit(struct jmt_visitor *visitor, size_t *len) { |
1011 | 0 | struct jmt_parse *parse = visitor->parse; |
1012 | 0 | ind_t k = parse->nsets - 1; /* Current Earley set */ |
1013 | 0 | ind_t item; |
1014 | 0 | struct item_set *set = parse->sets[k]; |
1015 | |
|
1016 | 0 | if (set == NULL) |
1017 | 0 | goto noparse; |
1018 | | |
1019 | 0 | for (item = 0; item < set->items.used; item++) { |
1020 | 0 | struct item *x = set_item(parse, k, item); |
1021 | 0 | if (x->parent == 0 && returns(x->state, parse->jmt->lens)) { |
1022 | 0 | for (ind_t i = 0; i < x->nlinks; i++) { |
1023 | 0 | if (is_complete(x->links + i) || is_scan(x->links + i)) { |
1024 | 0 | if (debugging("cf.jmt.visit")) |
1025 | 0 | printf("visit: found (%d, %d) in E_%d\n", |
1026 | 0 | x->state->num, x->parent, k); |
1027 | 0 | goto found; |
1028 | 0 | } |
1029 | 0 | } |
1030 | 0 | } |
1031 | 0 | } |
1032 | 0 | found: |
1033 | 0 | if (item >= parse->sets[k]->items.used) |
1034 | 0 | goto noparse; |
1035 | 0 | struct lens *lens = lens_of_parse(parse, parse->jmt->lens); |
1036 | |
|
1037 | 0 | visit_enter(visitor, lens, 0, k, NULL, 0); |
1038 | 0 | ERR_BAIL(parse); |
1039 | |
|
1040 | 0 | *len = build_children(parse, k, item, visitor, 0, |
1041 | 0 | parse->jmt->start->num); |
1042 | 0 | ERR_BAIL(parse); |
1043 | |
|
1044 | 0 | visit_exit(visitor, lens, 0, k, NULL, 0); |
1045 | 0 | ERR_BAIL(parse); |
1046 | 0 | return 1; |
1047 | 0 | error: |
1048 | 0 | return -1; |
1049 | 0 | noparse: |
1050 | 0 | for (; k > 0; k--) |
1051 | 0 | if (parse->sets[k] != NULL) break; |
1052 | 0 | *len = k; |
1053 | 0 | return 0; |
1054 | 0 | } |
1055 | | |
1056 | | /* |
1057 | | * Build the automaton |
1058 | | */ |
1059 | | |
1060 | | |
1061 | 0 | static struct state *make_state(struct jmt *jmt) { |
1062 | 0 | struct state *s; |
1063 | 0 | int r; |
1064 | |
|
1065 | 0 | r = ALLOC(s); |
1066 | 0 | ERR_NOMEM(r < 0, jmt); |
1067 | 0 | s->num = jmt->state_count++; |
1068 | 0 | array_init(&s->trans, sizeof(struct trans)); |
1069 | 0 | if (jmt->start != NULL) |
1070 | 0 | list_cons(jmt->start->next, s); |
1071 | 0 | else |
1072 | 0 | jmt->start = s; |
1073 | 0 | return s; |
1074 | 0 | error: |
1075 | 0 | return NULL; |
1076 | 0 | } |
1077 | | |
1078 | 0 | static ind_t add_lens(struct jmt *jmt, struct lens *lens) { |
1079 | 0 | int r; |
1080 | 0 | ind_t l; |
1081 | 0 | struct state *sA = NULL; |
1082 | 0 | int nullable = 0; |
1083 | |
|
1084 | 0 | r = array_add(&jmt->lenses, &l); |
1085 | 0 | ERR_NOMEM(r < 0, jmt); |
1086 | 0 | ERR_NOMEM(l == IND_MAX, jmt); |
1087 | |
|
1088 | 0 | if (! lens->recursive) |
1089 | 0 | nullable = regexp_matches_empty(lens->ctype); |
1090 | |
|
1091 | 0 | array_elem(jmt->lenses, l, struct jmt_lens)->lens = lens; |
1092 | | /* A nonrecursive lens that matches epsilon is both a terminal |
1093 | | * and a nonterminal */ |
1094 | 0 | if (lens->recursive || nullable) { |
1095 | 0 | sA = make_state(jmt); |
1096 | 0 | ERR_NOMEM(sA == NULL, jmt); |
1097 | 0 | array_elem(jmt->lenses, l, struct jmt_lens)->state = sA; |
1098 | 0 | if (! lens->recursive) { |
1099 | | /* Add lens again, so that l refers to the nonterminal T |
1100 | | * for the lens, and l+1 refers to the terminal t for it */ |
1101 | 0 | ind_t m; |
1102 | 0 | r = array_add(&jmt->lenses, &m); |
1103 | 0 | ERR_NOMEM(r < 0, jmt); |
1104 | 0 | ERR_NOMEM(m == IND_MAX, jmt); |
1105 | |
|
1106 | 0 | array_elem(jmt->lenses, m, struct jmt_lens)->lens = lens; |
1107 | 0 | } |
1108 | 0 | } |
1109 | | |
1110 | 0 | if (debugging("cf.jmt")) { |
1111 | 0 | if (sA == NULL) { |
1112 | 0 | char *s = format_lens(lens); |
1113 | 0 | printf("add_lens: "); |
1114 | 0 | print_regexp(stdout, lens->ctype); |
1115 | 0 | printf(" %s\n", s); |
1116 | 0 | free(s); |
1117 | 0 | } else { |
1118 | 0 | char *s = format_lens(lens); |
1119 | 0 | printf("add_lens: "); |
1120 | 0 | flens(stdout, l); |
1121 | 0 | printf(" %u %s\n", sA->num, s); |
1122 | 0 | if (nullable) { |
1123 | 0 | printf("add_lens: // %s\n", s); |
1124 | 0 | } |
1125 | 0 | free(s); |
1126 | 0 | } |
1127 | 0 | } |
1128 | |
|
1129 | 0 | return l; |
1130 | 0 | error: |
1131 | 0 | return IND_MAX; |
1132 | 0 | } |
1133 | | |
1134 | | static struct trans * |
1135 | | add_new_trans(struct jmt *jmt, |
1136 | 0 | struct state *from, struct state *to, ind_t lens) { |
1137 | 0 | struct trans *t; |
1138 | 0 | ind_t i; |
1139 | 0 | int r; |
1140 | |
|
1141 | 0 | if (from == NULL || to == NULL) |
1142 | 0 | return NULL; |
1143 | | |
1144 | 0 | r = array_add(&from->trans, &i); |
1145 | 0 | ERR_NOMEM(r < 0, jmt); |
1146 | 0 | t = array_elem(from->trans, i, struct trans); |
1147 | 0 | t->to = to; |
1148 | 0 | t->lens = lens; |
1149 | 0 | return t; |
1150 | 0 | error: |
1151 | 0 | return NULL; |
1152 | 0 | } |
1153 | | |
1154 | | static struct trans * |
1155 | 0 | add_eps_trans(struct jmt *jmt, struct state *from, struct state *to) { |
1156 | 0 | return add_new_trans(jmt, from, to, EPS); |
1157 | 0 | } |
1158 | | |
1159 | 0 | static struct lens *lens_of_jmt(struct jmt *jmt, ind_t l) { |
1160 | 0 | return array_elem(jmt->lenses, l, struct jmt_lens)->lens; |
1161 | 0 | } |
1162 | | |
1163 | 0 | static ind_t lens_index(struct jmt *jmt, struct lens *lens) { |
1164 | 0 | array_for_each(i, jmt->lenses) |
1165 | 0 | if (lens_of_jmt(jmt, i) == lens) |
1166 | 0 | return i; |
1167 | 0 | return IND_MAX; |
1168 | 0 | } |
1169 | | |
1170 | 0 | static struct state *lens_state(struct jmt *jmt, ind_t l) { |
1171 | 0 | return array_elem(jmt->lenses, l, struct jmt_lens)->state; |
1172 | 0 | } |
1173 | | |
1174 | 0 | static void print_lens_symbol(FILE *fp, struct jmt *jmt, struct lens *lens) { |
1175 | 0 | ind_t l = lens_index(jmt, lens); |
1176 | 0 | struct state *sA = lens_state(jmt, l); |
1177 | |
|
1178 | 0 | if (sA == NULL) |
1179 | 0 | print_regexp(fp, lens->ctype); |
1180 | 0 | else |
1181 | 0 | flens(fp, l); |
1182 | 0 | } |
1183 | | |
1184 | 0 | static void print_grammar(struct jmt *jmt, struct lens *lens) { |
1185 | 0 | ind_t l = lens_index(jmt, lens); |
1186 | 0 | struct state *sA = lens_state(jmt, l); |
1187 | |
|
1188 | 0 | if (sA == NULL || (lens->tag == L_REC && lens->rec_internal)) |
1189 | 0 | return; |
1190 | | |
1191 | 0 | printf(" "); |
1192 | 0 | print_lens_symbol(stdout, jmt, lens); |
1193 | 0 | printf(" := "); |
1194 | |
|
1195 | 0 | if (! lens->recursive) { |
1196 | | /* Nullable regexps */ |
1197 | 0 | print_regexp(stdout, lens->ctype); |
1198 | 0 | printf("\n"); |
1199 | 0 | return; |
1200 | 0 | } |
1201 | | |
1202 | 0 | switch (lens->tag) { |
1203 | 0 | case L_CONCAT: |
1204 | 0 | print_lens_symbol(stdout, jmt, lens->children[0]); |
1205 | 0 | for (int i=1; i < lens->nchildren; i++) { |
1206 | 0 | printf(" . "); |
1207 | 0 | print_lens_symbol(stdout, jmt, lens->children[i]); |
1208 | 0 | } |
1209 | 0 | printf("\n"); |
1210 | 0 | for (int i=0; i < lens->nchildren; i++) |
1211 | 0 | print_grammar(jmt, lens->children[i]); |
1212 | 0 | break; |
1213 | 0 | case L_UNION: |
1214 | 0 | print_lens_symbol(stdout, jmt, lens->children[0]); |
1215 | 0 | for (int i=1; i < lens->nchildren; i++) { |
1216 | 0 | printf(" | "); |
1217 | 0 | print_lens_symbol(stdout, jmt, lens->children[i]); |
1218 | 0 | } |
1219 | 0 | printf("\n"); |
1220 | 0 | for (int i=0; i < lens->nchildren; i++) |
1221 | 0 | print_grammar(jmt, lens->children[i]); |
1222 | 0 | break; |
1223 | 0 | case L_SUBTREE: |
1224 | 0 | print_lens_symbol(stdout, jmt, lens->child); |
1225 | 0 | printf("\n"); |
1226 | 0 | print_grammar(jmt, lens->child); |
1227 | 0 | break; |
1228 | 0 | case L_STAR: |
1229 | 0 | print_lens_symbol(stdout, jmt, lens->child); |
1230 | 0 | printf("*\n"); |
1231 | 0 | print_grammar(jmt, lens->child); |
1232 | 0 | break; |
1233 | 0 | case L_MAYBE: |
1234 | 0 | print_lens_symbol(stdout, jmt, lens->child); |
1235 | 0 | printf("?\n"); |
1236 | 0 | print_grammar(jmt, lens->child); |
1237 | 0 | break; |
1238 | 0 | case L_REC: |
1239 | 0 | print_lens_symbol(stdout, jmt, lens->body); |
1240 | 0 | printf("\n"); |
1241 | 0 | print_grammar(jmt, lens->body); |
1242 | 0 | break; |
1243 | 0 | case L_SQUARE: |
1244 | 0 | print_lens_symbol(stdout, jmt, lens->child); |
1245 | 0 | printf("\n"); |
1246 | 0 | print_grammar(jmt, lens->child); |
1247 | 0 | break; |
1248 | 0 | default: |
1249 | 0 | BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag); |
1250 | 0 | break; |
1251 | 0 | } |
1252 | 0 | error: |
1253 | 0 | return; |
1254 | 0 | } |
1255 | | |
1256 | 0 | static void print_grammar_top(struct jmt *jmt, struct lens *lens) { |
1257 | 0 | printf("Grammar:\n"); |
1258 | 0 | print_grammar(jmt, lens); |
1259 | 0 | if (lens->tag == L_REC) { |
1260 | 0 | printf(" "); |
1261 | 0 | print_lens_symbol(stdout, jmt, lens->alias); |
1262 | 0 | printf(" := "); |
1263 | 0 | print_lens_symbol(stdout, jmt, lens->alias->body); |
1264 | 0 | printf("\n"); |
1265 | 0 | } |
1266 | 0 | } |
1267 | | |
1268 | 0 | static void index_lenses(struct jmt *jmt, struct lens *lens) { |
1269 | 0 | ind_t l; |
1270 | |
|
1271 | 0 | l = lens_index(jmt, lens); |
1272 | 0 | if (l == IND_MAX) { |
1273 | 0 | l = add_lens(jmt, lens); |
1274 | 0 | ERR_BAIL(jmt); |
1275 | 0 | } |
1276 | | |
1277 | 0 | if (! lens->recursive) |
1278 | 0 | return; |
1279 | | |
1280 | 0 | switch (lens->tag) { |
1281 | 0 | case L_CONCAT: |
1282 | 0 | case L_UNION: |
1283 | 0 | for (int i=0; i < lens->nchildren; i++) |
1284 | 0 | index_lenses(jmt, lens->children[i]); |
1285 | 0 | break; |
1286 | 0 | case L_SUBTREE: |
1287 | 0 | case L_STAR: |
1288 | 0 | case L_MAYBE: |
1289 | 0 | case L_SQUARE: |
1290 | 0 | index_lenses(jmt, lens->child); |
1291 | 0 | break; |
1292 | 0 | case L_REC: |
1293 | 0 | if (! lens->rec_internal) |
1294 | 0 | index_lenses(jmt, lens->body); |
1295 | 0 | break; |
1296 | 0 | default: |
1297 | 0 | BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag); |
1298 | 0 | break; |
1299 | 0 | } |
1300 | 0 | error: |
1301 | 0 | return; |
1302 | 0 | } |
1303 | | |
1304 | | static void thompson(struct jmt *jmt, struct lens *lens, |
1305 | 0 | struct state **s, struct state **f) { |
1306 | 0 | ind_t l = lens_index(jmt, lens); |
1307 | 0 | struct state *sA = lens_state(jmt, l); |
1308 | 0 | ensure(l < jmt->lenses.used, jmt); |
1309 | |
|
1310 | 0 | *s = make_state(jmt); |
1311 | 0 | *f = make_state(jmt); |
1312 | 0 | ERR_BAIL(jmt); |
1313 | |
|
1314 | 0 | if (lens->recursive) { |
1315 | | /* A nonterminal */ |
1316 | 0 | add_new_trans(jmt, *s, *f, l); |
1317 | 0 | add_new_trans(jmt, *s, sA, CALL); |
1318 | 0 | } else if (sA == NULL) { |
1319 | | /* A terminal that never matches epsilon */ |
1320 | 0 | add_new_trans(jmt, *s, *f, l); |
1321 | 0 | } else { |
1322 | | /* A terminal that matches epsilon */ |
1323 | 0 | add_new_trans(jmt, *s, *f, l); |
1324 | 0 | add_new_trans(jmt, *s, sA, CALL); |
1325 | 0 | add_new_trans(jmt, *s, *f, l+1); |
1326 | 0 | } |
1327 | 0 | error: |
1328 | 0 | return; |
1329 | 0 | } |
1330 | | |
1331 | | static void conv(struct jmt *jmt, struct lens *lens, |
1332 | | struct state **s, struct state **e, |
1333 | 0 | struct state **f) { |
1334 | 0 | ind_t l = lens_index(jmt, lens); |
1335 | 0 | ensure(l < jmt->lenses.used, jmt); |
1336 | 0 | struct state *sA = lens_state(jmt, l); |
1337 | |
|
1338 | 0 | *s = NULL; |
1339 | 0 | *e = NULL; |
1340 | 0 | *f = NULL; |
1341 | |
|
1342 | 0 | if (lens->recursive) { |
1343 | | /* A nonterminal */ |
1344 | 0 | *s = make_state(jmt); |
1345 | 0 | *f = make_state(jmt); |
1346 | 0 | ERR_BAIL(jmt); |
1347 | 0 | add_new_trans(jmt, *s, *f, l); |
1348 | 0 | ERR_BAIL(jmt); |
1349 | 0 | ensure(sA != NULL, jmt); |
1350 | 0 | add_new_trans(jmt, *s, sA, EPS); |
1351 | 0 | ERR_BAIL(jmt); |
1352 | 0 | } else if (sA == NULL) { |
1353 | | /* A terminal that never matches epsilon */ |
1354 | 0 | *s = make_state(jmt); |
1355 | 0 | *f = make_state(jmt); |
1356 | 0 | ERR_BAIL(jmt); |
1357 | 0 | add_new_trans(jmt, *s, *f, l); |
1358 | 0 | ERR_BAIL(jmt); |
1359 | 0 | } else { |
1360 | | /* A terminal that matches epsilon */ |
1361 | 0 | *s = make_state(jmt); |
1362 | 0 | *f = make_state(jmt); |
1363 | 0 | ERR_BAIL(jmt); |
1364 | 0 | add_new_trans(jmt, *s, *f, l); |
1365 | 0 | add_new_trans(jmt, *s, *f, l+1); |
1366 | 0 | add_new_trans(jmt, *s, sA, EPS); |
1367 | 0 | ERR_BAIL(jmt); |
1368 | 0 | } |
1369 | 0 | error: |
1370 | 0 | return; |
1371 | 0 | } |
1372 | | |
1373 | | static void conv_concat(struct jmt *jmt, struct lens *lens, |
1374 | | struct state **s, struct state **e, |
1375 | 0 | struct state **f) { |
1376 | 0 | struct state *s2, *f2, *e2; |
1377 | |
|
1378 | 0 | conv(jmt, lens->children[0], &s2, &e2, &f2); |
1379 | 0 | *s = make_state(jmt); |
1380 | 0 | add_new_trans(jmt, *s, s2, EPS); |
1381 | |
|
1382 | 0 | for (int i=1; i < lens->nchildren; i++) { |
1383 | 0 | struct state *s3, *e3, *f3, *scall, *fcall; |
1384 | 0 | conv(jmt, lens->children[i], &s3, &e3, &f3); |
1385 | 0 | thompson(jmt, lens->children[i], &scall, &fcall); |
1386 | 0 | ERR_BAIL(jmt); |
1387 | 0 | add_eps_trans(jmt, f2, scall); |
1388 | 0 | add_eps_trans(jmt, e2, s3); |
1389 | 0 | *f = make_state(jmt); |
1390 | 0 | add_eps_trans(jmt, f3, *f); |
1391 | 0 | add_eps_trans(jmt, fcall, *f); |
1392 | 0 | *e = make_state(jmt); |
1393 | 0 | add_eps_trans(jmt, e3, *e); |
1394 | 0 | f2 = *f; |
1395 | 0 | e2 = *e; |
1396 | 0 | } |
1397 | 0 | error: |
1398 | 0 | return; |
1399 | 0 | } |
1400 | | |
1401 | | static void conv_union(struct jmt *jmt, struct lens *lens, |
1402 | | struct state **s, struct state **e, |
1403 | 0 | struct state **f) { |
1404 | |
|
1405 | 0 | *s = make_state(jmt); |
1406 | 0 | *e = make_state(jmt); |
1407 | 0 | *f = make_state(jmt); |
1408 | 0 | ERR_BAIL(jmt); |
1409 | |
|
1410 | 0 | for (int i = 0; i < lens->nchildren; i++) { |
1411 | 0 | struct state *s2, *e2, *f2; |
1412 | |
|
1413 | 0 | conv(jmt, lens->children[i], &s2, &e2, &f2); |
1414 | 0 | ERR_BAIL(jmt); |
1415 | |
|
1416 | 0 | add_eps_trans(jmt, *s, s2); |
1417 | 0 | add_eps_trans(jmt, e2, *e); |
1418 | 0 | add_eps_trans(jmt, f2, *f); |
1419 | 0 | } |
1420 | | |
1421 | 0 | error: |
1422 | 0 | return; |
1423 | 0 | } |
1424 | | |
1425 | | static void conv_star(struct jmt *jmt, struct lens *lens, |
1426 | | struct state **s, struct state **e, |
1427 | 0 | struct state **f) { |
1428 | |
|
1429 | 0 | *s = make_state(jmt); |
1430 | 0 | *e = make_state(jmt); |
1431 | 0 | *f = make_state(jmt); |
1432 | 0 | ERR_BAIL(jmt); |
1433 | |
|
1434 | 0 | struct state *si, *ei, *fi, *scall, *fcall; |
1435 | 0 | conv(jmt, lens->child, &si, &ei, &fi); |
1436 | 0 | thompson(jmt, lens->child, &scall, &fcall); |
1437 | 0 | ERR_BAIL(jmt); |
1438 | |
|
1439 | 0 | add_eps_trans(jmt, *s, si); |
1440 | 0 | add_eps_trans(jmt, ei, si); |
1441 | 0 | add_eps_trans(jmt, *s, *e); |
1442 | 0 | add_eps_trans(jmt, ei, *e); |
1443 | 0 | add_eps_trans(jmt, fi, scall); |
1444 | 0 | add_eps_trans(jmt, fcall, scall); |
1445 | 0 | add_eps_trans(jmt, fi, *f); |
1446 | 0 | add_eps_trans(jmt, fcall, *f); |
1447 | 0 | ERR_BAIL(jmt); |
1448 | |
|
1449 | 0 | error: |
1450 | 0 | return; |
1451 | 0 | } |
1452 | | |
1453 | 0 | static void conv_rhs(struct jmt *jmt, ind_t l) { |
1454 | 0 | struct lens *lens = lens_of_jmt(jmt, l); |
1455 | 0 | struct state *s = NULL, *e = NULL, *f = NULL; |
1456 | 0 | struct state *sA = lens_state(jmt, l); |
1457 | |
|
1458 | 0 | if (! lens->recursive) { |
1459 | | /* Nothing to do for terminals that do not match epsilon */ |
1460 | 0 | if (sA != NULL) |
1461 | 0 | state_add_return(jmt, sA, l); |
1462 | 0 | return; |
1463 | 0 | } |
1464 | | |
1465 | | /* All other nonterminals/recursive lenses */ |
1466 | | |
1467 | | /* Maintain P1 */ |
1468 | 0 | if (lens->ctype_nullable) |
1469 | 0 | state_add_return(jmt, sA, l); |
1470 | |
|
1471 | 0 | switch (lens->tag) { |
1472 | 0 | case L_REC: |
1473 | 0 | conv(jmt, lens->body, &s, &e, &f); |
1474 | 0 | break; |
1475 | 0 | case L_CONCAT: |
1476 | 0 | conv_concat(jmt, lens, &s, &e, &f); |
1477 | 0 | break; |
1478 | 0 | case L_UNION: |
1479 | 0 | conv_union(jmt, lens, &s, &e, &f); |
1480 | 0 | break; |
1481 | 0 | case L_SUBTREE: |
1482 | 0 | conv(jmt, lens->child, &s, &e, &f); |
1483 | 0 | break; |
1484 | 0 | case L_STAR: |
1485 | 0 | conv_star(jmt, lens, &s, &e, &f); |
1486 | 0 | break; |
1487 | 0 | case L_MAYBE: |
1488 | 0 | conv(jmt, lens->child, &s, &e, &f); |
1489 | 0 | add_new_trans(jmt, s, e, EPS); |
1490 | 0 | break; |
1491 | 0 | case L_SQUARE: |
1492 | 0 | conv(jmt, lens->child, &s, &e, &f); |
1493 | 0 | break; |
1494 | 0 | default: |
1495 | 0 | BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag); |
1496 | 0 | } |
1497 | | |
1498 | 0 | ensure(sA != NULL, jmt); |
1499 | |
|
1500 | 0 | add_eps_trans(jmt, sA, s); |
1501 | 0 | state_add_return(jmt, e, l); |
1502 | 0 | state_add_return(jmt, f, l); |
1503 | |
|
1504 | 0 | error: |
1505 | 0 | return; |
1506 | 0 | } |
1507 | | |
1508 | | ATTRIBUTE_RETURN_CHECK |
1509 | 0 | static int push_state(struct array *worklist, struct state *s) { |
1510 | 0 | int r; |
1511 | 0 | ind_t ind; |
1512 | |
|
1513 | 0 | r = array_add(worklist, &ind); |
1514 | 0 | if (r < 0) |
1515 | 0 | return -1; |
1516 | 0 | *array_elem(*worklist, ind, struct state *) = s; |
1517 | 0 | return 0; |
1518 | 0 | } |
1519 | | |
1520 | 0 | static struct state *pop_state(struct array *worklist) { |
1521 | 0 | if (worklist->used > 0) { |
1522 | 0 | worklist->used -= 1; |
1523 | 0 | return *array_elem(*worklist, worklist->used, struct state *); |
1524 | 0 | } else { |
1525 | 0 | return NULL; |
1526 | 0 | } |
1527 | 0 | } |
1528 | | |
1529 | 0 | static void free_state(struct state *s) { |
1530 | 0 | if (s == NULL) |
1531 | 0 | return; |
1532 | 0 | free(s->ret); |
1533 | 0 | array_release(&s->trans); |
1534 | 0 | free(s); |
1535 | 0 | } |
1536 | | |
1537 | 0 | static void collect(struct jmt *jmt) { |
1538 | 0 | struct array worklist; |
1539 | 0 | size_t count, removed; |
1540 | 0 | int r; |
1541 | |
|
1542 | 0 | count = 0; |
1543 | 0 | list_for_each(s, jmt->start) { |
1544 | 0 | s->live = 0; |
1545 | 0 | s->reachable = 0; |
1546 | 0 | count += 1; |
1547 | 0 | } |
1548 | |
|
1549 | 0 | array_init(&worklist, sizeof(struct state *)); |
1550 | 0 | jmt->start->reachable = 1; |
1551 | 0 | for (struct state *s = jmt->start; |
1552 | 0 | s != NULL; |
1553 | 0 | s = pop_state(&worklist)) { |
1554 | 0 | for_each_trans(t, s) { |
1555 | 0 | if (! t->to->reachable) { |
1556 | 0 | t->to->reachable = 1; |
1557 | 0 | r = push_state(&worklist, t->to); |
1558 | 0 | ERR_NOMEM(r < 0, jmt); |
1559 | 0 | } |
1560 | 0 | } |
1561 | 0 | } |
1562 | | |
1563 | 0 | list_for_each(s, jmt->start) |
1564 | 0 | if (s->reachable && is_return(s)) |
1565 | 0 | s->live = 1; |
1566 | |
|
1567 | 0 | bool changed; |
1568 | 0 | do { |
1569 | 0 | changed = false; |
1570 | 0 | list_for_each(s, jmt->start) { |
1571 | 0 | if (! s->live && s->reachable) { |
1572 | 0 | for_each_trans(t, s) { |
1573 | 0 | if (t->lens != CALL && t->to->live) { |
1574 | 0 | s->live = 1; |
1575 | 0 | changed = true; |
1576 | 0 | break; |
1577 | 0 | } |
1578 | 0 | } |
1579 | 0 | } |
1580 | 0 | } |
1581 | 0 | } while (changed); |
1582 | |
|
1583 | 0 | list_for_each(s, jmt->start) { |
1584 | 0 | if (s->live && s->reachable) { |
1585 | 0 | for (ind_t i = 0; i < s->trans.used; ) { |
1586 | 0 | struct trans *t = state_trans(s, i); |
1587 | 0 | if (! (t->to->live && t->to->reachable)) |
1588 | 0 | array_remove(&s->trans, i); |
1589 | 0 | else |
1590 | 0 | i += 1; |
1591 | 0 | } |
1592 | 0 | } |
1593 | 0 | } |
1594 | |
|
1595 | 0 | removed = 0; |
1596 | 0 | for (struct state *s = jmt->start; |
1597 | 0 | s->next != NULL; ) { |
1598 | 0 | struct state *p = s->next; |
1599 | 0 | if (p->live && p->reachable) { |
1600 | 0 | s = p; |
1601 | 0 | } else { |
1602 | 0 | s->next = p->next; |
1603 | 0 | free_state(p); |
1604 | 0 | removed += 1; |
1605 | 0 | } |
1606 | 0 | } |
1607 | |
|
1608 | 0 | error: |
1609 | 0 | array_release(&worklist); |
1610 | 0 | return; |
1611 | 0 | } |
1612 | | |
1613 | 0 | static void dedup(struct state *s) { |
1614 | 0 | array_for_each(i, s->trans) { |
1615 | 0 | struct trans *t = state_trans(s, i); |
1616 | 0 | for (ind_t j = i+1; j < s->trans.used;) { |
1617 | 0 | struct trans *u = state_trans(s, j); |
1618 | 0 | if (t->to == u->to && t->lens == u->lens) |
1619 | 0 | array_remove(&s->trans, j); |
1620 | 0 | else |
1621 | 0 | j += 1; |
1622 | 0 | } |
1623 | 0 | } |
1624 | 0 | } |
1625 | | |
1626 | 0 | static void unepsilon(struct jmt *jmt) { |
1627 | 0 | int r; |
1628 | |
|
1629 | 0 | if (debugging("cf.jmt.build")) |
1630 | 0 | jmt_dot(jmt, "jmt_10_raw.dot"); |
1631 | 0 | collect(jmt); |
1632 | | |
1633 | | /* Get rid of epsilon transitions */ |
1634 | 0 | bool changed; |
1635 | 0 | do { |
1636 | 0 | changed = false; |
1637 | 0 | list_for_each(s, jmt->start) { |
1638 | 0 | array_for_each(i , s->trans) { |
1639 | 0 | struct trans *t = state_trans(s, i); |
1640 | 0 | if (t->lens == EPS) { |
1641 | 0 | struct state *to = t->to; |
1642 | 0 | array_remove(&s->trans, i); |
1643 | 0 | r = array_join(&s->trans, &to->trans); |
1644 | 0 | ERR_NOMEM(r < 0, jmt); |
1645 | 0 | state_merge_returns(jmt, s, to); |
1646 | 0 | dedup(s); |
1647 | 0 | changed = true; |
1648 | 0 | } |
1649 | 0 | } |
1650 | 0 | } |
1651 | 0 | } while (changed); |
1652 | | |
1653 | 0 | collect(jmt); |
1654 | 0 | if (debugging("cf.jmt.build")) |
1655 | 0 | jmt_dot(jmt, "jmt_20_uneps.dot"); |
1656 | 0 | error: |
1657 | 0 | return; |
1658 | 0 | } |
1659 | | |
1660 | 0 | static bool is_deterministic(struct jmt *jmt) { |
1661 | 0 | list_for_each(s, jmt->start) { |
1662 | 0 | array_for_each(i, s->trans) { |
1663 | 0 | struct trans *t = state_trans(s, i); |
1664 | 0 | for (ind_t j = i+1; j < s->trans.used; j++) { |
1665 | 0 | struct trans *u = state_trans(s, j); |
1666 | 0 | if (t->lens == u->lens) |
1667 | 0 | return false; |
1668 | 0 | } |
1669 | 0 | } |
1670 | 0 | } |
1671 | 0 | return true; |
1672 | 0 | } |
1673 | | |
1674 | 0 | static void free_nfa_state(struct nfa_state *s) { |
1675 | 0 | if (s == NULL) |
1676 | 0 | return; |
1677 | 0 | array_release(&s->set); |
1678 | 0 | free(s); |
1679 | 0 | } |
1680 | | |
1681 | 0 | static struct nfa_state *make_nfa_state(struct jmt *jmt) { |
1682 | 0 | struct nfa_state *result = NULL; |
1683 | 0 | int r; |
1684 | |
|
1685 | 0 | r = ALLOC(result); |
1686 | 0 | ERR_NOMEM(r < 0, jmt); |
1687 | |
|
1688 | 0 | array_init(&result->set, sizeof(struct state *)); |
1689 | |
|
1690 | 0 | return result; |
1691 | 0 | error: |
1692 | 0 | FREE(result); |
1693 | 0 | return NULL; |
1694 | 0 | } |
1695 | | |
1696 | | static ind_t nfa_state_add(struct jmt *jmt, struct nfa_state *nfa, |
1697 | 0 | struct state *s) { |
1698 | 0 | ind_t i; |
1699 | 0 | int r; |
1700 | |
|
1701 | 0 | array_for_each(j, nfa->set) { |
1702 | 0 | struct state *q = *array_elem(nfa->set, j, struct state *); |
1703 | 0 | if (q == s) |
1704 | 0 | return j; |
1705 | 0 | } |
1706 | | |
1707 | | /* Keep the list of states sorted */ |
1708 | 0 | i = nfa->set.used; |
1709 | 0 | for (int j=0; j + 1 < nfa->set.used; j++) { |
1710 | 0 | if (s < *array_elem(nfa->set, j, struct state *)) { |
1711 | 0 | i = j; |
1712 | 0 | break; |
1713 | 0 | } |
1714 | 0 | } |
1715 | 0 | r = array_insert(&nfa->set, i); |
1716 | 0 | ERR_NOMEM(r < 0, jmt); |
1717 | |
|
1718 | 0 | *array_elem(nfa->set, i, struct state *) = s; |
1719 | 0 | return i; |
1720 | 0 | error: |
1721 | 0 | return IND_MAX; |
1722 | 0 | } |
1723 | | |
1724 | 0 | static bool nfa_same_set(struct nfa_state *s1, struct nfa_state *s2) { |
1725 | 0 | if (s1->set.used != s2->set.used) |
1726 | 0 | return false; |
1727 | | |
1728 | 0 | array_for_each(i, s1->set) { |
1729 | 0 | struct state *q1 = *array_elem(s1->set, i, struct state *); |
1730 | 0 | struct state *q2 = *array_elem(s2->set, i, struct state *); |
1731 | 0 | if (q1 != q2) |
1732 | 0 | return false; |
1733 | 0 | } |
1734 | | |
1735 | 0 | return true; |
1736 | 0 | } |
1737 | | |
1738 | | static struct nfa_state *nfa_uniq(struct jmt *jmt, struct array *newstates, |
1739 | 0 | struct nfa_state *s) { |
1740 | 0 | ind_t ind; |
1741 | 0 | int r; |
1742 | |
|
1743 | 0 | array_each_elem(q, *newstates, struct nfa_state *) { |
1744 | 0 | if (nfa_same_set(s, *q)) { |
1745 | 0 | if (s == *q) |
1746 | 0 | return s; |
1747 | 0 | free_nfa_state(s); |
1748 | 0 | return *q; |
1749 | 0 | } |
1750 | 0 | } |
1751 | | |
1752 | 0 | r = array_add(newstates, &ind); |
1753 | 0 | ERR_NOMEM(r < 0, jmt); |
1754 | 0 | *array_elem(*newstates, ind, struct nfa_state *) = s; |
1755 | |
|
1756 | 0 | if (s->state == NULL) { |
1757 | 0 | s->state = make_state(jmt); |
1758 | 0 | ERR_BAIL(jmt); |
1759 | 0 | } |
1760 | | |
1761 | | /* This makes looking at pictures easier */ |
1762 | 0 | if (s->set.used == 1) |
1763 | 0 | s->state->num = (*array_elem(s->set, 0, struct state *))->num; |
1764 | |
|
1765 | 0 | return s; |
1766 | | |
1767 | 0 | error: |
1768 | 0 | return NULL; |
1769 | 0 | } |
1770 | | |
1771 | | static void det_target(struct jmt *jmt, struct array *newstates, |
1772 | 0 | struct nfa_state *nfas, ind_t l) { |
1773 | 0 | struct nfa_state *to = NULL; |
1774 | |
|
1775 | 0 | array_each_elem(s, nfas->set, struct state *) { |
1776 | 0 | for_each_trans(t, *s) { |
1777 | 0 | if (t->lens == l) { |
1778 | 0 | if (to == NULL) { |
1779 | 0 | to = make_nfa_state(jmt); |
1780 | 0 | ERR_RET(jmt); |
1781 | 0 | } |
1782 | 0 | nfa_state_add(jmt, to, t->to); |
1783 | 0 | ERR_RET(jmt); |
1784 | 0 | } |
1785 | 0 | } |
1786 | 0 | } |
1787 | | |
1788 | 0 | if (to != NULL) { |
1789 | 0 | to = nfa_uniq(jmt, newstates, to); |
1790 | 0 | ERR_RET(jmt); |
1791 | 0 | add_new_trans(jmt, nfas->state, to->state, l); |
1792 | 0 | } |
1793 | 0 | } |
1794 | | |
1795 | 0 | static void determinize(struct jmt *jmt) { |
1796 | 0 | struct nfa_state *ini = NULL; |
1797 | 0 | struct array *newstates = NULL; |
1798 | 0 | int r; |
1799 | 0 | ind_t ind, nlenses; |
1800 | |
|
1801 | 0 | if (is_deterministic(jmt)) |
1802 | 0 | return; |
1803 | | |
1804 | 0 | r = ALLOC(newstates); |
1805 | 0 | ERR_NOMEM(r < 0, jmt); |
1806 | 0 | array_init(newstates, sizeof(struct nfa_state *)); |
1807 | |
|
1808 | 0 | nlenses = jmt->lenses.used; |
1809 | | |
1810 | | /* The initial state consists of just the start state */ |
1811 | 0 | ini = make_nfa_state(jmt); |
1812 | 0 | ERR_BAIL(jmt); |
1813 | 0 | nfa_state_add(jmt, ini, jmt->start); |
1814 | 0 | ERR_BAIL(jmt); |
1815 | | |
1816 | | /* Make a new initial state */ |
1817 | 0 | ini->state = make_state(jmt); |
1818 | 0 | ini->state->num = jmt->start->num; /* Makes lokking at pictures easier */ |
1819 | 0 | ERR_BAIL(jmt); |
1820 | 0 | jmt->start->next = ini->state->next; |
1821 | 0 | ini->state->next = jmt->start; |
1822 | 0 | jmt->start = ini->state; |
1823 | |
|
1824 | 0 | r = array_add(newstates, &ind); |
1825 | 0 | ERR_NOMEM(r < 0, jmt); |
1826 | |
|
1827 | 0 | *array_elem(*newstates, ind, struct nfa_state *) = ini; |
1828 | 0 | ini = NULL; |
1829 | |
|
1830 | 0 | for (ind_t i = 0; i < newstates->used; i++) { |
1831 | 0 | struct nfa_state *nfas = *array_elem(*newstates, i, struct nfa_state *); |
1832 | |
|
1833 | 0 | for (int j = 0; j < nfas->set.used; j++) { |
1834 | 0 | struct state *s = *array_elem(nfas->set, j, struct state *); |
1835 | 0 | state_merge_returns(jmt, nfas->state, s); |
1836 | 0 | } |
1837 | |
|
1838 | 0 | for (ind_t l = 0; l < nlenses; l++) { |
1839 | 0 | det_target(jmt, newstates, nfas, l); |
1840 | 0 | ERR_BAIL(jmt); |
1841 | 0 | } |
1842 | 0 | det_target(jmt, newstates, nfas, CALL); |
1843 | 0 | ERR_BAIL(jmt); |
1844 | 0 | } |
1845 | | |
1846 | 0 | collect(jmt); |
1847 | |
|
1848 | 0 | done: |
1849 | 0 | if (newstates) { |
1850 | 0 | array_each_elem(s, *newstates, struct nfa_state *) { |
1851 | 0 | free_nfa_state(*s); |
1852 | 0 | } |
1853 | 0 | array_release(newstates); |
1854 | 0 | FREE(newstates); |
1855 | 0 | } |
1856 | 0 | free_nfa_state(ini); |
1857 | 0 | return; |
1858 | 0 | error: |
1859 | 0 | goto done; |
1860 | 0 | } |
1861 | | |
1862 | 0 | struct jmt *jmt_build(struct lens *lens) { |
1863 | 0 | struct jmt *jmt = NULL; |
1864 | 0 | int r; |
1865 | |
|
1866 | 0 | r = ALLOC(jmt); |
1867 | 0 | ERR_NOMEM(r < 0, lens->info); |
1868 | |
|
1869 | 0 | jmt->error = lens->info->error; |
1870 | 0 | array_init(&jmt->lenses, sizeof(struct jmt_lens)); |
1871 | |
|
1872 | 0 | index_lenses(jmt, lens); |
1873 | |
|
1874 | 0 | if (debugging("cf.jmt")) |
1875 | 0 | print_grammar_top(jmt, lens); |
1876 | |
|
1877 | 0 | for (ind_t i=0; i < jmt->lenses.used; i++) { |
1878 | 0 | conv_rhs(jmt, i); |
1879 | 0 | ERR_BAIL(jmt); |
1880 | 0 | } |
1881 | | |
1882 | 0 | unepsilon(jmt); |
1883 | 0 | ERR_BAIL(jmt); |
1884 | |
|
1885 | 0 | determinize(jmt); |
1886 | 0 | ERR_BAIL(jmt); |
1887 | |
|
1888 | 0 | if (debugging("cf.jmt.build")) |
1889 | 0 | jmt_dot(jmt, "jmt_30_dfa.dot"); |
1890 | |
|
1891 | 0 | return jmt; |
1892 | 0 | error: |
1893 | 0 | jmt_free(jmt); |
1894 | 0 | return NULL; |
1895 | 0 | } |
1896 | | |
1897 | 0 | void jmt_free(struct jmt *jmt) { |
1898 | 0 | if (jmt == NULL) |
1899 | 0 | return; |
1900 | 0 | array_release(&jmt->lenses); |
1901 | 0 | struct state *s = jmt->start; |
1902 | 0 | while (s != NULL) { |
1903 | 0 | struct state *del = s; |
1904 | 0 | s = del->next; |
1905 | 0 | free_state(del); |
1906 | 0 | } |
1907 | 0 | free(jmt); |
1908 | 0 | } |
1909 | | |
1910 | 0 | void jmt_dot(struct jmt *jmt, const char *fname) { |
1911 | 0 | FILE *fp = debug_fopen("%s", fname); |
1912 | 0 | if (fp == NULL) |
1913 | 0 | return; |
1914 | | |
1915 | 0 | fprintf(fp, "digraph \"jmt\" {\n"); |
1916 | 0 | fprintf(fp, " rankdir = LR;\n"); |
1917 | 0 | list_for_each(s, jmt->start) { |
1918 | 0 | if (is_return(s)) { |
1919 | 0 | fprintf(fp, " %u [ shape = doublecircle, label = \"%u (", |
1920 | 0 | s->num, s->num); |
1921 | 0 | flens(fp, s->ret[0]); |
1922 | 0 | for (ind_t i = 1; i < s->nret; i++) { |
1923 | 0 | fprintf(fp, ", "); |
1924 | 0 | flens(fp, s->ret[i]); |
1925 | 0 | } |
1926 | 0 | fprintf(fp, ")\" ];\n"); |
1927 | 0 | } |
1928 | 0 | for_each_trans(t, s) { |
1929 | 0 | fprintf(fp, " %u -> %u ", s->num, t->to->num); |
1930 | 0 | if (t->lens == EPS) |
1931 | 0 | fprintf(fp, ";\n"); |
1932 | 0 | else if (t->lens == CALL) |
1933 | 0 | fprintf(fp, "[ label = \"call\" ];\n"); |
1934 | 0 | else if (lens_state(jmt, t->lens) == NULL) { |
1935 | 0 | struct lens *lens = lens_of_jmt(jmt, t->lens); |
1936 | 0 | fprintf(fp, "[ label = \""); |
1937 | 0 | print_regexp(fp, lens->ctype); |
1938 | 0 | fprintf(fp, "\" ];\n"); |
1939 | 0 | } else { |
1940 | 0 | fprintf(fp, "[ label = \""); |
1941 | 0 | flens(fp, t->lens); |
1942 | 0 | fprintf(fp, "\" ];\n"); |
1943 | 0 | } |
1944 | 0 | } |
1945 | 0 | } |
1946 | 0 | fprintf(fp, "}\n"); |
1947 | 0 | fclose(fp); |
1948 | 0 | } |
1949 | | |
1950 | | /* |
1951 | | * Local variables: |
1952 | | * indent-tabs-mode: nil |
1953 | | * c-indent-level: 4 |
1954 | | * c-basic-offset: 4 |
1955 | | * tab-width: 4 |
1956 | | * End: |
1957 | | */ |