/src/mupdf/thirdparty/mujs/regexp.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include <stdlib.h> |
2 | | #include <stdio.h> |
3 | | #include <string.h> |
4 | | #include <setjmp.h> |
5 | | #include <limits.h> |
6 | | |
7 | | #include "regexp.h" |
8 | | #include "utf.h" |
9 | | |
10 | 0 | #define emit regemit |
11 | 0 | #define next regnext |
12 | 0 | #define accept regaccept |
13 | | |
14 | 0 | #define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) |
15 | | |
16 | 0 | #define REPINF 255 |
17 | | #ifndef REG_MAXPROG |
18 | 0 | #define REG_MAXPROG (32 << 10) |
19 | | #endif |
20 | | #ifndef REG_MAXREC |
21 | 0 | #define REG_MAXREC 1024 |
22 | | #endif |
23 | | #ifndef REG_MAXSPAN |
24 | | #define REG_MAXSPAN 64 |
25 | | #endif |
26 | | #ifndef REG_MAXCLASS |
27 | | #define REG_MAXCLASS 16 |
28 | | #endif |
29 | | |
30 | | typedef struct Reclass Reclass; |
31 | | typedef struct Renode Renode; |
32 | | typedef struct Reinst Reinst; |
33 | | typedef struct Rethread Rethread; |
34 | | |
35 | | struct Reclass { |
36 | | Rune *end; |
37 | | Rune spans[REG_MAXSPAN]; |
38 | | }; |
39 | | |
40 | | struct Reprog { |
41 | | Reinst *start, *end; |
42 | | int flags; |
43 | | int nsub; |
44 | | Reclass cclass[REG_MAXCLASS]; |
45 | | }; |
46 | | |
47 | | struct cstate { |
48 | | Reprog *prog; |
49 | | Renode *pstart, *pend; |
50 | | |
51 | | const char *source; |
52 | | int ncclass; |
53 | | int nsub; |
54 | | Renode *sub[REG_MAXSUB]; |
55 | | |
56 | | int lookahead; |
57 | | Rune yychar; |
58 | | Reclass *yycc; |
59 | | int yymin, yymax; |
60 | | |
61 | | const char *error; |
62 | | jmp_buf kaboom; |
63 | | }; |
64 | | |
65 | | static void die(struct cstate *g, const char *message) |
66 | 0 | { |
67 | 0 | g->error = message; |
68 | 0 | longjmp(g->kaboom, 1); |
69 | 0 | } |
70 | | |
71 | | static int canon(Rune c) |
72 | 0 | { |
73 | 0 | Rune u = toupperrune(c); |
74 | 0 | if (c >= 128 && u < 128) |
75 | 0 | return c; |
76 | 0 | return u; |
77 | 0 | } |
78 | | |
79 | | /* Scan */ |
80 | | |
81 | | enum { |
82 | | L_CHAR = 256, |
83 | | L_CCLASS, /* character class */ |
84 | | L_NCCLASS, /* negative character class */ |
85 | | L_NC, /* "(?:" no capture */ |
86 | | L_PLA, /* "(?=" positive lookahead */ |
87 | | L_NLA, /* "(?!" negative lookahead */ |
88 | | L_WORD, /* "\b" word boundary */ |
89 | | L_NWORD, /* "\B" non-word boundary */ |
90 | | L_REF, /* "\1" back-reference */ |
91 | | L_COUNT, /* {M,N} */ |
92 | | }; |
93 | | |
94 | | static int hex(struct cstate *g, int c) |
95 | 0 | { |
96 | 0 | if (c >= '0' && c <= '9') return c - '0'; |
97 | 0 | if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; |
98 | 0 | if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; |
99 | 0 | die(g, "invalid escape sequence"); |
100 | 0 | return 0; |
101 | 0 | } |
102 | | |
103 | | static int dec(struct cstate *g, int c) |
104 | 0 | { |
105 | 0 | if (c >= '0' && c <= '9') return c - '0'; |
106 | 0 | die(g, "invalid quantifier"); |
107 | 0 | return 0; |
108 | 0 | } |
109 | | |
110 | 0 | #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789" |
111 | | |
112 | | static int isunicodeletter(int c) |
113 | 0 | { |
114 | 0 | return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c); |
115 | 0 | } |
116 | | |
117 | | static int nextrune(struct cstate *g) |
118 | 0 | { |
119 | 0 | if (!*g->source) { |
120 | 0 | g->yychar = EOF; |
121 | 0 | return 0; |
122 | 0 | } |
123 | 0 | g->source += chartorune(&g->yychar, g->source); |
124 | 0 | if (g->yychar == '\\') { |
125 | 0 | if (!*g->source) |
126 | 0 | die(g, "unterminated escape sequence"); |
127 | 0 | g->source += chartorune(&g->yychar, g->source); |
128 | 0 | switch (g->yychar) { |
129 | 0 | case 'f': g->yychar = '\f'; return 0; |
130 | 0 | case 'n': g->yychar = '\n'; return 0; |
131 | 0 | case 'r': g->yychar = '\r'; return 0; |
132 | 0 | case 't': g->yychar = '\t'; return 0; |
133 | 0 | case 'v': g->yychar = '\v'; return 0; |
134 | 0 | case 'c': |
135 | 0 | if (!g->source[0]) |
136 | 0 | die(g, "unterminated escape sequence"); |
137 | 0 | g->yychar = (*g->source++) & 31; |
138 | 0 | return 0; |
139 | 0 | case 'x': |
140 | 0 | if (!g->source[0] || !g->source[1]) |
141 | 0 | die(g, "unterminated escape sequence"); |
142 | 0 | g->yychar = hex(g, *g->source++) << 4; |
143 | 0 | g->yychar += hex(g, *g->source++); |
144 | 0 | if (g->yychar == 0) { |
145 | 0 | g->yychar = '0'; |
146 | 0 | return 1; |
147 | 0 | } |
148 | 0 | return 0; |
149 | 0 | case 'u': |
150 | 0 | if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3]) |
151 | 0 | die(g, "unterminated escape sequence"); |
152 | 0 | g->yychar = hex(g, *g->source++) << 12; |
153 | 0 | g->yychar += hex(g, *g->source++) << 8; |
154 | 0 | g->yychar += hex(g, *g->source++) << 4; |
155 | 0 | g->yychar += hex(g, *g->source++); |
156 | 0 | if (g->yychar == 0) { |
157 | 0 | g->yychar = '0'; |
158 | 0 | return 1; |
159 | 0 | } |
160 | 0 | return 0; |
161 | 0 | case 0: |
162 | 0 | g->yychar = '0'; |
163 | 0 | return 1; |
164 | 0 | } |
165 | 0 | if (strchr(ESCAPES, g->yychar)) |
166 | 0 | return 1; |
167 | 0 | if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */ |
168 | 0 | die(g, "invalid escape character"); |
169 | 0 | return 0; |
170 | 0 | } |
171 | 0 | return 0; |
172 | 0 | } |
173 | | |
174 | | static int lexcount(struct cstate *g) |
175 | 0 | { |
176 | 0 | g->yychar = *g->source++; |
177 | |
|
178 | 0 | g->yymin = dec(g, g->yychar); |
179 | 0 | g->yychar = *g->source++; |
180 | 0 | while (g->yychar != ',' && g->yychar != '}') { |
181 | 0 | g->yymin = g->yymin * 10 + dec(g, g->yychar); |
182 | 0 | g->yychar = *g->source++; |
183 | 0 | if (g->yymin >= REPINF) |
184 | 0 | die(g, "numeric overflow"); |
185 | 0 | } |
186 | |
|
187 | 0 | if (g->yychar == ',') { |
188 | 0 | g->yychar = *g->source++; |
189 | 0 | if (g->yychar == '}') { |
190 | 0 | g->yymax = REPINF; |
191 | 0 | } else { |
192 | 0 | g->yymax = dec(g, g->yychar); |
193 | 0 | g->yychar = *g->source++; |
194 | 0 | while (g->yychar != '}') { |
195 | 0 | g->yymax = g->yymax * 10 + dec(g, g->yychar); |
196 | 0 | g->yychar = *g->source++; |
197 | 0 | if (g->yymax >= REPINF) |
198 | 0 | die(g, "numeric overflow"); |
199 | 0 | } |
200 | 0 | } |
201 | 0 | } else { |
202 | 0 | g->yymax = g->yymin; |
203 | 0 | } |
204 | |
|
205 | 0 | return L_COUNT; |
206 | 0 | } |
207 | | |
208 | | static void newcclass(struct cstate *g) |
209 | 0 | { |
210 | 0 | if (g->ncclass >= nelem(g->prog->cclass)) |
211 | 0 | die(g, "too many character classes"); |
212 | 0 | g->yycc = g->prog->cclass + g->ncclass++; |
213 | 0 | g->yycc->end = g->yycc->spans; |
214 | 0 | } |
215 | | |
216 | | static void addrange(struct cstate *g, Rune a, Rune b) |
217 | 0 | { |
218 | 0 | if (a > b) |
219 | 0 | die(g, "invalid character class range"); |
220 | 0 | if (g->yycc->end + 2 >= g->yycc->spans + nelem(g->yycc->spans)) |
221 | 0 | die(g, "too many character class ranges"); |
222 | 0 | *g->yycc->end++ = a; |
223 | 0 | *g->yycc->end++ = b; |
224 | 0 | } |
225 | | |
226 | | static void addranges_d(struct cstate *g) |
227 | 0 | { |
228 | 0 | addrange(g, '0', '9'); |
229 | 0 | } |
230 | | |
231 | | static void addranges_D(struct cstate *g) |
232 | 0 | { |
233 | 0 | addrange(g, 0, '0'-1); |
234 | 0 | addrange(g, '9'+1, 0xFFFF); |
235 | 0 | } |
236 | | |
237 | | static void addranges_s(struct cstate *g) |
238 | 0 | { |
239 | 0 | addrange(g, 0x9, 0xD); |
240 | 0 | addrange(g, 0x20, 0x20); |
241 | 0 | addrange(g, 0xA0, 0xA0); |
242 | 0 | addrange(g, 0x2028, 0x2029); |
243 | 0 | addrange(g, 0xFEFF, 0xFEFF); |
244 | 0 | } |
245 | | |
246 | | static void addranges_S(struct cstate *g) |
247 | 0 | { |
248 | 0 | addrange(g, 0, 0x9-1); |
249 | 0 | addrange(g, 0xD+1, 0x20-1); |
250 | 0 | addrange(g, 0x20+1, 0xA0-1); |
251 | 0 | addrange(g, 0xA0+1, 0x2028-1); |
252 | 0 | addrange(g, 0x2029+1, 0xFEFF-1); |
253 | 0 | addrange(g, 0xFEFF+1, 0xFFFF); |
254 | 0 | } |
255 | | |
256 | | static void addranges_w(struct cstate *g) |
257 | 0 | { |
258 | 0 | addrange(g, '0', '9'); |
259 | 0 | addrange(g, 'A', 'Z'); |
260 | 0 | addrange(g, '_', '_'); |
261 | 0 | addrange(g, 'a', 'z'); |
262 | 0 | } |
263 | | |
264 | | static void addranges_W(struct cstate *g) |
265 | 0 | { |
266 | 0 | addrange(g, 0, '0'-1); |
267 | 0 | addrange(g, '9'+1, 'A'-1); |
268 | 0 | addrange(g, 'Z'+1, '_'-1); |
269 | 0 | addrange(g, '_'+1, 'a'-1); |
270 | 0 | addrange(g, 'z'+1, 0xFFFF); |
271 | 0 | } |
272 | | |
273 | | static int lexclass(struct cstate *g) |
274 | 0 | { |
275 | 0 | int type = L_CCLASS; |
276 | 0 | int quoted, havesave, havedash; |
277 | 0 | Rune save = 0; |
278 | |
|
279 | 0 | newcclass(g); |
280 | |
|
281 | 0 | quoted = nextrune(g); |
282 | 0 | if (!quoted && g->yychar == '^') { |
283 | 0 | type = L_NCCLASS; |
284 | 0 | quoted = nextrune(g); |
285 | 0 | } |
286 | |
|
287 | 0 | havesave = havedash = 0; |
288 | 0 | for (;;) { |
289 | 0 | if (g->yychar == EOF) |
290 | 0 | die(g, "unterminated character class"); |
291 | 0 | if (!quoted && g->yychar == ']') |
292 | 0 | break; |
293 | | |
294 | 0 | if (!quoted && g->yychar == '-') { |
295 | 0 | if (havesave) { |
296 | 0 | if (havedash) { |
297 | 0 | addrange(g, save, '-'); |
298 | 0 | havesave = havedash = 0; |
299 | 0 | } else { |
300 | 0 | havedash = 1; |
301 | 0 | } |
302 | 0 | } else { |
303 | 0 | save = '-'; |
304 | 0 | havesave = 1; |
305 | 0 | } |
306 | 0 | } else if (quoted && strchr("DSWdsw", g->yychar)) { |
307 | 0 | if (havesave) { |
308 | 0 | addrange(g, save, save); |
309 | 0 | if (havedash) |
310 | 0 | addrange(g, '-', '-'); |
311 | 0 | } |
312 | 0 | switch (g->yychar) { |
313 | 0 | case 'd': addranges_d(g); break; |
314 | 0 | case 's': addranges_s(g); break; |
315 | 0 | case 'w': addranges_w(g); break; |
316 | 0 | case 'D': addranges_D(g); break; |
317 | 0 | case 'S': addranges_S(g); break; |
318 | 0 | case 'W': addranges_W(g); break; |
319 | 0 | } |
320 | 0 | havesave = havedash = 0; |
321 | 0 | } else { |
322 | 0 | if (quoted) { |
323 | 0 | if (g->yychar == 'b') |
324 | 0 | g->yychar = '\b'; |
325 | 0 | else if (g->yychar == '0') |
326 | 0 | g->yychar = 0; |
327 | | /* else identity escape */ |
328 | 0 | } |
329 | 0 | if (havesave) { |
330 | 0 | if (havedash) { |
331 | 0 | addrange(g, save, g->yychar); |
332 | 0 | havesave = havedash = 0; |
333 | 0 | } else { |
334 | 0 | addrange(g, save, save); |
335 | 0 | save = g->yychar; |
336 | 0 | } |
337 | 0 | } else { |
338 | 0 | save = g->yychar; |
339 | 0 | havesave = 1; |
340 | 0 | } |
341 | 0 | } |
342 | | |
343 | 0 | quoted = nextrune(g); |
344 | 0 | } |
345 | | |
346 | 0 | if (havesave) { |
347 | 0 | addrange(g, save, save); |
348 | 0 | if (havedash) |
349 | 0 | addrange(g, '-', '-'); |
350 | 0 | } |
351 | |
|
352 | 0 | return type; |
353 | 0 | } |
354 | | |
355 | | static int lex(struct cstate *g) |
356 | 0 | { |
357 | 0 | int quoted = nextrune(g); |
358 | 0 | if (quoted) { |
359 | 0 | switch (g->yychar) { |
360 | 0 | case 'b': return L_WORD; |
361 | 0 | case 'B': return L_NWORD; |
362 | 0 | case 'd': newcclass(g); addranges_d(g); return L_CCLASS; |
363 | 0 | case 's': newcclass(g); addranges_s(g); return L_CCLASS; |
364 | 0 | case 'w': newcclass(g); addranges_w(g); return L_CCLASS; |
365 | 0 | case 'D': newcclass(g); addranges_d(g); return L_NCCLASS; |
366 | 0 | case 'S': newcclass(g); addranges_s(g); return L_NCCLASS; |
367 | 0 | case 'W': newcclass(g); addranges_w(g); return L_NCCLASS; |
368 | 0 | case '0': g->yychar = 0; return L_CHAR; |
369 | 0 | } |
370 | 0 | if (g->yychar >= '0' && g->yychar <= '9') { |
371 | 0 | g->yychar -= '0'; |
372 | 0 | if (*g->source >= '0' && *g->source <= '9') |
373 | 0 | g->yychar = g->yychar * 10 + *g->source++ - '0'; |
374 | 0 | return L_REF; |
375 | 0 | } |
376 | 0 | return L_CHAR; |
377 | 0 | } |
378 | | |
379 | 0 | switch (g->yychar) { |
380 | 0 | case EOF: |
381 | 0 | case '$': case ')': case '*': case '+': |
382 | 0 | case '.': case '?': case '^': case '|': |
383 | 0 | return g->yychar; |
384 | 0 | } |
385 | | |
386 | 0 | if (g->yychar == '{') |
387 | 0 | return lexcount(g); |
388 | 0 | if (g->yychar == '[') |
389 | 0 | return lexclass(g); |
390 | 0 | if (g->yychar == '(') { |
391 | 0 | if (g->source[0] == '?') { |
392 | 0 | if (g->source[1] == ':') { |
393 | 0 | g->source += 2; |
394 | 0 | return L_NC; |
395 | 0 | } |
396 | 0 | if (g->source[1] == '=') { |
397 | 0 | g->source += 2; |
398 | 0 | return L_PLA; |
399 | 0 | } |
400 | 0 | if (g->source[1] == '!') { |
401 | 0 | g->source += 2; |
402 | 0 | return L_NLA; |
403 | 0 | } |
404 | 0 | } |
405 | 0 | return '('; |
406 | 0 | } |
407 | | |
408 | 0 | return L_CHAR; |
409 | 0 | } |
410 | | |
411 | | /* Parse */ |
412 | | |
413 | | enum { |
414 | | P_CAT, P_ALT, P_REP, |
415 | | P_BOL, P_EOL, P_WORD, P_NWORD, |
416 | | P_PAR, P_PLA, P_NLA, |
417 | | P_ANY, P_CHAR, P_CCLASS, P_NCCLASS, |
418 | | P_REF, |
419 | | }; |
420 | | |
421 | | struct Renode { |
422 | | unsigned char type; |
423 | | unsigned char ng, m, n; |
424 | | Rune c; |
425 | | Reclass *cc; |
426 | | Renode *x; |
427 | | Renode *y; |
428 | | }; |
429 | | |
430 | | static Renode *newnode(struct cstate *g, int type) |
431 | 0 | { |
432 | 0 | Renode *node = g->pend++; |
433 | 0 | node->type = type; |
434 | 0 | node->cc = NULL; |
435 | 0 | node->c = 0; |
436 | 0 | node->ng = 0; |
437 | 0 | node->m = 0; |
438 | 0 | node->n = 0; |
439 | 0 | node->x = node->y = NULL; |
440 | 0 | return node; |
441 | 0 | } |
442 | | |
443 | | static int empty(Renode *node) |
444 | 0 | { |
445 | 0 | if (!node) return 1; |
446 | 0 | switch (node->type) { |
447 | 0 | default: return 1; |
448 | 0 | case P_CAT: return empty(node->x) && empty(node->y); |
449 | 0 | case P_ALT: return empty(node->x) || empty(node->y); |
450 | 0 | case P_REP: return empty(node->x) || node->m == 0; |
451 | 0 | case P_PAR: return empty(node->x); |
452 | 0 | case P_REF: return empty(node->x); |
453 | 0 | case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0; |
454 | 0 | } |
455 | 0 | } |
456 | | |
457 | | static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max) |
458 | 0 | { |
459 | 0 | Renode *rep = newnode(g, P_REP); |
460 | 0 | if (max == REPINF && empty(atom)) |
461 | 0 | die(g, "infinite loop matching the empty string"); |
462 | 0 | rep->ng = ng; |
463 | 0 | rep->m = min; |
464 | 0 | rep->n = max; |
465 | 0 | rep->x = atom; |
466 | 0 | return rep; |
467 | 0 | } |
468 | | |
469 | | static void next(struct cstate *g) |
470 | 0 | { |
471 | 0 | g->lookahead = lex(g); |
472 | 0 | } |
473 | | |
474 | | static int accept(struct cstate *g, int t) |
475 | 0 | { |
476 | 0 | if (g->lookahead == t) { |
477 | 0 | next(g); |
478 | 0 | return 1; |
479 | 0 | } |
480 | 0 | return 0; |
481 | 0 | } |
482 | | |
483 | | static Renode *parsealt(struct cstate *g); |
484 | | |
485 | | static Renode *parseatom(struct cstate *g) |
486 | 0 | { |
487 | 0 | Renode *atom; |
488 | 0 | if (g->lookahead == L_CHAR) { |
489 | 0 | atom = newnode(g, P_CHAR); |
490 | 0 | atom->c = g->yychar; |
491 | 0 | next(g); |
492 | 0 | return atom; |
493 | 0 | } |
494 | 0 | if (g->lookahead == L_CCLASS) { |
495 | 0 | atom = newnode(g, P_CCLASS); |
496 | 0 | atom->cc = g->yycc; |
497 | 0 | next(g); |
498 | 0 | return atom; |
499 | 0 | } |
500 | 0 | if (g->lookahead == L_NCCLASS) { |
501 | 0 | atom = newnode(g, P_NCCLASS); |
502 | 0 | atom->cc = g->yycc; |
503 | 0 | next(g); |
504 | 0 | return atom; |
505 | 0 | } |
506 | 0 | if (g->lookahead == L_REF) { |
507 | 0 | atom = newnode(g, P_REF); |
508 | 0 | if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar]) |
509 | 0 | die(g, "invalid back-reference"); |
510 | 0 | atom->n = g->yychar; |
511 | 0 | atom->x = g->sub[g->yychar]; |
512 | 0 | next(g); |
513 | 0 | return atom; |
514 | 0 | } |
515 | 0 | if (accept(g, '.')) |
516 | 0 | return newnode(g, P_ANY); |
517 | 0 | if (accept(g, '(')) { |
518 | 0 | atom = newnode(g, P_PAR); |
519 | 0 | if (g->nsub == REG_MAXSUB) |
520 | 0 | die(g, "too many captures"); |
521 | 0 | atom->n = g->nsub++; |
522 | 0 | atom->x = parsealt(g); |
523 | 0 | g->sub[atom->n] = atom; |
524 | 0 | if (!accept(g, ')')) |
525 | 0 | die(g, "unmatched '('"); |
526 | 0 | return atom; |
527 | 0 | } |
528 | 0 | if (accept(g, L_NC)) { |
529 | 0 | atom = parsealt(g); |
530 | 0 | if (!accept(g, ')')) |
531 | 0 | die(g, "unmatched '('"); |
532 | 0 | return atom; |
533 | 0 | } |
534 | 0 | if (accept(g, L_PLA)) { |
535 | 0 | atom = newnode(g, P_PLA); |
536 | 0 | atom->x = parsealt(g); |
537 | 0 | if (!accept(g, ')')) |
538 | 0 | die(g, "unmatched '('"); |
539 | 0 | return atom; |
540 | 0 | } |
541 | 0 | if (accept(g, L_NLA)) { |
542 | 0 | atom = newnode(g, P_NLA); |
543 | 0 | atom->x = parsealt(g); |
544 | 0 | if (!accept(g, ')')) |
545 | 0 | die(g, "unmatched '('"); |
546 | 0 | return atom; |
547 | 0 | } |
548 | 0 | die(g, "syntax error"); |
549 | 0 | return NULL; |
550 | 0 | } |
551 | | |
552 | | static Renode *parserep(struct cstate *g) |
553 | 0 | { |
554 | 0 | Renode *atom; |
555 | |
|
556 | 0 | if (accept(g, '^')) return newnode(g, P_BOL); |
557 | 0 | if (accept(g, '$')) return newnode(g, P_EOL); |
558 | 0 | if (accept(g, L_WORD)) return newnode(g, P_WORD); |
559 | 0 | if (accept(g, L_NWORD)) return newnode(g, P_NWORD); |
560 | | |
561 | 0 | atom = parseatom(g); |
562 | 0 | if (g->lookahead == L_COUNT) { |
563 | 0 | int min = g->yymin, max = g->yymax; |
564 | 0 | next(g); |
565 | 0 | if (max < min) |
566 | 0 | die(g, "invalid quantifier"); |
567 | 0 | return newrep(g, atom, accept(g, '?'), min, max); |
568 | 0 | } |
569 | 0 | if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF); |
570 | 0 | if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF); |
571 | 0 | if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1); |
572 | 0 | return atom; |
573 | 0 | } |
574 | | |
575 | | static Renode *parsecat(struct cstate *g) |
576 | 0 | { |
577 | 0 | Renode *cat, *head, **tail; |
578 | 0 | if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { |
579 | | /* Build a right-leaning tree by splicing in new 'cat' at the tail. */ |
580 | 0 | head = parserep(g); |
581 | 0 | tail = &head; |
582 | 0 | while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { |
583 | 0 | cat = newnode(g, P_CAT); |
584 | 0 | cat->x = *tail; |
585 | 0 | cat->y = parserep(g); |
586 | 0 | *tail = cat; |
587 | 0 | tail = &cat->y; |
588 | 0 | } |
589 | 0 | return head; |
590 | 0 | } |
591 | 0 | return NULL; |
592 | 0 | } |
593 | | |
594 | | static Renode *parsealt(struct cstate *g) |
595 | 0 | { |
596 | 0 | Renode *alt, *x; |
597 | 0 | alt = parsecat(g); |
598 | 0 | while (accept(g, '|')) { |
599 | 0 | x = alt; |
600 | 0 | alt = newnode(g, P_ALT); |
601 | 0 | alt->x = x; |
602 | 0 | alt->y = parsecat(g); |
603 | 0 | } |
604 | 0 | return alt; |
605 | 0 | } |
606 | | |
607 | | /* Compile */ |
608 | | |
609 | | enum { |
610 | | I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA, |
611 | | I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF, |
612 | | I_BOL, I_EOL, I_WORD, I_NWORD, |
613 | | I_LPAR, I_RPAR |
614 | | }; |
615 | | |
616 | | struct Reinst { |
617 | | unsigned char opcode; |
618 | | unsigned char n; |
619 | | Rune c; |
620 | | Reclass *cc; |
621 | | Reinst *x; |
622 | | Reinst *y; |
623 | | }; |
624 | | |
625 | | static int count(struct cstate *g, Renode *node, int depth) |
626 | 0 | { |
627 | 0 | int min, max, n; |
628 | 0 | if (!node) return 0; |
629 | 0 | if (++depth > REG_MAXREC) die(g, "stack overflow"); |
630 | 0 | switch (node->type) { |
631 | 0 | default: return 1; |
632 | 0 | case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth); |
633 | 0 | case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2; |
634 | 0 | case P_REP: |
635 | 0 | min = node->m; |
636 | 0 | max = node->n; |
637 | 0 | if (min == max) n = count(g, node->x, depth) * min; |
638 | 0 | else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min); |
639 | 0 | else n = count(g, node->x, depth) * (min + 1) + 2; |
640 | 0 | if (n < 0 || n > REG_MAXPROG) die(g, "program too large"); |
641 | 0 | return n; |
642 | 0 | case P_PAR: return count(g, node->x, depth) + 2; |
643 | 0 | case P_PLA: return count(g, node->x, depth) + 2; |
644 | 0 | case P_NLA: return count(g, node->x, depth) + 2; |
645 | 0 | } |
646 | 0 | } |
647 | | |
648 | | static Reinst *emit(Reprog *prog, int opcode) |
649 | 0 | { |
650 | 0 | Reinst *inst = prog->end++; |
651 | 0 | inst->opcode = opcode; |
652 | 0 | inst->n = 0; |
653 | 0 | inst->c = 0; |
654 | 0 | inst->cc = NULL; |
655 | 0 | inst->x = inst->y = NULL; |
656 | 0 | return inst; |
657 | 0 | } |
658 | | |
659 | | static void compile(Reprog *prog, Renode *node) |
660 | 0 | { |
661 | 0 | Reinst *inst, *split, *jump; |
662 | 0 | int i; |
663 | |
|
664 | 0 | loop: |
665 | 0 | if (!node) |
666 | 0 | return; |
667 | | |
668 | 0 | switch (node->type) { |
669 | 0 | case P_CAT: |
670 | 0 | compile(prog, node->x); |
671 | 0 | node = node->y; |
672 | 0 | goto loop; |
673 | | |
674 | 0 | case P_ALT: |
675 | 0 | split = emit(prog, I_SPLIT); |
676 | 0 | compile(prog, node->x); |
677 | 0 | jump = emit(prog, I_JUMP); |
678 | 0 | compile(prog, node->y); |
679 | 0 | split->x = split + 1; |
680 | 0 | split->y = jump + 1; |
681 | 0 | jump->x = prog->end; |
682 | 0 | break; |
683 | | |
684 | 0 | case P_REP: |
685 | 0 | inst = NULL; /* silence compiler warning. assert(node->m > 0). */ |
686 | 0 | for (i = 0; i < node->m; ++i) { |
687 | 0 | inst = prog->end; |
688 | 0 | compile(prog, node->x); |
689 | 0 | } |
690 | 0 | if (node->m == node->n) |
691 | 0 | break; |
692 | 0 | if (node->n < REPINF) { |
693 | 0 | for (i = node->m; i < node->n; ++i) { |
694 | 0 | split = emit(prog, I_SPLIT); |
695 | 0 | compile(prog, node->x); |
696 | 0 | if (node->ng) { |
697 | 0 | split->y = split + 1; |
698 | 0 | split->x = prog->end; |
699 | 0 | } else { |
700 | 0 | split->x = split + 1; |
701 | 0 | split->y = prog->end; |
702 | 0 | } |
703 | 0 | } |
704 | 0 | } else if (node->m == 0) { |
705 | 0 | split = emit(prog, I_SPLIT); |
706 | 0 | compile(prog, node->x); |
707 | 0 | jump = emit(prog, I_JUMP); |
708 | 0 | if (node->ng) { |
709 | 0 | split->y = split + 1; |
710 | 0 | split->x = prog->end; |
711 | 0 | } else { |
712 | 0 | split->x = split + 1; |
713 | 0 | split->y = prog->end; |
714 | 0 | } |
715 | 0 | jump->x = split; |
716 | 0 | } else { |
717 | 0 | split = emit(prog, I_SPLIT); |
718 | 0 | if (node->ng) { |
719 | 0 | split->y = inst; |
720 | 0 | split->x = prog->end; |
721 | 0 | } else { |
722 | 0 | split->x = inst; |
723 | 0 | split->y = prog->end; |
724 | 0 | } |
725 | 0 | } |
726 | 0 | break; |
727 | | |
728 | 0 | case P_BOL: emit(prog, I_BOL); break; |
729 | 0 | case P_EOL: emit(prog, I_EOL); break; |
730 | 0 | case P_WORD: emit(prog, I_WORD); break; |
731 | 0 | case P_NWORD: emit(prog, I_NWORD); break; |
732 | | |
733 | 0 | case P_PAR: |
734 | 0 | inst = emit(prog, I_LPAR); |
735 | 0 | inst->n = node->n; |
736 | 0 | compile(prog, node->x); |
737 | 0 | inst = emit(prog, I_RPAR); |
738 | 0 | inst->n = node->n; |
739 | 0 | break; |
740 | 0 | case P_PLA: |
741 | 0 | split = emit(prog, I_PLA); |
742 | 0 | compile(prog, node->x); |
743 | 0 | emit(prog, I_END); |
744 | 0 | split->x = split + 1; |
745 | 0 | split->y = prog->end; |
746 | 0 | break; |
747 | 0 | case P_NLA: |
748 | 0 | split = emit(prog, I_NLA); |
749 | 0 | compile(prog, node->x); |
750 | 0 | emit(prog, I_END); |
751 | 0 | split->x = split + 1; |
752 | 0 | split->y = prog->end; |
753 | 0 | break; |
754 | | |
755 | 0 | case P_ANY: |
756 | 0 | emit(prog, I_ANY); |
757 | 0 | break; |
758 | 0 | case P_CHAR: |
759 | 0 | inst = emit(prog, I_CHAR); |
760 | 0 | inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c; |
761 | 0 | break; |
762 | 0 | case P_CCLASS: |
763 | 0 | inst = emit(prog, I_CCLASS); |
764 | 0 | inst->cc = node->cc; |
765 | 0 | break; |
766 | 0 | case P_NCCLASS: |
767 | 0 | inst = emit(prog, I_NCCLASS); |
768 | 0 | inst->cc = node->cc; |
769 | 0 | break; |
770 | 0 | case P_REF: |
771 | 0 | inst = emit(prog, I_REF); |
772 | 0 | inst->n = node->n; |
773 | 0 | break; |
774 | 0 | } |
775 | 0 | } |
776 | | |
777 | | #ifdef TEST |
778 | | static void dumpnode(Renode *node) |
779 | | { |
780 | | Rune *p; |
781 | | if (!node) { printf("Empty"); return; } |
782 | | switch (node->type) { |
783 | | case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break; |
784 | | case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break; |
785 | | case P_REP: |
786 | | printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n); |
787 | | dumpnode(node->x); |
788 | | printf(")"); |
789 | | break; |
790 | | case P_BOL: printf("Bol"); break; |
791 | | case P_EOL: printf("Eol"); break; |
792 | | case P_WORD: printf("Word"); break; |
793 | | case P_NWORD: printf("NotWord"); break; |
794 | | case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break; |
795 | | case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break; |
796 | | case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break; |
797 | | case P_ANY: printf("Any"); break; |
798 | | case P_CHAR: printf("Char(%c)", node->c); break; |
799 | | case P_CCLASS: |
800 | | printf("Class("); |
801 | | for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); |
802 | | printf(")"); |
803 | | break; |
804 | | case P_NCCLASS: |
805 | | printf("NotClass("); |
806 | | for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); |
807 | | printf(")"); |
808 | | break; |
809 | | case P_REF: printf("Ref(%d)", node->n); break; |
810 | | } |
811 | | } |
812 | | |
813 | | static void dumpcclass(Reclass *cc) { |
814 | | Rune *p; |
815 | | for (p = cc->spans; p < cc->end; p += 2) { |
816 | | if (p[0] > 32 && p[0] < 127) |
817 | | printf(" %c", p[0]); |
818 | | else |
819 | | printf(" \\x%02x", p[0]); |
820 | | if (p[1] > 32 && p[1] < 127) |
821 | | printf("-%c", p[1]); |
822 | | else |
823 | | printf("-\\x%02x", p[1]); |
824 | | } |
825 | | putchar('\n'); |
826 | | } |
827 | | |
828 | | static void dumpprog(Reprog *prog) |
829 | | { |
830 | | Reinst *inst; |
831 | | int i; |
832 | | for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) { |
833 | | printf("% 5d: ", i); |
834 | | switch (inst->opcode) { |
835 | | case I_END: puts("end"); break; |
836 | | case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break; |
837 | | case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
838 | | case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
839 | | case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
840 | | case I_ANY: puts("any"); break; |
841 | | case I_ANYNL: puts("anynl"); break; |
842 | | case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break; |
843 | | case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break; |
844 | | case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break; |
845 | | case I_REF: printf("ref %d\n", inst->n); break; |
846 | | case I_BOL: puts("bol"); break; |
847 | | case I_EOL: puts("eol"); break; |
848 | | case I_WORD: puts("word"); break; |
849 | | case I_NWORD: puts("nword"); break; |
850 | | case I_LPAR: printf("lpar %d\n", inst->n); break; |
851 | | case I_RPAR: printf("rpar %d\n", inst->n); break; |
852 | | } |
853 | | } |
854 | | } |
855 | | #endif |
856 | | |
857 | | Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx, |
858 | | const char *pattern, int cflags, const char **errorp) |
859 | 0 | { |
860 | 0 | struct cstate g; |
861 | 0 | Renode *node; |
862 | 0 | Reinst *split, *jump; |
863 | 0 | int i, n; |
864 | |
|
865 | 0 | g.pstart = NULL; |
866 | 0 | g.prog = NULL; |
867 | |
|
868 | 0 | if (setjmp(g.kaboom)) { |
869 | 0 | if (errorp) *errorp = g.error; |
870 | 0 | alloc(ctx, g.pstart, 0); |
871 | 0 | alloc(ctx, g.prog, 0); |
872 | 0 | return NULL; |
873 | 0 | } |
874 | | |
875 | 0 | g.prog = alloc(ctx, NULL, sizeof (Reprog)); |
876 | 0 | if (!g.prog) |
877 | 0 | die(&g, "cannot allocate regular expression"); |
878 | 0 | n = strlen(pattern) * 2; |
879 | 0 | if (n > REG_MAXPROG) |
880 | 0 | die(&g, "program too large"); |
881 | 0 | if (n > 0) { |
882 | 0 | g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n); |
883 | 0 | if (!g.pstart) |
884 | 0 | die(&g, "cannot allocate regular expression parse list"); |
885 | 0 | } |
886 | |
|
887 | 0 | g.source = pattern; |
888 | 0 | g.ncclass = 0; |
889 | 0 | g.nsub = 1; |
890 | 0 | for (i = 0; i < REG_MAXSUB; ++i) |
891 | 0 | g.sub[i] = 0; |
892 | |
|
893 | 0 | g.prog->flags = cflags; |
894 | |
|
895 | 0 | next(&g); |
896 | 0 | node = parsealt(&g); |
897 | 0 | if (g.lookahead == ')') |
898 | 0 | die(&g, "unmatched ')'"); |
899 | 0 | if (g.lookahead != EOF) |
900 | 0 | die(&g, "syntax error"); |
901 | |
|
902 | | #ifdef TEST |
903 | | dumpnode(node); |
904 | | putchar('\n'); |
905 | | #endif |
906 | |
|
907 | 0 | n = 6 + count(&g, node, 0); |
908 | 0 | if (n < 0 || n > REG_MAXPROG) |
909 | 0 | die(&g, "program too large"); |
910 | |
|
911 | 0 | g.prog->nsub = g.nsub; |
912 | 0 | g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst)); |
913 | 0 | if (!g.prog->start) |
914 | 0 | die(&g, "cannot allocate regular expression instruction list"); |
915 | |
|
916 | 0 | split = emit(g.prog, I_SPLIT); |
917 | 0 | split->x = split + 3; |
918 | 0 | split->y = split + 1; |
919 | 0 | emit(g.prog, I_ANYNL); |
920 | 0 | jump = emit(g.prog, I_JUMP); |
921 | 0 | jump->x = split; |
922 | 0 | emit(g.prog, I_LPAR); |
923 | 0 | compile(g.prog, node); |
924 | 0 | emit(g.prog, I_RPAR); |
925 | 0 | emit(g.prog, I_END); |
926 | |
|
927 | | #ifdef TEST |
928 | | dumpprog(g.prog); |
929 | | #endif |
930 | |
|
931 | 0 | alloc(ctx, g.pstart, 0); |
932 | |
|
933 | 0 | if (errorp) *errorp = NULL; |
934 | 0 | return g.prog; |
935 | 0 | } |
936 | | |
937 | | void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog) |
938 | 0 | { |
939 | 0 | if (prog) { |
940 | 0 | alloc(ctx, prog->start, 0); |
941 | 0 | alloc(ctx, prog, 0); |
942 | 0 | } |
943 | 0 | } |
944 | | |
945 | | static void *default_alloc(void *ctx, void *p, int n) |
946 | 0 | { |
947 | 0 | if (n == 0) { |
948 | 0 | free(p); |
949 | 0 | return NULL; |
950 | 0 | } |
951 | 0 | return realloc(p, (size_t)n); |
952 | 0 | } |
953 | | |
954 | | Reprog *regcomp(const char *pattern, int cflags, const char **errorp) |
955 | 0 | { |
956 | 0 | return regcompx(default_alloc, NULL, pattern, cflags, errorp); |
957 | 0 | } |
958 | | |
959 | | void regfree(Reprog *prog) |
960 | 0 | { |
961 | 0 | regfreex(default_alloc, NULL, prog); |
962 | 0 | } |
963 | | |
964 | | /* Match */ |
965 | | |
966 | | static int isnewline(int c) |
967 | 0 | { |
968 | 0 | return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029; |
969 | 0 | } |
970 | | |
971 | | static int iswordchar(int c) |
972 | 0 | { |
973 | 0 | return c == '_' || |
974 | 0 | (c >= 'a' && c <= 'z') || |
975 | 0 | (c >= 'A' && c <= 'Z') || |
976 | 0 | (c >= '0' && c <= '9'); |
977 | 0 | } |
978 | | |
979 | | static int incclass(Reclass *cc, Rune c) |
980 | 0 | { |
981 | 0 | Rune *p; |
982 | 0 | for (p = cc->spans; p < cc->end; p += 2) |
983 | 0 | if (p[0] <= c && c <= p[1]) |
984 | 0 | return 1; |
985 | 0 | return 0; |
986 | 0 | } |
987 | | |
988 | | static int incclasscanon(Reclass *cc, Rune c) |
989 | 0 | { |
990 | 0 | Rune *p, r; |
991 | 0 | for (p = cc->spans; p < cc->end; p += 2) |
992 | 0 | for (r = p[0]; r <= p[1]; ++r) |
993 | 0 | if (c == canon(r)) |
994 | 0 | return 1; |
995 | 0 | return 0; |
996 | 0 | } |
997 | | |
998 | | static int strncmpcanon(const char *a, const char *b, int n) |
999 | 0 | { |
1000 | 0 | Rune ra, rb; |
1001 | 0 | int c; |
1002 | 0 | while (n--) { |
1003 | 0 | if (!*a) return -1; |
1004 | 0 | if (!*b) return 1; |
1005 | 0 | a += chartorune(&ra, a); |
1006 | 0 | b += chartorune(&rb, b); |
1007 | 0 | c = canon(ra) - canon(rb); |
1008 | 0 | if (c) |
1009 | 0 | return c; |
1010 | 0 | } |
1011 | 0 | return 0; |
1012 | 0 | } |
1013 | | |
1014 | | static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth) |
1015 | 0 | { |
1016 | 0 | Resub scratch; |
1017 | 0 | int result; |
1018 | 0 | int i; |
1019 | 0 | Rune c; |
1020 | | |
1021 | | /* stack overflow */ |
1022 | 0 | if (depth > REG_MAXREC) |
1023 | 0 | return -1; |
1024 | | |
1025 | 0 | for (;;) { |
1026 | 0 | switch (pc->opcode) { |
1027 | 0 | case I_END: |
1028 | 0 | return 0; |
1029 | 0 | case I_JUMP: |
1030 | 0 | pc = pc->x; |
1031 | 0 | break; |
1032 | 0 | case I_SPLIT: |
1033 | 0 | scratch = *out; |
1034 | 0 | result = match(pc->x, sp, bol, flags, &scratch, depth+1); |
1035 | 0 | if (result == -1) |
1036 | 0 | return -1; |
1037 | 0 | if (result == 0) { |
1038 | 0 | *out = scratch; |
1039 | 0 | return 0; |
1040 | 0 | } |
1041 | 0 | pc = pc->y; |
1042 | 0 | break; |
1043 | | |
1044 | 0 | case I_PLA: |
1045 | 0 | result = match(pc->x, sp, bol, flags, out, depth+1); |
1046 | 0 | if (result == -1) |
1047 | 0 | return -1; |
1048 | 0 | if (result == 1) |
1049 | 0 | return 1; |
1050 | 0 | pc = pc->y; |
1051 | 0 | break; |
1052 | 0 | case I_NLA: |
1053 | 0 | scratch = *out; |
1054 | 0 | result = match(pc->x, sp, bol, flags, &scratch, depth+1); |
1055 | 0 | if (result == -1) |
1056 | 0 | return -1; |
1057 | 0 | if (result == 0) |
1058 | 0 | return 1; |
1059 | 0 | pc = pc->y; |
1060 | 0 | break; |
1061 | | |
1062 | 0 | case I_ANYNL: |
1063 | 0 | if (!*sp) return 1; |
1064 | 0 | sp += chartorune(&c, sp); |
1065 | 0 | pc = pc + 1; |
1066 | 0 | break; |
1067 | 0 | case I_ANY: |
1068 | 0 | if (!*sp) return 1; |
1069 | 0 | sp += chartorune(&c, sp); |
1070 | 0 | if (isnewline(c)) |
1071 | 0 | return 1; |
1072 | 0 | pc = pc + 1; |
1073 | 0 | break; |
1074 | 0 | case I_CHAR: |
1075 | 0 | if (!*sp) return 1; |
1076 | 0 | sp += chartorune(&c, sp); |
1077 | 0 | if (flags & REG_ICASE) |
1078 | 0 | c = canon(c); |
1079 | 0 | if (c != pc->c) |
1080 | 0 | return 1; |
1081 | 0 | pc = pc + 1; |
1082 | 0 | break; |
1083 | 0 | case I_CCLASS: |
1084 | 0 | if (!*sp) return 1; |
1085 | 0 | sp += chartorune(&c, sp); |
1086 | 0 | if (flags & REG_ICASE) { |
1087 | 0 | if (!incclasscanon(pc->cc, canon(c))) |
1088 | 0 | return 1; |
1089 | 0 | } else { |
1090 | 0 | if (!incclass(pc->cc, c)) |
1091 | 0 | return 1; |
1092 | 0 | } |
1093 | 0 | pc = pc + 1; |
1094 | 0 | break; |
1095 | 0 | case I_NCCLASS: |
1096 | 0 | if (!*sp) return 1; |
1097 | 0 | sp += chartorune(&c, sp); |
1098 | 0 | if (flags & REG_ICASE) { |
1099 | 0 | if (incclasscanon(pc->cc, canon(c))) |
1100 | 0 | return 1; |
1101 | 0 | } else { |
1102 | 0 | if (incclass(pc->cc, c)) |
1103 | 0 | return 1; |
1104 | 0 | } |
1105 | 0 | pc = pc + 1; |
1106 | 0 | break; |
1107 | 0 | case I_REF: |
1108 | 0 | i = out->sub[pc->n].ep - out->sub[pc->n].sp; |
1109 | 0 | if (flags & REG_ICASE) { |
1110 | 0 | if (strncmpcanon(sp, out->sub[pc->n].sp, i)) |
1111 | 0 | return 1; |
1112 | 0 | } else { |
1113 | 0 | if (strncmp(sp, out->sub[pc->n].sp, i)) |
1114 | 0 | return 1; |
1115 | 0 | } |
1116 | 0 | if (i > 0) |
1117 | 0 | sp += i; |
1118 | 0 | pc = pc + 1; |
1119 | 0 | break; |
1120 | | |
1121 | 0 | case I_BOL: |
1122 | 0 | if (sp == bol && !(flags & REG_NOTBOL)) { |
1123 | 0 | pc = pc + 1; |
1124 | 0 | break; |
1125 | 0 | } |
1126 | 0 | if (flags & REG_NEWLINE) { |
1127 | 0 | if (sp > bol && isnewline(sp[-1])) { |
1128 | 0 | pc = pc + 1; |
1129 | 0 | break; |
1130 | 0 | } |
1131 | 0 | } |
1132 | 0 | return 1; |
1133 | 0 | case I_EOL: |
1134 | 0 | if (*sp == 0) { |
1135 | 0 | pc = pc + 1; |
1136 | 0 | break; |
1137 | 0 | } |
1138 | 0 | if (flags & REG_NEWLINE) { |
1139 | 0 | if (isnewline(*sp)) { |
1140 | 0 | pc = pc + 1; |
1141 | 0 | break; |
1142 | 0 | } |
1143 | 0 | } |
1144 | 0 | return 1; |
1145 | 0 | case I_WORD: |
1146 | 0 | i = sp > bol && iswordchar(sp[-1]); |
1147 | 0 | i ^= iswordchar(sp[0]); |
1148 | 0 | if (!i) |
1149 | 0 | return 1; |
1150 | 0 | pc = pc + 1; |
1151 | 0 | break; |
1152 | 0 | case I_NWORD: |
1153 | 0 | i = sp > bol && iswordchar(sp[-1]); |
1154 | 0 | i ^= iswordchar(sp[0]); |
1155 | 0 | if (i) |
1156 | 0 | return 1; |
1157 | 0 | pc = pc + 1; |
1158 | 0 | break; |
1159 | | |
1160 | 0 | case I_LPAR: |
1161 | 0 | out->sub[pc->n].sp = sp; |
1162 | 0 | pc = pc + 1; |
1163 | 0 | break; |
1164 | 0 | case I_RPAR: |
1165 | 0 | out->sub[pc->n].ep = sp; |
1166 | 0 | pc = pc + 1; |
1167 | 0 | break; |
1168 | 0 | default: |
1169 | 0 | return 1; |
1170 | 0 | } |
1171 | 0 | } |
1172 | 0 | } |
1173 | | |
1174 | | int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) |
1175 | 0 | { |
1176 | 0 | Resub scratch; |
1177 | 0 | int i; |
1178 | |
|
1179 | 0 | if (!sub) |
1180 | 0 | sub = &scratch; |
1181 | |
|
1182 | 0 | sub->nsub = prog->nsub; |
1183 | 0 | for (i = 0; i < REG_MAXSUB; ++i) |
1184 | 0 | sub->sub[i].sp = sub->sub[i].ep = NULL; |
1185 | |
|
1186 | 0 | return match(prog->start, sp, sp, prog->flags | eflags, sub, 0); |
1187 | 0 | } |
1188 | | |
1189 | | #ifdef TEST |
1190 | | int main(int argc, char **argv) |
1191 | | { |
1192 | | const char *error; |
1193 | | const char *s; |
1194 | | Reprog *p; |
1195 | | Resub m; |
1196 | | int i; |
1197 | | |
1198 | | if (argc > 1) { |
1199 | | p = regcomp(argv[1], 0, &error); |
1200 | | if (!p) { |
1201 | | fprintf(stderr, "regcomp: %s\n", error); |
1202 | | return 1; |
1203 | | } |
1204 | | |
1205 | | if (argc > 2) { |
1206 | | s = argv[2]; |
1207 | | printf("nsub = %d\n", p->nsub); |
1208 | | if (!regexec(p, s, &m, 0)) { |
1209 | | for (i = 0; i < m.nsub; ++i) { |
1210 | | int n = m.sub[i].ep - m.sub[i].sp; |
1211 | | if (n > 0) |
1212 | | printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp); |
1213 | | else |
1214 | | printf("match %d: n=0 ''\n", i); |
1215 | | } |
1216 | | } else { |
1217 | | printf("no match\n"); |
1218 | | } |
1219 | | } |
1220 | | } |
1221 | | |
1222 | | return 0; |
1223 | | } |
1224 | | #endif |