/src/mupdf/thirdparty/mujs/regexp.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include <stdlib.h> |
2 | | #include <stdio.h> |
3 | | #include <string.h> |
4 | | #include <setjmp.h> |
5 | | #include <limits.h> |
6 | | |
7 | | #include "regexp.h" |
8 | | #include "utf.h" |
9 | | |
10 | 0 | #define emit regemit |
11 | 0 | #define next regnext |
12 | 0 | #define accept regaccept |
13 | | |
14 | 0 | #define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) |
15 | | |
16 | 0 | #define REPINF 255 |
17 | | #ifndef REG_MAXPROG |
18 | 0 | #define REG_MAXPROG (32 << 10) |
19 | | #endif |
20 | | #ifndef REG_MAXREC |
21 | 0 | #define REG_MAXREC 1024 |
22 | | #endif |
23 | | #ifndef REG_MAXSPAN |
24 | | #define REG_MAXSPAN 64 |
25 | | #endif |
26 | | #ifndef REG_MAXCLASS |
27 | 0 | #define REG_MAXCLASS 128 |
28 | | #endif |
29 | | |
30 | | typedef struct Reclass Reclass; |
31 | | typedef struct Renode Renode; |
32 | | typedef struct Reinst Reinst; |
33 | | typedef struct Rethread Rethread; |
34 | | |
35 | | struct Reclass { |
36 | | Rune *end; |
37 | | Rune spans[REG_MAXSPAN]; |
38 | | }; |
39 | | |
40 | | struct Reprog { |
41 | | Reinst *start, *end; |
42 | | Reclass *cclass; |
43 | | int flags; |
44 | | int nsub; |
45 | | }; |
46 | | |
47 | | struct cstate { |
48 | | Reprog *prog; |
49 | | Renode *pstart, *pend; |
50 | | |
51 | | const char *source; |
52 | | int ncclass; |
53 | | int nsub; |
54 | | Renode *sub[REG_MAXSUB]; |
55 | | |
56 | | int lookahead; |
57 | | Rune yychar; |
58 | | Reclass *yycc; |
59 | | int yymin, yymax; |
60 | | |
61 | | const char *error; |
62 | | jmp_buf kaboom; |
63 | | |
64 | | Reclass cclass[REG_MAXCLASS]; |
65 | | }; |
66 | | |
67 | | static void die(struct cstate *g, const char *message) |
68 | 0 | { |
69 | 0 | g->error = message; |
70 | 0 | longjmp(g->kaboom, 1); |
71 | 0 | } |
72 | | |
73 | | static int canon(Rune c) |
74 | 0 | { |
75 | 0 | Rune u = toupperrune(c); |
76 | 0 | if (c >= 128 && u < 128) |
77 | 0 | return c; |
78 | 0 | return u; |
79 | 0 | } |
80 | | |
81 | | /* Scan */ |
82 | | |
83 | | enum { |
84 | | L_CHAR = 256, |
85 | | L_CCLASS, /* character class */ |
86 | | L_NCCLASS, /* negative character class */ |
87 | | L_NC, /* "(?:" no capture */ |
88 | | L_PLA, /* "(?=" positive lookahead */ |
89 | | L_NLA, /* "(?!" negative lookahead */ |
90 | | L_WORD, /* "\b" word boundary */ |
91 | | L_NWORD, /* "\B" non-word boundary */ |
92 | | L_REF, /* "\1" back-reference */ |
93 | | L_COUNT, /* {M,N} */ |
94 | | }; |
95 | | |
96 | | static int hex(struct cstate *g, int c) |
97 | 0 | { |
98 | 0 | if (c >= '0' && c <= '9') return c - '0'; |
99 | 0 | if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; |
100 | 0 | if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; |
101 | 0 | die(g, "invalid escape sequence"); |
102 | 0 | return 0; |
103 | 0 | } |
104 | | |
105 | | static int dec(struct cstate *g, int c) |
106 | 0 | { |
107 | 0 | if (c >= '0' && c <= '9') return c - '0'; |
108 | 0 | die(g, "invalid quantifier"); |
109 | 0 | return 0; |
110 | 0 | } |
111 | | |
112 | 0 | #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789" |
113 | | |
114 | | static int isunicodeletter(int c) |
115 | 0 | { |
116 | 0 | return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c); |
117 | 0 | } |
118 | | |
119 | | static int nextrune(struct cstate *g) |
120 | 0 | { |
121 | 0 | if (!*g->source) { |
122 | 0 | g->yychar = EOF; |
123 | 0 | return 0; |
124 | 0 | } |
125 | 0 | g->source += chartorune(&g->yychar, g->source); |
126 | 0 | if (g->yychar == '\\') { |
127 | 0 | if (!*g->source) |
128 | 0 | die(g, "unterminated escape sequence"); |
129 | 0 | g->source += chartorune(&g->yychar, g->source); |
130 | 0 | switch (g->yychar) { |
131 | 0 | case 'f': g->yychar = '\f'; return 0; |
132 | 0 | case 'n': g->yychar = '\n'; return 0; |
133 | 0 | case 'r': g->yychar = '\r'; return 0; |
134 | 0 | case 't': g->yychar = '\t'; return 0; |
135 | 0 | case 'v': g->yychar = '\v'; return 0; |
136 | 0 | case 'c': |
137 | 0 | if (!g->source[0]) |
138 | 0 | die(g, "unterminated escape sequence"); |
139 | 0 | g->yychar = (*g->source++) & 31; |
140 | 0 | return 0; |
141 | 0 | case 'x': |
142 | 0 | if (!g->source[0] || !g->source[1]) |
143 | 0 | die(g, "unterminated escape sequence"); |
144 | 0 | g->yychar = hex(g, *g->source++) << 4; |
145 | 0 | g->yychar += hex(g, *g->source++); |
146 | 0 | if (g->yychar == 0) { |
147 | 0 | g->yychar = '0'; |
148 | 0 | return 1; |
149 | 0 | } |
150 | 0 | return 0; |
151 | 0 | case 'u': |
152 | 0 | if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3]) |
153 | 0 | die(g, "unterminated escape sequence"); |
154 | 0 | g->yychar = hex(g, *g->source++) << 12; |
155 | 0 | g->yychar += hex(g, *g->source++) << 8; |
156 | 0 | g->yychar += hex(g, *g->source++) << 4; |
157 | 0 | g->yychar += hex(g, *g->source++); |
158 | 0 | if (g->yychar == 0) { |
159 | 0 | g->yychar = '0'; |
160 | 0 | return 1; |
161 | 0 | } |
162 | 0 | return 0; |
163 | 0 | case 0: |
164 | 0 | g->yychar = '0'; |
165 | 0 | return 1; |
166 | 0 | } |
167 | 0 | if (strchr(ESCAPES, g->yychar)) |
168 | 0 | return 1; |
169 | 0 | if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */ |
170 | 0 | die(g, "invalid escape character"); |
171 | 0 | return 0; |
172 | 0 | } |
173 | 0 | return 0; |
174 | 0 | } |
175 | | |
176 | | static int lexcount(struct cstate *g) |
177 | 0 | { |
178 | 0 | g->yychar = *g->source++; |
179 | |
|
180 | 0 | g->yymin = dec(g, g->yychar); |
181 | 0 | g->yychar = *g->source++; |
182 | 0 | while (g->yychar != ',' && g->yychar != '}') { |
183 | 0 | g->yymin = g->yymin * 10 + dec(g, g->yychar); |
184 | 0 | g->yychar = *g->source++; |
185 | 0 | if (g->yymin >= REPINF) |
186 | 0 | die(g, "numeric overflow"); |
187 | 0 | } |
188 | |
|
189 | 0 | if (g->yychar == ',') { |
190 | 0 | g->yychar = *g->source++; |
191 | 0 | if (g->yychar == '}') { |
192 | 0 | g->yymax = REPINF; |
193 | 0 | } else { |
194 | 0 | g->yymax = dec(g, g->yychar); |
195 | 0 | g->yychar = *g->source++; |
196 | 0 | while (g->yychar != '}') { |
197 | 0 | g->yymax = g->yymax * 10 + dec(g, g->yychar); |
198 | 0 | g->yychar = *g->source++; |
199 | 0 | if (g->yymax >= REPINF) |
200 | 0 | die(g, "numeric overflow"); |
201 | 0 | } |
202 | 0 | } |
203 | 0 | } else { |
204 | 0 | g->yymax = g->yymin; |
205 | 0 | } |
206 | |
|
207 | 0 | return L_COUNT; |
208 | 0 | } |
209 | | |
210 | | static void newcclass(struct cstate *g) |
211 | 0 | { |
212 | 0 | if (g->ncclass >= REG_MAXCLASS) |
213 | 0 | die(g, "too many character classes"); |
214 | 0 | g->yycc = g->cclass + g->ncclass++; |
215 | 0 | g->yycc->end = g->yycc->spans; |
216 | 0 | } |
217 | | |
218 | | static void addrange(struct cstate *g, Rune a, Rune b) |
219 | 0 | { |
220 | 0 | Reclass *cc = g->yycc; |
221 | 0 | Rune *p; |
222 | |
|
223 | 0 | if (a > b) |
224 | 0 | die(g, "invalid character class range"); |
225 | | |
226 | | /* extend existing ranges if they overlap */ |
227 | 0 | for (p = cc->spans; p < cc->end; p += 2) { |
228 | | /* completely inside old range */ |
229 | 0 | if (a >= p[0] && b <= p[1]) |
230 | 0 | return; |
231 | | |
232 | | /* completely swallows old range */ |
233 | 0 | if (a < p[0] && b >= p[1]) { |
234 | 0 | p[0] = a; |
235 | 0 | p[1] = b; |
236 | 0 | return; |
237 | 0 | } |
238 | | |
239 | | /* extend at start */ |
240 | 0 | if (b >= p[0] - 1 && b <= p[1] && a < p[0]) { |
241 | 0 | p[0] = a; |
242 | 0 | return; |
243 | 0 | } |
244 | | |
245 | | /* extend at end */ |
246 | 0 | if (a >= p[0] && a <= p[1] + 1 && b > p[1]) { |
247 | 0 | p[1] = b; |
248 | 0 | return; |
249 | 0 | } |
250 | 0 | } |
251 | | |
252 | 0 | if (cc->end + 2 >= cc->spans + nelem(cc->spans)) |
253 | 0 | die(g, "too many character class ranges"); |
254 | 0 | *cc->end++ = a; |
255 | 0 | *cc->end++ = b; |
256 | 0 | } |
257 | | |
258 | | static void addranges_d(struct cstate *g) |
259 | 0 | { |
260 | 0 | addrange(g, '0', '9'); |
261 | 0 | } |
262 | | |
263 | | static void addranges_D(struct cstate *g) |
264 | 0 | { |
265 | 0 | addrange(g, 0, '0'-1); |
266 | 0 | addrange(g, '9'+1, 0xFFFF); |
267 | 0 | } |
268 | | |
269 | | static void addranges_s(struct cstate *g) |
270 | 0 | { |
271 | 0 | addrange(g, 0x9, 0xD); |
272 | 0 | addrange(g, 0x20, 0x20); |
273 | 0 | addrange(g, 0xA0, 0xA0); |
274 | 0 | addrange(g, 0x2028, 0x2029); |
275 | 0 | addrange(g, 0xFEFF, 0xFEFF); |
276 | 0 | } |
277 | | |
278 | | static void addranges_S(struct cstate *g) |
279 | 0 | { |
280 | 0 | addrange(g, 0, 0x9-1); |
281 | 0 | addrange(g, 0xD+1, 0x20-1); |
282 | 0 | addrange(g, 0x20+1, 0xA0-1); |
283 | 0 | addrange(g, 0xA0+1, 0x2028-1); |
284 | 0 | addrange(g, 0x2029+1, 0xFEFF-1); |
285 | 0 | addrange(g, 0xFEFF+1, 0xFFFF); |
286 | 0 | } |
287 | | |
288 | | static void addranges_w(struct cstate *g) |
289 | 0 | { |
290 | 0 | addrange(g, '0', '9'); |
291 | 0 | addrange(g, 'A', 'Z'); |
292 | 0 | addrange(g, '_', '_'); |
293 | 0 | addrange(g, 'a', 'z'); |
294 | 0 | } |
295 | | |
296 | | static void addranges_W(struct cstate *g) |
297 | 0 | { |
298 | 0 | addrange(g, 0, '0'-1); |
299 | 0 | addrange(g, '9'+1, 'A'-1); |
300 | 0 | addrange(g, 'Z'+1, '_'-1); |
301 | 0 | addrange(g, '_'+1, 'a'-1); |
302 | 0 | addrange(g, 'z'+1, 0xFFFF); |
303 | 0 | } |
304 | | |
305 | | static int lexclass(struct cstate *g) |
306 | 0 | { |
307 | 0 | int type = L_CCLASS; |
308 | 0 | int quoted, havesave, havedash; |
309 | 0 | Rune save = 0; |
310 | |
|
311 | 0 | newcclass(g); |
312 | |
|
313 | 0 | quoted = nextrune(g); |
314 | 0 | if (!quoted && g->yychar == '^') { |
315 | 0 | type = L_NCCLASS; |
316 | 0 | quoted = nextrune(g); |
317 | 0 | } |
318 | |
|
319 | 0 | havesave = havedash = 0; |
320 | 0 | for (;;) { |
321 | 0 | if (g->yychar == EOF) |
322 | 0 | die(g, "unterminated character class"); |
323 | 0 | if (!quoted && g->yychar == ']') |
324 | 0 | break; |
325 | | |
326 | 0 | if (!quoted && g->yychar == '-') { |
327 | 0 | if (havesave) { |
328 | 0 | if (havedash) { |
329 | 0 | addrange(g, save, '-'); |
330 | 0 | havesave = havedash = 0; |
331 | 0 | } else { |
332 | 0 | havedash = 1; |
333 | 0 | } |
334 | 0 | } else { |
335 | 0 | save = '-'; |
336 | 0 | havesave = 1; |
337 | 0 | } |
338 | 0 | } else if (quoted && strchr("DSWdsw", g->yychar)) { |
339 | 0 | if (havesave) { |
340 | 0 | addrange(g, save, save); |
341 | 0 | if (havedash) |
342 | 0 | addrange(g, '-', '-'); |
343 | 0 | } |
344 | 0 | switch (g->yychar) { |
345 | 0 | case 'd': addranges_d(g); break; |
346 | 0 | case 's': addranges_s(g); break; |
347 | 0 | case 'w': addranges_w(g); break; |
348 | 0 | case 'D': addranges_D(g); break; |
349 | 0 | case 'S': addranges_S(g); break; |
350 | 0 | case 'W': addranges_W(g); break; |
351 | 0 | } |
352 | 0 | havesave = havedash = 0; |
353 | 0 | } else { |
354 | 0 | if (quoted) { |
355 | 0 | if (g->yychar == 'b') |
356 | 0 | g->yychar = '\b'; |
357 | 0 | else if (g->yychar == '0') |
358 | 0 | g->yychar = 0; |
359 | | /* else identity escape */ |
360 | 0 | } |
361 | 0 | if (havesave) { |
362 | 0 | if (havedash) { |
363 | 0 | addrange(g, save, g->yychar); |
364 | 0 | havesave = havedash = 0; |
365 | 0 | } else { |
366 | 0 | addrange(g, save, save); |
367 | 0 | save = g->yychar; |
368 | 0 | } |
369 | 0 | } else { |
370 | 0 | save = g->yychar; |
371 | 0 | havesave = 1; |
372 | 0 | } |
373 | 0 | } |
374 | | |
375 | 0 | quoted = nextrune(g); |
376 | 0 | } |
377 | | |
378 | 0 | if (havesave) { |
379 | 0 | addrange(g, save, save); |
380 | 0 | if (havedash) |
381 | 0 | addrange(g, '-', '-'); |
382 | 0 | } |
383 | |
|
384 | 0 | return type; |
385 | 0 | } |
386 | | |
387 | | static int lex(struct cstate *g) |
388 | 0 | { |
389 | 0 | int quoted = nextrune(g); |
390 | 0 | if (quoted) { |
391 | 0 | switch (g->yychar) { |
392 | 0 | case 'b': return L_WORD; |
393 | 0 | case 'B': return L_NWORD; |
394 | 0 | case 'd': newcclass(g); addranges_d(g); return L_CCLASS; |
395 | 0 | case 's': newcclass(g); addranges_s(g); return L_CCLASS; |
396 | 0 | case 'w': newcclass(g); addranges_w(g); return L_CCLASS; |
397 | 0 | case 'D': newcclass(g); addranges_d(g); return L_NCCLASS; |
398 | 0 | case 'S': newcclass(g); addranges_s(g); return L_NCCLASS; |
399 | 0 | case 'W': newcclass(g); addranges_w(g); return L_NCCLASS; |
400 | 0 | case '0': g->yychar = 0; return L_CHAR; |
401 | 0 | } |
402 | 0 | if (g->yychar >= '0' && g->yychar <= '9') { |
403 | 0 | g->yychar -= '0'; |
404 | 0 | if (*g->source >= '0' && *g->source <= '9') |
405 | 0 | g->yychar = g->yychar * 10 + *g->source++ - '0'; |
406 | 0 | return L_REF; |
407 | 0 | } |
408 | 0 | return L_CHAR; |
409 | 0 | } |
410 | | |
411 | 0 | switch (g->yychar) { |
412 | 0 | case EOF: |
413 | 0 | case '$': case ')': case '*': case '+': |
414 | 0 | case '.': case '?': case '^': case '|': |
415 | 0 | return g->yychar; |
416 | 0 | } |
417 | | |
418 | 0 | if (g->yychar == '{') |
419 | 0 | return lexcount(g); |
420 | 0 | if (g->yychar == '[') |
421 | 0 | return lexclass(g); |
422 | 0 | if (g->yychar == '(') { |
423 | 0 | if (g->source[0] == '?') { |
424 | 0 | if (g->source[1] == ':') { |
425 | 0 | g->source += 2; |
426 | 0 | return L_NC; |
427 | 0 | } |
428 | 0 | if (g->source[1] == '=') { |
429 | 0 | g->source += 2; |
430 | 0 | return L_PLA; |
431 | 0 | } |
432 | 0 | if (g->source[1] == '!') { |
433 | 0 | g->source += 2; |
434 | 0 | return L_NLA; |
435 | 0 | } |
436 | 0 | } |
437 | 0 | return '('; |
438 | 0 | } |
439 | | |
440 | 0 | return L_CHAR; |
441 | 0 | } |
442 | | |
443 | | /* Parse */ |
444 | | |
445 | | enum { |
446 | | P_CAT, P_ALT, P_REP, |
447 | | P_BOL, P_EOL, P_WORD, P_NWORD, |
448 | | P_PAR, P_PLA, P_NLA, |
449 | | P_ANY, P_CHAR, P_CCLASS, P_NCCLASS, |
450 | | P_REF, |
451 | | }; |
452 | | |
453 | | struct Renode { |
454 | | unsigned char type; |
455 | | unsigned char ng, m, n; |
456 | | Rune c; |
457 | | int cc; |
458 | | Renode *x; |
459 | | Renode *y; |
460 | | }; |
461 | | |
462 | | static Renode *newnode(struct cstate *g, int type) |
463 | 0 | { |
464 | 0 | Renode *node = g->pend++; |
465 | 0 | node->type = type; |
466 | 0 | node->cc = -1; |
467 | 0 | node->c = 0; |
468 | 0 | node->ng = 0; |
469 | 0 | node->m = 0; |
470 | 0 | node->n = 0; |
471 | 0 | node->x = node->y = NULL; |
472 | 0 | return node; |
473 | 0 | } |
474 | | |
475 | | static int empty(Renode *node) |
476 | 0 | { |
477 | 0 | if (!node) return 1; |
478 | 0 | switch (node->type) { |
479 | 0 | default: return 1; |
480 | 0 | case P_CAT: return empty(node->x) && empty(node->y); |
481 | 0 | case P_ALT: return empty(node->x) || empty(node->y); |
482 | 0 | case P_REP: return empty(node->x) || node->m == 0; |
483 | 0 | case P_PAR: return empty(node->x); |
484 | 0 | case P_REF: return empty(node->x); |
485 | 0 | case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0; |
486 | 0 | } |
487 | 0 | } |
488 | | |
489 | | static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max) |
490 | 0 | { |
491 | 0 | Renode *rep = newnode(g, P_REP); |
492 | 0 | if (max == REPINF && empty(atom)) |
493 | 0 | die(g, "infinite loop matching the empty string"); |
494 | 0 | rep->ng = ng; |
495 | 0 | rep->m = min; |
496 | 0 | rep->n = max; |
497 | 0 | rep->x = atom; |
498 | 0 | return rep; |
499 | 0 | } |
500 | | |
501 | | static void next(struct cstate *g) |
502 | 0 | { |
503 | 0 | g->lookahead = lex(g); |
504 | 0 | } |
505 | | |
506 | | static int accept(struct cstate *g, int t) |
507 | 0 | { |
508 | 0 | if (g->lookahead == t) { |
509 | 0 | next(g); |
510 | 0 | return 1; |
511 | 0 | } |
512 | 0 | return 0; |
513 | 0 | } |
514 | | |
515 | | static Renode *parsealt(struct cstate *g); |
516 | | |
517 | | static Renode *parseatom(struct cstate *g) |
518 | 0 | { |
519 | 0 | Renode *atom; |
520 | 0 | if (g->lookahead == L_CHAR) { |
521 | 0 | atom = newnode(g, P_CHAR); |
522 | 0 | atom->c = g->yychar; |
523 | 0 | next(g); |
524 | 0 | return atom; |
525 | 0 | } |
526 | 0 | if (g->lookahead == L_CCLASS) { |
527 | 0 | atom = newnode(g, P_CCLASS); |
528 | 0 | atom->cc = g->yycc - g->cclass; |
529 | 0 | next(g); |
530 | 0 | return atom; |
531 | 0 | } |
532 | 0 | if (g->lookahead == L_NCCLASS) { |
533 | 0 | atom = newnode(g, P_NCCLASS); |
534 | 0 | atom->cc = g->yycc - g->cclass; |
535 | 0 | next(g); |
536 | 0 | return atom; |
537 | 0 | } |
538 | 0 | if (g->lookahead == L_REF) { |
539 | 0 | atom = newnode(g, P_REF); |
540 | 0 | if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar]) |
541 | 0 | die(g, "invalid back-reference"); |
542 | 0 | atom->n = g->yychar; |
543 | 0 | atom->x = g->sub[g->yychar]; |
544 | 0 | next(g); |
545 | 0 | return atom; |
546 | 0 | } |
547 | 0 | if (accept(g, '.')) |
548 | 0 | return newnode(g, P_ANY); |
549 | 0 | if (accept(g, '(')) { |
550 | 0 | atom = newnode(g, P_PAR); |
551 | 0 | if (g->nsub == REG_MAXSUB) |
552 | 0 | die(g, "too many captures"); |
553 | 0 | atom->n = g->nsub++; |
554 | 0 | atom->x = parsealt(g); |
555 | 0 | g->sub[atom->n] = atom; |
556 | 0 | if (!accept(g, ')')) |
557 | 0 | die(g, "unmatched '('"); |
558 | 0 | return atom; |
559 | 0 | } |
560 | 0 | if (accept(g, L_NC)) { |
561 | 0 | atom = parsealt(g); |
562 | 0 | if (!accept(g, ')')) |
563 | 0 | die(g, "unmatched '('"); |
564 | 0 | return atom; |
565 | 0 | } |
566 | 0 | if (accept(g, L_PLA)) { |
567 | 0 | atom = newnode(g, P_PLA); |
568 | 0 | atom->x = parsealt(g); |
569 | 0 | if (!accept(g, ')')) |
570 | 0 | die(g, "unmatched '('"); |
571 | 0 | return atom; |
572 | 0 | } |
573 | 0 | if (accept(g, L_NLA)) { |
574 | 0 | atom = newnode(g, P_NLA); |
575 | 0 | atom->x = parsealt(g); |
576 | 0 | if (!accept(g, ')')) |
577 | 0 | die(g, "unmatched '('"); |
578 | 0 | return atom; |
579 | 0 | } |
580 | 0 | die(g, "syntax error"); |
581 | 0 | return NULL; |
582 | 0 | } |
583 | | |
584 | | static Renode *parserep(struct cstate *g) |
585 | 0 | { |
586 | 0 | Renode *atom; |
587 | |
|
588 | 0 | if (accept(g, '^')) return newnode(g, P_BOL); |
589 | 0 | if (accept(g, '$')) return newnode(g, P_EOL); |
590 | 0 | if (accept(g, L_WORD)) return newnode(g, P_WORD); |
591 | 0 | if (accept(g, L_NWORD)) return newnode(g, P_NWORD); |
592 | | |
593 | 0 | atom = parseatom(g); |
594 | 0 | if (g->lookahead == L_COUNT) { |
595 | 0 | int min = g->yymin, max = g->yymax; |
596 | 0 | next(g); |
597 | 0 | if (max < min) |
598 | 0 | die(g, "invalid quantifier"); |
599 | 0 | return newrep(g, atom, accept(g, '?'), min, max); |
600 | 0 | } |
601 | 0 | if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF); |
602 | 0 | if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF); |
603 | 0 | if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1); |
604 | 0 | return atom; |
605 | 0 | } |
606 | | |
607 | | static Renode *parsecat(struct cstate *g) |
608 | 0 | { |
609 | 0 | Renode *cat, *head, **tail; |
610 | 0 | if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { |
611 | | /* Build a right-leaning tree by splicing in new 'cat' at the tail. */ |
612 | 0 | head = parserep(g); |
613 | 0 | tail = &head; |
614 | 0 | while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { |
615 | 0 | cat = newnode(g, P_CAT); |
616 | 0 | cat->x = *tail; |
617 | 0 | cat->y = parserep(g); |
618 | 0 | *tail = cat; |
619 | 0 | tail = &cat->y; |
620 | 0 | } |
621 | 0 | return head; |
622 | 0 | } |
623 | 0 | return NULL; |
624 | 0 | } |
625 | | |
626 | | static Renode *parsealt(struct cstate *g) |
627 | 0 | { |
628 | 0 | Renode *alt, *x; |
629 | 0 | alt = parsecat(g); |
630 | 0 | while (accept(g, '|')) { |
631 | 0 | x = alt; |
632 | 0 | alt = newnode(g, P_ALT); |
633 | 0 | alt->x = x; |
634 | 0 | alt->y = parsecat(g); |
635 | 0 | } |
636 | 0 | return alt; |
637 | 0 | } |
638 | | |
639 | | /* Compile */ |
640 | | |
641 | | enum { |
642 | | I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA, |
643 | | I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF, |
644 | | I_BOL, I_EOL, I_WORD, I_NWORD, |
645 | | I_LPAR, I_RPAR |
646 | | }; |
647 | | |
648 | | struct Reinst { |
649 | | unsigned char opcode; |
650 | | unsigned char n; |
651 | | Rune c; |
652 | | Reclass *cc; |
653 | | Reinst *x; |
654 | | Reinst *y; |
655 | | }; |
656 | | |
657 | | static int count(struct cstate *g, Renode *node, int depth) |
658 | 0 | { |
659 | 0 | int min, max, n; |
660 | 0 | if (!node) return 0; |
661 | 0 | if (++depth > REG_MAXREC) die(g, "stack overflow"); |
662 | 0 | switch (node->type) { |
663 | 0 | default: return 1; |
664 | 0 | case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth); |
665 | 0 | case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2; |
666 | 0 | case P_REP: |
667 | 0 | min = node->m; |
668 | 0 | max = node->n; |
669 | 0 | if (min == max) n = count(g, node->x, depth) * min; |
670 | 0 | else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min); |
671 | 0 | else n = count(g, node->x, depth) * (min + 1) + 2; |
672 | 0 | if (n < 0 || n > REG_MAXPROG) die(g, "program too large"); |
673 | 0 | return n; |
674 | 0 | case P_PAR: return count(g, node->x, depth) + 2; |
675 | 0 | case P_PLA: return count(g, node->x, depth) + 2; |
676 | 0 | case P_NLA: return count(g, node->x, depth) + 2; |
677 | 0 | } |
678 | 0 | } |
679 | | |
680 | | static Reinst *emit(Reprog *prog, int opcode) |
681 | 0 | { |
682 | 0 | Reinst *inst = prog->end++; |
683 | 0 | inst->opcode = opcode; |
684 | 0 | inst->n = 0; |
685 | 0 | inst->c = 0; |
686 | 0 | inst->cc = NULL; |
687 | 0 | inst->x = inst->y = NULL; |
688 | 0 | return inst; |
689 | 0 | } |
690 | | |
691 | | static void compile(Reprog *prog, Renode *node) |
692 | 0 | { |
693 | 0 | Reinst *inst, *split, *jump; |
694 | 0 | int i; |
695 | |
|
696 | 0 | loop: |
697 | 0 | if (!node) |
698 | 0 | return; |
699 | | |
700 | 0 | switch (node->type) { |
701 | 0 | case P_CAT: |
702 | 0 | compile(prog, node->x); |
703 | 0 | node = node->y; |
704 | 0 | goto loop; |
705 | | |
706 | 0 | case P_ALT: |
707 | 0 | split = emit(prog, I_SPLIT); |
708 | 0 | compile(prog, node->x); |
709 | 0 | jump = emit(prog, I_JUMP); |
710 | 0 | compile(prog, node->y); |
711 | 0 | split->x = split + 1; |
712 | 0 | split->y = jump + 1; |
713 | 0 | jump->x = prog->end; |
714 | 0 | break; |
715 | | |
716 | 0 | case P_REP: |
717 | 0 | inst = NULL; /* silence compiler warning. assert(node->m > 0). */ |
718 | 0 | for (i = 0; i < node->m; ++i) { |
719 | 0 | inst = prog->end; |
720 | 0 | compile(prog, node->x); |
721 | 0 | } |
722 | 0 | if (node->m == node->n) |
723 | 0 | break; |
724 | 0 | if (node->n < REPINF) { |
725 | 0 | for (i = node->m; i < node->n; ++i) { |
726 | 0 | split = emit(prog, I_SPLIT); |
727 | 0 | compile(prog, node->x); |
728 | 0 | if (node->ng) { |
729 | 0 | split->y = split + 1; |
730 | 0 | split->x = prog->end; |
731 | 0 | } else { |
732 | 0 | split->x = split + 1; |
733 | 0 | split->y = prog->end; |
734 | 0 | } |
735 | 0 | } |
736 | 0 | } else if (node->m == 0) { |
737 | 0 | split = emit(prog, I_SPLIT); |
738 | 0 | compile(prog, node->x); |
739 | 0 | jump = emit(prog, I_JUMP); |
740 | 0 | if (node->ng) { |
741 | 0 | split->y = split + 1; |
742 | 0 | split->x = prog->end; |
743 | 0 | } else { |
744 | 0 | split->x = split + 1; |
745 | 0 | split->y = prog->end; |
746 | 0 | } |
747 | 0 | jump->x = split; |
748 | 0 | } else { |
749 | 0 | split = emit(prog, I_SPLIT); |
750 | 0 | if (node->ng) { |
751 | 0 | split->y = inst; |
752 | 0 | split->x = prog->end; |
753 | 0 | } else { |
754 | 0 | split->x = inst; |
755 | 0 | split->y = prog->end; |
756 | 0 | } |
757 | 0 | } |
758 | 0 | break; |
759 | | |
760 | 0 | case P_BOL: emit(prog, I_BOL); break; |
761 | 0 | case P_EOL: emit(prog, I_EOL); break; |
762 | 0 | case P_WORD: emit(prog, I_WORD); break; |
763 | 0 | case P_NWORD: emit(prog, I_NWORD); break; |
764 | | |
765 | 0 | case P_PAR: |
766 | 0 | inst = emit(prog, I_LPAR); |
767 | 0 | inst->n = node->n; |
768 | 0 | compile(prog, node->x); |
769 | 0 | inst = emit(prog, I_RPAR); |
770 | 0 | inst->n = node->n; |
771 | 0 | break; |
772 | 0 | case P_PLA: |
773 | 0 | split = emit(prog, I_PLA); |
774 | 0 | compile(prog, node->x); |
775 | 0 | emit(prog, I_END); |
776 | 0 | split->x = split + 1; |
777 | 0 | split->y = prog->end; |
778 | 0 | break; |
779 | 0 | case P_NLA: |
780 | 0 | split = emit(prog, I_NLA); |
781 | 0 | compile(prog, node->x); |
782 | 0 | emit(prog, I_END); |
783 | 0 | split->x = split + 1; |
784 | 0 | split->y = prog->end; |
785 | 0 | break; |
786 | | |
787 | 0 | case P_ANY: |
788 | 0 | emit(prog, I_ANY); |
789 | 0 | break; |
790 | 0 | case P_CHAR: |
791 | 0 | inst = emit(prog, I_CHAR); |
792 | 0 | inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c; |
793 | 0 | break; |
794 | 0 | case P_CCLASS: |
795 | 0 | inst = emit(prog, I_CCLASS); |
796 | 0 | inst->cc = prog->cclass + node->cc; |
797 | 0 | break; |
798 | 0 | case P_NCCLASS: |
799 | 0 | inst = emit(prog, I_NCCLASS); |
800 | 0 | inst->cc = prog->cclass + node->cc; |
801 | 0 | break; |
802 | 0 | case P_REF: |
803 | 0 | inst = emit(prog, I_REF); |
804 | 0 | inst->n = node->n; |
805 | 0 | break; |
806 | 0 | } |
807 | 0 | } |
808 | | |
809 | | #ifdef TEST |
810 | | static void dumpnode(struct cstate *g, Renode *node) |
811 | | { |
812 | | Rune *p; |
813 | | Reclass *cc; |
814 | | if (!node) { printf("Empty"); return; } |
815 | | switch (node->type) { |
816 | | case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; |
817 | | case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; |
818 | | case P_REP: |
819 | | printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n); |
820 | | dumpnode(g, node->x); |
821 | | printf(")"); |
822 | | break; |
823 | | case P_BOL: printf("Bol"); break; |
824 | | case P_EOL: printf("Eol"); break; |
825 | | case P_WORD: printf("Word"); break; |
826 | | case P_NWORD: printf("NotWord"); break; |
827 | | case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break; |
828 | | case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break; |
829 | | case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break; |
830 | | case P_ANY: printf("Any"); break; |
831 | | case P_CHAR: printf("Char(%c)", node->c); break; |
832 | | case P_CCLASS: |
833 | | printf("Class("); |
834 | | cc = g->cclass + node->cc; |
835 | | for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); |
836 | | printf(")"); |
837 | | break; |
838 | | case P_NCCLASS: |
839 | | printf("NotClass("); |
840 | | cc = g->cclass + node->cc; |
841 | | for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); |
842 | | printf(")"); |
843 | | break; |
844 | | case P_REF: printf("Ref(%d)", node->n); break; |
845 | | } |
846 | | } |
847 | | |
848 | | static void dumpcclass(Reclass *cc) { |
849 | | Rune *p; |
850 | | for (p = cc->spans; p < cc->end; p += 2) { |
851 | | if (p[0] > 32 && p[0] < 127) |
852 | | printf(" %c", p[0]); |
853 | | else |
854 | | printf(" \\x%02x", p[0]); |
855 | | if (p[1] > 32 && p[1] < 127) |
856 | | printf("-%c", p[1]); |
857 | | else |
858 | | printf("-\\x%02x", p[1]); |
859 | | } |
860 | | putchar('\n'); |
861 | | } |
862 | | |
863 | | static void dumpprog(Reprog *prog) |
864 | | { |
865 | | Reinst *inst; |
866 | | int i; |
867 | | for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) { |
868 | | printf("% 5d: ", i); |
869 | | switch (inst->opcode) { |
870 | | case I_END: puts("end"); break; |
871 | | case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break; |
872 | | case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
873 | | case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
874 | | case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; |
875 | | case I_ANY: puts("any"); break; |
876 | | case I_ANYNL: puts("anynl"); break; |
877 | | case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break; |
878 | | case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break; |
879 | | case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break; |
880 | | case I_REF: printf("ref %d\n", inst->n); break; |
881 | | case I_BOL: puts("bol"); break; |
882 | | case I_EOL: puts("eol"); break; |
883 | | case I_WORD: puts("word"); break; |
884 | | case I_NWORD: puts("nword"); break; |
885 | | case I_LPAR: printf("lpar %d\n", inst->n); break; |
886 | | case I_RPAR: printf("rpar %d\n", inst->n); break; |
887 | | } |
888 | | } |
889 | | } |
890 | | #endif |
891 | | |
892 | | Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx, |
893 | | const char *pattern, int cflags, const char **errorp) |
894 | 0 | { |
895 | 0 | struct cstate g; |
896 | 0 | Renode *node; |
897 | 0 | Reinst *split, *jump; |
898 | 0 | int i, n; |
899 | |
|
900 | 0 | g.pstart = NULL; |
901 | 0 | g.prog = NULL; |
902 | |
|
903 | 0 | if (setjmp(g.kaboom)) { |
904 | 0 | if (errorp) *errorp = g.error; |
905 | 0 | alloc(ctx, g.pstart, 0); |
906 | 0 | if (g.prog) { |
907 | 0 | alloc(ctx, g.prog->cclass, 0); |
908 | 0 | alloc(ctx, g.prog->start, 0); |
909 | 0 | alloc(ctx, g.prog, 0); |
910 | 0 | } |
911 | 0 | return NULL; |
912 | 0 | } |
913 | | |
914 | 0 | g.prog = alloc(ctx, NULL, sizeof (Reprog)); |
915 | 0 | if (!g.prog) |
916 | 0 | die(&g, "cannot allocate regular expression"); |
917 | 0 | g.prog->start = NULL; |
918 | 0 | g.prog->cclass = NULL; |
919 | |
|
920 | 0 | n = strlen(pattern) * 2; |
921 | 0 | if (n > REG_MAXPROG) |
922 | 0 | die(&g, "program too large"); |
923 | 0 | if (n > 0) { |
924 | 0 | g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n); |
925 | 0 | if (!g.pstart) |
926 | 0 | die(&g, "cannot allocate regular expression parse list"); |
927 | 0 | } |
928 | |
|
929 | 0 | g.source = pattern; |
930 | 0 | g.ncclass = 0; |
931 | 0 | g.nsub = 1; |
932 | 0 | for (i = 0; i < REG_MAXSUB; ++i) |
933 | 0 | g.sub[i] = 0; |
934 | |
|
935 | 0 | g.prog->flags = cflags; |
936 | |
|
937 | 0 | next(&g); |
938 | 0 | node = parsealt(&g); |
939 | 0 | if (g.lookahead == ')') |
940 | 0 | die(&g, "unmatched ')'"); |
941 | 0 | if (g.lookahead != EOF) |
942 | 0 | die(&g, "syntax error"); |
943 | |
|
944 | | #ifdef TEST |
945 | | dumpnode(&g, node); |
946 | | putchar('\n'); |
947 | | #endif |
948 | |
|
949 | 0 | n = 6 + count(&g, node, 0); |
950 | 0 | if (n < 0 || n > REG_MAXPROG) |
951 | 0 | die(&g, "program too large"); |
952 | |
|
953 | 0 | g.prog->nsub = g.nsub; |
954 | 0 | g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst)); |
955 | 0 | if (!g.prog->start) |
956 | 0 | die(&g, "cannot allocate regular expression instruction list"); |
957 | |
|
958 | 0 | if (g.ncclass > 0) { |
959 | 0 | g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass)); |
960 | 0 | if (!g.prog->cclass) |
961 | 0 | die(&g, "cannot allocate regular expression character class list"); |
962 | 0 | memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass)); |
963 | 0 | for (i = 0; i < g.ncclass; ++i) |
964 | 0 | g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans); |
965 | 0 | } |
966 | |
|
967 | 0 | split = emit(g.prog, I_SPLIT); |
968 | 0 | split->x = split + 3; |
969 | 0 | split->y = split + 1; |
970 | 0 | emit(g.prog, I_ANYNL); |
971 | 0 | jump = emit(g.prog, I_JUMP); |
972 | 0 | jump->x = split; |
973 | 0 | emit(g.prog, I_LPAR); |
974 | 0 | compile(g.prog, node); |
975 | 0 | emit(g.prog, I_RPAR); |
976 | 0 | emit(g.prog, I_END); |
977 | |
|
978 | | #ifdef TEST |
979 | | dumpprog(g.prog); |
980 | | #endif |
981 | |
|
982 | 0 | alloc(ctx, g.pstart, 0); |
983 | |
|
984 | 0 | if (errorp) *errorp = NULL; |
985 | 0 | return g.prog; |
986 | 0 | } |
987 | | |
988 | | void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog) |
989 | 0 | { |
990 | 0 | if (prog) { |
991 | 0 | if (prog->cclass) |
992 | 0 | alloc(ctx, prog->cclass, 0); |
993 | 0 | alloc(ctx, prog->start, 0); |
994 | 0 | alloc(ctx, prog, 0); |
995 | 0 | } |
996 | 0 | } |
997 | | |
998 | | static void *default_alloc(void *ctx, void *p, int n) |
999 | 0 | { |
1000 | 0 | if (n == 0) { |
1001 | 0 | free(p); |
1002 | 0 | return NULL; |
1003 | 0 | } |
1004 | 0 | return realloc(p, (size_t)n); |
1005 | 0 | } |
1006 | | |
1007 | | Reprog *regcomp(const char *pattern, int cflags, const char **errorp) |
1008 | 0 | { |
1009 | 0 | return regcompx(default_alloc, NULL, pattern, cflags, errorp); |
1010 | 0 | } |
1011 | | |
1012 | | void regfree(Reprog *prog) |
1013 | 0 | { |
1014 | 0 | regfreex(default_alloc, NULL, prog); |
1015 | 0 | } |
1016 | | |
1017 | | /* Match */ |
1018 | | |
1019 | | static int isnewline(int c) |
1020 | 0 | { |
1021 | 0 | return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029; |
1022 | 0 | } |
1023 | | |
1024 | | static int iswordchar(int c) |
1025 | 0 | { |
1026 | 0 | return c == '_' || |
1027 | 0 | (c >= 'a' && c <= 'z') || |
1028 | 0 | (c >= 'A' && c <= 'Z') || |
1029 | 0 | (c >= '0' && c <= '9'); |
1030 | 0 | } |
1031 | | |
1032 | | static int incclass(Reclass *cc, Rune c) |
1033 | 0 | { |
1034 | 0 | Rune *p; |
1035 | 0 | for (p = cc->spans; p < cc->end; p += 2) |
1036 | 0 | if (p[0] <= c && c <= p[1]) |
1037 | 0 | return 1; |
1038 | 0 | return 0; |
1039 | 0 | } |
1040 | | |
1041 | | static int incclasscanon(Reclass *cc, Rune c) |
1042 | 0 | { |
1043 | 0 | Rune *p, r; |
1044 | 0 | for (p = cc->spans; p < cc->end; p += 2) |
1045 | 0 | for (r = p[0]; r <= p[1]; ++r) |
1046 | 0 | if (c == canon(r)) |
1047 | 0 | return 1; |
1048 | 0 | return 0; |
1049 | 0 | } |
1050 | | |
1051 | | static int strncmpcanon(const char *a, const char *b, int n) |
1052 | 0 | { |
1053 | 0 | Rune ra, rb; |
1054 | 0 | int c; |
1055 | 0 | while (n--) { |
1056 | 0 | if (!*a) return -1; |
1057 | 0 | if (!*b) return 1; |
1058 | 0 | a += chartorune(&ra, a); |
1059 | 0 | b += chartorune(&rb, b); |
1060 | 0 | c = canon(ra) - canon(rb); |
1061 | 0 | if (c) |
1062 | 0 | return c; |
1063 | 0 | } |
1064 | 0 | return 0; |
1065 | 0 | } |
1066 | | |
1067 | | static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth) |
1068 | 0 | { |
1069 | 0 | Resub scratch; |
1070 | 0 | int result; |
1071 | 0 | int i; |
1072 | 0 | Rune c; |
1073 | | |
1074 | | /* stack overflow */ |
1075 | 0 | if (depth > REG_MAXREC) |
1076 | 0 | return -1; |
1077 | | |
1078 | 0 | for (;;) { |
1079 | 0 | switch (pc->opcode) { |
1080 | 0 | case I_END: |
1081 | 0 | return 0; |
1082 | 0 | case I_JUMP: |
1083 | 0 | pc = pc->x; |
1084 | 0 | break; |
1085 | 0 | case I_SPLIT: |
1086 | 0 | scratch = *out; |
1087 | 0 | result = match(pc->x, sp, bol, flags, &scratch, depth+1); |
1088 | 0 | if (result == -1) |
1089 | 0 | return -1; |
1090 | 0 | if (result == 0) { |
1091 | 0 | *out = scratch; |
1092 | 0 | return 0; |
1093 | 0 | } |
1094 | 0 | pc = pc->y; |
1095 | 0 | break; |
1096 | | |
1097 | 0 | case I_PLA: |
1098 | 0 | result = match(pc->x, sp, bol, flags, out, depth+1); |
1099 | 0 | if (result == -1) |
1100 | 0 | return -1; |
1101 | 0 | if (result == 1) |
1102 | 0 | return 1; |
1103 | 0 | pc = pc->y; |
1104 | 0 | break; |
1105 | 0 | case I_NLA: |
1106 | 0 | scratch = *out; |
1107 | 0 | result = match(pc->x, sp, bol, flags, &scratch, depth+1); |
1108 | 0 | if (result == -1) |
1109 | 0 | return -1; |
1110 | 0 | if (result == 0) |
1111 | 0 | return 1; |
1112 | 0 | pc = pc->y; |
1113 | 0 | break; |
1114 | | |
1115 | 0 | case I_ANYNL: |
1116 | 0 | if (!*sp) return 1; |
1117 | 0 | sp += chartorune(&c, sp); |
1118 | 0 | pc = pc + 1; |
1119 | 0 | break; |
1120 | 0 | case I_ANY: |
1121 | 0 | if (!*sp) return 1; |
1122 | 0 | sp += chartorune(&c, sp); |
1123 | 0 | if (isnewline(c)) |
1124 | 0 | return 1; |
1125 | 0 | pc = pc + 1; |
1126 | 0 | break; |
1127 | 0 | case I_CHAR: |
1128 | 0 | if (!*sp) return 1; |
1129 | 0 | sp += chartorune(&c, sp); |
1130 | 0 | if (flags & REG_ICASE) |
1131 | 0 | c = canon(c); |
1132 | 0 | if (c != pc->c) |
1133 | 0 | return 1; |
1134 | 0 | pc = pc + 1; |
1135 | 0 | break; |
1136 | 0 | case I_CCLASS: |
1137 | 0 | if (!*sp) return 1; |
1138 | 0 | sp += chartorune(&c, sp); |
1139 | 0 | if (flags & REG_ICASE) { |
1140 | 0 | if (!incclasscanon(pc->cc, canon(c))) |
1141 | 0 | return 1; |
1142 | 0 | } else { |
1143 | 0 | if (!incclass(pc->cc, c)) |
1144 | 0 | return 1; |
1145 | 0 | } |
1146 | 0 | pc = pc + 1; |
1147 | 0 | break; |
1148 | 0 | case I_NCCLASS: |
1149 | 0 | if (!*sp) return 1; |
1150 | 0 | sp += chartorune(&c, sp); |
1151 | 0 | if (flags & REG_ICASE) { |
1152 | 0 | if (incclasscanon(pc->cc, canon(c))) |
1153 | 0 | return 1; |
1154 | 0 | } else { |
1155 | 0 | if (incclass(pc->cc, c)) |
1156 | 0 | return 1; |
1157 | 0 | } |
1158 | 0 | pc = pc + 1; |
1159 | 0 | break; |
1160 | 0 | case I_REF: |
1161 | 0 | i = out->sub[pc->n].ep - out->sub[pc->n].sp; |
1162 | 0 | if (flags & REG_ICASE) { |
1163 | 0 | if (strncmpcanon(sp, out->sub[pc->n].sp, i)) |
1164 | 0 | return 1; |
1165 | 0 | } else { |
1166 | 0 | if (strncmp(sp, out->sub[pc->n].sp, i)) |
1167 | 0 | return 1; |
1168 | 0 | } |
1169 | 0 | if (i > 0) |
1170 | 0 | sp += i; |
1171 | 0 | pc = pc + 1; |
1172 | 0 | break; |
1173 | | |
1174 | 0 | case I_BOL: |
1175 | 0 | if (sp == bol && !(flags & REG_NOTBOL)) { |
1176 | 0 | pc = pc + 1; |
1177 | 0 | break; |
1178 | 0 | } |
1179 | 0 | if (flags & REG_NEWLINE) { |
1180 | 0 | if (sp > bol && isnewline(sp[-1])) { |
1181 | 0 | pc = pc + 1; |
1182 | 0 | break; |
1183 | 0 | } |
1184 | 0 | } |
1185 | 0 | return 1; |
1186 | 0 | case I_EOL: |
1187 | 0 | if (*sp == 0) { |
1188 | 0 | pc = pc + 1; |
1189 | 0 | break; |
1190 | 0 | } |
1191 | 0 | if (flags & REG_NEWLINE) { |
1192 | 0 | if (isnewline(*sp)) { |
1193 | 0 | pc = pc + 1; |
1194 | 0 | break; |
1195 | 0 | } |
1196 | 0 | } |
1197 | 0 | return 1; |
1198 | 0 | case I_WORD: |
1199 | 0 | i = sp > bol && iswordchar(sp[-1]); |
1200 | 0 | i ^= iswordchar(sp[0]); |
1201 | 0 | if (!i) |
1202 | 0 | return 1; |
1203 | 0 | pc = pc + 1; |
1204 | 0 | break; |
1205 | 0 | case I_NWORD: |
1206 | 0 | i = sp > bol && iswordchar(sp[-1]); |
1207 | 0 | i ^= iswordchar(sp[0]); |
1208 | 0 | if (i) |
1209 | 0 | return 1; |
1210 | 0 | pc = pc + 1; |
1211 | 0 | break; |
1212 | | |
1213 | 0 | case I_LPAR: |
1214 | 0 | out->sub[pc->n].sp = sp; |
1215 | 0 | pc = pc + 1; |
1216 | 0 | break; |
1217 | 0 | case I_RPAR: |
1218 | 0 | out->sub[pc->n].ep = sp; |
1219 | 0 | pc = pc + 1; |
1220 | 0 | break; |
1221 | 0 | default: |
1222 | 0 | return 1; |
1223 | 0 | } |
1224 | 0 | } |
1225 | 0 | } |
1226 | | |
1227 | | int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) |
1228 | 0 | { |
1229 | 0 | Resub scratch; |
1230 | 0 | int i; |
1231 | |
|
1232 | 0 | if (!sub) |
1233 | 0 | sub = &scratch; |
1234 | |
|
1235 | 0 | sub->nsub = prog->nsub; |
1236 | 0 | for (i = 0; i < REG_MAXSUB; ++i) |
1237 | 0 | sub->sub[i].sp = sub->sub[i].ep = NULL; |
1238 | |
|
1239 | 0 | return match(prog->start, sp, sp, prog->flags | eflags, sub, 0); |
1240 | 0 | } |
1241 | | |
1242 | | #ifdef TEST |
1243 | | int main(int argc, char **argv) |
1244 | | { |
1245 | | const char *error; |
1246 | | const char *s; |
1247 | | Reprog *p; |
1248 | | Resub m; |
1249 | | int i; |
1250 | | |
1251 | | if (argc > 1) { |
1252 | | p = regcomp(argv[1], 0, &error); |
1253 | | if (!p) { |
1254 | | fprintf(stderr, "regcomp: %s\n", error); |
1255 | | return 1; |
1256 | | } |
1257 | | |
1258 | | if (argc > 2) { |
1259 | | s = argv[2]; |
1260 | | printf("nsub = %d\n", p->nsub); |
1261 | | if (!regexec(p, s, &m, 0)) { |
1262 | | for (i = 0; i < m.nsub; ++i) { |
1263 | | int n = m.sub[i].ep - m.sub[i].sp; |
1264 | | if (n > 0) |
1265 | | printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp); |
1266 | | else |
1267 | | printf("match %d: n=0 ''\n", i); |
1268 | | } |
1269 | | } else { |
1270 | | printf("no match\n"); |
1271 | | } |
1272 | | } |
1273 | | } |
1274 | | |
1275 | | return 0; |
1276 | | } |
1277 | | #endif |