/src/librdkafka/src/regexp.c
Line | Count | Source |
1 | | /** |
2 | | * Copyright: public domain |
3 | | * |
4 | | * From https://github.com/ccxvii/minilibs sha |
5 | | * 875c33568b5a4aa4fb3dd0c52ea98f7f0e5ca684: |
6 | | * |
7 | | * These libraries are in the public domain (or the equivalent where that is not |
8 | | * possible). You can do anything you want with them. You have no legal |
9 | | * obligation to do anything else, although I appreciate attribution. |
10 | | */ |
11 | | |
12 | | #include "rd.h" |
13 | | |
14 | | #include <stdlib.h> |
15 | | #include <string.h> |
16 | | #include <setjmp.h> |
17 | | #include <stdio.h> |
18 | | |
19 | | #include "regexp.h" |
20 | | |
21 | 10.2k | #define nelem(a) (sizeof(a) / sizeof(a)[0]) |
22 | | |
23 | | typedef unsigned int Rune; |
24 | | |
25 | 528 | static int isalpharune(Rune c) { |
26 | | /* TODO: Add unicode support */ |
27 | 528 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
28 | 528 | } |
29 | | |
30 | 0 | static Rune toupperrune(Rune c) { |
31 | | /* TODO: Add unicode support */ |
32 | 0 | if (c >= 'a' && c <= 'z') |
33 | 0 | return c - 'a' + 'A'; |
34 | 0 | return c; |
35 | 0 | } |
36 | | |
37 | 256k | static int chartorune(Rune *r, const char *s) { |
38 | | /* TODO: Add UTF-8 decoding */ |
39 | 256k | *r = *s; |
40 | 256k | return 1; |
41 | 256k | } |
42 | | |
43 | 28.3k | #define REPINF 255 |
44 | 0 | #define MAXTHREAD 1000 |
45 | 36.3k | #define MAXSUB REG_MAXSUB |
46 | | |
47 | | typedef struct Reclass Reclass; |
48 | | typedef struct Renode Renode; |
49 | | typedef struct Reinst Reinst; |
50 | | typedef struct Rethread Rethread; |
51 | | typedef struct Restate Restate; |
52 | | |
53 | | struct Reclass { |
54 | | Rune *end; |
55 | | Rune spans[64]; |
56 | | }; |
57 | | |
58 | | struct Restate { |
59 | | Reprog *prog; |
60 | | Renode *pstart, *pend; |
61 | | |
62 | | const char *source; |
63 | | unsigned int ncclass; |
64 | | unsigned int nsub; |
65 | | Renode *sub[MAXSUB]; |
66 | | |
67 | | int lookahead; |
68 | | Rune yychar; |
69 | | Reclass *yycc; |
70 | | int yymin, yymax; |
71 | | |
72 | | const char *error; |
73 | | jmp_buf kaboom; |
74 | | }; |
75 | | |
76 | | struct Reprog { |
77 | | Reinst *start, *end; |
78 | | int flags; |
79 | | unsigned int nsub; |
80 | | Reclass cclass[16]; |
81 | | Restate g; /**< Upstream has this as a global variable */ |
82 | | }; |
83 | | |
84 | 1.14k | static void die(Restate *g, const char *message) { |
85 | 1.14k | g->error = message; |
86 | 1.14k | longjmp(g->kaboom, 1); |
87 | 1.14k | } |
88 | | |
89 | 0 | static Rune canon(Rune c) { |
90 | 0 | Rune u = toupperrune(c); |
91 | 0 | if (c >= 128 && u < 128) |
92 | 0 | return c; |
93 | 0 | return u; |
94 | 0 | } |
95 | | |
96 | | /* Scan */ |
97 | | |
98 | | enum { |
99 | | L_CHAR = 256, |
100 | | L_CCLASS, /* character class */ |
101 | | L_NCCLASS, /* negative character class */ |
102 | | L_NC, /* "(?:" no capture */ |
103 | | L_PLA, /* "(?=" positive lookahead */ |
104 | | L_NLA, /* "(?!" negative lookahead */ |
105 | | L_WORD, /* "\b" word boundary */ |
106 | | L_NWORD, /* "\B" non-word boundary */ |
107 | | L_REF, /* "\1" back-reference */ |
108 | | L_COUNT /* {M,N} */ |
109 | | }; |
110 | | |
111 | 9.37k | static int hex(Restate *g, int c) { |
112 | 9.37k | if (c >= '0' && c <= '9') |
113 | 3.96k | return c - '0'; |
114 | 5.41k | if (c >= 'a' && c <= 'f') |
115 | 1.97k | return c - 'a' + 0xA; |
116 | 3.44k | if (c >= 'A' && c <= 'F') |
117 | 3.16k | return c - 'A' + 0xA; |
118 | 276 | die(g, "invalid escape sequence"); |
119 | 276 | return 0; |
120 | 3.44k | } |
121 | | |
122 | 5.62k | static int dec(Restate *g, int c) { |
123 | 5.62k | if (c >= '0' && c <= '9') |
124 | 5.50k | return c - '0'; |
125 | 117 | die(g, "invalid quantifier"); |
126 | 117 | return 0; |
127 | 5.62k | } |
128 | | |
129 | 5.25k | #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789" |
130 | | |
131 | 245k | static int nextrune(Restate *g) { |
132 | 245k | g->source += chartorune(&g->yychar, g->source); |
133 | 245k | if (g->yychar == '\\') { |
134 | 11.1k | g->source += chartorune(&g->yychar, g->source); |
135 | 11.1k | switch (g->yychar) { |
136 | 7 | case 0: |
137 | 7 | die(g, "unterminated escape sequence"); |
138 | 1.35k | case 'f': |
139 | 1.35k | g->yychar = '\f'; |
140 | 1.35k | return 0; |
141 | 327 | case 'n': |
142 | 327 | g->yychar = '\n'; |
143 | 327 | return 0; |
144 | 203 | case 'r': |
145 | 203 | g->yychar = '\r'; |
146 | 203 | return 0; |
147 | 201 | case 't': |
148 | 201 | g->yychar = '\t'; |
149 | 201 | return 0; |
150 | 194 | case 'v': |
151 | 194 | g->yychar = '\v'; |
152 | 194 | return 0; |
153 | 378 | case 'c': |
154 | 378 | g->yychar = (*g->source++) & 31; |
155 | 378 | return 0; |
156 | 1.58k | case 'x': |
157 | 1.58k | g->yychar = hex(g, *g->source++) << 4; |
158 | 1.58k | g->yychar += hex(g, *g->source++); |
159 | 1.58k | if (g->yychar == 0) { |
160 | 496 | g->yychar = '0'; |
161 | 496 | return 1; |
162 | 496 | } |
163 | 1.09k | return 0; |
164 | 1.63k | case 'u': |
165 | 1.63k | g->yychar = hex(g, *g->source++) << 12; |
166 | 1.63k | g->yychar += hex(g, *g->source++) << 8; |
167 | 1.63k | g->yychar += hex(g, *g->source++) << 4; |
168 | 1.63k | g->yychar += hex(g, *g->source++); |
169 | 1.63k | if (g->yychar == 0) { |
170 | 533 | g->yychar = '0'; |
171 | 533 | return 1; |
172 | 533 | } |
173 | 1.09k | return 0; |
174 | 11.1k | } |
175 | 5.25k | if (strchr(ESCAPES, g->yychar)) |
176 | 4.72k | return 1; |
177 | 528 | if (isalpharune(g->yychar) || |
178 | 517 | g->yychar == '_') /* check identity escape */ |
179 | 12 | die(g, "invalid escape character"); |
180 | 528 | return 0; |
181 | 5.25k | } |
182 | 234k | return 0; |
183 | 245k | } |
184 | | |
185 | 2.03k | static int lexcount(Restate *g) { |
186 | 2.03k | g->yychar = *g->source++; |
187 | | |
188 | 2.03k | g->yymin = dec(g, g->yychar); |
189 | 2.03k | g->yychar = *g->source++; |
190 | 4.04k | while (g->yychar != ',' && g->yychar != '}') { |
191 | 2.01k | g->yymin = g->yymin * 10 + dec(g, g->yychar); |
192 | 2.01k | g->yychar = *g->source++; |
193 | 2.01k | } |
194 | 2.03k | if (g->yymin >= REPINF) |
195 | 18 | die(g, "numeric overflow"); |
196 | | |
197 | 2.03k | if (g->yychar == ',') { |
198 | 831 | g->yychar = *g->source++; |
199 | 831 | if (g->yychar == '}') { |
200 | 251 | g->yymax = REPINF; |
201 | 580 | } else { |
202 | 580 | g->yymax = dec(g, g->yychar); |
203 | 580 | g->yychar = *g->source++; |
204 | 1.57k | while (g->yychar != '}') { |
205 | 997 | g->yymax = g->yymax * 10 + dec(g, g->yychar); |
206 | 997 | g->yychar = *g->source++; |
207 | 997 | } |
208 | 580 | if (g->yymax >= REPINF) |
209 | 19 | die(g, "numeric overflow"); |
210 | 580 | } |
211 | 1.20k | } else { |
212 | 1.20k | g->yymax = g->yymin; |
213 | 1.20k | } |
214 | | |
215 | 2.03k | return L_COUNT; |
216 | 2.03k | } |
217 | | |
218 | 1.37k | static void newcclass(Restate *g) { |
219 | 1.37k | if (g->ncclass >= nelem(g->prog->cclass)) |
220 | 7 | die(g, "too many character classes"); |
221 | 1.37k | g->yycc = g->prog->cclass + g->ncclass++; |
222 | 1.37k | g->yycc->end = g->yycc->spans; |
223 | 1.37k | } |
224 | | |
225 | 8.90k | static void addrange(Restate *g, Rune a, Rune b) { |
226 | 8.90k | if (a > b) |
227 | 46 | die(g, "invalid character class range"); |
228 | 8.90k | if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans)) |
229 | 44 | die(g, "too many character class ranges"); |
230 | 8.90k | *g->yycc->end++ = a; |
231 | 8.90k | *g->yycc->end++ = b; |
232 | 8.90k | } |
233 | | |
234 | 708 | static void addranges_d(Restate *g) { |
235 | 708 | addrange(g, '0', '9'); |
236 | 708 | } |
237 | | |
238 | 278 | static void addranges_D(Restate *g) { |
239 | 278 | addrange(g, 0, '0' - 1); |
240 | 278 | addrange(g, '9' + 1, 0xFFFF); |
241 | 278 | } |
242 | | |
243 | 468 | static void addranges_s(Restate *g) { |
244 | 468 | addrange(g, 0x9, 0x9); |
245 | 468 | addrange(g, 0xA, 0xD); |
246 | 468 | addrange(g, 0x20, 0x20); |
247 | 468 | addrange(g, 0xA0, 0xA0); |
248 | 468 | addrange(g, 0x2028, 0x2029); |
249 | 468 | addrange(g, 0xFEFF, 0xFEFF); |
250 | 468 | } |
251 | | |
252 | 4 | static void addranges_S(Restate *g) { |
253 | 4 | addrange(g, 0, 0x9 - 1); |
254 | 4 | addrange(g, 0x9 + 1, 0xA - 1); |
255 | 4 | addrange(g, 0xD + 1, 0x20 - 1); |
256 | 4 | addrange(g, 0x20 + 1, 0xA0 - 1); |
257 | 4 | addrange(g, 0xA0 + 1, 0x2028 - 1); |
258 | 4 | addrange(g, 0x2029 + 1, 0xFEFF - 1); |
259 | 4 | addrange(g, 0xFEFF + 1, 0xFFFF); |
260 | 4 | } |
261 | | |
262 | 362 | static void addranges_w(Restate *g) { |
263 | 362 | addrange(g, '0', '9'); |
264 | 362 | addrange(g, 'A', 'Z'); |
265 | 362 | addrange(g, '_', '_'); |
266 | 362 | addrange(g, 'a', 'z'); |
267 | 362 | } |
268 | | |
269 | 90 | static void addranges_W(Restate *g) { |
270 | 90 | addrange(g, 0, '0' - 1); |
271 | 90 | addrange(g, '9' + 1, 'A' - 1); |
272 | 90 | addrange(g, 'Z' + 1, '_' - 1); |
273 | 90 | addrange(g, '_' + 1, 'a' - 1); |
274 | 90 | addrange(g, 'z' + 1, 0xFFFF); |
275 | 90 | } |
276 | | |
277 | 763 | static int lexclass(Restate *g) { |
278 | 763 | int type = L_CCLASS; |
279 | 763 | int quoted, havesave, havedash; |
280 | 763 | Rune save = 0; |
281 | | |
282 | 763 | newcclass(g); |
283 | | |
284 | 763 | quoted = nextrune(g); |
285 | 763 | if (!quoted && g->yychar == '^') { |
286 | 36 | type = L_NCCLASS; |
287 | 36 | quoted = nextrune(g); |
288 | 36 | } |
289 | | |
290 | 763 | havesave = havedash = 0; |
291 | 6.81k | for (;;) { |
292 | 6.81k | if (g->yychar == 0) |
293 | 235 | die(g, "unterminated character class"); |
294 | 6.81k | if (!quoted && g->yychar == ']') |
295 | 439 | break; |
296 | | |
297 | 6.37k | if (!quoted && g->yychar == '-') { |
298 | 2.05k | if (havesave) { |
299 | 1.53k | if (havedash) { |
300 | 446 | addrange(g, save, '-'); |
301 | 446 | havesave = havedash = 0; |
302 | 1.09k | } else { |
303 | 1.09k | havedash = 1; |
304 | 1.09k | } |
305 | 1.53k | } else { |
306 | 514 | save = '-'; |
307 | 514 | havesave = 1; |
308 | 514 | } |
309 | 4.32k | } else if (quoted && strchr("DSWdsw", g->yychar)) { |
310 | 1.31k | if (havesave) { |
311 | 441 | addrange(g, save, save); |
312 | 441 | if (havedash) |
313 | 195 | addrange(g, '-', '-'); |
314 | 441 | } |
315 | 1.31k | switch (g->yychar) { |
316 | 529 | case 'd': |
317 | 529 | addranges_d(g); |
318 | 529 | break; |
319 | 217 | case 's': |
320 | 217 | addranges_s(g); |
321 | 217 | break; |
322 | 190 | case 'w': |
323 | 190 | addranges_w(g); |
324 | 190 | break; |
325 | 278 | case 'D': |
326 | 278 | addranges_D(g); |
327 | 278 | break; |
328 | 4 | case 'S': |
329 | 4 | addranges_S(g); |
330 | 4 | break; |
331 | 90 | case 'W': |
332 | 90 | addranges_W(g); |
333 | 90 | break; |
334 | 1.31k | } |
335 | 1.27k | havesave = havedash = 0; |
336 | 3.01k | } else { |
337 | 3.01k | if (quoted) { |
338 | 694 | if (g->yychar == 'b') |
339 | 194 | g->yychar = '\b'; |
340 | 500 | else if (g->yychar == '0') |
341 | 217 | g->yychar = 0; |
342 | | /* else identity escape */ |
343 | 694 | } |
344 | 3.01k | if (havesave) { |
345 | 1.68k | if (havedash) { |
346 | 404 | addrange(g, save, g->yychar); |
347 | 404 | havesave = havedash = 0; |
348 | 1.28k | } else { |
349 | 1.28k | addrange(g, save, save); |
350 | 1.28k | save = g->yychar; |
351 | 1.28k | } |
352 | 1.68k | } else { |
353 | 1.32k | save = g->yychar; |
354 | 1.32k | havesave = 1; |
355 | 1.32k | } |
356 | 3.01k | } |
357 | | |
358 | 6.34k | quoted = nextrune(g); |
359 | 6.34k | } |
360 | | |
361 | 726 | if (havesave) { |
362 | 174 | addrange(g, save, save); |
363 | 174 | if (havedash) |
364 | 37 | addrange(g, '-', '-'); |
365 | 174 | } |
366 | | |
367 | 726 | return type; |
368 | 763 | } |
369 | | |
370 | 238k | static int lex(Restate *g) { |
371 | 238k | int quoted = nextrune(g); |
372 | 238k | if (quoted) { |
373 | 3.74k | switch (g->yychar) { |
374 | 201 | case 'b': |
375 | 201 | return L_WORD; |
376 | 396 | case 'B': |
377 | 396 | return L_NWORD; |
378 | 92 | case 'd': |
379 | 92 | newcclass(g); |
380 | 92 | addranges_d(g); |
381 | 92 | return L_CCLASS; |
382 | 123 | case 's': |
383 | 123 | newcclass(g); |
384 | 123 | addranges_s(g); |
385 | 123 | return L_CCLASS; |
386 | 105 | case 'w': |
387 | 105 | newcclass(g); |
388 | 105 | addranges_w(g); |
389 | 105 | return L_CCLASS; |
390 | 89 | case 'D': |
391 | 89 | newcclass(g); |
392 | 89 | addranges_d(g); |
393 | 89 | return L_NCCLASS; |
394 | 130 | case 'S': |
395 | 130 | newcclass(g); |
396 | 130 | addranges_s(g); |
397 | 130 | return L_NCCLASS; |
398 | 69 | case 'W': |
399 | 69 | newcclass(g); |
400 | 69 | addranges_w(g); |
401 | 69 | return L_NCCLASS; |
402 | 1.25k | case '0': |
403 | 1.25k | g->yychar = 0; |
404 | 1.25k | return L_CHAR; |
405 | 3.74k | } |
406 | 1.28k | if (g->yychar >= '0' && g->yychar <= '9') { |
407 | 865 | g->yychar -= '0'; |
408 | 865 | if (*g->source >= '0' && *g->source <= '9') |
409 | 200 | g->yychar = g->yychar * 10 + *g->source++ - '0'; |
410 | 865 | return L_REF; |
411 | 865 | } |
412 | 418 | return L_CHAR; |
413 | 1.28k | } |
414 | | |
415 | 234k | switch (g->yychar) { |
416 | 1.16k | case 0: |
417 | 71.2k | case '$': |
418 | 73.3k | case ')': |
419 | 75.9k | case '*': |
420 | 76.9k | case '+': |
421 | 77.9k | case '.': |
422 | 80.8k | case '?': |
423 | 105k | case '^': |
424 | 110k | case '|': |
425 | 110k | return g->yychar; |
426 | 234k | } |
427 | | |
428 | 124k | if (g->yychar == '{') |
429 | 2.03k | return lexcount(g); |
430 | 122k | if (g->yychar == '[') |
431 | 763 | return lexclass(g); |
432 | 121k | if (g->yychar == '(') { |
433 | 3.53k | if (g->source[0] == '?') { |
434 | 2.34k | if (g->source[1] == ':') { |
435 | 668 | g->source += 2; |
436 | 668 | return L_NC; |
437 | 668 | } |
438 | 1.67k | if (g->source[1] == '=') { |
439 | 671 | g->source += 2; |
440 | 671 | return L_PLA; |
441 | 671 | } |
442 | 1.00k | if (g->source[1] == '!') { |
443 | 984 | g->source += 2; |
444 | 984 | return L_NLA; |
445 | 984 | } |
446 | 1.00k | } |
447 | 1.21k | return '('; |
448 | 3.53k | } |
449 | | |
450 | 118k | return L_CHAR; |
451 | 121k | } |
452 | | |
453 | | /* Parse */ |
454 | | |
455 | | enum { |
456 | | P_CAT, |
457 | | P_ALT, |
458 | | P_REP, |
459 | | P_BOL, |
460 | | P_EOL, |
461 | | P_WORD, |
462 | | P_NWORD, |
463 | | P_PAR, |
464 | | P_PLA, |
465 | | P_NLA, |
466 | | P_ANY, |
467 | | P_CHAR, |
468 | | P_CCLASS, |
469 | | P_NCCLASS, |
470 | | P_REF |
471 | | }; |
472 | | |
473 | | struct Renode { |
474 | | unsigned char type; |
475 | | unsigned char ng, m, n; |
476 | | Rune c; |
477 | | Reclass *cc; |
478 | | Renode *x; |
479 | | Renode *y; |
480 | | }; |
481 | | |
482 | 449k | static Renode *newnode(Restate *g, int type) { |
483 | 449k | Renode *node = g->pend++; |
484 | 449k | node->type = type; |
485 | 449k | node->cc = NULL; |
486 | 449k | node->c = 0; |
487 | 449k | node->ng = 0; |
488 | 449k | node->m = 0; |
489 | 449k | node->n = 0; |
490 | 449k | node->x = node->y = NULL; |
491 | 449k | return node; |
492 | 449k | } |
493 | | |
494 | 44.5k | static int empty(Renode *node) { |
495 | 44.5k | if (!node) |
496 | 1.63k | return 1; |
497 | 42.9k | switch (node->type) { |
498 | 216 | default: |
499 | 216 | return 1; |
500 | 30.1k | case P_CAT: |
501 | 30.1k | return empty(node->x) && empty(node->y); |
502 | 911 | case P_ALT: |
503 | 911 | return empty(node->x) || empty(node->y); |
504 | 1.43k | case P_REP: |
505 | 1.43k | return empty(node->x) || node->m == 0; |
506 | 3.27k | case P_PAR: |
507 | 3.27k | return empty(node->x); |
508 | 1.75k | case P_REF: |
509 | 1.75k | return empty(node->x); |
510 | 684 | case P_ANY: |
511 | 4.77k | case P_CHAR: |
512 | 5.04k | case P_CCLASS: |
513 | 5.24k | case P_NCCLASS: |
514 | 5.24k | return 0; |
515 | 42.9k | } |
516 | 42.9k | } |
517 | | |
518 | 6.20k | static Renode *newrep(Restate *g, Renode *atom, int ng, int min, int max) { |
519 | 6.20k | Renode *rep = newnode(g, P_REP); |
520 | 6.20k | if (max == REPINF && empty(atom)) |
521 | 37 | die(g, "infinite loop matching the empty string"); |
522 | 6.20k | rep->ng = ng; |
523 | 6.20k | rep->m = min; |
524 | 6.20k | rep->n = max; |
525 | 6.20k | rep->x = atom; |
526 | 6.20k | return rep; |
527 | 6.20k | } |
528 | | |
529 | 238k | static void next(Restate *g) { |
530 | 238k | g->lookahead = lex(g); |
531 | 238k | } |
532 | | |
533 | 1.06M | static int re_accept(Restate *g, int t) { |
534 | 1.06M | if (g->lookahead == t) { |
535 | 112k | next(g); |
536 | 112k | return 1; |
537 | 112k | } |
538 | 952k | return 0; |
539 | 1.06M | } |
540 | | |
541 | | static Renode *parsealt(Restate *g); |
542 | | |
543 | 126k | static Renode *parseatom(Restate *g) { |
544 | 126k | Renode *atom; |
545 | 126k | if (g->lookahead == L_CHAR) { |
546 | 119k | atom = newnode(g, P_CHAR); |
547 | 119k | atom->c = g->yychar; |
548 | 119k | next(g); |
549 | 119k | return atom; |
550 | 119k | } |
551 | 6.58k | if (g->lookahead == L_CCLASS) { |
552 | 722 | atom = newnode(g, P_CCLASS); |
553 | 722 | atom->cc = g->yycc; |
554 | 722 | next(g); |
555 | 722 | return atom; |
556 | 722 | } |
557 | 5.86k | if (g->lookahead == L_NCCLASS) { |
558 | 315 | atom = newnode(g, P_NCCLASS); |
559 | 315 | atom->cc = g->yycc; |
560 | 315 | next(g); |
561 | 315 | return atom; |
562 | 315 | } |
563 | 5.54k | if (g->lookahead == L_REF) { |
564 | 865 | atom = newnode(g, P_REF); |
565 | 865 | if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar]) |
566 | 29 | die(g, "invalid back-reference"); |
567 | 865 | atom->n = g->yychar; |
568 | 865 | atom->x = g->sub[g->yychar]; |
569 | 865 | next(g); |
570 | 865 | return atom; |
571 | 865 | } |
572 | 4.68k | if (re_accept(g, '.')) |
573 | 1.01k | return newnode(g, P_ANY); |
574 | 3.67k | if (re_accept(g, '(')) { |
575 | 1.20k | atom = newnode(g, P_PAR); |
576 | 1.20k | if (g->nsub == MAXSUB) |
577 | 4 | die(g, "too many captures"); |
578 | 1.20k | atom->n = g->nsub++; |
579 | 1.20k | atom->x = parsealt(g); |
580 | 1.20k | g->sub[atom->n] = atom; |
581 | 1.20k | if (!re_accept(g, ')')) |
582 | 62 | die(g, "unmatched '('"); |
583 | 1.20k | return atom; |
584 | 1.20k | } |
585 | 2.46k | if (re_accept(g, L_NC)) { |
586 | 668 | atom = parsealt(g); |
587 | 668 | if (!re_accept(g, ')')) |
588 | 12 | die(g, "unmatched '('"); |
589 | 668 | return atom; |
590 | 668 | } |
591 | 1.79k | if (re_accept(g, L_PLA)) { |
592 | 671 | atom = newnode(g, P_PLA); |
593 | 671 | atom->x = parsealt(g); |
594 | 671 | if (!re_accept(g, ')')) |
595 | 9 | die(g, "unmatched '('"); |
596 | 671 | return atom; |
597 | 671 | } |
598 | 1.12k | if (re_accept(g, L_NLA)) { |
599 | 984 | atom = newnode(g, P_NLA); |
600 | 984 | atom->x = parsealt(g); |
601 | 984 | if (!re_accept(g, ')')) |
602 | 9 | die(g, "unmatched '('"); |
603 | 984 | return atom; |
604 | 984 | } |
605 | 138 | die(g, "syntax error"); |
606 | 138 | return NULL; |
607 | 1.12k | } |
608 | | |
609 | 221k | static Renode *parserep(Restate *g) { |
610 | 221k | Renode *atom; |
611 | | |
612 | 221k | if (re_accept(g, '^')) |
613 | 24.7k | return newnode(g, P_BOL); |
614 | 197k | if (re_accept(g, '$')) |
615 | 70.1k | return newnode(g, P_EOL); |
616 | 127k | if (re_accept(g, L_WORD)) |
617 | 201 | return newnode(g, P_WORD); |
618 | 126k | if (re_accept(g, L_NWORD)) |
619 | 396 | return newnode(g, P_NWORD); |
620 | | |
621 | 126k | atom = parseatom(g); |
622 | 126k | if (g->lookahead == L_COUNT) { |
623 | 1.79k | int min = g->yymin, max = g->yymax; |
624 | 1.79k | next(g); |
625 | 1.79k | if (max < min) |
626 | 5 | die(g, "invalid quantifier"); |
627 | 1.79k | return newrep(g, atom, re_accept(g, '?'), min, max); |
628 | 1.79k | } |
629 | 124k | if (re_accept(g, '*')) |
630 | 2.59k | return newrep(g, atom, re_accept(g, '?'), 0, REPINF); |
631 | 122k | if (re_accept(g, '+')) |
632 | 1.01k | return newrep(g, atom, re_accept(g, '?'), 1, REPINF); |
633 | 121k | if (re_accept(g, '?')) |
634 | 800 | return newrep(g, atom, re_accept(g, '?'), 0, 1); |
635 | 120k | return atom; |
636 | 121k | } |
637 | | |
638 | 9.46k | static Renode *parsecat(Restate *g) { |
639 | 9.46k | Renode *cat, *x; |
640 | 9.46k | if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') { |
641 | 4.58k | cat = parserep(g); |
642 | 221k | while (g->lookahead && g->lookahead != '|' && |
643 | 218k | g->lookahead != ')') { |
644 | 217k | x = cat; |
645 | 217k | cat = newnode(g, P_CAT); |
646 | 217k | cat->x = x; |
647 | 217k | cat->y = parserep(g); |
648 | 217k | } |
649 | 4.58k | return cat; |
650 | 4.58k | } |
651 | 4.88k | return NULL; |
652 | 9.46k | } |
653 | | |
654 | 5.04k | static Renode *parsealt(Restate *g) { |
655 | 5.04k | Renode *alt, *x; |
656 | 5.04k | alt = parsecat(g); |
657 | 9.46k | while (re_accept(g, '|')) { |
658 | 4.41k | x = alt; |
659 | 4.41k | alt = newnode(g, P_ALT); |
660 | 4.41k | alt->x = x; |
661 | 4.41k | alt->y = parsecat(g); |
662 | 4.41k | } |
663 | 5.04k | return alt; |
664 | 5.04k | } |
665 | | |
666 | | /* Compile */ |
667 | | |
668 | | enum { |
669 | | I_END, |
670 | | I_JUMP, |
671 | | I_SPLIT, |
672 | | I_PLA, |
673 | | I_NLA, |
674 | | I_ANYNL, |
675 | | I_ANY, |
676 | | I_CHAR, |
677 | | I_CCLASS, |
678 | | I_NCCLASS, |
679 | | I_REF, |
680 | | I_BOL, |
681 | | I_EOL, |
682 | | I_WORD, |
683 | | I_NWORD, |
684 | | I_LPAR, |
685 | | I_RPAR |
686 | | }; |
687 | | |
688 | | struct Reinst { |
689 | | unsigned char opcode; |
690 | | unsigned char n; |
691 | | Rune c; |
692 | | Reclass *cc; |
693 | | Reinst *x; |
694 | | Reinst *y; |
695 | | }; |
696 | | |
697 | 425k | static unsigned int count(Renode *node) { |
698 | 425k | unsigned int min, max; |
699 | 425k | if (!node) |
700 | 4.26k | return 0; |
701 | 421k | switch (node->type) { |
702 | 206k | default: |
703 | 206k | return 1; |
704 | 205k | case P_CAT: |
705 | 205k | return count(node->x) + count(node->y); |
706 | 3.92k | case P_ALT: |
707 | 3.92k | return count(node->x) + count(node->y) + 2; |
708 | 4.41k | case P_REP: |
709 | 4.41k | min = node->m; |
710 | 4.41k | max = node->n; |
711 | 4.41k | if (min == max) |
712 | 824 | return count(node->x) * min; |
713 | 3.59k | if (max < REPINF) |
714 | 926 | return count(node->x) * max + (max - min); |
715 | 2.66k | return count(node->x) * (min + 1) + 2; |
716 | 578 | case P_PAR: |
717 | 578 | return count(node->x) + 2; |
718 | 454 | case P_PLA: |
719 | 454 | return count(node->x) + 2; |
720 | 442 | case P_NLA: |
721 | 442 | return count(node->x) + 2; |
722 | 421k | } |
723 | 421k | } |
724 | | |
725 | 403k | static Reinst *emit(Reprog *prog, int opcode) { |
726 | 403k | Reinst *inst = prog->end++; |
727 | 403k | inst->opcode = opcode; |
728 | 403k | inst->n = 0; |
729 | 403k | inst->c = 0; |
730 | 403k | inst->cc = NULL; |
731 | 403k | inst->x = inst->y = NULL; |
732 | 403k | return inst; |
733 | 403k | } |
734 | | |
735 | 596k | static void compile(Reprog *prog, Renode *node) { |
736 | 596k | Reinst *inst, *split, *jump; |
737 | 596k | unsigned int i; |
738 | | |
739 | 596k | if (!node) |
740 | 10.5k | return; |
741 | | |
742 | 585k | switch (node->type) { |
743 | 211k | case P_CAT: |
744 | 211k | compile(prog, node->x); |
745 | 211k | compile(prog, node->y); |
746 | 211k | break; |
747 | | |
748 | 6.90k | case P_ALT: |
749 | 6.90k | split = emit(prog, I_SPLIT); |
750 | 6.90k | compile(prog, node->x); |
751 | 6.90k | jump = emit(prog, I_JUMP); |
752 | 6.90k | compile(prog, node->y); |
753 | 6.90k | split->x = split + 1; |
754 | 6.90k | split->y = jump + 1; |
755 | 6.90k | jump->x = prog->end; |
756 | 6.90k | break; |
757 | | |
758 | 8.18k | case P_REP: |
759 | 145k | for (i = 0; i < node->m; ++i) { |
760 | 137k | inst = prog->end; |
761 | 137k | compile(prog, node->x); |
762 | 137k | } |
763 | 8.18k | if (node->m == node->n) |
764 | 2.27k | break; |
765 | 5.91k | if (node->n < REPINF) { |
766 | 9.91k | for (i = node->m; i < node->n; ++i) { |
767 | 8.46k | split = emit(prog, I_SPLIT); |
768 | 8.46k | compile(prog, node->x); |
769 | 8.46k | if (node->ng) { |
770 | 433 | split->y = split + 1; |
771 | 433 | split->x = prog->end; |
772 | 8.02k | } else { |
773 | 8.02k | split->x = split + 1; |
774 | 8.02k | split->y = prog->end; |
775 | 8.02k | } |
776 | 8.46k | } |
777 | 4.45k | } else if (node->m == 0) { |
778 | 2.26k | split = emit(prog, I_SPLIT); |
779 | 2.26k | compile(prog, node->x); |
780 | 2.26k | jump = emit(prog, I_JUMP); |
781 | 2.26k | if (node->ng) { |
782 | 943 | split->y = split + 1; |
783 | 943 | split->x = prog->end; |
784 | 1.31k | } else { |
785 | 1.31k | split->x = split + 1; |
786 | 1.31k | split->y = prog->end; |
787 | 1.31k | } |
788 | 2.26k | jump->x = split; |
789 | 2.26k | } else { |
790 | 2.19k | split = emit(prog, I_SPLIT); |
791 | 2.19k | if (node->ng) { |
792 | 552 | split->y = inst; |
793 | 552 | split->x = prog->end; |
794 | 1.64k | } else { |
795 | 1.64k | split->x = inst; |
796 | 1.64k | split->y = prog->end; |
797 | 1.64k | } |
798 | 2.19k | } |
799 | 5.91k | break; |
800 | | |
801 | 25.5k | case P_BOL: |
802 | 25.5k | emit(prog, I_BOL); |
803 | 25.5k | break; |
804 | 70.3k | case P_EOL: |
805 | 70.3k | emit(prog, I_EOL); |
806 | 70.3k | break; |
807 | 564 | case P_WORD: |
808 | 564 | emit(prog, I_WORD); |
809 | 564 | break; |
810 | 727 | case P_NWORD: |
811 | 727 | emit(prog, I_NWORD); |
812 | 727 | break; |
813 | | |
814 | 8.34k | case P_PAR: |
815 | 8.34k | inst = emit(prog, I_LPAR); |
816 | 8.34k | inst->n = node->n; |
817 | 8.34k | compile(prog, node->x); |
818 | 8.34k | inst = emit(prog, I_RPAR); |
819 | 8.34k | inst->n = node->n; |
820 | 8.34k | break; |
821 | 760 | case P_PLA: |
822 | 760 | split = emit(prog, I_PLA); |
823 | 760 | compile(prog, node->x); |
824 | 760 | emit(prog, I_END); |
825 | 760 | split->x = split + 1; |
826 | 760 | split->y = prog->end; |
827 | 760 | break; |
828 | 663 | case P_NLA: |
829 | 663 | split = emit(prog, I_NLA); |
830 | 663 | compile(prog, node->x); |
831 | 663 | emit(prog, I_END); |
832 | 663 | split->x = split + 1; |
833 | 663 | split->y = prog->end; |
834 | 663 | break; |
835 | | |
836 | 4.45k | case P_ANY: |
837 | 4.45k | emit(prog, I_ANY); |
838 | 4.45k | break; |
839 | 245k | case P_CHAR: |
840 | 245k | inst = emit(prog, I_CHAR); |
841 | 245k | inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c; |
842 | 245k | break; |
843 | 716 | case P_CCLASS: |
844 | 716 | inst = emit(prog, I_CCLASS); |
845 | 716 | inst->cc = node->cc; |
846 | 716 | break; |
847 | 567 | case P_NCCLASS: |
848 | 567 | inst = emit(prog, I_NCCLASS); |
849 | 567 | inst->cc = node->cc; |
850 | 567 | break; |
851 | 675 | case P_REF: |
852 | 675 | inst = emit(prog, I_REF); |
853 | 675 | inst->n = node->n; |
854 | 675 | break; |
855 | 585k | } |
856 | 585k | } |
857 | | |
858 | | #ifdef TEST |
859 | | static void dumpnode(Renode *node) { |
860 | | Rune *p; |
861 | | if (!node) { |
862 | | printf("Empty"); |
863 | | return; |
864 | | } |
865 | | switch (node->type) { |
866 | | case P_CAT: |
867 | | printf("Cat("); |
868 | | dumpnode(node->x); |
869 | | printf(", "); |
870 | | dumpnode(node->y); |
871 | | printf(")"); |
872 | | break; |
873 | | case P_ALT: |
874 | | printf("Alt("); |
875 | | dumpnode(node->x); |
876 | | printf(", "); |
877 | | dumpnode(node->y); |
878 | | printf(")"); |
879 | | break; |
880 | | case P_REP: |
881 | | printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, |
882 | | node->n); |
883 | | dumpnode(node->x); |
884 | | printf(")"); |
885 | | break; |
886 | | case P_BOL: |
887 | | printf("Bol"); |
888 | | break; |
889 | | case P_EOL: |
890 | | printf("Eol"); |
891 | | break; |
892 | | case P_WORD: |
893 | | printf("Word"); |
894 | | break; |
895 | | case P_NWORD: |
896 | | printf("NotWord"); |
897 | | break; |
898 | | case P_PAR: |
899 | | printf("Par(%d,", node->n); |
900 | | dumpnode(node->x); |
901 | | printf(")"); |
902 | | break; |
903 | | case P_PLA: |
904 | | printf("PLA("); |
905 | | dumpnode(node->x); |
906 | | printf(")"); |
907 | | break; |
908 | | case P_NLA: |
909 | | printf("NLA("); |
910 | | dumpnode(node->x); |
911 | | printf(")"); |
912 | | break; |
913 | | case P_ANY: |
914 | | printf("Any"); |
915 | | break; |
916 | | case P_CHAR: |
917 | | printf("Char(%c)", node->c); |
918 | | break; |
919 | | case P_CCLASS: |
920 | | printf("Class("); |
921 | | for (p = node->cc->spans; p < node->cc->end; p += 2) |
922 | | printf("%02X-%02X,", p[0], p[1]); |
923 | | printf(")"); |
924 | | break; |
925 | | case P_NCCLASS: |
926 | | printf("NotClass("); |
927 | | for (p = node->cc->spans; p < node->cc->end; p += 2) |
928 | | printf("%02X-%02X,", p[0], p[1]); |
929 | | printf(")"); |
930 | | break; |
931 | | case P_REF: |
932 | | printf("Ref(%d)", node->n); |
933 | | break; |
934 | | } |
935 | | } |
936 | | |
937 | | static void dumpprog(Reprog *prog) { |
938 | | Reinst *inst; |
939 | | int i; |
940 | | for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) { |
941 | | printf("% 5d: ", i); |
942 | | switch (inst->opcode) { |
943 | | case I_END: |
944 | | puts("end"); |
945 | | break; |
946 | | case I_JUMP: |
947 | | printf("jump %d\n", (int)(inst->x - prog->start)); |
948 | | break; |
949 | | case I_SPLIT: |
950 | | printf("split %d %d\n", (int)(inst->x - prog->start), |
951 | | (int)(inst->y - prog->start)); |
952 | | break; |
953 | | case I_PLA: |
954 | | printf("pla %d %d\n", (int)(inst->x - prog->start), |
955 | | (int)(inst->y - prog->start)); |
956 | | break; |
957 | | case I_NLA: |
958 | | printf("nla %d %d\n", (int)(inst->x - prog->start), |
959 | | (int)(inst->y - prog->start)); |
960 | | break; |
961 | | case I_ANY: |
962 | | puts("any"); |
963 | | break; |
964 | | case I_ANYNL: |
965 | | puts("anynl"); |
966 | | break; |
967 | | case I_CHAR: |
968 | | printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" |
969 | | : "char U+%04X\n", |
970 | | inst->c); |
971 | | break; |
972 | | case I_CCLASS: |
973 | | puts("cclass"); |
974 | | break; |
975 | | case I_NCCLASS: |
976 | | puts("ncclass"); |
977 | | break; |
978 | | case I_REF: |
979 | | printf("ref %d\n", inst->n); |
980 | | break; |
981 | | case I_BOL: |
982 | | puts("bol"); |
983 | | break; |
984 | | case I_EOL: |
985 | | puts("eol"); |
986 | | break; |
987 | | case I_WORD: |
988 | | puts("word"); |
989 | | break; |
990 | | case I_NWORD: |
991 | | puts("nword"); |
992 | | break; |
993 | | case I_LPAR: |
994 | | printf("lpar %d\n", inst->n); |
995 | | break; |
996 | | case I_RPAR: |
997 | | printf("rpar %d\n", inst->n); |
998 | | break; |
999 | | } |
1000 | | } |
1001 | | } |
1002 | | #endif |
1003 | | |
1004 | 2.14k | Reprog *re_regcomp(const char *pattern, int cflags, const char **errorp) { |
1005 | 2.14k | Reprog *prog; |
1006 | 2.14k | Restate *g; |
1007 | 2.14k | Renode *node; |
1008 | 2.14k | Reinst *split, *jump; |
1009 | 2.14k | int i; |
1010 | 2.14k | unsigned int ncount; |
1011 | 2.14k | size_t pattern_len = strlen(pattern); |
1012 | | |
1013 | 2.14k | if (pattern_len > 10000) { |
1014 | | /* Avoid stack exhaustion in recursive parseatom() et.al. */ |
1015 | 11 | if (errorp) |
1016 | 11 | *errorp = "regexp pattern too long (max 10000)"; |
1017 | 11 | return NULL; |
1018 | 11 | } |
1019 | | |
1020 | 2.13k | prog = rd_calloc(1, sizeof(Reprog)); |
1021 | 2.13k | g = &prog->g; |
1022 | 2.13k | g->prog = prog; |
1023 | 2.13k | g->pstart = g->pend = rd_malloc(sizeof(Renode) * pattern_len * 2); |
1024 | | |
1025 | 2.13k | if (setjmp(g->kaboom)) { |
1026 | 1.14k | if (errorp) |
1027 | 1.14k | *errorp = g->error; |
1028 | 1.14k | rd_free(g->pstart); |
1029 | 1.14k | rd_free(prog); |
1030 | 1.14k | return NULL; |
1031 | 1.14k | } |
1032 | | |
1033 | 988 | g->source = pattern; |
1034 | 988 | g->ncclass = 0; |
1035 | 988 | g->nsub = 1; |
1036 | 35.1k | for (i = 0; i < MAXSUB; ++i) |
1037 | 34.1k | g->sub[i] = 0; |
1038 | | |
1039 | 988 | g->prog->flags = cflags; |
1040 | | |
1041 | 988 | next(g); |
1042 | 988 | node = parsealt(g); |
1043 | 988 | if (g->lookahead == ')') |
1044 | 19 | die(g, "unmatched ')'"); |
1045 | 988 | if (g->lookahead != 0) |
1046 | 0 | die(g, "syntax error"); |
1047 | | |
1048 | 988 | g->prog->nsub = g->nsub; |
1049 | 988 | ncount = count(node); |
1050 | 988 | if (ncount > 10000) |
1051 | 44 | die(g, "regexp graph too large"); |
1052 | 988 | g->prog->start = g->prog->end = |
1053 | 988 | rd_malloc((ncount + 6) * sizeof(Reinst)); |
1054 | | |
1055 | 988 | split = emit(g->prog, I_SPLIT); |
1056 | 988 | split->x = split + 3; |
1057 | 988 | split->y = split + 1; |
1058 | 988 | emit(g->prog, I_ANYNL); |
1059 | 988 | jump = emit(g->prog, I_JUMP); |
1060 | 988 | jump->x = split; |
1061 | 988 | emit(g->prog, I_LPAR); |
1062 | 988 | compile(g->prog, node); |
1063 | 988 | emit(g->prog, I_RPAR); |
1064 | 988 | emit(g->prog, I_END); |
1065 | | |
1066 | | #ifdef TEST |
1067 | | dumpnode(node); |
1068 | | putchar('\n'); |
1069 | | dumpprog(g->prog); |
1070 | | #endif |
1071 | | |
1072 | 988 | rd_free(g->pstart); |
1073 | | |
1074 | 988 | if (errorp) |
1075 | 988 | *errorp = NULL; |
1076 | 988 | return g->prog; |
1077 | 2.13k | } |
1078 | | |
1079 | 988 | void re_regfree(Reprog *prog) { |
1080 | 988 | if (prog) { |
1081 | 988 | rd_free(prog->start); |
1082 | 988 | rd_free(prog); |
1083 | 988 | } |
1084 | 988 | } |
1085 | | |
1086 | | /* Match */ |
1087 | | |
1088 | 0 | static int isnewline(int c) { |
1089 | 0 | return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029; |
1090 | 0 | } |
1091 | | |
1092 | 0 | static int iswordchar(int c) { |
1093 | 0 | return c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || |
1094 | 0 | (c >= '0' && c <= '9'); |
1095 | 0 | } |
1096 | | |
1097 | 0 | static int incclass(Reclass *cc, Rune c) { |
1098 | 0 | Rune *p; |
1099 | 0 | for (p = cc->spans; p < cc->end; p += 2) |
1100 | 0 | if (p[0] <= c && c <= p[1]) |
1101 | 0 | return 1; |
1102 | 0 | return 0; |
1103 | 0 | } |
1104 | | |
1105 | 0 | static int incclasscanon(Reclass *cc, Rune c) { |
1106 | 0 | Rune *p, r; |
1107 | 0 | for (p = cc->spans; p < cc->end; p += 2) |
1108 | 0 | for (r = p[0]; r <= p[1]; ++r) |
1109 | 0 | if (c == canon(r)) |
1110 | 0 | return 1; |
1111 | 0 | return 0; |
1112 | 0 | } |
1113 | | |
1114 | 0 | static int strncmpcanon(const char *a, const char *b, unsigned int n) { |
1115 | 0 | Rune ra, rb; |
1116 | 0 | int c; |
1117 | 0 | while (n--) { |
1118 | 0 | if (!*a) |
1119 | 0 | return -1; |
1120 | 0 | if (!*b) |
1121 | 0 | return 1; |
1122 | 0 | a += chartorune(&ra, a); |
1123 | 0 | b += chartorune(&rb, b); |
1124 | 0 | c = canon(ra) - canon(rb); |
1125 | 0 | if (c) |
1126 | 0 | return c; |
1127 | 0 | } |
1128 | 0 | return 0; |
1129 | 0 | } |
1130 | | |
1131 | | struct Rethread { |
1132 | | Reinst *pc; |
1133 | | const char *sp; |
1134 | | Resub sub; |
1135 | | }; |
1136 | | |
1137 | 0 | static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub) { |
1138 | 0 | t->pc = pc; |
1139 | 0 | t->sp = sp; |
1140 | 0 | memcpy(&t->sub, sub, sizeof t->sub); |
1141 | 0 | } |
1142 | | |
1143 | | static int |
1144 | 0 | match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out) { |
1145 | 0 | Rethread ready[MAXTHREAD]; |
1146 | 0 | Resub scratch; |
1147 | 0 | Resub sub; |
1148 | 0 | Rune c; |
1149 | 0 | unsigned int nready; |
1150 | 0 | int i; |
1151 | | |
1152 | | /* queue initial thread */ |
1153 | 0 | spawn(ready + 0, pc, sp, out); |
1154 | 0 | nready = 1; |
1155 | | |
1156 | | /* run threads in stack order */ |
1157 | 0 | while (nready > 0) { |
1158 | 0 | --nready; |
1159 | 0 | pc = ready[nready].pc; |
1160 | 0 | sp = ready[nready].sp; |
1161 | 0 | memcpy(&sub, &ready[nready].sub, sizeof sub); |
1162 | 0 | for (;;) { |
1163 | 0 | switch (pc->opcode) { |
1164 | 0 | case I_END: |
1165 | 0 | for (i = 0; i < MAXSUB; ++i) { |
1166 | 0 | out->sub[i].sp = sub.sub[i].sp; |
1167 | 0 | out->sub[i].ep = sub.sub[i].ep; |
1168 | 0 | } |
1169 | 0 | return 1; |
1170 | 0 | case I_JUMP: |
1171 | 0 | pc = pc->x; |
1172 | 0 | continue; |
1173 | 0 | case I_SPLIT: |
1174 | 0 | if (nready >= MAXTHREAD) { |
1175 | 0 | fprintf( |
1176 | 0 | stderr, |
1177 | 0 | "regexec: backtrack overflow!\n"); |
1178 | 0 | return 0; |
1179 | 0 | } |
1180 | 0 | spawn(&ready[nready++], pc->y, sp, &sub); |
1181 | 0 | pc = pc->x; |
1182 | 0 | continue; |
1183 | | |
1184 | 0 | case I_PLA: |
1185 | 0 | if (!match(pc->x, sp, bol, flags, &sub)) |
1186 | 0 | goto dead; |
1187 | 0 | pc = pc->y; |
1188 | 0 | continue; |
1189 | 0 | case I_NLA: |
1190 | 0 | memcpy(&scratch, &sub, sizeof scratch); |
1191 | 0 | if (match(pc->x, sp, bol, flags, &scratch)) |
1192 | 0 | goto dead; |
1193 | 0 | pc = pc->y; |
1194 | 0 | continue; |
1195 | | |
1196 | 0 | case I_ANYNL: |
1197 | 0 | sp += chartorune(&c, sp); |
1198 | 0 | if (c == 0) |
1199 | 0 | goto dead; |
1200 | 0 | break; |
1201 | 0 | case I_ANY: |
1202 | 0 | sp += chartorune(&c, sp); |
1203 | 0 | if (c == 0) |
1204 | 0 | goto dead; |
1205 | 0 | if (isnewline(c)) |
1206 | 0 | goto dead; |
1207 | 0 | break; |
1208 | 0 | case I_CHAR: |
1209 | 0 | sp += chartorune(&c, sp); |
1210 | 0 | if (c == 0) |
1211 | 0 | goto dead; |
1212 | 0 | if (flags & REG_ICASE) |
1213 | 0 | c = canon(c); |
1214 | 0 | if (c != pc->c) |
1215 | 0 | goto dead; |
1216 | 0 | break; |
1217 | 0 | case I_CCLASS: |
1218 | 0 | sp += chartorune(&c, sp); |
1219 | 0 | if (c == 0) |
1220 | 0 | goto dead; |
1221 | 0 | if (flags & REG_ICASE) { |
1222 | 0 | if (!incclasscanon(pc->cc, canon(c))) |
1223 | 0 | goto dead; |
1224 | 0 | } else { |
1225 | 0 | if (!incclass(pc->cc, c)) |
1226 | 0 | goto dead; |
1227 | 0 | } |
1228 | 0 | break; |
1229 | 0 | case I_NCCLASS: |
1230 | 0 | sp += chartorune(&c, sp); |
1231 | 0 | if (c == 0) |
1232 | 0 | goto dead; |
1233 | 0 | if (flags & REG_ICASE) { |
1234 | 0 | if (incclasscanon(pc->cc, canon(c))) |
1235 | 0 | goto dead; |
1236 | 0 | } else { |
1237 | 0 | if (incclass(pc->cc, c)) |
1238 | 0 | goto dead; |
1239 | 0 | } |
1240 | 0 | break; |
1241 | 0 | case I_REF: |
1242 | 0 | i = (int)(sub.sub[pc->n].ep - |
1243 | 0 | sub.sub[pc->n].sp); |
1244 | 0 | if (flags & REG_ICASE) { |
1245 | 0 | if (strncmpcanon(sp, sub.sub[pc->n].sp, |
1246 | 0 | i)) |
1247 | 0 | goto dead; |
1248 | 0 | } else { |
1249 | 0 | if (strncmp(sp, sub.sub[pc->n].sp, i)) |
1250 | 0 | goto dead; |
1251 | 0 | } |
1252 | 0 | if (i > 0) |
1253 | 0 | sp += i; |
1254 | 0 | break; |
1255 | | |
1256 | 0 | case I_BOL: |
1257 | 0 | if (sp == bol && !(flags & REG_NOTBOL)) |
1258 | 0 | break; |
1259 | 0 | if (flags & REG_NEWLINE) |
1260 | 0 | if (sp > bol && isnewline(sp[-1])) |
1261 | 0 | break; |
1262 | 0 | goto dead; |
1263 | 0 | case I_EOL: |
1264 | 0 | if (*sp == 0) |
1265 | 0 | break; |
1266 | 0 | if (flags & REG_NEWLINE) |
1267 | 0 | if (isnewline(*sp)) |
1268 | 0 | break; |
1269 | 0 | goto dead; |
1270 | 0 | case I_WORD: |
1271 | 0 | i = sp > bol && iswordchar(sp[-1]); |
1272 | 0 | i ^= iswordchar(sp[0]); |
1273 | 0 | if (i) |
1274 | 0 | break; |
1275 | 0 | goto dead; |
1276 | 0 | case I_NWORD: |
1277 | 0 | i = sp > bol && iswordchar(sp[-1]); |
1278 | 0 | i ^= iswordchar(sp[0]); |
1279 | 0 | if (!i) |
1280 | 0 | break; |
1281 | 0 | goto dead; |
1282 | | |
1283 | 0 | case I_LPAR: |
1284 | 0 | sub.sub[pc->n].sp = sp; |
1285 | 0 | break; |
1286 | 0 | case I_RPAR: |
1287 | 0 | sub.sub[pc->n].ep = sp; |
1288 | 0 | break; |
1289 | 0 | default: |
1290 | 0 | goto dead; |
1291 | 0 | } |
1292 | 0 | pc = pc + 1; |
1293 | 0 | } |
1294 | 0 | dead:; |
1295 | 0 | } |
1296 | 0 | return 0; |
1297 | 0 | } |
1298 | | |
1299 | 0 | int re_regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) { |
1300 | 0 | Resub scratch; |
1301 | 0 | int i; |
1302 | |
|
1303 | 0 | if (!sub) |
1304 | 0 | sub = &scratch; |
1305 | |
|
1306 | 0 | sub->nsub = prog->nsub; |
1307 | 0 | for (i = 0; i < MAXSUB; ++i) |
1308 | 0 | sub->sub[i].sp = sub->sub[i].ep = NULL; |
1309 | |
|
1310 | 0 | return !match(prog->start, sp, sp, prog->flags | eflags, sub); |
1311 | 0 | } |
1312 | | |
1313 | | #ifdef TEST |
1314 | | int main(int argc, char **argv) { |
1315 | | const char *error; |
1316 | | const char *s; |
1317 | | Reprog *p; |
1318 | | Resub m; |
1319 | | unsigned int i; |
1320 | | |
1321 | | if (argc > 1) { |
1322 | | p = regcomp(argv[1], 0, &error); |
1323 | | if (!p) { |
1324 | | fprintf(stderr, "regcomp: %s\n", error); |
1325 | | return 1; |
1326 | | } |
1327 | | |
1328 | | if (argc > 2) { |
1329 | | s = argv[2]; |
1330 | | printf("nsub = %d\n", p->nsub); |
1331 | | if (!regexec(p, s, &m, 0)) { |
1332 | | for (i = 0; i < m.nsub; ++i) { |
1333 | | int n = m.sub[i].ep - m.sub[i].sp; |
1334 | | if (n > 0) |
1335 | | printf( |
1336 | | "match %d: s=%d e=%d n=%d " |
1337 | | "'%.*s'\n", |
1338 | | i, (int)(m.sub[i].sp - s), |
1339 | | (int)(m.sub[i].ep - s), n, |
1340 | | n, m.sub[i].sp); |
1341 | | else |
1342 | | printf("match %d: n=0 ''\n", i); |
1343 | | } |
1344 | | } else { |
1345 | | printf("no match\n"); |
1346 | | } |
1347 | | } |
1348 | | } |
1349 | | |
1350 | | return 0; |
1351 | | } |
1352 | | #endif |