/src/gnupg/regexp/jimregexp.c
Line | Count | Source |
1 | | /* |
2 | | * vi:se ts=8: |
3 | | * |
4 | | * regcomp and regexec -- regsub and regerror are elsewhere |
5 | | * |
6 | | * Copyright (c) 1986 by University of Toronto. |
7 | | * Written by Henry Spencer. Not derived from licensed software. |
8 | | * |
9 | | * Permission is granted to anyone to use this software for any |
10 | | * purpose on any computer system, and to redistribute it freely, |
11 | | * subject to the following restrictions: |
12 | | * |
13 | | * 1. The author is not responsible for the consequences of use of |
14 | | * this software, no matter how awful, even if they arise |
15 | | * from defects in it. |
16 | | * |
17 | | * 2. The origin of this software must not be misrepresented, either |
18 | | * by explicit claim or by omission. |
19 | | * |
20 | | * 3. Altered versions must be plainly marked as such, and must not |
21 | | * be misrepresented as being the original software. |
22 | | *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
23 | | *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | |
24 | | *** to assist in implementing egrep. |
25 | | *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
26 | | *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching |
27 | | *** as in BSD grep and ex. |
28 | | *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
29 | | *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. |
30 | | *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, |
31 | | *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. |
32 | | *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald |
33 | | *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h |
34 | | *** was moved into regexp.h, and the include of regexp.h now uses "'s |
35 | | *** to avoid conflicting with the system regexp.h. Const, bless its |
36 | | *** soul, was removed so it can compile everywhere. The declaration |
37 | | *** of strchr() was in conflict on AIX, so it was removed (as it is |
38 | | *** happily defined in string.h). |
39 | | *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald |
40 | | *** seiwald@perforce.com, on 20 January 2000, to use function prototypes. |
41 | | *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald |
42 | | *** seiwald@perforce.com, on 05 November 2002, to const string literals. |
43 | | * |
44 | | * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au> |
45 | | * on 16 October 2010, to remove static state and add better Tcl ARE compatibility. |
46 | | * This includes counted repetitions, UTF-8 support, character classes, |
47 | | * shorthand character classes, increased number of parentheses to 100, |
48 | | * backslash escape sequences. It also removes \n as an alternative to |. |
49 | | * |
50 | | *** THIS IS AN ALTERED VERSION. It was altered to offer POSIX-like |
51 | | *** regular expression routines of regcomp/regexec/regerror/regfree, |
52 | | *** with UTF-8 support, by NIIBE Yutaka <gniibe@fsij.org> on |
53 | | *** 2020-02-14. |
54 | | * |
55 | | * Beware that some of this code is subtly aware of the way operator |
56 | | * precedence is structured in regular expressions. Serious changes in |
57 | | * regular-expression syntax might require a total rethink. |
58 | | */ |
59 | | |
60 | | #if defined(JIM_REGEXP) |
61 | | #include <stdio.h> |
62 | | #include <ctype.h> |
63 | | #include <stdlib.h> |
64 | | #include <string.h> |
65 | | |
66 | | #include "jimregexp.h" |
67 | | #include "utf8.h" |
68 | | |
69 | | #define UCHAR(c) ((unsigned char)(c)) |
70 | | |
71 | | /* An arbitrary limit, but this seems enough. Must be less than 1000. */ |
72 | 0 | #define REG_MAX_PAREN 100 |
73 | | |
74 | | /* |
75 | | * Structure for regexp "program". This is essentially a linear encoding |
76 | | * of a nondeterministic finite-state machine (aka syntax charts or |
77 | | * "railroad normal form" in parsing technology). Each node is an opcode |
78 | | * plus a "next" pointer, possibly plus an operand. "Next" pointers of |
79 | | * all nodes except BRANCH implement concatenation; a "next" pointer with |
80 | | * a BRANCH on both ends of it is connecting two alternatives. (Here we |
81 | | * have one of the subtle syntax dependencies: an individual BRANCH (as |
82 | | * opposed to a collection of them) is never concatenated with anything |
83 | | * because of operator precedence.) The operand of some types of node is |
84 | | * a literal string; for others, it is a node leading into a sub-FSM. In |
85 | | * particular, the operand of a BRANCH node is the first node of the branch. |
86 | | * (NB this is *not* a tree structure: the tail of the branch connects |
87 | | * to the thing following the set of BRANCHes.) The opcodes are: |
88 | | */ |
89 | | |
90 | | /* definition number opnd? meaning */ |
91 | 0 | #define END 0 /* no End of program. */ |
92 | 0 | #define BOL 1 /* no Match "" at beginning of line. */ |
93 | 0 | #define EOL 2 /* no Match "" at end of line. */ |
94 | 0 | #define ANY 3 /* no Match any one character. */ |
95 | 0 | #define ANYOF 4 /* str Match any character in this string. */ |
96 | 0 | #define ANYBUT 5 /* str Match any character not in this string. */ |
97 | 0 | #define BRANCH 6 /* node Match this alternative, or the next... */ |
98 | 0 | #define BACK 7 /* no Match "", "next" ptr points backward. */ |
99 | 0 | #define EXACTLY 8 /* str Match this string. */ |
100 | 0 | #define NOTHING 9 /* no Match empty string. */ |
101 | 0 | #define REP 10 /* max,min Match this (simple) thing [min,max] times. */ |
102 | 0 | #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */ |
103 | 0 | #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */ |
104 | 0 | #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */ |
105 | 0 | #define BOLX 14 /* no Match "" at beginning of input. */ |
106 | 0 | #define EOLX 15 /* no Match "" at end of input. */ |
107 | 0 | #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */ |
108 | 0 | #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */ |
109 | | |
110 | 0 | #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */ |
111 | 0 | #define OPEN 1001 /* no Mark this point in input as start of #n. */ |
112 | | /* OPEN+1 is number 1, etc. */ |
113 | | |
114 | | /* must not be any other opts between OPEN and CLOSE */ |
115 | | |
116 | 0 | #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */ |
117 | 0 | #define CLOSE 2001 /* no Analogous to OPEN. */ |
118 | 0 | #define CLOSE_END (CLOSE+REG_MAX_PAREN) |
119 | | |
120 | | /* |
121 | | * The first word of the regexp internal "program" is actually this magic |
122 | | * number; the start node begins in the second word. |
123 | | */ |
124 | 0 | #define REG_MAGIC 0xFADED00D |
125 | | |
126 | | /* |
127 | | * Opcode notes: |
128 | | * |
129 | | * BRANCH The set of branches constituting a single choice are hooked |
130 | | * together with their "next" pointers, since precedence prevents |
131 | | * anything being concatenated to any individual branch. The |
132 | | * "next" pointer of the last BRANCH in a choice points to the |
133 | | * thing following the whole choice. This is also where the |
134 | | * final "next" pointer of each individual branch points; each |
135 | | * branch starts with the operand node of a BRANCH node. |
136 | | * |
137 | | * BACK Normal "next" pointers all implicitly point forward; BACK |
138 | | * exists to make loop structures possible. |
139 | | * |
140 | | * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented |
141 | | * as either simple repeats (REP) or complex repeats (REPX). |
142 | | * These opcodes include a "min" and "max" count after the opcode. |
143 | | * This is followed by a fourth "current count" word that is |
144 | | * only used by REPX, as it implements a recursive match. |
145 | | * REPMIN and REPXMIN are identical except they implement minimal repeats. |
146 | | * |
147 | | * OPEN,CLOSE ...are numbered at compile time. |
148 | | */ |
149 | | |
150 | | /* |
151 | | * A node is one word of opcode followed by one word of "next" pointer. |
152 | | * The "next" pointer value is a positive offset from the opcode of the node |
153 | | * containing it. |
154 | | * An operand, if any, simply follows the node. (Note that much of the |
155 | | * code generation knows about this implicit relationship.) |
156 | | */ |
157 | 0 | #define OP(preg, p) (preg->program[p]) |
158 | 0 | #define NEXT(preg, p) (preg->program[p + 1]) |
159 | 0 | #define OPERAND(p) ((p) + 2) |
160 | | |
161 | | /* |
162 | | * See regmagic.h for one further detail of program structure. |
163 | | */ |
164 | | |
165 | | |
166 | | /* |
167 | | * Utility definitions. |
168 | | */ |
169 | | |
170 | 0 | #define FAIL(R,M) { (R)->err = (M); return (M); } |
171 | 0 | #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{') |
172 | 0 | #define META "^$.[()|?{+*" |
173 | | |
174 | | /* |
175 | | * Flags to be passed up and down. |
176 | | */ |
177 | 0 | #define HASWIDTH 1 /* Known never to match null string. */ |
178 | 0 | #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */ |
179 | 0 | #define SPSTART 4 /* Starts with * or +. */ |
180 | 0 | #define WORST 0 /* Worst case. */ |
181 | | |
182 | 0 | #define MAX_REP_COUNT 1000000 |
183 | | |
184 | | /* |
185 | | * Forward declarations for regcomp()'s friends. |
186 | | */ |
187 | | static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ); |
188 | | static int regpiece(regex_t *preg, int *flagp ); |
189 | | static int regbranch(regex_t *preg, int *flagp ); |
190 | | static int regatom(regex_t *preg, int *flagp ); |
191 | | static int regnode(regex_t *preg, int op ); |
192 | | static int regnext(regex_t *preg, int p ); |
193 | | static void regc(regex_t *preg, int b ); |
194 | | static int reginsert(regex_t *preg, int op, int size, int opnd ); |
195 | | static void regtail(regex_t *preg, int p, int val); |
196 | | static void regoptail(regex_t *preg, int p, int val ); |
197 | | static int regopsize(regex_t *preg, int p ); |
198 | | |
199 | | static int reg_range_find(const int *string, int c); |
200 | | static const char *str_find(const char *string, int c, int nocase); |
201 | | static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase); |
202 | | |
203 | | /*#define DEBUG*/ |
204 | | #ifdef DEBUG |
205 | | static int regnarrate = 0; |
206 | | static void regdump(regex_t *preg); |
207 | | static const char *regprop( int op ); |
208 | | #endif |
209 | | |
210 | | |
211 | | /** |
212 | | * Returns the length of the null-terminated integer sequence. |
213 | | */ |
214 | | static int str_int_len(const int *seq) |
215 | 0 | { |
216 | 0 | int n = 0; |
217 | 0 | while (*seq++) { |
218 | 0 | n++; |
219 | 0 | } |
220 | 0 | return n; |
221 | 0 | } |
222 | | |
223 | | /* |
224 | | - regcomp - compile a regular expression into internal code |
225 | | * |
226 | | * We can't allocate space until we know how big the compiled form will be, |
227 | | * but we can't compile it (and thus know how big it is) until we've got a |
228 | | * place to put the code. So we cheat: we compile it twice, once with code |
229 | | * generation turned off and size counting turned on, and once "for real". |
230 | | * This also means that we don't allocate space until we are sure that the |
231 | | * thing really will compile successfully, and we never have to move the |
232 | | * code and thus invalidate pointers into it. (Note that it has to be in |
233 | | * one piece because free() must be able to free it all.) |
234 | | * |
235 | | * Beware that the optimization-preparation code in here knows about some |
236 | | * of the structure of the compiled regexp. |
237 | | */ |
238 | | int regcomp(regex_t *preg, const char *exp, int cflags) |
239 | 0 | { |
240 | 0 | int scan; |
241 | 0 | int longest; |
242 | 0 | unsigned len; |
243 | 0 | int flags; |
244 | |
|
245 | | #ifdef DEBUG |
246 | | fprintf(stderr, "Compiling: '%s'\n", exp); |
247 | | #endif |
248 | 0 | memset(preg, 0, sizeof(*preg)); |
249 | |
|
250 | 0 | if (exp == NULL) |
251 | 0 | FAIL(preg, REG_ERR_NULL_ARGUMENT); |
252 | | |
253 | | /* First pass: determine size, legality. */ |
254 | 0 | preg->cflags = cflags; |
255 | 0 | preg->regparse = exp; |
256 | | |
257 | | /* Allocate space. */ |
258 | 0 | preg->proglen = (strlen(exp) + 1) * 5; |
259 | 0 | preg->program = malloc(preg->proglen * sizeof(int)); |
260 | 0 | if (preg->program == NULL) |
261 | 0 | FAIL(preg, REG_ERR_NOMEM); |
262 | | |
263 | | /* Note that since we store a magic value as the first item in the program, |
264 | | * program offsets will never be 0 |
265 | | */ |
266 | 0 | regc(preg, REG_MAGIC); |
267 | 0 | if (reg(preg, 0, &flags) == 0) { |
268 | 0 | return preg->err; |
269 | 0 | } |
270 | | |
271 | | /* Small enough for pointer-storage convention? */ |
272 | 0 | if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */ |
273 | 0 | FAIL(preg,REG_ERR_TOO_BIG); |
274 | | |
275 | | /* Dig out information for optimizations. */ |
276 | 0 | preg->regstart = 0; /* Worst-case defaults. */ |
277 | 0 | preg->reganch = 0; |
278 | 0 | preg->regmust = 0; |
279 | 0 | preg->regmlen = 0; |
280 | 0 | scan = 1; /* First BRANCH. */ |
281 | 0 | if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */ |
282 | 0 | scan = OPERAND(scan); |
283 | | |
284 | | /* Starting-point info. */ |
285 | 0 | if (OP(preg, scan) == EXACTLY) { |
286 | 0 | preg->regstart = preg->program[OPERAND(scan)]; |
287 | 0 | } |
288 | 0 | else if (OP(preg, scan) == BOL) |
289 | 0 | preg->reganch++; |
290 | | |
291 | | /* |
292 | | * If there's something expensive in the r.e., find the |
293 | | * longest literal string that must appear and make it the |
294 | | * regmust. Resolve ties in favor of later strings, since |
295 | | * the regstart check works with the beginning of the r.e. |
296 | | * and avoiding duplication strengthens checking. Not a |
297 | | * strong reason, but sufficient in the absence of others. |
298 | | */ |
299 | 0 | if (flags&SPSTART) { |
300 | 0 | longest = 0; |
301 | 0 | len = 0; |
302 | 0 | for (; scan != 0; scan = regnext(preg, scan)) { |
303 | 0 | if (OP(preg, scan) == EXACTLY) { |
304 | 0 | int plen = str_int_len(preg->program + OPERAND(scan)); |
305 | 0 | if (plen >= len) { |
306 | 0 | longest = OPERAND(scan); |
307 | 0 | len = plen; |
308 | 0 | } |
309 | 0 | } |
310 | 0 | } |
311 | 0 | preg->regmust = longest; |
312 | 0 | preg->regmlen = len; |
313 | 0 | } |
314 | 0 | } |
315 | |
|
316 | | #ifdef DEBUG |
317 | | regdump(preg); |
318 | | #endif |
319 | |
|
320 | 0 | return 0; |
321 | 0 | } |
322 | | |
323 | | /* |
324 | | - reg - regular expression, i.e. main body or parenthesized thing |
325 | | * |
326 | | * Caller must absorb opening parenthesis. |
327 | | * |
328 | | * Combining parenthesis handling with the base level of regular expression |
329 | | * is a trifle forced, but the need to tie the tails of the branches to what |
330 | | * follows makes it hard to avoid. |
331 | | */ |
332 | | static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) |
333 | 0 | { |
334 | 0 | int ret; |
335 | 0 | int br; |
336 | 0 | int ender; |
337 | 0 | int parno = 0; |
338 | 0 | int flags; |
339 | |
|
340 | 0 | *flagp = HASWIDTH; /* Tentatively. */ |
341 | | |
342 | | /* Make an OPEN node, if parenthesized. */ |
343 | 0 | if (paren) { |
344 | 0 | if (preg->regparse[0] == '?' && preg->regparse[1] == ':') { |
345 | | /* non-capturing paren */ |
346 | 0 | preg->regparse += 2; |
347 | 0 | parno = -1; |
348 | 0 | } |
349 | 0 | else { |
350 | 0 | parno = ++preg->re_nsub; |
351 | 0 | } |
352 | 0 | ret = regnode(preg, OPEN+parno); |
353 | 0 | } else |
354 | 0 | ret = 0; |
355 | | |
356 | | /* Pick up the branches, linking them together. */ |
357 | 0 | br = regbranch(preg, &flags); |
358 | 0 | if (br == 0) |
359 | 0 | return 0; |
360 | 0 | if (ret != 0) |
361 | 0 | regtail(preg, ret, br); /* OPEN -> first. */ |
362 | 0 | else |
363 | 0 | ret = br; |
364 | 0 | if (!(flags&HASWIDTH)) |
365 | 0 | *flagp &= ~HASWIDTH; |
366 | 0 | *flagp |= flags&SPSTART; |
367 | 0 | while (*preg->regparse == '|') { |
368 | 0 | preg->regparse++; |
369 | 0 | br = regbranch(preg, &flags); |
370 | 0 | if (br == 0) |
371 | 0 | return 0; |
372 | 0 | regtail(preg, ret, br); /* BRANCH -> BRANCH. */ |
373 | 0 | if (!(flags&HASWIDTH)) |
374 | 0 | *flagp &= ~HASWIDTH; |
375 | 0 | *flagp |= flags&SPSTART; |
376 | 0 | } |
377 | | |
378 | | /* Make a closing node, and hook it on the end. */ |
379 | 0 | ender = regnode(preg, (paren) ? CLOSE+parno : END); |
380 | 0 | regtail(preg, ret, ender); |
381 | | |
382 | | /* Hook the tails of the branches to the closing node. */ |
383 | 0 | for (br = ret; br != 0; br = regnext(preg, br)) |
384 | 0 | regoptail(preg, br, ender); |
385 | | |
386 | | /* Check for proper termination. */ |
387 | 0 | if (paren && *preg->regparse++ != ')') { |
388 | 0 | preg->err = REG_ERR_UNMATCHED_PAREN; |
389 | 0 | return 0; |
390 | 0 | } else if (!paren && *preg->regparse != '\0') { |
391 | 0 | if (*preg->regparse == ')') { |
392 | 0 | preg->err = REG_ERR_UNMATCHED_PAREN; |
393 | 0 | return 0; |
394 | 0 | } else { |
395 | 0 | preg->err = REG_ERR_JUNK_ON_END; |
396 | 0 | return 0; |
397 | 0 | } |
398 | 0 | } |
399 | | |
400 | 0 | return(ret); |
401 | 0 | } |
402 | | |
403 | | /* |
404 | | - regbranch - one alternative of an | operator |
405 | | * |
406 | | * Implements the concatenation operator. |
407 | | */ |
408 | | static int regbranch(regex_t *preg, int *flagp ) |
409 | 0 | { |
410 | 0 | int ret; |
411 | 0 | int chain; |
412 | 0 | int latest; |
413 | 0 | int flags; |
414 | |
|
415 | 0 | *flagp = WORST; /* Tentatively. */ |
416 | |
|
417 | 0 | ret = regnode(preg, BRANCH); |
418 | 0 | chain = 0; |
419 | 0 | while (*preg->regparse != '\0' && *preg->regparse != ')' && |
420 | 0 | *preg->regparse != '|') { |
421 | 0 | latest = regpiece(preg, &flags); |
422 | 0 | if (latest == 0) |
423 | 0 | return 0; |
424 | 0 | *flagp |= flags&HASWIDTH; |
425 | 0 | if (chain == 0) {/* First piece. */ |
426 | 0 | *flagp |= flags&SPSTART; |
427 | 0 | } |
428 | 0 | else { |
429 | 0 | regtail(preg, chain, latest); |
430 | 0 | } |
431 | 0 | chain = latest; |
432 | 0 | } |
433 | 0 | if (chain == 0) /* Loop ran zero times. */ |
434 | 0 | (void) regnode(preg, NOTHING); |
435 | |
|
436 | 0 | return(ret); |
437 | 0 | } |
438 | | |
439 | | /* |
440 | | - regpiece - something followed by possible [*+?] |
441 | | * |
442 | | * Note that the branching code sequences used for ? and the general cases |
443 | | * of * and + are somewhat optimized: they use the same NOTHING node as |
444 | | * both the endmarker for their branch list and the body of the last branch. |
445 | | * It might seem that this node could be dispensed with entirely, but the |
446 | | * endmarker role is not redundant. |
447 | | */ |
448 | | static int regpiece(regex_t *preg, int *flagp) |
449 | 0 | { |
450 | 0 | int ret; |
451 | 0 | char op; |
452 | 0 | int next; |
453 | 0 | int flags; |
454 | 0 | int min; |
455 | 0 | int max; |
456 | |
|
457 | 0 | ret = regatom(preg, &flags); |
458 | 0 | if (ret == 0) |
459 | 0 | return 0; |
460 | | |
461 | 0 | op = *preg->regparse; |
462 | 0 | if (!ISMULT(op)) { |
463 | 0 | *flagp = flags; |
464 | 0 | return(ret); |
465 | 0 | } |
466 | | |
467 | 0 | if (!(flags&HASWIDTH) && op != '?') { |
468 | 0 | preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY; |
469 | 0 | return 0; |
470 | 0 | } |
471 | | |
472 | | /* Handle braces (counted repetition) by expansion */ |
473 | 0 | if (op == '{') { |
474 | 0 | char *end; |
475 | |
|
476 | 0 | min = strtoul(preg->regparse + 1, &end, 10); |
477 | 0 | if (end == preg->regparse + 1) { |
478 | 0 | preg->err = REG_ERR_BAD_COUNT; |
479 | 0 | return 0; |
480 | 0 | } |
481 | 0 | if (*end == '}') { |
482 | 0 | max = min; |
483 | 0 | } |
484 | 0 | else if (*end == '\0') { |
485 | 0 | preg->err = REG_ERR_UNMATCHED_BRACES; |
486 | 0 | return 0; |
487 | 0 | } |
488 | 0 | else { |
489 | 0 | preg->regparse = end; |
490 | 0 | max = strtoul(preg->regparse + 1, &end, 10); |
491 | 0 | if (*end != '}') { |
492 | 0 | preg->err = REG_ERR_UNMATCHED_BRACES; |
493 | 0 | return 0; |
494 | 0 | } |
495 | 0 | } |
496 | 0 | if (end == preg->regparse + 1) { |
497 | 0 | max = MAX_REP_COUNT; |
498 | 0 | } |
499 | 0 | else if (max < min || max >= 100) { |
500 | 0 | preg->err = REG_ERR_BAD_COUNT; |
501 | 0 | return 0; |
502 | 0 | } |
503 | 0 | if (min >= 100) { |
504 | 0 | preg->err = REG_ERR_BAD_COUNT; |
505 | 0 | return 0; |
506 | 0 | } |
507 | | |
508 | 0 | preg->regparse = strchr(preg->regparse, '}'); |
509 | 0 | } |
510 | 0 | else { |
511 | 0 | min = (op == '+'); |
512 | 0 | max = (op == '?' ? 1 : MAX_REP_COUNT); |
513 | 0 | } |
514 | | |
515 | 0 | if (preg->regparse[1] == '?') { |
516 | 0 | preg->regparse++; |
517 | 0 | next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret); |
518 | 0 | } |
519 | 0 | else { |
520 | 0 | next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret); |
521 | 0 | } |
522 | 0 | preg->program[ret + 2] = max; |
523 | 0 | preg->program[ret + 3] = min; |
524 | 0 | preg->program[ret + 4] = 0; |
525 | |
|
526 | 0 | *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART); |
527 | |
|
528 | 0 | if (!(flags & SIMPLE)) { |
529 | 0 | int back = regnode(preg, BACK); |
530 | 0 | regtail(preg, back, ret); |
531 | 0 | regtail(preg, next, back); |
532 | 0 | } |
533 | |
|
534 | 0 | preg->regparse++; |
535 | 0 | if (ISMULT(*preg->regparse)) { |
536 | 0 | preg->err = REG_ERR_NESTED_COUNT; |
537 | 0 | return 0; |
538 | 0 | } |
539 | | |
540 | 0 | return ret; |
541 | 0 | } |
542 | | |
543 | | /** |
544 | | * Add all characters in the inclusive range between lower and upper. |
545 | | * |
546 | | * Handles a swapped range (upper < lower). |
547 | | */ |
548 | | static void reg_addrange(regex_t *preg, int lower, int upper) |
549 | 0 | { |
550 | 0 | if (lower > upper) { |
551 | 0 | reg_addrange(preg, upper, lower); |
552 | 0 | } |
553 | | /* Add a range as length, start */ |
554 | 0 | regc(preg, upper - lower + 1); |
555 | 0 | regc(preg, lower); |
556 | 0 | } |
557 | | |
558 | | /** |
559 | | * Add a null-terminated literal string as a set of ranges. |
560 | | */ |
561 | | static void reg_addrange_str(regex_t *preg, const char *str) |
562 | 0 | { |
563 | 0 | while (*str) { |
564 | 0 | reg_addrange(preg, *str, *str); |
565 | 0 | str++; |
566 | 0 | } |
567 | 0 | } |
568 | | |
569 | | /** |
570 | | * Extracts the next unicode char from utf8. |
571 | | * |
572 | | * If 'upper' is set, converts the char to uppercase. |
573 | | */ |
574 | | static int reg_utf8_tounicode_case(const char *s, int *uc, int upper) |
575 | 0 | { |
576 | 0 | int l = utf8_tounicode(s, uc); |
577 | 0 | if (upper) { |
578 | 0 | *uc = utf8_upper(*uc); |
579 | 0 | } |
580 | 0 | return l; |
581 | 0 | } |
582 | | |
583 | | /** |
584 | | * Converts a hex digit to decimal. |
585 | | * |
586 | | * Returns -1 for an invalid hex digit. |
587 | | */ |
588 | | static int hexdigitval(int c) |
589 | 0 | { |
590 | 0 | if (c >= '0' && c <= '9') |
591 | 0 | return c - '0'; |
592 | 0 | if (c >= 'a' && c <= 'f') |
593 | 0 | return c - 'a' + 10; |
594 | 0 | if (c >= 'A' && c <= 'F') |
595 | 0 | return c - 'A' + 10; |
596 | 0 | return -1; |
597 | 0 | } |
598 | | |
599 | | /** |
600 | | * Parses up to 'n' hex digits at 's' and stores the result in *uc. |
601 | | * |
602 | | * Returns the number of hex digits parsed. |
603 | | * If there are no hex digits, returns 0 and stores nothing. |
604 | | */ |
605 | | static int parse_hex(const char *s, int n, int *uc) |
606 | 0 | { |
607 | 0 | int val = 0; |
608 | 0 | int k; |
609 | |
|
610 | 0 | for (k = 0; k < n; k++) { |
611 | 0 | int c = hexdigitval(*s++); |
612 | 0 | if (c == -1) { |
613 | 0 | break; |
614 | 0 | } |
615 | 0 | val = (val << 4) | c; |
616 | 0 | } |
617 | 0 | if (k) { |
618 | 0 | *uc = val; |
619 | 0 | } |
620 | 0 | return k; |
621 | 0 | } |
622 | | |
623 | | /** |
624 | | * Call for chars after a backlash to decode the escape sequence. |
625 | | * |
626 | | * Stores the result in *ch. |
627 | | * |
628 | | * Returns the number of bytes consumed. |
629 | | */ |
630 | | static int reg_decode_escape(const char *s, int *ch) |
631 | 0 | { |
632 | 0 | int n; |
633 | 0 | const char *s0 = s; |
634 | |
|
635 | 0 | *ch = *s++; |
636 | |
|
637 | 0 | switch (*ch) { |
638 | 0 | case 'b': *ch = '\b'; break; |
639 | 0 | case 'e': *ch = 27; break; |
640 | 0 | case 'f': *ch = '\f'; break; |
641 | 0 | case 'n': *ch = '\n'; break; |
642 | 0 | case 'r': *ch = '\r'; break; |
643 | 0 | case 't': *ch = '\t'; break; |
644 | 0 | case 'v': *ch = '\v'; break; |
645 | 0 | case 'u': |
646 | 0 | if (*s == '{') { |
647 | | /* Expect \u{NNNN} */ |
648 | 0 | n = parse_hex(s + 1, 6, ch); |
649 | 0 | if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) { |
650 | 0 | s += n + 2; |
651 | 0 | } |
652 | 0 | else { |
653 | | /* Invalid, so just treat as an escaped 'u' */ |
654 | 0 | *ch = 'u'; |
655 | 0 | } |
656 | 0 | } |
657 | 0 | else if ((n = parse_hex(s, 4, ch)) > 0) { |
658 | 0 | s += n; |
659 | 0 | } |
660 | 0 | break; |
661 | 0 | case 'U': |
662 | 0 | if ((n = parse_hex(s, 8, ch)) > 0) { |
663 | 0 | s += n; |
664 | 0 | } |
665 | 0 | break; |
666 | 0 | case 'x': |
667 | 0 | if ((n = parse_hex(s, 2, ch)) > 0) { |
668 | 0 | s += n; |
669 | 0 | } |
670 | 0 | break; |
671 | 0 | case '\0': |
672 | 0 | s--; |
673 | 0 | *ch = '\\'; |
674 | 0 | break; |
675 | 0 | } |
676 | 0 | return s - s0; |
677 | 0 | } |
678 | | |
679 | | /* |
680 | | - regatom - the lowest level |
681 | | * |
682 | | * Optimization: gobbles an entire sequence of ordinary characters so that |
683 | | * it can turn them into a single node, which is smaller to store and |
684 | | * faster to run. Backslashed characters are exceptions, each becoming a |
685 | | * separate node; the code is simpler that way and it's not worth fixing. |
686 | | */ |
687 | | static int regatom(regex_t *preg, int *flagp) |
688 | 0 | { |
689 | 0 | int ret; |
690 | 0 | int flags; |
691 | 0 | int nocase = (preg->cflags & REG_ICASE); |
692 | |
|
693 | 0 | int ch; |
694 | 0 | int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase); |
695 | |
|
696 | 0 | *flagp = WORST; /* Tentatively. */ |
697 | |
|
698 | 0 | preg->regparse += n; |
699 | 0 | switch (ch) { |
700 | | /* FIXME: these chars only have meaning at beg/end of pat? */ |
701 | 0 | case '^': |
702 | 0 | ret = regnode(preg, BOL); |
703 | 0 | break; |
704 | 0 | case '$': |
705 | 0 | ret = regnode(preg, EOL); |
706 | 0 | break; |
707 | 0 | case '.': |
708 | 0 | ret = regnode(preg, ANY); |
709 | 0 | *flagp |= HASWIDTH|SIMPLE; |
710 | 0 | break; |
711 | 0 | case '[': { |
712 | 0 | const char *pattern = preg->regparse; |
713 | |
|
714 | 0 | if (*pattern == '^') { /* Complement of range. */ |
715 | 0 | ret = regnode(preg, ANYBUT); |
716 | 0 | pattern++; |
717 | 0 | } else |
718 | 0 | ret = regnode(preg, ANYOF); |
719 | | |
720 | | /* Special case. If the first char is ']' or '-', it is part of the set */ |
721 | 0 | if (*pattern == ']' || *pattern == '-') { |
722 | 0 | reg_addrange(preg, *pattern, *pattern); |
723 | 0 | pattern++; |
724 | 0 | } |
725 | |
|
726 | 0 | while (*pattern != ']') { |
727 | | /* Is this a range? a-z */ |
728 | 0 | int start; |
729 | 0 | int end; |
730 | |
|
731 | 0 | enum { |
732 | 0 | CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER, |
733 | 0 | CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT, |
734 | 0 | CC_NUM |
735 | 0 | }; |
736 | 0 | int cc; |
737 | |
|
738 | 0 | if (!*pattern) { |
739 | 0 | preg->err = REG_ERR_UNMATCHED_BRACKET; |
740 | 0 | return 0; |
741 | 0 | } |
742 | | |
743 | 0 | pattern += reg_utf8_tounicode_case(pattern, &start, nocase); |
744 | 0 | if (start == '\\') { |
745 | | /* First check for class shorthand escapes */ |
746 | 0 | switch (*pattern) { |
747 | 0 | case 's': |
748 | 0 | pattern++; |
749 | 0 | cc = CC_SPACE; |
750 | 0 | goto cc_switch; |
751 | 0 | case 'd': |
752 | 0 | pattern++; |
753 | 0 | cc = CC_DIGIT; |
754 | 0 | goto cc_switch; |
755 | 0 | case 'w': |
756 | 0 | pattern++; |
757 | 0 | reg_addrange(preg, '_', '_'); |
758 | 0 | cc = CC_ALNUM; |
759 | 0 | goto cc_switch; |
760 | 0 | } |
761 | 0 | pattern += reg_decode_escape(pattern, &start); |
762 | 0 | if (start == 0) { |
763 | 0 | preg->err = REG_ERR_NULL_CHAR; |
764 | 0 | return 0; |
765 | 0 | } |
766 | 0 | if (start == '\\' && *pattern == 0) { |
767 | 0 | preg->err = REG_ERR_INVALID_ESCAPE; |
768 | 0 | return 0; |
769 | 0 | } |
770 | 0 | } |
771 | 0 | if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') { |
772 | | /* skip '-' */ |
773 | 0 | pattern += utf8_tounicode(pattern, &end); |
774 | 0 | pattern += reg_utf8_tounicode_case(pattern, &end, nocase); |
775 | 0 | if (end == '\\') { |
776 | 0 | pattern += reg_decode_escape(pattern, &end); |
777 | 0 | if (end == 0) { |
778 | 0 | preg->err = REG_ERR_NULL_CHAR; |
779 | 0 | return 0; |
780 | 0 | } |
781 | 0 | if (end == '\\' && *pattern == 0) { |
782 | 0 | preg->err = REG_ERR_INVALID_ESCAPE; |
783 | 0 | return 0; |
784 | 0 | } |
785 | 0 | } |
786 | | |
787 | 0 | reg_addrange(preg, start, end); |
788 | 0 | continue; |
789 | 0 | } |
790 | 0 | if (start == '[' && pattern[0] == ':') { |
791 | 0 | static const char *character_class[] = { |
792 | 0 | ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:", |
793 | 0 | ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:", |
794 | 0 | }; |
795 | |
|
796 | 0 | for (cc = 0; cc < CC_NUM; cc++) { |
797 | 0 | n = strlen(character_class[cc]); |
798 | 0 | if (strncmp(pattern, character_class[cc], n) == 0) { |
799 | 0 | if (pattern[n] != ']') { |
800 | 0 | preg->err = REG_ERR_UNMATCHED_BRACKET; |
801 | 0 | return 0; |
802 | 0 | } |
803 | | /* Found a character class */ |
804 | 0 | pattern += n + 1; |
805 | 0 | break; |
806 | 0 | } |
807 | 0 | } |
808 | 0 | if (cc != CC_NUM) { |
809 | 0 | cc_switch: |
810 | 0 | switch (cc) { |
811 | 0 | case CC_ALNUM: |
812 | 0 | reg_addrange(preg, '0', '9'); |
813 | | /* Fall through */ |
814 | 0 | case CC_ALPHA: |
815 | 0 | if ((preg->cflags & REG_ICASE) == 0) { |
816 | 0 | reg_addrange(preg, 'a', 'z'); |
817 | 0 | } |
818 | 0 | reg_addrange(preg, 'A', 'Z'); |
819 | 0 | break; |
820 | 0 | case CC_SPACE: |
821 | 0 | reg_addrange_str(preg, " \t\r\n\f\v"); |
822 | 0 | break; |
823 | 0 | case CC_BLANK: |
824 | 0 | reg_addrange_str(preg, " \t"); |
825 | 0 | break; |
826 | 0 | case CC_UPPER: |
827 | 0 | reg_addrange(preg, 'A', 'Z'); |
828 | 0 | break; |
829 | 0 | case CC_LOWER: |
830 | 0 | reg_addrange(preg, 'a', 'z'); |
831 | 0 | break; |
832 | 0 | case CC_XDIGIT: |
833 | 0 | reg_addrange(preg, 'a', 'f'); |
834 | 0 | reg_addrange(preg, 'A', 'F'); |
835 | | /* Fall through */ |
836 | 0 | case CC_DIGIT: |
837 | 0 | reg_addrange(preg, '0', '9'); |
838 | 0 | break; |
839 | 0 | case CC_CNTRL: |
840 | 0 | reg_addrange(preg, 0, 31); |
841 | 0 | reg_addrange(preg, 127, 127); |
842 | 0 | break; |
843 | 0 | case CC_PRINT: |
844 | 0 | reg_addrange(preg, ' ', '~'); |
845 | 0 | break; |
846 | 0 | case CC_GRAPH: |
847 | 0 | reg_addrange(preg, '!', '~'); |
848 | 0 | break; |
849 | 0 | case CC_PUNCT: |
850 | 0 | reg_addrange(preg, '!', '/'); |
851 | 0 | reg_addrange(preg, ':', '@'); |
852 | 0 | reg_addrange(preg, '[', '`'); |
853 | 0 | reg_addrange(preg, '{', '~'); |
854 | 0 | break; |
855 | 0 | } |
856 | 0 | continue; |
857 | 0 | } |
858 | 0 | } |
859 | | /* Not a range, so just add the char */ |
860 | 0 | reg_addrange(preg, start, start); |
861 | 0 | } |
862 | 0 | regc(preg, '\0'); |
863 | |
|
864 | 0 | if (*pattern) { |
865 | 0 | pattern++; |
866 | 0 | } |
867 | 0 | preg->regparse = pattern; |
868 | |
|
869 | 0 | *flagp |= HASWIDTH|SIMPLE; |
870 | 0 | } |
871 | 0 | break; |
872 | 0 | case '(': |
873 | 0 | ret = reg(preg, 1, &flags); |
874 | 0 | if (ret == 0) |
875 | 0 | return 0; |
876 | 0 | *flagp |= flags&(HASWIDTH|SPSTART); |
877 | 0 | break; |
878 | 0 | case '\0': |
879 | 0 | case '|': |
880 | 0 | case ')': |
881 | 0 | preg->err = REG_ERR_INTERNAL; |
882 | 0 | return 0; /* Supposed to be caught earlier. */ |
883 | 0 | case '?': |
884 | 0 | case '+': |
885 | 0 | case '*': |
886 | 0 | case '{': |
887 | 0 | preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING; |
888 | 0 | return 0; |
889 | 0 | case '\\': |
890 | 0 | ch = *preg->regparse++; |
891 | 0 | switch (ch) { |
892 | 0 | case '\0': |
893 | 0 | preg->err = REG_ERR_INVALID_ESCAPE; |
894 | 0 | return 0; |
895 | 0 | case 'A': |
896 | 0 | ret = regnode(preg, BOLX); |
897 | 0 | break; |
898 | 0 | case 'Z': |
899 | 0 | ret = regnode(preg, EOLX); |
900 | 0 | break; |
901 | 0 | case '<': |
902 | 0 | case 'm': |
903 | 0 | ret = regnode(preg, WORDA); |
904 | 0 | break; |
905 | 0 | case '>': |
906 | 0 | case 'M': |
907 | 0 | ret = regnode(preg, WORDZ); |
908 | 0 | break; |
909 | 0 | case 'd': |
910 | 0 | case 'D': |
911 | 0 | ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT); |
912 | 0 | reg_addrange(preg, '0', '9'); |
913 | 0 | regc(preg, '\0'); |
914 | 0 | *flagp |= HASWIDTH|SIMPLE; |
915 | 0 | break; |
916 | 0 | case 'w': |
917 | 0 | case 'W': |
918 | 0 | ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT); |
919 | 0 | if ((preg->cflags & REG_ICASE) == 0) { |
920 | 0 | reg_addrange(preg, 'a', 'z'); |
921 | 0 | } |
922 | 0 | reg_addrange(preg, 'A', 'Z'); |
923 | 0 | reg_addrange(preg, '0', '9'); |
924 | 0 | reg_addrange(preg, '_', '_'); |
925 | 0 | regc(preg, '\0'); |
926 | 0 | *flagp |= HASWIDTH|SIMPLE; |
927 | 0 | break; |
928 | 0 | case 's': |
929 | 0 | case 'S': |
930 | 0 | ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT); |
931 | 0 | reg_addrange_str(preg," \t\r\n\f\v"); |
932 | 0 | regc(preg, '\0'); |
933 | 0 | *flagp |= HASWIDTH|SIMPLE; |
934 | 0 | break; |
935 | | /* FIXME: Someday handle \1, \2, ... */ |
936 | 0 | default: |
937 | | /* Handle general quoted chars in exact-match routine */ |
938 | | /* Back up to include the backslash */ |
939 | 0 | preg->regparse--; |
940 | 0 | goto de_fault; |
941 | 0 | } |
942 | 0 | break; |
943 | 0 | de_fault: |
944 | 0 | default: { |
945 | | /* |
946 | | * Encode a string of characters to be matched exactly. |
947 | | */ |
948 | 0 | int added = 0; |
949 | | |
950 | | /* Back up to pick up the first char of interest */ |
951 | 0 | preg->regparse -= n; |
952 | |
|
953 | 0 | ret = regnode(preg, EXACTLY); |
954 | | |
955 | | /* Note that a META operator such as ? or * consumes the |
956 | | * preceding char. |
957 | | * Thus we must be careful to look ahead by 2 and add the |
958 | | * last char as it's own EXACTLY if necessary |
959 | | */ |
960 | | |
961 | | /* Until end of string or a META char is reached */ |
962 | 0 | while (*preg->regparse && strchr(META, *preg->regparse) == NULL) { |
963 | 0 | n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE)); |
964 | 0 | if (ch == '\\' && preg->regparse[n]) { |
965 | | /* Non-trailing backslash. |
966 | | * Is this a special escape, or a regular escape? |
967 | | */ |
968 | 0 | if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) { |
969 | | /* A special escape. All done with EXACTLY */ |
970 | 0 | break; |
971 | 0 | } |
972 | | /* Decode it. Note that we add the length for the escape |
973 | | * sequence to the length for the backlash so we can skip |
974 | | * the entire sequence, or not as required. |
975 | | */ |
976 | 0 | n += reg_decode_escape(preg->regparse + n, &ch); |
977 | 0 | if (ch == 0) { |
978 | 0 | preg->err = REG_ERR_NULL_CHAR; |
979 | 0 | return 0; |
980 | 0 | } |
981 | 0 | } |
982 | | |
983 | | /* Now we have one char 'ch' of length 'n'. |
984 | | * Check to see if the following char is a MULT |
985 | | */ |
986 | | |
987 | 0 | if (ISMULT(preg->regparse[n])) { |
988 | | /* Yes. But do we already have some EXACTLY chars? */ |
989 | 0 | if (added) { |
990 | | /* Yes, so return what we have and pick up the current char next time around */ |
991 | 0 | break; |
992 | 0 | } |
993 | | /* No, so add this single char and finish */ |
994 | 0 | regc(preg, ch); |
995 | 0 | added++; |
996 | 0 | preg->regparse += n; |
997 | 0 | break; |
998 | 0 | } |
999 | | |
1000 | | /* No, so just add this char normally */ |
1001 | 0 | regc(preg, ch); |
1002 | 0 | added++; |
1003 | 0 | preg->regparse += n; |
1004 | 0 | } |
1005 | 0 | regc(preg, '\0'); |
1006 | |
|
1007 | 0 | *flagp |= HASWIDTH; |
1008 | 0 | if (added == 1) |
1009 | 0 | *flagp |= SIMPLE; |
1010 | 0 | break; |
1011 | 0 | } |
1012 | 0 | break; |
1013 | 0 | } |
1014 | | |
1015 | 0 | return(ret); |
1016 | 0 | } |
1017 | | |
1018 | | static void reg_grow(regex_t *preg, int n) |
1019 | 0 | { |
1020 | 0 | if (preg->p + n >= preg->proglen) { |
1021 | 0 | preg->proglen = (preg->p + n) * 2; |
1022 | 0 | preg->program = realloc(preg->program, preg->proglen * sizeof(int)); |
1023 | 0 | } |
1024 | 0 | } |
1025 | | |
1026 | | /* |
1027 | | - regnode - emit a node |
1028 | | */ |
1029 | | /* Location. */ |
1030 | | static int regnode(regex_t *preg, int op) |
1031 | 0 | { |
1032 | 0 | reg_grow(preg, 2); |
1033 | | |
1034 | | /* The OP followed by a next pointer */ |
1035 | 0 | preg->program[preg->p++] = op; |
1036 | 0 | preg->program[preg->p++] = 0; |
1037 | | |
1038 | | /* Return the start of the node */ |
1039 | 0 | return preg->p - 2; |
1040 | 0 | } |
1041 | | |
1042 | | /* |
1043 | | - regc - emit (if appropriate) a byte of code |
1044 | | */ |
1045 | | static void regc(regex_t *preg, int b ) |
1046 | 0 | { |
1047 | 0 | reg_grow(preg, 1); |
1048 | 0 | preg->program[preg->p++] = b; |
1049 | 0 | } |
1050 | | |
1051 | | /* |
1052 | | - reginsert - insert an operator in front of already-emitted operand |
1053 | | * |
1054 | | * Means relocating the operand. |
1055 | | * Returns the new location of the original operand. |
1056 | | */ |
1057 | | static int reginsert(regex_t *preg, int op, int size, int opnd ) |
1058 | 0 | { |
1059 | 0 | reg_grow(preg, size); |
1060 | | |
1061 | | /* Move everything from opnd up */ |
1062 | 0 | memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd)); |
1063 | | /* Zero out the new space */ |
1064 | 0 | memset(preg->program + opnd, 0, sizeof(int) * size); |
1065 | |
|
1066 | 0 | preg->program[opnd] = op; |
1067 | |
|
1068 | 0 | preg->p += size; |
1069 | |
|
1070 | 0 | return opnd + size; |
1071 | 0 | } |
1072 | | |
1073 | | /* |
1074 | | - regtail - set the next-pointer at the end of a node chain |
1075 | | */ |
1076 | | static void regtail(regex_t *preg, int p, int val) |
1077 | 0 | { |
1078 | 0 | int scan; |
1079 | 0 | int temp; |
1080 | 0 | int offset; |
1081 | | |
1082 | | /* Find last node. */ |
1083 | 0 | scan = p; |
1084 | 0 | for (;;) { |
1085 | 0 | temp = regnext(preg, scan); |
1086 | 0 | if (temp == 0) |
1087 | 0 | break; |
1088 | 0 | scan = temp; |
1089 | 0 | } |
1090 | |
|
1091 | 0 | if (OP(preg, scan) == BACK) |
1092 | 0 | offset = scan - val; |
1093 | 0 | else |
1094 | 0 | offset = val - scan; |
1095 | |
|
1096 | 0 | preg->program[scan + 1] = offset; |
1097 | 0 | } |
1098 | | |
1099 | | /* |
1100 | | - regoptail - regtail on operand of first argument; nop if operandless |
1101 | | */ |
1102 | | |
1103 | | static void regoptail(regex_t *preg, int p, int val ) |
1104 | 0 | { |
1105 | | /* "Operandless" and "op != BRANCH" are synonymous in practice. */ |
1106 | 0 | if (p != 0 && OP(preg, p) == BRANCH) { |
1107 | 0 | regtail(preg, OPERAND(p), val); |
1108 | 0 | } |
1109 | 0 | } |
1110 | | |
1111 | | /* |
1112 | | * regexec and friends |
1113 | | */ |
1114 | | |
1115 | | /* |
1116 | | * Forwards. |
1117 | | */ |
1118 | | static int regtry(regex_t *preg, const char *string ); |
1119 | | static int regmatch(regex_t *preg, int prog); |
1120 | | static int regrepeat(regex_t *preg, int p, int max); |
1121 | | |
1122 | | /* |
1123 | | - regexec - match a regexp against a string |
1124 | | */ |
1125 | | int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags) |
1126 | 0 | { |
1127 | 0 | const char *s; |
1128 | 0 | int scan; |
1129 | | |
1130 | | /* Be paranoid... */ |
1131 | 0 | if (preg == NULL || preg->program == NULL || string == NULL) { |
1132 | 0 | return REG_ERR_NULL_ARGUMENT; |
1133 | 0 | } |
1134 | | |
1135 | | /* Check validity of program. */ |
1136 | 0 | if (*preg->program != REG_MAGIC) { |
1137 | 0 | return REG_ERR_CORRUPTED; |
1138 | 0 | } |
1139 | | |
1140 | | #ifdef DEBUG |
1141 | | fprintf(stderr, "regexec: %s\n", string); |
1142 | | regdump(preg); |
1143 | | #endif |
1144 | | |
1145 | 0 | preg->eflags = eflags; |
1146 | 0 | preg->pmatch = pmatch; |
1147 | 0 | preg->nmatch = nmatch; |
1148 | 0 | preg->start = string; /* All offsets are computed from here */ |
1149 | | |
1150 | | /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */ |
1151 | 0 | for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) { |
1152 | 0 | int op = OP(preg, scan); |
1153 | 0 | if (op == END) |
1154 | 0 | break; |
1155 | 0 | if (op == REPX || op == REPXMIN) |
1156 | 0 | preg->program[scan + 4] = 0; |
1157 | 0 | } |
1158 | | |
1159 | | /* If there is a "must appear" string, look for it. */ |
1160 | 0 | if (preg->regmust != 0) { |
1161 | 0 | s = string; |
1162 | 0 | while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) { |
1163 | 0 | if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) { |
1164 | 0 | break; |
1165 | 0 | } |
1166 | 0 | s++; |
1167 | 0 | } |
1168 | 0 | if (s == NULL) /* Not present. */ |
1169 | 0 | return REG_NOMATCH; |
1170 | 0 | } |
1171 | | |
1172 | | /* Mark beginning of line for ^ . */ |
1173 | 0 | preg->regbol = string; |
1174 | | |
1175 | | /* Simplest case: anchored match need be tried only once (maybe per line). */ |
1176 | 0 | if (preg->reganch) { |
1177 | 0 | if (eflags & REG_NOTBOL) { |
1178 | | /* This is an anchored search, but not an BOL, so possibly skip to the next line */ |
1179 | 0 | goto nextline; |
1180 | 0 | } |
1181 | 0 | while (1) { |
1182 | 0 | if (regtry(preg, string)) { |
1183 | 0 | return REG_NOERROR; |
1184 | 0 | } |
1185 | 0 | if (*string) { |
1186 | 0 | nextline: |
1187 | 0 | if (preg->cflags & REG_NEWLINE) { |
1188 | | /* Try the next anchor? */ |
1189 | 0 | string = strchr(string, '\n'); |
1190 | 0 | if (string) { |
1191 | 0 | preg->regbol = ++string; |
1192 | 0 | continue; |
1193 | 0 | } |
1194 | 0 | } |
1195 | 0 | } |
1196 | 0 | return REG_NOMATCH; |
1197 | 0 | } |
1198 | 0 | } |
1199 | | |
1200 | | /* Messy cases: unanchored match. */ |
1201 | 0 | s = string; |
1202 | 0 | if (preg->regstart != '\0') { |
1203 | | /* We know what char it must start with. */ |
1204 | 0 | while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) { |
1205 | 0 | if (regtry(preg, s)) |
1206 | 0 | return REG_NOERROR; |
1207 | 0 | s++; |
1208 | 0 | } |
1209 | 0 | } |
1210 | 0 | else |
1211 | | /* We don't -- general case. */ |
1212 | 0 | while (1) { |
1213 | 0 | if (regtry(preg, s)) |
1214 | 0 | return REG_NOERROR; |
1215 | 0 | if (*s == '\0') { |
1216 | 0 | break; |
1217 | 0 | } |
1218 | 0 | else { |
1219 | 0 | int c; |
1220 | 0 | s += utf8_tounicode(s, &c); |
1221 | 0 | } |
1222 | 0 | } |
1223 | | |
1224 | | /* Failure. */ |
1225 | 0 | return REG_NOMATCH; |
1226 | 0 | } |
1227 | | |
1228 | | /* |
1229 | | - regtry - try match at specific point |
1230 | | */ |
1231 | | /* 0 failure, 1 success */ |
1232 | | static int regtry( regex_t *preg, const char *string ) |
1233 | 0 | { |
1234 | 0 | int i; |
1235 | |
|
1236 | 0 | preg->reginput = string; |
1237 | |
|
1238 | 0 | for (i = 0; i < preg->nmatch; i++) { |
1239 | 0 | if (preg->pmatch) { |
1240 | 0 | preg->pmatch[i].rm_so = -1; |
1241 | 0 | preg->pmatch[i].rm_eo = -1; |
1242 | 0 | } |
1243 | 0 | } |
1244 | 0 | if (regmatch(preg, 1)) { |
1245 | 0 | if (preg->pmatch) { |
1246 | 0 | preg->pmatch[0].rm_so = string - preg->start; |
1247 | 0 | preg->pmatch[0].rm_eo = preg->reginput - preg->start; |
1248 | 0 | } |
1249 | 0 | return(1); |
1250 | 0 | } else |
1251 | 0 | return(0); |
1252 | 0 | } |
1253 | | |
1254 | | /** |
1255 | | * Returns bytes matched if 'pattern' is a prefix of 'string'. |
1256 | | * |
1257 | | * If 'nocase' is non-zero, does a case-insensitive match. |
1258 | | * |
1259 | | * Returns -1 on not found. |
1260 | | */ |
1261 | | static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase) |
1262 | 0 | { |
1263 | 0 | const char *s = string; |
1264 | 0 | while (proglen && *s) { |
1265 | 0 | int ch; |
1266 | 0 | int n = reg_utf8_tounicode_case(s, &ch, nocase); |
1267 | 0 | if (ch != *prog) { |
1268 | 0 | return -1; |
1269 | 0 | } |
1270 | 0 | prog++; |
1271 | 0 | s += n; |
1272 | 0 | proglen--; |
1273 | 0 | } |
1274 | 0 | if (proglen == 0) { |
1275 | 0 | return s - string; |
1276 | 0 | } |
1277 | 0 | return -1; |
1278 | 0 | } |
1279 | | |
1280 | | /** |
1281 | | * Searches for 'c' in the range 'range'. |
1282 | | * |
1283 | | * Returns 1 if found, or 0 if not. |
1284 | | */ |
1285 | | static int reg_range_find(const int *range, int c) |
1286 | 0 | { |
1287 | 0 | while (*range) { |
1288 | | /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/ |
1289 | 0 | if (c >= range[1] && c <= (range[0] + range[1] - 1)) { |
1290 | 0 | return 1; |
1291 | 0 | } |
1292 | 0 | range += 2; |
1293 | 0 | } |
1294 | 0 | return 0; |
1295 | 0 | } |
1296 | | |
1297 | | /** |
1298 | | * Search for the character 'c' in the utf-8 string 'string'. |
1299 | | * |
1300 | | * If 'nocase' is set, the 'string' is assumed to be uppercase |
1301 | | * and 'c' is converted to uppercase before matching. |
1302 | | * |
1303 | | * Returns the byte position in the string where the 'c' was found, or |
1304 | | * NULL if not found. |
1305 | | */ |
1306 | | static const char *str_find(const char *string, int c, int nocase) |
1307 | 0 | { |
1308 | 0 | if (nocase) { |
1309 | | /* The "string" should already be converted to uppercase */ |
1310 | 0 | c = utf8_upper(c); |
1311 | 0 | } |
1312 | 0 | while (*string) { |
1313 | 0 | int ch; |
1314 | 0 | int n = reg_utf8_tounicode_case(string, &ch, nocase); |
1315 | 0 | if (c == ch) { |
1316 | 0 | return string; |
1317 | 0 | } |
1318 | 0 | string += n; |
1319 | 0 | } |
1320 | 0 | return NULL; |
1321 | 0 | } |
1322 | | |
1323 | | /** |
1324 | | * Returns true if 'ch' is an end-of-line char. |
1325 | | * |
1326 | | * In REG_NEWLINE mode, \n is considered EOL in |
1327 | | * addition to \0 |
1328 | | */ |
1329 | | static int reg_iseol(regex_t *preg, int ch) |
1330 | 0 | { |
1331 | 0 | if (preg->cflags & REG_NEWLINE) { |
1332 | 0 | return ch == '\0' || ch == '\n'; |
1333 | 0 | } |
1334 | 0 | else { |
1335 | 0 | return ch == '\0'; |
1336 | 0 | } |
1337 | 0 | } |
1338 | | |
1339 | | static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin) |
1340 | 0 | { |
1341 | 0 | int nextch = '\0'; |
1342 | 0 | const char *save; |
1343 | 0 | int no; |
1344 | 0 | int c; |
1345 | |
|
1346 | 0 | int max = preg->program[scan + 2]; |
1347 | 0 | int min = preg->program[scan + 3]; |
1348 | 0 | int next = regnext(preg, scan); |
1349 | | |
1350 | | /* |
1351 | | * Lookahead to avoid useless match attempts |
1352 | | * when we know what character comes next. |
1353 | | */ |
1354 | 0 | if (OP(preg, next) == EXACTLY) { |
1355 | 0 | nextch = preg->program[OPERAND(next)]; |
1356 | 0 | } |
1357 | 0 | save = preg->reginput; |
1358 | 0 | no = regrepeat(preg, scan + 5, max); |
1359 | 0 | if (no < min) { |
1360 | 0 | return 0; |
1361 | 0 | } |
1362 | 0 | if (matchmin) { |
1363 | | /* from min up to no */ |
1364 | 0 | max = no; |
1365 | 0 | no = min; |
1366 | 0 | } |
1367 | | /* else from no down to min */ |
1368 | 0 | while (1) { |
1369 | 0 | if (matchmin) { |
1370 | 0 | if (no > max) { |
1371 | 0 | break; |
1372 | 0 | } |
1373 | 0 | } |
1374 | 0 | else { |
1375 | 0 | if (no < min) { |
1376 | 0 | break; |
1377 | 0 | } |
1378 | 0 | } |
1379 | 0 | preg->reginput = save + utf8_index(save, no); |
1380 | 0 | reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); |
1381 | | /* If it could work, try it. */ |
1382 | 0 | if (reg_iseol(preg, nextch) || c == nextch) { |
1383 | 0 | if (regmatch(preg, next)) { |
1384 | 0 | return(1); |
1385 | 0 | } |
1386 | 0 | } |
1387 | 0 | if (matchmin) { |
1388 | | /* Couldn't or didn't, add one more */ |
1389 | 0 | no++; |
1390 | 0 | } |
1391 | 0 | else { |
1392 | | /* Couldn't or didn't -- back up. */ |
1393 | 0 | no--; |
1394 | 0 | } |
1395 | 0 | } |
1396 | 0 | return(0); |
1397 | 0 | } |
1398 | | |
1399 | | static int regmatchrepeat(regex_t *preg, int scan, int matchmin) |
1400 | 0 | { |
1401 | 0 | int *scanpt = preg->program + scan; |
1402 | |
|
1403 | 0 | int max = scanpt[2]; |
1404 | 0 | int min = scanpt[3]; |
1405 | | |
1406 | | /* Have we reached min? */ |
1407 | 0 | if (scanpt[4] < min) { |
1408 | | /* No, so get another one */ |
1409 | 0 | scanpt[4]++; |
1410 | 0 | if (regmatch(preg, scan + 5)) { |
1411 | 0 | return 1; |
1412 | 0 | } |
1413 | 0 | scanpt[4]--; |
1414 | 0 | return 0; |
1415 | 0 | } |
1416 | 0 | if (scanpt[4] > max) { |
1417 | 0 | return 0; |
1418 | 0 | } |
1419 | | |
1420 | 0 | if (matchmin) { |
1421 | | /* minimal, so try other branch first */ |
1422 | 0 | if (regmatch(preg, regnext(preg, scan))) { |
1423 | 0 | return 1; |
1424 | 0 | } |
1425 | | /* No, so try one more */ |
1426 | 0 | scanpt[4]++; |
1427 | 0 | if (regmatch(preg, scan + 5)) { |
1428 | 0 | return 1; |
1429 | 0 | } |
1430 | 0 | scanpt[4]--; |
1431 | 0 | return 0; |
1432 | 0 | } |
1433 | | /* maximal, so try this branch again */ |
1434 | 0 | if (scanpt[4] < max) { |
1435 | 0 | scanpt[4]++; |
1436 | 0 | if (regmatch(preg, scan + 5)) { |
1437 | 0 | return 1; |
1438 | 0 | } |
1439 | 0 | scanpt[4]--; |
1440 | 0 | } |
1441 | | /* At this point we are at max with no match. Try the other branch */ |
1442 | 0 | return regmatch(preg, regnext(preg, scan)); |
1443 | 0 | } |
1444 | | |
1445 | | /* |
1446 | | - regmatch - main matching routine |
1447 | | * |
1448 | | * Conceptually the strategy is simple: check to see whether the current |
1449 | | * node matches, call self recursively to see whether the rest matches, |
1450 | | * and then act accordingly. In practice we make some effort to avoid |
1451 | | * recursion, in particular by going through "ordinary" nodes (that don't |
1452 | | * need to know whether the rest of the match failed) by a loop instead of |
1453 | | * by recursion. |
1454 | | */ |
1455 | | /* 0 failure, 1 success */ |
1456 | | static int regmatch(regex_t *preg, int prog) |
1457 | 0 | { |
1458 | 0 | int scan; /* Current node. */ |
1459 | 0 | int next; /* Next node. */ |
1460 | 0 | const char *save; |
1461 | |
|
1462 | 0 | scan = prog; |
1463 | |
|
1464 | | #ifdef DEBUG |
1465 | | if (scan != 0 && regnarrate) |
1466 | | fprintf(stderr, "%s(\n", regprop(scan)); |
1467 | | #endif |
1468 | 0 | while (scan != 0) { |
1469 | 0 | int n; |
1470 | 0 | int c; |
1471 | | #ifdef DEBUG |
1472 | | if (regnarrate) { |
1473 | | fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */ |
1474 | | } |
1475 | | #endif |
1476 | 0 | next = regnext(preg, scan); |
1477 | 0 | n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); |
1478 | |
|
1479 | 0 | switch (OP(preg, scan)) { |
1480 | 0 | case BOLX: |
1481 | 0 | if ((preg->eflags & REG_NOTBOL)) { |
1482 | 0 | return(0); |
1483 | 0 | } |
1484 | | /* Fall through */ |
1485 | 0 | case BOL: |
1486 | 0 | if (preg->reginput != preg->regbol) { |
1487 | 0 | return(0); |
1488 | 0 | } |
1489 | 0 | break; |
1490 | 0 | case EOLX: |
1491 | 0 | if (c != 0) { |
1492 | | /* For EOLX, only match real end of line, not newline */ |
1493 | 0 | return 0; |
1494 | 0 | } |
1495 | 0 | break; |
1496 | 0 | case EOL: |
1497 | 0 | if (!reg_iseol(preg, c)) { |
1498 | 0 | return(0); |
1499 | 0 | } |
1500 | 0 | break; |
1501 | 0 | case WORDA: |
1502 | | /* Must be looking at a letter, digit, or _ */ |
1503 | 0 | if ((!isalnum(UCHAR(c))) && c != '_') |
1504 | 0 | return(0); |
1505 | | /* Prev must be BOL or nonword */ |
1506 | 0 | if (preg->reginput > preg->regbol && |
1507 | 0 | (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) |
1508 | 0 | return(0); |
1509 | 0 | break; |
1510 | 0 | case WORDZ: |
1511 | | /* Can't match at BOL */ |
1512 | 0 | if (preg->reginput > preg->regbol) { |
1513 | | /* Current must be EOL or nonword */ |
1514 | 0 | if (reg_iseol(preg, c) || !(isalnum(UCHAR(c)) || c == '_')) { |
1515 | 0 | c = preg->reginput[-1]; |
1516 | | /* Previous must be word */ |
1517 | 0 | if (isalnum(UCHAR(c)) || c == '_') { |
1518 | 0 | break; |
1519 | 0 | } |
1520 | 0 | } |
1521 | 0 | } |
1522 | | /* No */ |
1523 | 0 | return(0); |
1524 | | |
1525 | 0 | case ANY: |
1526 | 0 | if (reg_iseol(preg, c)) |
1527 | 0 | return 0; |
1528 | 0 | preg->reginput += n; |
1529 | 0 | break; |
1530 | 0 | case EXACTLY: { |
1531 | 0 | int opnd; |
1532 | 0 | int len; |
1533 | 0 | int slen; |
1534 | |
|
1535 | 0 | opnd = OPERAND(scan); |
1536 | 0 | len = str_int_len(preg->program + opnd); |
1537 | |
|
1538 | 0 | slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE); |
1539 | 0 | if (slen < 0) { |
1540 | 0 | return(0); |
1541 | 0 | } |
1542 | 0 | preg->reginput += slen; |
1543 | 0 | } |
1544 | 0 | break; |
1545 | 0 | case ANYOF: |
1546 | 0 | if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) { |
1547 | 0 | return(0); |
1548 | 0 | } |
1549 | 0 | preg->reginput += n; |
1550 | 0 | break; |
1551 | 0 | case ANYBUT: |
1552 | 0 | if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) { |
1553 | 0 | return(0); |
1554 | 0 | } |
1555 | 0 | preg->reginput += n; |
1556 | 0 | break; |
1557 | 0 | case NOTHING: |
1558 | 0 | break; |
1559 | 0 | case BACK: |
1560 | 0 | break; |
1561 | 0 | case BRANCH: |
1562 | 0 | if (OP(preg, next) != BRANCH) /* No choice. */ |
1563 | 0 | next = OPERAND(scan); /* Avoid recursion. */ |
1564 | 0 | else { |
1565 | 0 | do { |
1566 | 0 | save = preg->reginput; |
1567 | 0 | if (regmatch(preg, OPERAND(scan))) { |
1568 | 0 | return(1); |
1569 | 0 | } |
1570 | 0 | preg->reginput = save; |
1571 | 0 | scan = regnext(preg, scan); |
1572 | 0 | } while (scan != 0 && OP(preg, scan) == BRANCH); |
1573 | 0 | return(0); |
1574 | | /* NOTREACHED */ |
1575 | 0 | } |
1576 | 0 | break; |
1577 | 0 | case REP: |
1578 | 0 | case REPMIN: |
1579 | 0 | return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN); |
1580 | | |
1581 | 0 | case REPX: |
1582 | 0 | case REPXMIN: |
1583 | 0 | return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN); |
1584 | | |
1585 | 0 | case END: |
1586 | 0 | return 1; /* Success! */ |
1587 | | |
1588 | 0 | case OPENNC: |
1589 | 0 | case CLOSENC: |
1590 | 0 | return regmatch(preg, next); |
1591 | | |
1592 | 0 | default: |
1593 | 0 | if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) { |
1594 | 0 | save = preg->reginput; |
1595 | 0 | if (regmatch(preg, next)) { |
1596 | 0 | if (OP(preg, scan) < CLOSE) { |
1597 | 0 | int no = OP(preg, scan) - OPEN; |
1598 | 0 | if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_so == -1) { |
1599 | 0 | preg->pmatch[no].rm_so = save - preg->start; |
1600 | 0 | } |
1601 | 0 | } |
1602 | 0 | else { |
1603 | 0 | int no = OP(preg, scan) - CLOSE; |
1604 | 0 | if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_eo == -1) { |
1605 | 0 | preg->pmatch[no].rm_eo = save - preg->start; |
1606 | 0 | } |
1607 | 0 | } |
1608 | 0 | return(1); |
1609 | 0 | } |
1610 | | /* Restore input position after failure */ |
1611 | 0 | preg->reginput = save; |
1612 | 0 | return(0); |
1613 | 0 | } |
1614 | 0 | return REG_ERR_INTERNAL; |
1615 | 0 | } |
1616 | | |
1617 | 0 | scan = next; |
1618 | 0 | } |
1619 | | |
1620 | | /* |
1621 | | * We get here only if there's trouble -- normally "case END" is |
1622 | | * the terminating point. |
1623 | | */ |
1624 | 0 | return REG_ERR_INTERNAL; |
1625 | 0 | } |
1626 | | |
1627 | | /* |
1628 | | - regrepeat - repeatedly match something simple, report how many |
1629 | | */ |
1630 | | static int regrepeat(regex_t *preg, int p, int max) |
1631 | 0 | { |
1632 | 0 | int count = 0; |
1633 | 0 | const char *scan; |
1634 | 0 | int opnd; |
1635 | 0 | int ch; |
1636 | 0 | int n; |
1637 | |
|
1638 | 0 | scan = preg->reginput; |
1639 | 0 | opnd = OPERAND(p); |
1640 | 0 | switch (OP(preg, p)) { |
1641 | 0 | case ANY: |
1642 | 0 | while (!reg_iseol(preg, *scan) && count < max) { |
1643 | 0 | count++; |
1644 | 0 | scan += utf8_charlen(*scan); |
1645 | 0 | } |
1646 | 0 | break; |
1647 | 0 | case EXACTLY: |
1648 | 0 | while (count < max) { |
1649 | 0 | n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); |
1650 | 0 | if (preg->program[opnd] != ch) { |
1651 | 0 | break; |
1652 | 0 | } |
1653 | 0 | count++; |
1654 | 0 | scan += n; |
1655 | 0 | } |
1656 | 0 | break; |
1657 | 0 | case ANYOF: |
1658 | 0 | while (count < max) { |
1659 | 0 | n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); |
1660 | 0 | if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) { |
1661 | 0 | break; |
1662 | 0 | } |
1663 | 0 | count++; |
1664 | 0 | scan += n; |
1665 | 0 | } |
1666 | 0 | break; |
1667 | 0 | case ANYBUT: |
1668 | 0 | while (count < max) { |
1669 | 0 | n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE); |
1670 | 0 | if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) { |
1671 | 0 | break; |
1672 | 0 | } |
1673 | 0 | count++; |
1674 | 0 | scan += n; |
1675 | 0 | } |
1676 | 0 | break; |
1677 | 0 | default: /* Oh dear. Called inappropriately. */ |
1678 | 0 | preg->err = REG_ERR_INTERNAL; |
1679 | 0 | count = 0; /* Best compromise. */ |
1680 | 0 | break; |
1681 | 0 | } |
1682 | 0 | preg->reginput = scan; |
1683 | |
|
1684 | 0 | return(count); |
1685 | 0 | } |
1686 | | |
1687 | | /* |
1688 | | - regnext - dig the "next" pointer out of a node |
1689 | | */ |
1690 | | static int regnext(regex_t *preg, int p ) |
1691 | 0 | { |
1692 | 0 | int offset; |
1693 | |
|
1694 | 0 | offset = NEXT(preg, p); |
1695 | |
|
1696 | 0 | if (offset == 0) |
1697 | 0 | return 0; |
1698 | | |
1699 | 0 | if (OP(preg, p) == BACK) |
1700 | 0 | return(p-offset); |
1701 | 0 | else |
1702 | 0 | return(p+offset); |
1703 | 0 | } |
1704 | | |
1705 | | /* |
1706 | | - regopsize - returns the size of opcode + operands at 'p' in words |
1707 | | */ |
1708 | | static int regopsize(regex_t *preg, int p ) |
1709 | 0 | { |
1710 | | /* Almost all opcodes are 2 words, but some are more */ |
1711 | 0 | switch (OP(preg, p)) { |
1712 | 0 | case REP: |
1713 | 0 | case REPMIN: |
1714 | 0 | case REPX: |
1715 | 0 | case REPXMIN: |
1716 | 0 | return 5; |
1717 | | |
1718 | 0 | case ANYOF: |
1719 | 0 | case ANYBUT: |
1720 | 0 | case EXACTLY: { |
1721 | 0 | int s = p + 2; |
1722 | 0 | while (preg->program[s++]) { |
1723 | 0 | } |
1724 | 0 | return s - p; |
1725 | 0 | } |
1726 | 0 | } |
1727 | 0 | return 2; |
1728 | 0 | } |
1729 | | |
1730 | | #if defined(DEBUG) && !defined(JIM_BOOTSTRAP) |
1731 | | |
1732 | | /* |
1733 | | - regdump - dump a regexp onto stdout in vaguely comprehensible form |
1734 | | */ |
1735 | | static void regdump(regex_t *preg) |
1736 | | { |
1737 | | int s; |
1738 | | int op = EXACTLY; /* Arbitrary non-END op. */ |
1739 | | int next; |
1740 | | char buf[MAX_UTF8_LEN + 1]; |
1741 | | |
1742 | | int i; |
1743 | | for (i = 1; i < preg->p; i++) { |
1744 | | printf("%02x ", (unsigned char)preg->program[i]); |
1745 | | if (i % 16 == 0) { |
1746 | | printf("\n"); |
1747 | | } |
1748 | | } |
1749 | | printf("\n"); |
1750 | | |
1751 | | s = 1; |
1752 | | while (op != END && s < preg->p) { /* While that wasn't END last time... */ |
1753 | | op = OP(preg, s); |
1754 | | printf("%3d: %s", s, regprop(op)); /* Where, what. */ |
1755 | | next = regnext(preg, s); |
1756 | | if (next == 0) /* Next ptr. */ |
1757 | | printf("(0)"); |
1758 | | else |
1759 | | printf("(%d)", next); |
1760 | | s += 2; |
1761 | | if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) { |
1762 | | int max = preg->program[s]; |
1763 | | int min = preg->program[s + 1]; |
1764 | | if (max == 65535) { |
1765 | | printf("{%d,*}", min); |
1766 | | } |
1767 | | else { |
1768 | | printf("{%d,%d}", min, max); |
1769 | | } |
1770 | | printf(" %d", preg->program[s + 2]); |
1771 | | s += 3; |
1772 | | } |
1773 | | else if (op == ANYOF || op == ANYBUT) { |
1774 | | /* set of ranges */ |
1775 | | |
1776 | | while (preg->program[s]) { |
1777 | | int len = preg->program[s++]; |
1778 | | int first = preg->program[s++]; |
1779 | | buf[utf8_getchars(buf, first)] = 0; |
1780 | | printf("%s", buf); |
1781 | | if (len > 1) { |
1782 | | buf[utf8_getchars(buf, first + len - 1)] = 0; |
1783 | | printf("-%s", buf); |
1784 | | } |
1785 | | } |
1786 | | s++; |
1787 | | } |
1788 | | else if (op == EXACTLY) { |
1789 | | /* Literal string, where present. */ |
1790 | | |
1791 | | while (preg->program[s]) { |
1792 | | buf[utf8_getchars(buf, preg->program[s])] = 0; |
1793 | | printf("%s", buf); |
1794 | | s++; |
1795 | | } |
1796 | | s++; |
1797 | | } |
1798 | | putchar('\n'); |
1799 | | } |
1800 | | |
1801 | | if (op == END) { |
1802 | | /* Header fields of interest. */ |
1803 | | if (preg->regstart) { |
1804 | | buf[utf8_getchars(buf, preg->regstart)] = 0; |
1805 | | printf("start '%s' ", buf); |
1806 | | } |
1807 | | if (preg->reganch) |
1808 | | printf("anchored "); |
1809 | | if (preg->regmust != 0) { |
1810 | | int i; |
1811 | | printf("must have:"); |
1812 | | for (i = 0; i < preg->regmlen; i++) { |
1813 | | putchar(preg->program[preg->regmust + i]); |
1814 | | } |
1815 | | putchar('\n'); |
1816 | | } |
1817 | | } |
1818 | | printf("\n"); |
1819 | | } |
1820 | | |
1821 | | /* |
1822 | | - regprop - printable representation of opcode |
1823 | | */ |
1824 | | static const char *regprop( int op ) |
1825 | | { |
1826 | | static char buf[50]; |
1827 | | |
1828 | | switch (op) { |
1829 | | case BOL: |
1830 | | return "BOL"; |
1831 | | case EOL: |
1832 | | return "EOL"; |
1833 | | case BOLX: |
1834 | | return "BOLX"; |
1835 | | case EOLX: |
1836 | | return "EOLX"; |
1837 | | case ANY: |
1838 | | return "ANY"; |
1839 | | case ANYOF: |
1840 | | return "ANYOF"; |
1841 | | case ANYBUT: |
1842 | | return "ANYBUT"; |
1843 | | case BRANCH: |
1844 | | return "BRANCH"; |
1845 | | case EXACTLY: |
1846 | | return "EXACTLY"; |
1847 | | case NOTHING: |
1848 | | return "NOTHING"; |
1849 | | case BACK: |
1850 | | return "BACK"; |
1851 | | case END: |
1852 | | return "END"; |
1853 | | case REP: |
1854 | | return "REP"; |
1855 | | case REPMIN: |
1856 | | return "REPMIN"; |
1857 | | case REPX: |
1858 | | return "REPX"; |
1859 | | case REPXMIN: |
1860 | | return "REPXMIN"; |
1861 | | case WORDA: |
1862 | | return "WORDA"; |
1863 | | case WORDZ: |
1864 | | return "WORDZ"; |
1865 | | case OPENNC: |
1866 | | return "OPEN"; |
1867 | | case CLOSENC: |
1868 | | return "CLOSE"; |
1869 | | default: |
1870 | | if (op >= OPEN && op < CLOSE) { |
1871 | | snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN); |
1872 | | } |
1873 | | else if (op >= CLOSE && op < CLOSE_END) { |
1874 | | snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE); |
1875 | | } |
1876 | | else { |
1877 | | snprintf(buf, sizeof(buf), "?%d?\n", op); |
1878 | | } |
1879 | | return(buf); |
1880 | | } |
1881 | | } |
1882 | | #endif /* JIM_BOOTSTRAP */ |
1883 | | |
1884 | | size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) |
1885 | 0 | { |
1886 | 0 | static const char *error_strings[] = { |
1887 | 0 | "success", |
1888 | 0 | "no match", |
1889 | 0 | "bad pattern", |
1890 | 0 | "null argument", |
1891 | 0 | "unknown error", |
1892 | 0 | "too big", |
1893 | 0 | "out of memory", |
1894 | 0 | "too many ()", |
1895 | 0 | "parentheses () not balanced", |
1896 | 0 | "braces {} not balanced", |
1897 | 0 | "invalid repetition count(s)", |
1898 | 0 | "extra characters", |
1899 | 0 | "*+ of empty atom", |
1900 | 0 | "nested count", |
1901 | 0 | "internal error", |
1902 | 0 | "count follows nothing", |
1903 | 0 | "invalid escape \\ sequence", |
1904 | 0 | "corrupted program", |
1905 | 0 | "contains null char", |
1906 | 0 | "brackets [] not balanced", |
1907 | 0 | }; |
1908 | 0 | const char *err; |
1909 | |
|
1910 | 0 | (void)preg; |
1911 | |
|
1912 | 0 | if (errcode < 0 || errcode >= REG_ERR_NUM) { |
1913 | 0 | err = "Bad error code"; |
1914 | 0 | } |
1915 | 0 | else { |
1916 | 0 | err = error_strings[errcode]; |
1917 | 0 | } |
1918 | |
|
1919 | 0 | return snprintf(errbuf, errbuf_size, "%s", err); |
1920 | 0 | } |
1921 | | |
1922 | | void regfree(regex_t *preg) |
1923 | 0 | { |
1924 | 0 | free(preg->program); |
1925 | 0 | } |
1926 | | |
1927 | | #endif |