/src/keystone/llvm/lib/Support/regcomp.c
Line | Count | Source |
1 | | /*- |
2 | | * This code is derived from OpenBSD's libc/regex, original license follows: |
3 | | * |
4 | | * Copyright (c) 1992, 1993, 1994 Henry Spencer. |
5 | | * Copyright (c) 1992, 1993, 1994 |
6 | | * The Regents of the University of California. All rights reserved. |
7 | | * |
8 | | * This code is derived from software contributed to Berkeley by |
9 | | * Henry Spencer. |
10 | | * |
11 | | * Redistribution and use in source and binary forms, with or without |
12 | | * modification, are permitted provided that the following conditions |
13 | | * are met: |
14 | | * 1. Redistributions of source code must retain the above copyright |
15 | | * notice, this list of conditions and the following disclaimer. |
16 | | * 2. Redistributions in binary form must reproduce the above copyright |
17 | | * notice, this list of conditions and the following disclaimer in the |
18 | | * documentation and/or other materials provided with the distribution. |
19 | | * 3. Neither the name of the University nor the names of its contributors |
20 | | * may be used to endorse or promote products derived from this software |
21 | | * without specific prior written permission. |
22 | | * |
23 | | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
24 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
25 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
26 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
27 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
28 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
29 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
30 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
31 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
32 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
33 | | * SUCH DAMAGE. |
34 | | * |
35 | | * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 |
36 | | */ |
37 | | |
38 | | #include <sys/types.h> |
39 | | #include <stdio.h> |
40 | | #include <string.h> |
41 | | #include <ctype.h> |
42 | | #include <limits.h> |
43 | | #include <stdlib.h> |
44 | | #include "regex_impl.h" |
45 | | |
46 | | #include "regutils.h" |
47 | | #include "regex2.h" |
48 | | |
49 | | #include "regcclass.h" |
50 | | #include "regcname.h" |
51 | | |
52 | | #include "llvm/Config/config.h" |
53 | | #if HAVE_STDINT_H |
54 | | #include <stdint.h> |
55 | | #else |
56 | | /* Pessimistically bound memory use */ |
57 | | #define SIZE_MAX UINT_MAX |
58 | | #endif |
59 | | |
60 | | /* |
61 | | * parse structure, passed up and down to avoid global variables and |
62 | | * other clumsinesses |
63 | | */ |
64 | | struct parse { |
65 | | char *next; /* next character in RE */ |
66 | | char *end; /* end of string (-> NUL normally) */ |
67 | | int error; /* has an error been seen? */ |
68 | | sop *strip; /* malloced strip */ |
69 | | sopno ssize; /* malloced strip size (allocated) */ |
70 | | sopno slen; /* malloced strip length (used) */ |
71 | | int ncsalloc; /* number of csets allocated */ |
72 | | struct re_guts *g; |
73 | 26.7k | # define NPAREN 10 /* we need to remember () 1-9 for back refs */ |
74 | | sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ |
75 | | sopno pend[NPAREN]; /* -> ) ([0] unused) */ |
76 | | }; |
77 | | |
78 | | static void p_ere(struct parse *, int); |
79 | | static void p_ere_exp(struct parse *); |
80 | | static void p_str(struct parse *); |
81 | | static void p_bre(struct parse *, int, int); |
82 | | static int p_simp_re(struct parse *, int); |
83 | | static int p_count(struct parse *); |
84 | | static void p_bracket(struct parse *); |
85 | | static void p_b_term(struct parse *, cset *); |
86 | | static void p_b_cclass(struct parse *, cset *); |
87 | | static void p_b_eclass(struct parse *, cset *); |
88 | | static char p_b_symbol(struct parse *); |
89 | | static char p_b_coll_elem(struct parse *, int); |
90 | | static char othercase(int); |
91 | | static void bothcases(struct parse *, int); |
92 | | static void ordinary(struct parse *, int); |
93 | | static void nonnewline(struct parse *); |
94 | | static void repeat(struct parse *, sopno, int, int); |
95 | | static int seterr(struct parse *, int); |
96 | | static cset *allocset(struct parse *); |
97 | | static void freeset(struct parse *, cset *); |
98 | | static int freezeset(struct parse *, cset *); |
99 | | static int firstch(struct parse *, cset *); |
100 | | static int nch(struct parse *, cset *); |
101 | | static void mcadd(struct parse *, cset *, const char *); |
102 | | static void mcinvert(struct parse *, cset *); |
103 | | static void mccase(struct parse *, cset *); |
104 | | static int isinsets(struct re_guts *, int); |
105 | | static int samesets(struct re_guts *, int, int); |
106 | | static void categorize(struct parse *, struct re_guts *); |
107 | | static sopno dupl(struct parse *, sopno, sopno); |
108 | | static void doemit(struct parse *, sop, size_t); |
109 | | static void doinsert(struct parse *, sop, size_t, sopno); |
110 | | static void dofwd(struct parse *, sopno, sop); |
111 | | static void enlarge(struct parse *, sopno); |
112 | | static void stripsnug(struct parse *, struct re_guts *); |
113 | | static void findmust(struct parse *, struct re_guts *); |
114 | | static sopno pluscount(struct parse *, struct re_guts *); |
115 | | |
116 | | static char nuls[10]; /* place to point scanner in event of error */ |
117 | | |
118 | | /* |
119 | | * macros for use with parse structure |
120 | | * BEWARE: these know that the parse structure is named `p' !!! |
121 | | */ |
122 | 96.6k | #define PEEK() (*p->next) |
123 | 4.57k | #define PEEK2() (*(p->next+1)) |
124 | 177k | #define MORE() (p->next < p->end) |
125 | 50.9k | #define MORE2() (p->next+1 < p->end) |
126 | 40.4k | #define SEE(c) (MORE() && PEEK() == (c)) |
127 | 13.7k | #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) |
128 | 28.0k | #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) |
129 | 9.14k | #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) |
130 | 5.87k | #define NEXT() (p->next++) |
131 | 0 | #define NEXT2() (p->next += 2) |
132 | 0 | #define NEXTn(n) (p->next += (n)) |
133 | 24.1k | #define GETNEXT() (*p->next++) |
134 | 0 | #define SETERROR(e) seterr(p, (e)) |
135 | 30.0k | #define REQUIRE(co, e) (void)((co) || SETERROR(e)) |
136 | | #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) |
137 | 7.83k | #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) |
138 | | #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) |
139 | 24.8k | #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) |
140 | 1.30k | #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) |
141 | 2.61k | #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) |
142 | 2.61k | #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) |
143 | 34.6k | #define HERE() (p->slen) |
144 | 2.61k | #define THERE() (p->slen - 1) |
145 | | #define THERETHERE() (p->slen - 2) |
146 | 0 | #define DROP(n) (p->slen -= (n)) |
147 | | |
148 | | #ifdef _POSIX2_RE_DUP_MAX |
149 | 0 | #define DUPMAX _POSIX2_RE_DUP_MAX |
150 | | #else |
151 | | #define DUPMAX 255 |
152 | | #endif |
153 | 0 | #define INFINITY (DUPMAX + 1) |
154 | | |
155 | | #ifndef NDEBUG |
156 | | static int never = 0; /* for use in asserts; shuts lint up */ |
157 | | #else |
158 | | #define never 0 /* some <assert.h>s have bugs too */ |
159 | | #endif |
160 | | |
161 | | /* |
162 | | - llvm_regcomp - interface for parser and compilation |
163 | | */ |
164 | | int /* 0 success, otherwise REG_something */ |
165 | | llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) |
166 | 653 | { |
167 | 653 | struct parse pa; |
168 | 653 | struct re_guts *g; |
169 | 653 | struct parse *p = &pa; |
170 | 653 | int i; |
171 | 653 | size_t len; |
172 | | #ifdef REDEBUG |
173 | | # define GOODFLAGS(f) (f) |
174 | | #else |
175 | 653 | # define GOODFLAGS(f) ((f)&~REG_DUMP) |
176 | 653 | #endif |
177 | | |
178 | 653 | cflags = GOODFLAGS(cflags); |
179 | 653 | if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) |
180 | 0 | return(REG_INVARG); |
181 | | |
182 | 653 | if (cflags®_PEND) { |
183 | 653 | if (preg->re_endp < pattern) |
184 | 0 | return(REG_INVARG); |
185 | 653 | len = preg->re_endp - pattern; |
186 | 653 | } else |
187 | 0 | len = strlen((const char *)pattern); |
188 | | |
189 | | /* do the mallocs early so failure handling is easy */ |
190 | 653 | g = (struct re_guts *)malloc(sizeof(struct re_guts) + |
191 | 653 | (NC-1)*sizeof(cat_t)); |
192 | 653 | if (g == NULL) |
193 | 0 | return(REG_ESPACE); |
194 | 653 | p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ |
195 | 653 | p->strip = (sop *)calloc(p->ssize, sizeof(sop)); |
196 | 653 | p->slen = 0; |
197 | 653 | if (p->strip == NULL) { |
198 | 0 | free((char *)g); |
199 | 0 | return(REG_ESPACE); |
200 | 0 | } |
201 | | |
202 | | /* set things up */ |
203 | 653 | p->g = g; |
204 | 653 | p->next = (char *)pattern; /* convenience; we do not modify it */ |
205 | 653 | p->end = p->next + len; |
206 | 653 | p->error = 0; |
207 | 653 | p->ncsalloc = 0; |
208 | 7.18k | for (i = 0; i < NPAREN; i++) { |
209 | 6.53k | p->pbegin[i] = 0; |
210 | 6.53k | p->pend[i] = 0; |
211 | 6.53k | } |
212 | 653 | g->csetsize = NC; |
213 | 653 | g->sets = NULL; |
214 | 653 | g->setbits = NULL; |
215 | 653 | g->ncsets = 0; |
216 | 653 | g->cflags = cflags; |
217 | 653 | g->iflags = 0; |
218 | 653 | g->nbol = 0; |
219 | 653 | g->neol = 0; |
220 | 653 | g->must = NULL; |
221 | 653 | g->mlen = 0; |
222 | 653 | g->nsub = 0; |
223 | 653 | g->ncategories = 1; /* category 0 is "everything else" */ |
224 | 653 | g->categories = &g->catspace[-(CHAR_MIN)]; |
225 | 653 | (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); |
226 | 653 | g->backrefs = 0; |
227 | | |
228 | | /* do it */ |
229 | 653 | EMIT(OEND, 0); |
230 | 653 | g->firststate = THERE(); |
231 | 653 | if (cflags®_EXTENDED) |
232 | 653 | p_ere(p, OUT); |
233 | 0 | else if (cflags®_NOSPEC) |
234 | 0 | p_str(p); |
235 | 0 | else |
236 | 0 | p_bre(p, OUT, OUT); |
237 | 653 | EMIT(OEND, 0); |
238 | 653 | g->laststate = THERE(); |
239 | | |
240 | | /* tidy up loose ends and fill things in */ |
241 | 653 | categorize(p, g); |
242 | 653 | stripsnug(p, g); |
243 | 653 | findmust(p, g); |
244 | 653 | g->nplus = pluscount(p, g); |
245 | 653 | g->magic = MAGIC2; |
246 | 653 | preg->re_nsub = g->nsub; |
247 | 653 | preg->re_g = g; |
248 | 653 | preg->re_magic = MAGIC1; |
249 | 653 | #ifndef REDEBUG |
250 | | /* not debugging, so can't rely on the assert() in llvm_regexec() */ |
251 | 653 | if (g->iflags®EX_BAD) |
252 | 0 | SETERROR(REG_ASSERT); |
253 | 653 | #endif |
254 | | |
255 | | /* win or lose, we're done */ |
256 | 653 | if (p->error != 0) /* lose */ |
257 | 0 | llvm_regfree(preg); |
258 | 653 | return(p->error); |
259 | 653 | } |
260 | | |
261 | | /* |
262 | | - p_ere - ERE parser top level, concatenation and alternation |
263 | | */ |
264 | | static void |
265 | | p_ere(struct parse *p, int stop) /* character this ERE should end at */ |
266 | 3.91k | { |
267 | 3.91k | char c; |
268 | 3.91k | sopno prevback = 0; |
269 | 3.91k | sopno prevfwd = 0; |
270 | 3.91k | sopno conc; |
271 | 3.91k | int first = 1; /* is this the first alternative? */ |
272 | | |
273 | 5.22k | for (;;) { |
274 | | /* do a bunch of concatenated expressions */ |
275 | 5.22k | conc = HERE(); |
276 | 20.2k | while (MORE() && (c = PEEK()) != '|' && c != stop) |
277 | 15.0k | p_ere_exp(p); |
278 | 5.22k | REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ |
279 | | |
280 | 5.22k | if (!EAT('|')) |
281 | 3.91k | break; /* NOTE BREAK OUT */ |
282 | | |
283 | 1.30k | if (first) { |
284 | 1.30k | INSERT(OCH_, conc); /* offset is wrong */ |
285 | 1.30k | prevfwd = conc; |
286 | 1.30k | prevback = conc; |
287 | 1.30k | first = 0; |
288 | 1.30k | } |
289 | 1.30k | ASTERN(OOR1, prevback); |
290 | 1.30k | prevback = THERE(); |
291 | 1.30k | AHEAD(prevfwd); /* fix previous offset */ |
292 | 1.30k | prevfwd = HERE(); |
293 | 1.30k | EMIT(OOR2, 0); /* offset is very wrong */ |
294 | 1.30k | } |
295 | | |
296 | 3.91k | if (!first) { /* tail-end fixups */ |
297 | 1.30k | AHEAD(prevfwd); |
298 | 1.30k | ASTERN(O_CH, prevback); |
299 | 1.30k | } |
300 | | |
301 | 3.91k | assert(!MORE() || SEE(stop)); |
302 | 3.91k | } |
303 | | |
304 | | /* |
305 | | - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op |
306 | | */ |
307 | | static void |
308 | | p_ere_exp(struct parse *p) |
309 | 15.0k | { |
310 | 15.0k | char c; |
311 | 15.0k | sopno pos; |
312 | 15.0k | int count; |
313 | 15.0k | int count2; |
314 | 15.0k | int backrefnum; |
315 | 15.0k | sopno subno; |
316 | 15.0k | int wascaret = 0; |
317 | | |
318 | 15.0k | assert(MORE()); /* caller should have ensured this */ |
319 | 15.0k | c = GETNEXT(); |
320 | | |
321 | 15.0k | pos = HERE(); |
322 | 15.0k | switch (c) { |
323 | 3.26k | case '(': |
324 | 3.26k | REQUIRE(MORE(), REG_EPAREN); |
325 | 3.26k | p->g->nsub++; |
326 | 3.26k | subno = p->g->nsub; |
327 | 3.26k | if (subno < NPAREN) |
328 | 3.26k | p->pbegin[subno] = HERE(); |
329 | 3.26k | EMIT(OLPAREN, subno); |
330 | 3.26k | if (!SEE(')')) |
331 | 3.26k | p_ere(p, ')'); |
332 | 3.26k | if (subno < NPAREN) { |
333 | 3.26k | p->pend[subno] = HERE(); |
334 | 3.26k | assert(p->pend[subno] != 0); |
335 | 3.26k | } |
336 | 3.26k | EMIT(ORPAREN, subno); |
337 | 3.26k | MUSTEAT(')', REG_EPAREN); |
338 | 3.26k | break; |
339 | 0 | #ifndef POSIX_MISTAKE |
340 | 0 | case ')': /* happens only if no current unmatched ( */ |
341 | | /* |
342 | | * You may ask, why the ifndef? Because I didn't notice |
343 | | * this until slightly too late for 1003.2, and none of the |
344 | | * other 1003.2 regular-expression reviewers noticed it at |
345 | | * all. So an unmatched ) is legal POSIX, at least until |
346 | | * we can get it fixed. |
347 | | */ |
348 | 0 | SETERROR(REG_EPAREN); |
349 | 0 | break; |
350 | 0 | #endif |
351 | 653 | case '^': |
352 | 653 | EMIT(OBOL, 0); |
353 | 653 | p->g->iflags |= USEBOL; |
354 | 653 | p->g->nbol++; |
355 | 653 | wascaret = 1; |
356 | 653 | break; |
357 | 653 | case '$': |
358 | 653 | EMIT(OEOL, 0); |
359 | 653 | p->g->iflags |= USEEOL; |
360 | 653 | p->g->neol++; |
361 | 653 | break; |
362 | 0 | case '|': |
363 | 0 | SETERROR(REG_EMPTY); |
364 | 0 | break; |
365 | 0 | case '*': |
366 | 0 | case '+': |
367 | 0 | case '?': |
368 | 0 | SETERROR(REG_BADRPT); |
369 | 0 | break; |
370 | 0 | case '.': |
371 | 0 | if (p->g->cflags®_NEWLINE) |
372 | 0 | nonnewline(p); |
373 | 0 | else |
374 | 0 | EMIT(OANY, 0); |
375 | 0 | break; |
376 | 4.57k | case '[': |
377 | 4.57k | p_bracket(p); |
378 | 4.57k | break; |
379 | 0 | case '\\': |
380 | 0 | REQUIRE(MORE(), REG_EESCAPE); |
381 | 0 | c = GETNEXT(); |
382 | 0 | if (c >= '1' && c <= '9') { |
383 | | /* \[0-9] is taken to be a back-reference to a previously specified |
384 | | * matching group. backrefnum will hold the number. The matching |
385 | | * group must exist (i.e. if \4 is found there must have been at |
386 | | * least 4 matching groups specified in the pattern previously). |
387 | | */ |
388 | 0 | backrefnum = c - '0'; |
389 | 0 | if (p->pend[backrefnum] == 0) { |
390 | 0 | SETERROR(REG_ESUBREG); |
391 | 0 | break; |
392 | 0 | } |
393 | | |
394 | | /* Make sure everything checks out and emit the sequence |
395 | | * that marks a back-reference to the parse structure. |
396 | | */ |
397 | 0 | assert(backrefnum <= p->g->nsub); |
398 | 0 | EMIT(OBACK_, backrefnum); |
399 | 0 | assert(p->pbegin[backrefnum] != 0); |
400 | 0 | assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN); |
401 | 0 | assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN); |
402 | 0 | (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]); |
403 | 0 | EMIT(O_BACK, backrefnum); |
404 | 0 | p->g->backrefs = 1; |
405 | 0 | } else { |
406 | | /* Other chars are simply themselves when escaped with a backslash. |
407 | | */ |
408 | 0 | ordinary(p, c); |
409 | 0 | } |
410 | 0 | break; |
411 | 0 | case '{': /* okay as ordinary except if digit follows */ |
412 | 0 | REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); |
413 | | /* FALLTHROUGH */ |
414 | 5.87k | default: |
415 | 5.87k | ordinary(p, c); |
416 | 5.87k | break; |
417 | 15.0k | } |
418 | | |
419 | 15.0k | if (!MORE()) |
420 | 653 | return; |
421 | 14.3k | c = PEEK(); |
422 | | /* we call { a repetition if followed by a digit */ |
423 | 14.3k | if (!( c == '*' || c == '+' || c == '?' || |
424 | 14.3k | (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) |
425 | 14.3k | return; /* no repetition, we're done */ |
426 | 0 | NEXT(); |
427 | |
|
428 | 0 | REQUIRE(!wascaret, REG_BADRPT); |
429 | 0 | switch (c) { |
430 | 0 | case '*': /* implemented as +? */ |
431 | | /* this case does not require the (y|) trick, noKLUDGE */ |
432 | 0 | INSERT(OPLUS_, pos); |
433 | 0 | ASTERN(O_PLUS, pos); |
434 | 0 | INSERT(OQUEST_, pos); |
435 | 0 | ASTERN(O_QUEST, pos); |
436 | 0 | break; |
437 | 0 | case '+': |
438 | 0 | INSERT(OPLUS_, pos); |
439 | 0 | ASTERN(O_PLUS, pos); |
440 | 0 | break; |
441 | 0 | case '?': |
442 | | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
443 | 0 | INSERT(OCH_, pos); /* offset slightly wrong */ |
444 | 0 | ASTERN(OOR1, pos); /* this one's right */ |
445 | 0 | AHEAD(pos); /* fix the OCH_ */ |
446 | 0 | EMIT(OOR2, 0); /* offset very wrong... */ |
447 | 0 | AHEAD(THERE()); /* ...so fix it */ |
448 | 0 | ASTERN(O_CH, THERETHERE()); |
449 | 0 | break; |
450 | 0 | case '{': |
451 | 0 | count = p_count(p); |
452 | 0 | if (EAT(',')) { |
453 | 0 | if (isdigit((uch)PEEK())) { |
454 | 0 | count2 = p_count(p); |
455 | 0 | REQUIRE(count <= count2, REG_BADBR); |
456 | 0 | } else /* single number with comma */ |
457 | 0 | count2 = INFINITY; |
458 | 0 | } else /* just a single number */ |
459 | 0 | count2 = count; |
460 | 0 | repeat(p, pos, count, count2); |
461 | 0 | if (!EAT('}')) { /* error heuristics */ |
462 | 0 | while (MORE() && PEEK() != '}') |
463 | 0 | NEXT(); |
464 | 0 | REQUIRE(MORE(), REG_EBRACE); |
465 | 0 | SETERROR(REG_BADBR); |
466 | 0 | } |
467 | 0 | break; |
468 | 0 | } |
469 | | |
470 | 0 | if (!MORE()) |
471 | 0 | return; |
472 | 0 | c = PEEK(); |
473 | 0 | if (!( c == '*' || c == '+' || c == '?' || |
474 | 0 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) |
475 | 0 | return; |
476 | 0 | SETERROR(REG_BADRPT); |
477 | 0 | } |
478 | | |
479 | | /* |
480 | | - p_str - string (no metacharacters) "parser" |
481 | | */ |
482 | | static void |
483 | | p_str(struct parse *p) |
484 | 0 | { |
485 | 0 | REQUIRE(MORE(), REG_EMPTY); |
486 | 0 | while (MORE()) |
487 | 0 | ordinary(p, GETNEXT()); |
488 | 0 | } |
489 | | |
490 | | /* |
491 | | - p_bre - BRE parser top level, anchoring and concatenation |
492 | | * Giving end1 as OUT essentially eliminates the end1/end2 check. |
493 | | * |
494 | | * This implementation is a bit of a kludge, in that a trailing $ is first |
495 | | * taken as an ordinary character and then revised to be an anchor. The |
496 | | * only undesirable side effect is that '$' gets included as a character |
497 | | * category in such cases. This is fairly harmless; not worth fixing. |
498 | | * The amount of lookahead needed to avoid this kludge is excessive. |
499 | | */ |
500 | | static void |
501 | | p_bre(struct parse *p, |
502 | | int end1, /* first terminating character */ |
503 | | int end2) /* second terminating character */ |
504 | 0 | { |
505 | 0 | sopno start = HERE(); |
506 | 0 | int first = 1; /* first subexpression? */ |
507 | 0 | int wasdollar = 0; |
508 | |
|
509 | 0 | if (EAT('^')) { |
510 | 0 | EMIT(OBOL, 0); |
511 | 0 | p->g->iflags |= USEBOL; |
512 | 0 | p->g->nbol++; |
513 | 0 | } |
514 | 0 | while (MORE() && !SEETWO(end1, end2)) { |
515 | 0 | wasdollar = p_simp_re(p, first); |
516 | 0 | first = 0; |
517 | 0 | } |
518 | 0 | if (wasdollar) { /* oops, that was a trailing anchor */ |
519 | 0 | DROP(1); |
520 | 0 | EMIT(OEOL, 0); |
521 | 0 | p->g->iflags |= USEEOL; |
522 | 0 | p->g->neol++; |
523 | 0 | } |
524 | |
|
525 | 0 | REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ |
526 | 0 | } |
527 | | |
528 | | /* |
529 | | - p_simp_re - parse a simple RE, an atom possibly followed by a repetition |
530 | | */ |
531 | | static int /* was the simple RE an unbackslashed $? */ |
532 | | p_simp_re(struct parse *p, |
533 | | int starordinary) /* is a leading * an ordinary character? */ |
534 | 0 | { |
535 | 0 | int c; |
536 | 0 | int count; |
537 | 0 | int count2; |
538 | 0 | sopno pos; |
539 | 0 | int i; |
540 | 0 | sopno subno; |
541 | 0 | # define BACKSL (1<<CHAR_BIT) |
542 | |
|
543 | 0 | pos = HERE(); /* repetition op, if any, covers from here */ |
544 | |
|
545 | 0 | assert(MORE()); /* caller should have ensured this */ |
546 | 0 | c = GETNEXT(); |
547 | 0 | if (c == '\\') { |
548 | 0 | REQUIRE(MORE(), REG_EESCAPE); |
549 | 0 | c = BACKSL | GETNEXT(); |
550 | 0 | } |
551 | 0 | switch (c) { |
552 | 0 | case '.': |
553 | 0 | if (p->g->cflags®_NEWLINE) |
554 | 0 | nonnewline(p); |
555 | 0 | else |
556 | 0 | EMIT(OANY, 0); |
557 | 0 | break; |
558 | 0 | case '[': |
559 | 0 | p_bracket(p); |
560 | 0 | break; |
561 | 0 | case BACKSL|'{': |
562 | 0 | SETERROR(REG_BADRPT); |
563 | 0 | break; |
564 | 0 | case BACKSL|'(': |
565 | 0 | p->g->nsub++; |
566 | 0 | subno = p->g->nsub; |
567 | 0 | if (subno < NPAREN) |
568 | 0 | p->pbegin[subno] = HERE(); |
569 | 0 | EMIT(OLPAREN, subno); |
570 | | /* the MORE here is an error heuristic */ |
571 | 0 | if (MORE() && !SEETWO('\\', ')')) |
572 | 0 | p_bre(p, '\\', ')'); |
573 | 0 | if (subno < NPAREN) { |
574 | 0 | p->pend[subno] = HERE(); |
575 | 0 | assert(p->pend[subno] != 0); |
576 | 0 | } |
577 | 0 | EMIT(ORPAREN, subno); |
578 | 0 | REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
579 | 0 | break; |
580 | 0 | case BACKSL|')': /* should not get here -- must be user */ |
581 | 0 | case BACKSL|'}': |
582 | 0 | SETERROR(REG_EPAREN); |
583 | 0 | break; |
584 | 0 | case BACKSL|'1': |
585 | 0 | case BACKSL|'2': |
586 | 0 | case BACKSL|'3': |
587 | 0 | case BACKSL|'4': |
588 | 0 | case BACKSL|'5': |
589 | 0 | case BACKSL|'6': |
590 | 0 | case BACKSL|'7': |
591 | 0 | case BACKSL|'8': |
592 | 0 | case BACKSL|'9': |
593 | 0 | i = (c&~BACKSL) - '0'; |
594 | 0 | assert(i < NPAREN); |
595 | 0 | if (p->pend[i] != 0) { |
596 | 0 | assert(i <= p->g->nsub); |
597 | 0 | EMIT(OBACK_, i); |
598 | 0 | assert(p->pbegin[i] != 0); |
599 | 0 | assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
600 | 0 | assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
601 | 0 | (void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
602 | 0 | EMIT(O_BACK, i); |
603 | 0 | } else |
604 | 0 | SETERROR(REG_ESUBREG); |
605 | 0 | p->g->backrefs = 1; |
606 | 0 | break; |
607 | 0 | case '*': |
608 | 0 | REQUIRE(starordinary, REG_BADRPT); |
609 | | /* FALLTHROUGH */ |
610 | 0 | default: |
611 | 0 | ordinary(p, (char)c); |
612 | 0 | break; |
613 | 0 | } |
614 | | |
615 | 0 | if (EAT('*')) { /* implemented as +? */ |
616 | | /* this case does not require the (y|) trick, noKLUDGE */ |
617 | 0 | INSERT(OPLUS_, pos); |
618 | 0 | ASTERN(O_PLUS, pos); |
619 | 0 | INSERT(OQUEST_, pos); |
620 | 0 | ASTERN(O_QUEST, pos); |
621 | 0 | } else if (EATTWO('\\', '{')) { |
622 | 0 | count = p_count(p); |
623 | 0 | if (EAT(',')) { |
624 | 0 | if (MORE() && isdigit((uch)PEEK())) { |
625 | 0 | count2 = p_count(p); |
626 | 0 | REQUIRE(count <= count2, REG_BADBR); |
627 | 0 | } else /* single number with comma */ |
628 | 0 | count2 = INFINITY; |
629 | 0 | } else /* just a single number */ |
630 | 0 | count2 = count; |
631 | 0 | repeat(p, pos, count, count2); |
632 | 0 | if (!EATTWO('\\', '}')) { /* error heuristics */ |
633 | 0 | while (MORE() && !SEETWO('\\', '}')) |
634 | 0 | NEXT(); |
635 | 0 | REQUIRE(MORE(), REG_EBRACE); |
636 | 0 | SETERROR(REG_BADBR); |
637 | 0 | } |
638 | 0 | } else if (c == '$') /* $ (but not \$) ends it */ |
639 | 0 | return(1); |
640 | | |
641 | 0 | return(0); |
642 | 0 | } |
643 | | |
644 | | /* |
645 | | - p_count - parse a repetition count |
646 | | */ |
647 | | static int /* the value */ |
648 | | p_count(struct parse *p) |
649 | 0 | { |
650 | 0 | int count = 0; |
651 | 0 | int ndigits = 0; |
652 | |
|
653 | 0 | while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { |
654 | 0 | count = count*10 + (GETNEXT() - '0'); |
655 | 0 | ndigits++; |
656 | 0 | } |
657 | |
|
658 | 0 | REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
659 | 0 | return(count); |
660 | 0 | } |
661 | | |
662 | | /* |
663 | | - p_bracket - parse a bracketed character list |
664 | | * |
665 | | * Note a significant property of this code: if the allocset() did SETERROR, |
666 | | * no set operations are done. |
667 | | */ |
668 | | static void |
669 | | p_bracket(struct parse *p) |
670 | 4.57k | { |
671 | 4.57k | cset *cs; |
672 | 4.57k | int invert = 0; |
673 | | |
674 | | /* Dept of Truly Sickening Special-Case Kludges */ |
675 | 4.57k | if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { |
676 | 0 | EMIT(OBOW, 0); |
677 | 0 | NEXTn(6); |
678 | 0 | return; |
679 | 0 | } |
680 | 4.57k | if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { |
681 | 0 | EMIT(OEOW, 0); |
682 | 0 | NEXTn(6); |
683 | 0 | return; |
684 | 0 | } |
685 | | |
686 | 4.57k | if ((cs = allocset(p)) == NULL) { |
687 | | /* allocset did set error status in p */ |
688 | 0 | return; |
689 | 0 | } |
690 | | |
691 | 4.57k | if (EAT('^')) |
692 | 0 | invert++; /* make note to invert set at end */ |
693 | 4.57k | if (EAT(']')) |
694 | 0 | CHadd(cs, ']'); |
695 | 4.57k | else if (EAT('-')) |
696 | 0 | CHadd(cs, '-'); |
697 | 9.14k | while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) |
698 | 4.57k | p_b_term(p, cs); |
699 | 4.57k | if (EAT('-')) |
700 | 0 | CHadd(cs, '-'); |
701 | 4.57k | MUSTEAT(']', REG_EBRACK); |
702 | | |
703 | 4.57k | if (p->error != 0) { /* don't mess things up further */ |
704 | 0 | freeset(p, cs); |
705 | 0 | return; |
706 | 0 | } |
707 | | |
708 | 4.57k | if (p->g->cflags®_ICASE) { |
709 | 0 | int i; |
710 | 0 | int ci; |
711 | |
|
712 | 0 | for (i = p->g->csetsize - 1; i >= 0; i--) |
713 | 0 | if (CHIN(cs, i) && isalpha(i)) { |
714 | 0 | ci = othercase(i); |
715 | 0 | if (ci != i) |
716 | 0 | CHadd(cs, ci); |
717 | 0 | } |
718 | 0 | if (cs->multis != NULL) |
719 | 0 | mccase(p, cs); |
720 | 0 | } |
721 | 4.57k | if (invert) { |
722 | 0 | int i; |
723 | |
|
724 | 0 | for (i = p->g->csetsize - 1; i >= 0; i--) |
725 | 0 | if (CHIN(cs, i)) |
726 | 0 | CHsub(cs, i); |
727 | 0 | else |
728 | 0 | CHadd(cs, i); |
729 | 0 | if (p->g->cflags®_NEWLINE) |
730 | 0 | CHsub(cs, '\n'); |
731 | 0 | if (cs->multis != NULL) |
732 | 0 | mcinvert(p, cs); |
733 | 0 | } |
734 | | |
735 | 4.57k | assert(cs->multis == NULL); /* xxx */ |
736 | | |
737 | 4.57k | if (nch(p, cs) == 1) { /* optimize singleton sets */ |
738 | 0 | ordinary(p, firstch(p, cs)); |
739 | 0 | freeset(p, cs); |
740 | 0 | } else |
741 | 4.57k | EMIT(OANYOF, freezeset(p, cs)); |
742 | 4.57k | } |
743 | | |
744 | | /* |
745 | | - p_b_term - parse one term of a bracketed character list |
746 | | */ |
747 | | static void |
748 | | p_b_term(struct parse *p, cset *cs) |
749 | 4.57k | { |
750 | 4.57k | char c; |
751 | 4.57k | char start, finish; |
752 | 4.57k | int i; |
753 | | |
754 | | /* classify what we've got */ |
755 | 4.57k | switch ((MORE()) ? PEEK() : '\0') { |
756 | 0 | case '[': |
757 | 0 | c = (MORE2()) ? PEEK2() : '\0'; |
758 | 0 | break; |
759 | 0 | case '-': |
760 | 0 | SETERROR(REG_ERANGE); |
761 | 0 | return; /* NOTE RETURN */ |
762 | 0 | break; |
763 | 4.57k | default: |
764 | 4.57k | c = '\0'; |
765 | 4.57k | break; |
766 | 4.57k | } |
767 | | |
768 | 4.57k | switch (c) { |
769 | 0 | case ':': /* character class */ |
770 | 0 | NEXT2(); |
771 | 0 | REQUIRE(MORE(), REG_EBRACK); |
772 | 0 | c = PEEK(); |
773 | 0 | REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
774 | 0 | p_b_cclass(p, cs); |
775 | 0 | REQUIRE(MORE(), REG_EBRACK); |
776 | 0 | REQUIRE(EATTWO(':', ']'), REG_ECTYPE); |
777 | 0 | break; |
778 | 0 | case '=': /* equivalence class */ |
779 | 0 | NEXT2(); |
780 | 0 | REQUIRE(MORE(), REG_EBRACK); |
781 | 0 | c = PEEK(); |
782 | 0 | REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
783 | 0 | p_b_eclass(p, cs); |
784 | 0 | REQUIRE(MORE(), REG_EBRACK); |
785 | 0 | REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); |
786 | 0 | break; |
787 | 4.57k | default: /* symbol, ordinary character, or range */ |
788 | | /* xxx revision needed for multichar stuff */ |
789 | 4.57k | start = p_b_symbol(p); |
790 | 4.57k | if (SEE('-') && MORE2() && PEEK2() != ']') { |
791 | | /* range */ |
792 | 4.57k | NEXT(); |
793 | 4.57k | if (EAT('-')) |
794 | 0 | finish = '-'; |
795 | 4.57k | else |
796 | 4.57k | finish = p_b_symbol(p); |
797 | 4.57k | } else |
798 | 0 | finish = start; |
799 | | /* xxx what about signed chars here... */ |
800 | 4.57k | REQUIRE(start <= finish, REG_ERANGE); |
801 | 38.5k | for (i = start; i <= finish; i++) |
802 | 33.9k | CHadd(cs, i); |
803 | 4.57k | break; |
804 | 4.57k | } |
805 | 4.57k | } |
806 | | |
807 | | /* |
808 | | - p_b_cclass - parse a character-class name and deal with it |
809 | | */ |
810 | | static void |
811 | | p_b_cclass(struct parse *p, cset *cs) |
812 | 0 | { |
813 | 0 | char *sp = p->next; |
814 | 0 | struct cclass *cp; |
815 | 0 | size_t len; |
816 | 0 | const char *u; |
817 | 0 | char c; |
818 | |
|
819 | 0 | while (MORE() && isalpha((uch)PEEK())) |
820 | 0 | NEXT(); |
821 | 0 | len = p->next - sp; |
822 | 0 | for (cp = cclasses; cp->name != NULL; cp++) |
823 | 0 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') |
824 | 0 | break; |
825 | 0 | if (cp->name == NULL) { |
826 | | /* oops, didn't find it */ |
827 | 0 | SETERROR(REG_ECTYPE); |
828 | 0 | return; |
829 | 0 | } |
830 | | |
831 | 0 | u = cp->chars; |
832 | 0 | while ((c = *u++) != '\0') |
833 | 0 | CHadd(cs, c); |
834 | 0 | for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) |
835 | 0 | MCadd(p, cs, u); |
836 | 0 | } |
837 | | |
838 | | /* |
839 | | - p_b_eclass - parse an equivalence-class name and deal with it |
840 | | * |
841 | | * This implementation is incomplete. xxx |
842 | | */ |
843 | | static void |
844 | | p_b_eclass(struct parse *p, cset *cs) |
845 | 0 | { |
846 | 0 | char c; |
847 | |
|
848 | 0 | c = p_b_coll_elem(p, '='); |
849 | 0 | CHadd(cs, c); |
850 | 0 | } |
851 | | |
852 | | /* |
853 | | - p_b_symbol - parse a character or [..]ed multicharacter collating symbol |
854 | | */ |
855 | | static char /* value of symbol */ |
856 | | p_b_symbol(struct parse *p) |
857 | 9.14k | { |
858 | 9.14k | char value; |
859 | | |
860 | 9.14k | REQUIRE(MORE(), REG_EBRACK); |
861 | 9.14k | if (!EATTWO('[', '.')) |
862 | 9.14k | return(GETNEXT()); |
863 | | |
864 | | /* collating symbol */ |
865 | 0 | value = p_b_coll_elem(p, '.'); |
866 | 0 | REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
867 | 0 | return(value); |
868 | 9.14k | } |
869 | | |
870 | | /* |
871 | | - p_b_coll_elem - parse a collating-element name and look it up |
872 | | */ |
873 | | static char /* value of collating element */ |
874 | | p_b_coll_elem(struct parse *p, |
875 | | int endc) /* name ended by endc,']' */ |
876 | 0 | { |
877 | 0 | char *sp = p->next; |
878 | 0 | struct cname *cp; |
879 | 0 | int len; |
880 | |
|
881 | 0 | while (MORE() && !SEETWO(endc, ']')) |
882 | 0 | NEXT(); |
883 | 0 | if (!MORE()) { |
884 | 0 | SETERROR(REG_EBRACK); |
885 | 0 | return(0); |
886 | 0 | } |
887 | 0 | len = p->next - sp; |
888 | 0 | for (cp = cnames; cp->name != NULL; cp++) |
889 | 0 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') |
890 | 0 | return(cp->code); /* known name */ |
891 | 0 | if (len == 1) |
892 | 0 | return(*sp); /* single character */ |
893 | 0 | SETERROR(REG_ECOLLATE); /* neither */ |
894 | 0 | return(0); |
895 | 0 | } |
896 | | |
897 | | /* |
898 | | - othercase - return the case counterpart of an alphabetic |
899 | | */ |
900 | | static char /* if no counterpart, return ch */ |
901 | | othercase(int ch) |
902 | 0 | { |
903 | 0 | ch = (uch)ch; |
904 | 0 | assert(isalpha(ch)); |
905 | 0 | if (isupper(ch)) |
906 | 0 | return ((uch)tolower(ch)); |
907 | 0 | else if (islower(ch)) |
908 | 0 | return ((uch)toupper(ch)); |
909 | 0 | else /* peculiar, but could happen */ |
910 | 0 | return(ch); |
911 | 0 | } |
912 | | |
913 | | /* |
914 | | - bothcases - emit a dualcase version of a two-case character |
915 | | * |
916 | | * Boy, is this implementation ever a kludge... |
917 | | */ |
918 | | static void |
919 | | bothcases(struct parse *p, int ch) |
920 | 0 | { |
921 | 0 | char *oldnext = p->next; |
922 | 0 | char *oldend = p->end; |
923 | 0 | char bracket[3]; |
924 | |
|
925 | 0 | ch = (uch)ch; |
926 | 0 | assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
927 | 0 | p->next = bracket; |
928 | 0 | p->end = bracket+2; |
929 | 0 | bracket[0] = ch; |
930 | 0 | bracket[1] = ']'; |
931 | 0 | bracket[2] = '\0'; |
932 | 0 | p_bracket(p); |
933 | 0 | assert(p->next == bracket+2); |
934 | 0 | p->next = oldnext; |
935 | 0 | p->end = oldend; |
936 | 0 | } |
937 | | |
938 | | /* |
939 | | - ordinary - emit an ordinary character |
940 | | */ |
941 | | static void |
942 | | ordinary(struct parse *p, int ch) |
943 | 5.87k | { |
944 | 5.87k | cat_t *cap = p->g->categories; |
945 | | |
946 | 5.87k | if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) |
947 | 0 | bothcases(p, ch); |
948 | 5.87k | else { |
949 | 5.87k | EMIT(OCHAR, (uch)ch); |
950 | 5.87k | if (cap[ch] == 0) |
951 | 2.61k | cap[ch] = p->g->ncategories++; |
952 | 5.87k | } |
953 | 5.87k | } |
954 | | |
955 | | /* |
956 | | - nonnewline - emit REG_NEWLINE version of OANY |
957 | | * |
958 | | * Boy, is this implementation ever a kludge... |
959 | | */ |
960 | | static void |
961 | | nonnewline(struct parse *p) |
962 | 0 | { |
963 | 0 | char *oldnext = p->next; |
964 | 0 | char *oldend = p->end; |
965 | 0 | char bracket[4]; |
966 | |
|
967 | 0 | p->next = bracket; |
968 | 0 | p->end = bracket+3; |
969 | 0 | bracket[0] = '^'; |
970 | 0 | bracket[1] = '\n'; |
971 | 0 | bracket[2] = ']'; |
972 | 0 | bracket[3] = '\0'; |
973 | 0 | p_bracket(p); |
974 | 0 | assert(p->next == bracket+3); |
975 | 0 | p->next = oldnext; |
976 | 0 | p->end = oldend; |
977 | 0 | } |
978 | | |
979 | | /* |
980 | | - repeat - generate code for a bounded repetition, recursively if needed |
981 | | */ |
982 | | static void |
983 | | repeat(struct parse *p, |
984 | | sopno start, /* operand from here to end of strip */ |
985 | | int from, /* repeated from this number */ |
986 | | int to) /* to this number of times (maybe INFINITY) */ |
987 | 0 | { |
988 | 0 | sopno finish = HERE(); |
989 | 0 | # define N 2 |
990 | 0 | # define INF 3 |
991 | 0 | # define REP(f, t) ((f)*8 + (t)) |
992 | 0 | # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) |
993 | 0 | sopno copy; |
994 | |
|
995 | 0 | if (p->error != 0) /* head off possible runaway recursion */ |
996 | 0 | return; |
997 | | |
998 | 0 | assert(from <= to); |
999 | |
|
1000 | 0 | switch (REP(MAP(from), MAP(to))) { |
1001 | 0 | case REP(0, 0): /* must be user doing this */ |
1002 | 0 | DROP(finish-start); /* drop the operand */ |
1003 | 0 | break; |
1004 | 0 | case REP(0, 1): /* as x{1,1}? */ |
1005 | 0 | case REP(0, N): /* as x{1,n}? */ |
1006 | 0 | case REP(0, INF): /* as x{1,}? */ |
1007 | | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
1008 | 0 | INSERT(OCH_, start); /* offset is wrong... */ |
1009 | 0 | repeat(p, start+1, 1, to); |
1010 | 0 | ASTERN(OOR1, start); |
1011 | 0 | AHEAD(start); /* ... fix it */ |
1012 | 0 | EMIT(OOR2, 0); |
1013 | 0 | AHEAD(THERE()); |
1014 | 0 | ASTERN(O_CH, THERETHERE()); |
1015 | 0 | break; |
1016 | 0 | case REP(1, 1): /* trivial case */ |
1017 | | /* done */ |
1018 | 0 | break; |
1019 | 0 | case REP(1, N): /* as x?x{1,n-1} */ |
1020 | | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
1021 | 0 | INSERT(OCH_, start); |
1022 | 0 | ASTERN(OOR1, start); |
1023 | 0 | AHEAD(start); |
1024 | 0 | EMIT(OOR2, 0); /* offset very wrong... */ |
1025 | 0 | AHEAD(THERE()); /* ...so fix it */ |
1026 | 0 | ASTERN(O_CH, THERETHERE()); |
1027 | 0 | copy = dupl(p, start+1, finish+1); |
1028 | 0 | assert(copy == finish+4); |
1029 | 0 | repeat(p, copy, 1, to-1); |
1030 | 0 | break; |
1031 | 0 | case REP(1, INF): /* as x+ */ |
1032 | 0 | INSERT(OPLUS_, start); |
1033 | 0 | ASTERN(O_PLUS, start); |
1034 | 0 | break; |
1035 | 0 | case REP(N, N): /* as xx{m-1,n-1} */ |
1036 | 0 | copy = dupl(p, start, finish); |
1037 | 0 | repeat(p, copy, from-1, to-1); |
1038 | 0 | break; |
1039 | 0 | case REP(N, INF): /* as xx{n-1,INF} */ |
1040 | 0 | copy = dupl(p, start, finish); |
1041 | 0 | repeat(p, copy, from-1, to); |
1042 | 0 | break; |
1043 | 0 | default: /* "can't happen" */ |
1044 | 0 | SETERROR(REG_ASSERT); /* just in case */ |
1045 | 0 | break; |
1046 | 0 | } |
1047 | 0 | } |
1048 | | |
1049 | | /* |
1050 | | - seterr - set an error condition |
1051 | | */ |
1052 | | static int /* useless but makes type checking happy */ |
1053 | | seterr(struct parse *p, int e) |
1054 | 0 | { |
1055 | 0 | if (p->error == 0) /* keep earliest error condition */ |
1056 | 0 | p->error = e; |
1057 | 0 | p->next = nuls; /* try to bring things to a halt */ |
1058 | 0 | p->end = nuls; |
1059 | 0 | return(0); /* make the return value well-defined */ |
1060 | 0 | } |
1061 | | |
1062 | | /* |
1063 | | - allocset - allocate a set of characters for [] |
1064 | | */ |
1065 | | static cset * |
1066 | | allocset(struct parse *p) |
1067 | 4.57k | { |
1068 | 4.57k | int no = p->g->ncsets++; |
1069 | 4.57k | size_t nc; |
1070 | 4.57k | size_t nbytes; |
1071 | 4.57k | cset *cs; |
1072 | 4.57k | size_t css = (size_t)p->g->csetsize; |
1073 | 4.57k | int i; |
1074 | | |
1075 | 4.57k | if (no >= p->ncsalloc) { /* need another column of space */ |
1076 | 653 | void *ptr; |
1077 | | |
1078 | 653 | p->ncsalloc += CHAR_BIT; |
1079 | 653 | nc = p->ncsalloc; |
1080 | 653 | if (nc > SIZE_MAX / sizeof(cset)) |
1081 | 0 | goto nomem; |
1082 | 653 | assert(nc % CHAR_BIT == 0); |
1083 | 653 | nbytes = nc / CHAR_BIT * css; |
1084 | | |
1085 | 653 | ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); |
1086 | 653 | if (ptr == NULL) |
1087 | 0 | goto nomem; |
1088 | 653 | p->g->sets = ptr; |
1089 | | |
1090 | 653 | ptr = (uch *)realloc((char *)p->g->setbits, nbytes); |
1091 | 653 | if (ptr == NULL) |
1092 | 0 | goto nomem; |
1093 | 653 | p->g->setbits = ptr; |
1094 | | |
1095 | 653 | for (i = 0; i < no; i++) |
1096 | 0 | p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); |
1097 | | |
1098 | 653 | (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); |
1099 | 653 | } |
1100 | | /* XXX should not happen */ |
1101 | 4.57k | if (p->g->sets == NULL || p->g->setbits == NULL) |
1102 | 0 | goto nomem; |
1103 | | |
1104 | 4.57k | cs = &p->g->sets[no]; |
1105 | 4.57k | cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); |
1106 | 4.57k | cs->mask = 1 << ((no) % CHAR_BIT); |
1107 | 4.57k | cs->hash = 0; |
1108 | 4.57k | cs->smultis = 0; |
1109 | 4.57k | cs->multis = NULL; |
1110 | | |
1111 | 4.57k | return(cs); |
1112 | 0 | nomem: |
1113 | 0 | free(p->g->sets); |
1114 | 0 | p->g->sets = NULL; |
1115 | 0 | free(p->g->setbits); |
1116 | 0 | p->g->setbits = NULL; |
1117 | |
|
1118 | 0 | SETERROR(REG_ESPACE); |
1119 | | /* caller's responsibility not to do set ops */ |
1120 | 0 | return(NULL); |
1121 | 4.57k | } |
1122 | | |
1123 | | /* |
1124 | | - freeset - free a now-unused set |
1125 | | */ |
1126 | | static void |
1127 | | freeset(struct parse *p, cset *cs) |
1128 | 1.95k | { |
1129 | 1.95k | size_t i; |
1130 | 1.95k | cset *top = &p->g->sets[p->g->ncsets]; |
1131 | 1.95k | size_t css = (size_t)p->g->csetsize; |
1132 | | |
1133 | 503k | for (i = 0; i < css; i++) |
1134 | 501k | CHsub(cs, i); |
1135 | 1.95k | if (cs == top-1) /* recover only the easy case */ |
1136 | 1.95k | p->g->ncsets--; |
1137 | 1.95k | } |
1138 | | |
1139 | | /* |
1140 | | - freezeset - final processing on a set of characters |
1141 | | * |
1142 | | * The main task here is merging identical sets. This is usually a waste |
1143 | | * of time (although the hash code minimizes the overhead), but can win |
1144 | | * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash |
1145 | | * is done using addition rather than xor -- all ASCII [aA] sets xor to |
1146 | | * the same value! |
1147 | | */ |
1148 | | static int /* set number */ |
1149 | | freezeset(struct parse *p, cset *cs) |
1150 | 4.57k | { |
1151 | 4.57k | uch h = cs->hash; |
1152 | 4.57k | size_t i; |
1153 | 4.57k | cset *top = &p->g->sets[p->g->ncsets]; |
1154 | 4.57k | cset *cs2; |
1155 | 4.57k | size_t css = (size_t)p->g->csetsize; |
1156 | | |
1157 | | /* look for an earlier one which is the same */ |
1158 | 15.0k | for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) |
1159 | 12.4k | if (cs2->hash == h && cs2 != cs) { |
1160 | | /* maybe */ |
1161 | 503k | for (i = 0; i < css; i++) |
1162 | 501k | if (!!CHIN(cs2, i) != !!CHIN(cs, i)) |
1163 | 0 | break; /* no */ |
1164 | 1.95k | if (i == css) |
1165 | 1.95k | break; /* yes */ |
1166 | 1.95k | } |
1167 | | |
1168 | 4.57k | if (cs2 < top) { /* found one */ |
1169 | 1.95k | freeset(p, cs); |
1170 | 1.95k | cs = cs2; |
1171 | 1.95k | } |
1172 | | |
1173 | 4.57k | return((int)(cs - p->g->sets)); |
1174 | 4.57k | } |
1175 | | |
1176 | | /* |
1177 | | - firstch - return first character in a set (which must have at least one) |
1178 | | */ |
1179 | | static int /* character; there is no "none" value */ |
1180 | | firstch(struct parse *p, cset *cs) |
1181 | 0 | { |
1182 | 0 | size_t i; |
1183 | 0 | size_t css = (size_t)p->g->csetsize; |
1184 | |
|
1185 | 0 | for (i = 0; i < css; i++) |
1186 | 0 | if (CHIN(cs, i)) |
1187 | 0 | return((char)i); |
1188 | 0 | assert(never); |
1189 | 0 | return(0); /* arbitrary */ |
1190 | 0 | } |
1191 | | |
1192 | | /* |
1193 | | - nch - number of characters in a set |
1194 | | */ |
1195 | | static int |
1196 | | nch(struct parse *p, cset *cs) |
1197 | 4.57k | { |
1198 | 4.57k | size_t i; |
1199 | 4.57k | size_t css = (size_t)p->g->csetsize; |
1200 | 4.57k | int n = 0; |
1201 | | |
1202 | 1.17M | for (i = 0; i < css; i++) |
1203 | 1.17M | if (CHIN(cs, i)) |
1204 | 33.9k | n++; |
1205 | 4.57k | return(n); |
1206 | 4.57k | } |
1207 | | |
1208 | | /* |
1209 | | - mcadd - add a collating element to a cset |
1210 | | */ |
1211 | | static void |
1212 | | mcadd( struct parse *p, cset *cs, const char *cp) |
1213 | 0 | { |
1214 | 0 | size_t oldend = cs->smultis; |
1215 | 0 | void *np; |
1216 | |
|
1217 | 0 | cs->smultis += strlen(cp) + 1; |
1218 | 0 | np = realloc(cs->multis, cs->smultis); |
1219 | 0 | if (np == NULL) { |
1220 | 0 | if (cs->multis) |
1221 | 0 | free(cs->multis); |
1222 | 0 | cs->multis = NULL; |
1223 | 0 | SETERROR(REG_ESPACE); |
1224 | 0 | return; |
1225 | 0 | } |
1226 | 0 | cs->multis = np; |
1227 | |
|
1228 | 0 | llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); |
1229 | 0 | } |
1230 | | |
1231 | | /* |
1232 | | - mcinvert - invert the list of collating elements in a cset |
1233 | | * |
1234 | | * This would have to know the set of possibilities. Implementation |
1235 | | * is deferred. |
1236 | | */ |
1237 | | /* ARGSUSED */ |
1238 | | static void |
1239 | | mcinvert(struct parse *p, cset *cs) |
1240 | 0 | { |
1241 | 0 | assert(cs->multis == NULL); /* xxx */ |
1242 | 0 | } |
1243 | | |
1244 | | /* |
1245 | | - mccase - add case counterparts of the list of collating elements in a cset |
1246 | | * |
1247 | | * This would have to know the set of possibilities. Implementation |
1248 | | * is deferred. |
1249 | | */ |
1250 | | /* ARGSUSED */ |
1251 | | static void |
1252 | | mccase(struct parse *p, cset *cs) |
1253 | 0 | { |
1254 | 0 | assert(cs->multis == NULL); /* xxx */ |
1255 | 0 | } |
1256 | | |
1257 | | /* |
1258 | | - isinsets - is this character in any sets? |
1259 | | */ |
1260 | | static int /* predicate */ |
1261 | | isinsets(struct re_guts *g, int c) |
1262 | 161k | { |
1263 | 161k | uch *col; |
1264 | 161k | int i; |
1265 | 161k | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1266 | 161k | unsigned uc = (uch)c; |
1267 | | |
1268 | 319k | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
1269 | 161k | if (col[uc] != 0) |
1270 | 2.61k | return(1); |
1271 | 158k | return(0); |
1272 | 161k | } |
1273 | | |
1274 | | /* |
1275 | | - samesets - are these two characters in exactly the same sets? |
1276 | | */ |
1277 | | static int /* predicate */ |
1278 | | samesets(struct re_guts *g, int c1, int c2) |
1279 | 186k | { |
1280 | 186k | uch *col; |
1281 | 186k | int i; |
1282 | 186k | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
1283 | 186k | unsigned uc1 = (uch)c1; |
1284 | 186k | unsigned uc2 = (uch)c2; |
1285 | | |
1286 | 189k | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
1287 | 186k | if (col[uc1] != col[uc2]) |
1288 | 182k | return(0); |
1289 | 3.26k | return(1); |
1290 | 186k | } |
1291 | | |
1292 | | /* |
1293 | | - categorize - sort out character categories |
1294 | | */ |
1295 | | static void |
1296 | | categorize(struct parse *p, struct re_guts *g) |
1297 | 653 | { |
1298 | 653 | cat_t *cats = g->categories; |
1299 | 653 | int c; |
1300 | 653 | int c2; |
1301 | 653 | cat_t cat; |
1302 | | |
1303 | | /* avoid making error situations worse */ |
1304 | 653 | if (p->error != 0) |
1305 | 0 | return; |
1306 | | |
1307 | 167k | for (c = CHAR_MIN; c <= CHAR_MAX; c++) |
1308 | 167k | if (cats[c] == 0 && isinsets(g, c)) { |
1309 | 2.61k | cat = g->ncategories++; |
1310 | 2.61k | cats[c] = cat; |
1311 | 197k | for (c2 = c+1; c2 <= CHAR_MAX; c2++) |
1312 | 194k | if (cats[c2] == 0 && samesets(g, c, c2)) |
1313 | 3.26k | cats[c2] = cat; |
1314 | 2.61k | } |
1315 | 653 | } |
1316 | | |
1317 | | /* |
1318 | | - dupl - emit a duplicate of a bunch of sops |
1319 | | */ |
1320 | | static sopno /* start of duplicate */ |
1321 | | dupl(struct parse *p, |
1322 | | sopno start, /* from here */ |
1323 | | sopno finish) /* to this less one */ |
1324 | 0 | { |
1325 | 0 | sopno ret = HERE(); |
1326 | 0 | sopno len = finish - start; |
1327 | |
|
1328 | 0 | assert(finish >= start); |
1329 | 0 | if (len == 0) |
1330 | 0 | return(ret); |
1331 | 0 | enlarge(p, p->ssize + len); /* this many unexpected additions */ |
1332 | 0 | assert(p->ssize >= p->slen + len); |
1333 | 0 | (void) memmove((char *)(p->strip + p->slen), |
1334 | 0 | (char *)(p->strip + start), (size_t)len*sizeof(sop)); |
1335 | 0 | p->slen += len; |
1336 | 0 | return(ret); |
1337 | 0 | } |
1338 | | |
1339 | | /* |
1340 | | - doemit - emit a strip operator |
1341 | | * |
1342 | | * It might seem better to implement this as a macro with a function as |
1343 | | * hard-case backup, but it's just too big and messy unless there are |
1344 | | * some changes to the data structures. Maybe later. |
1345 | | */ |
1346 | | static void |
1347 | | doemit(struct parse *p, sop op, size_t opnd) |
1348 | 24.8k | { |
1349 | | /* avoid making error situations worse */ |
1350 | 24.8k | if (p->error != 0) |
1351 | 0 | return; |
1352 | | |
1353 | | /* deal with oversize operands ("can't happen", more or less) */ |
1354 | 24.8k | assert(opnd < 1<<OPSHIFT); |
1355 | | |
1356 | | /* deal with undersized strip */ |
1357 | 24.8k | if (p->slen >= p->ssize) |
1358 | 0 | enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ |
1359 | 24.8k | assert(p->slen < p->ssize); |
1360 | | |
1361 | | /* finally, it's all reduced to the easy case */ |
1362 | 24.8k | p->strip[p->slen++] = SOP(op, opnd); |
1363 | 24.8k | } |
1364 | | |
1365 | | /* |
1366 | | - doinsert - insert a sop into the strip |
1367 | | */ |
1368 | | static void |
1369 | | doinsert(struct parse *p, sop op, size_t opnd, sopno pos) |
1370 | 1.30k | { |
1371 | 1.30k | sopno sn; |
1372 | 1.30k | sop s; |
1373 | 1.30k | int i; |
1374 | | |
1375 | | /* avoid making error situations worse */ |
1376 | 1.30k | if (p->error != 0) |
1377 | 0 | return; |
1378 | | |
1379 | 1.30k | sn = HERE(); |
1380 | 1.30k | EMIT(op, opnd); /* do checks, ensure space */ |
1381 | 1.30k | assert(HERE() == sn+1); |
1382 | 1.30k | s = p->strip[sn]; |
1383 | | |
1384 | | /* adjust paren pointers */ |
1385 | 1.30k | assert(pos > 0); |
1386 | 13.0k | for (i = 1; i < NPAREN; i++) { |
1387 | 11.7k | if (p->pbegin[i] >= pos) { |
1388 | 0 | p->pbegin[i]++; |
1389 | 0 | } |
1390 | 11.7k | if (p->pend[i] >= pos) { |
1391 | 0 | p->pend[i]++; |
1392 | 0 | } |
1393 | 11.7k | } |
1394 | | |
1395 | 1.30k | memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], |
1396 | 1.30k | (HERE()-pos-1)*sizeof(sop)); |
1397 | 1.30k | p->strip[pos] = s; |
1398 | 1.30k | } |
1399 | | |
1400 | | /* |
1401 | | - dofwd - complete a forward reference |
1402 | | */ |
1403 | | static void |
1404 | | dofwd(struct parse *p, sopno pos, sop value) |
1405 | 2.61k | { |
1406 | | /* avoid making error situations worse */ |
1407 | 2.61k | if (p->error != 0) |
1408 | 0 | return; |
1409 | | |
1410 | 2.61k | assert(value < 1<<OPSHIFT); |
1411 | 2.61k | p->strip[pos] = OP(p->strip[pos]) | value; |
1412 | 2.61k | } |
1413 | | |
1414 | | /* |
1415 | | - enlarge - enlarge the strip |
1416 | | */ |
1417 | | static void |
1418 | | enlarge(struct parse *p, sopno size) |
1419 | 0 | { |
1420 | 0 | sop *sp; |
1421 | |
|
1422 | 0 | if (p->ssize >= size) |
1423 | 0 | return; |
1424 | | |
1425 | 0 | if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) { |
1426 | 0 | SETERROR(REG_ESPACE); |
1427 | 0 | return; |
1428 | 0 | } |
1429 | | |
1430 | 0 | sp = (sop *)realloc(p->strip, size*sizeof(sop)); |
1431 | 0 | if (sp == NULL) { |
1432 | 0 | SETERROR(REG_ESPACE); |
1433 | 0 | return; |
1434 | 0 | } |
1435 | 0 | p->strip = sp; |
1436 | 0 | p->ssize = size; |
1437 | 0 | } |
1438 | | |
1439 | | /* |
1440 | | - stripsnug - compact the strip |
1441 | | */ |
1442 | | static void |
1443 | | stripsnug(struct parse *p, struct re_guts *g) |
1444 | 653 | { |
1445 | 653 | g->nstates = p->slen; |
1446 | 653 | if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) { |
1447 | 0 | g->strip = p->strip; |
1448 | 0 | SETERROR(REG_ESPACE); |
1449 | 0 | return; |
1450 | 0 | } |
1451 | | |
1452 | 653 | g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); |
1453 | 653 | if (g->strip == NULL) { |
1454 | 0 | SETERROR(REG_ESPACE); |
1455 | 0 | g->strip = p->strip; |
1456 | 0 | } |
1457 | 653 | } |
1458 | | |
1459 | | /* |
1460 | | - findmust - fill in must and mlen with longest mandatory literal string |
1461 | | * |
1462 | | * This algorithm could do fancy things like analyzing the operands of | |
1463 | | * for common subsequences. Someday. This code is simple and finds most |
1464 | | * of the interesting cases. |
1465 | | * |
1466 | | * Note that must and mlen got initialized during setup. |
1467 | | */ |
1468 | | static void |
1469 | | findmust(struct parse *p, struct re_guts *g) |
1470 | 653 | { |
1471 | 653 | sop *scan; |
1472 | 653 | sop *start = 0; /* start initialized in the default case, after that */ |
1473 | 653 | sop *newstart = 0; /* newstart was initialized in the OCHAR case */ |
1474 | 653 | sopno newlen; |
1475 | 653 | sop s; |
1476 | 653 | char *cp; |
1477 | 653 | sopno i; |
1478 | | |
1479 | | /* avoid making error situations worse */ |
1480 | 653 | if (p->error != 0) |
1481 | 0 | return; |
1482 | | |
1483 | | /* find the longest OCHAR sequence in strip */ |
1484 | 653 | newlen = 0; |
1485 | 653 | scan = g->strip + 1; |
1486 | 17.6k | do { |
1487 | 17.6k | s = *scan++; |
1488 | 17.6k | switch (OP(s)) { |
1489 | 4.57k | case OCHAR: /* sequence member */ |
1490 | 4.57k | if (newlen == 0) /* new sequence */ |
1491 | 3.26k | newstart = scan - 1; |
1492 | 4.57k | newlen++; |
1493 | 4.57k | break; |
1494 | 0 | case OPLUS_: /* things that don't break one */ |
1495 | 3.26k | case OLPAREN: |
1496 | 6.53k | case ORPAREN: |
1497 | 6.53k | break; |
1498 | 0 | case OQUEST_: /* things that must be skipped */ |
1499 | 1.30k | case OCH_: |
1500 | 1.30k | scan--; |
1501 | 2.61k | do { |
1502 | 2.61k | scan += OPND(s); |
1503 | 2.61k | s = *scan; |
1504 | | /* assert() interferes w debug printouts */ |
1505 | 2.61k | if (OP(s) != O_QUEST && OP(s) != O_CH && |
1506 | 1.30k | OP(s) != OOR2) { |
1507 | 0 | g->iflags |= REGEX_BAD; |
1508 | 0 | return; |
1509 | 0 | } |
1510 | 2.61k | } while (OP(s) != O_QUEST && OP(s) != O_CH); |
1511 | | /* fallthrough */ |
1512 | 6.53k | default: /* things that break a sequence */ |
1513 | 6.53k | if (newlen > g->mlen) { /* ends one */ |
1514 | 1.30k | start = newstart; |
1515 | 1.30k | g->mlen = newlen; |
1516 | 1.30k | } |
1517 | 6.53k | newlen = 0; |
1518 | 6.53k | break; |
1519 | 17.6k | } |
1520 | 17.6k | } while (OP(s) != OEND); |
1521 | | |
1522 | 653 | if (g->mlen == 0) /* there isn't one */ |
1523 | 0 | return; |
1524 | | |
1525 | | /* turn it into a character string */ |
1526 | 653 | g->must = malloc((size_t)g->mlen + 1); |
1527 | 653 | if (g->must == NULL) { /* argh; just forget it */ |
1528 | 0 | g->mlen = 0; |
1529 | 0 | return; |
1530 | 0 | } |
1531 | 653 | cp = g->must; |
1532 | 653 | scan = start; |
1533 | 1.95k | for (i = g->mlen; i > 0; i--) { |
1534 | 1.30k | while (OP(s = *scan++) != OCHAR) |
1535 | 0 | continue; |
1536 | 1.30k | assert(cp < g->must + g->mlen); |
1537 | 1.30k | *cp++ = (char)OPND(s); |
1538 | 1.30k | } |
1539 | 653 | assert(cp == g->must + g->mlen); |
1540 | 653 | *cp++ = '\0'; /* just on general principles */ |
1541 | 653 | } |
1542 | | |
1543 | | /* |
1544 | | - pluscount - count + nesting |
1545 | | */ |
1546 | | static sopno /* nesting depth */ |
1547 | | pluscount(struct parse *p, struct re_guts *g) |
1548 | 653 | { |
1549 | 653 | sop *scan; |
1550 | 653 | sop s; |
1551 | 653 | sopno plusnest = 0; |
1552 | 653 | sopno maxnest = 0; |
1553 | | |
1554 | 653 | if (p->error != 0) |
1555 | 0 | return(0); /* there may not be an OEND */ |
1556 | | |
1557 | 653 | scan = g->strip + 1; |
1558 | 24.1k | do { |
1559 | 24.1k | s = *scan++; |
1560 | 24.1k | switch (OP(s)) { |
1561 | 0 | case OPLUS_: |
1562 | 0 | plusnest++; |
1563 | 0 | break; |
1564 | 0 | case O_PLUS: |
1565 | 0 | if (plusnest > maxnest) |
1566 | 0 | maxnest = plusnest; |
1567 | 0 | plusnest--; |
1568 | 0 | break; |
1569 | 24.1k | } |
1570 | 24.1k | } while (OP(s) != OEND); |
1571 | 653 | if (plusnest != 0) |
1572 | 0 | g->iflags |= REGEX_BAD; |
1573 | 653 | return(maxnest); |
1574 | 653 | } |