/src/glib/glib/pcre/pcre_study.c
Line | Count | Source (jump to first uncovered line) |
1 | | /************************************************* |
2 | | * Perl-Compatible Regular Expressions * |
3 | | *************************************************/ |
4 | | |
5 | | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | | and semantics are as close as possible to those of the Perl 5 language. |
7 | | |
8 | | Written by Philip Hazel |
9 | | Copyright (c) 1997-2012 University of Cambridge |
10 | | |
11 | | ----------------------------------------------------------------------------- |
12 | | Redistribution and use in source and binary forms, with or without |
13 | | modification, are permitted provided that the following conditions are met: |
14 | | |
15 | | * Redistributions of source code must retain the above copyright notice, |
16 | | this list of conditions and the following disclaimer. |
17 | | |
18 | | * Redistributions in binary form must reproduce the above copyright |
19 | | notice, this list of conditions and the following disclaimer in the |
20 | | documentation and/or other materials provided with the distribution. |
21 | | |
22 | | * Neither the name of the University of Cambridge nor the names of its |
23 | | contributors may be used to endorse or promote products derived from |
24 | | this software without specific prior written permission. |
25 | | |
26 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
27 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
28 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
29 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
30 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36 | | POSSIBILITY OF SUCH DAMAGE. |
37 | | ----------------------------------------------------------------------------- |
38 | | */ |
39 | | |
40 | | |
41 | | /* This module contains the external function pcre_study(), along with local |
42 | | supporting functions. */ |
43 | | |
44 | | |
45 | | #include "config.h" |
46 | | |
47 | | #include "pcre_internal.h" |
48 | | |
49 | 0 | #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7)) |
50 | | |
51 | | /* Returns from set_start_bits() */ |
52 | | |
53 | | enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; |
54 | | |
55 | | |
56 | | |
57 | | /************************************************* |
58 | | * Find the minimum subject length for a group * |
59 | | *************************************************/ |
60 | | |
61 | | /* Scan a parenthesized group and compute the minimum length of subject that |
62 | | is needed to match it. This is a lower bound; it does not mean there is a |
63 | | string of that length that matches. In UTF8 mode, the result is in characters |
64 | | rather than bytes. |
65 | | |
66 | | Arguments: |
67 | | code pointer to start of group (the bracket) |
68 | | startcode pointer to start of the whole pattern |
69 | | options the compiling options |
70 | | int RECURSE depth |
71 | | |
72 | | Returns: the minimum length |
73 | | -1 if \C in UTF-8 mode or (*ACCEPT) was encountered |
74 | | -2 internal error (missing capturing bracket) |
75 | | -3 internal error (opcode not listed) |
76 | | */ |
77 | | |
78 | | static int |
79 | | find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options, |
80 | | int recurse_depth) |
81 | 0 | { |
82 | 0 | int length = -1; |
83 | | /* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
84 | 0 | BOOL utf = (options & PCRE_UTF8) != 0; |
85 | 0 | BOOL had_recurse = FALSE; |
86 | 0 | int branchlength = 0; |
87 | 0 | pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; |
88 | |
|
89 | 0 | if (*code == OP_CBRA || *code == OP_SCBRA || |
90 | 0 | *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; |
91 | | |
92 | | /* Scan along the opcodes for this branch. If we get to the end of the |
93 | | branch, check the length against that of the other branches. */ |
94 | |
|
95 | 0 | for (;;) |
96 | 0 | { |
97 | 0 | int d, min; |
98 | 0 | pcre_uchar *cs, *ce; |
99 | 0 | int op = *cc; |
100 | |
|
101 | 0 | switch (op) |
102 | 0 | { |
103 | 0 | case OP_COND: |
104 | 0 | case OP_SCOND: |
105 | | |
106 | | /* If there is only one branch in a condition, the implied branch has zero |
107 | | length, so we don't add anything. This covers the DEFINE "condition" |
108 | | automatically. */ |
109 | |
|
110 | 0 | cs = cc + GET(cc, 1); |
111 | 0 | if (*cs != OP_ALT) |
112 | 0 | { |
113 | 0 | cc = cs + 1 + LINK_SIZE; |
114 | 0 | break; |
115 | 0 | } |
116 | | |
117 | | /* Otherwise we can fall through and treat it the same as any other |
118 | | subpattern. */ |
119 | | |
120 | 0 | case OP_CBRA: |
121 | 0 | case OP_SCBRA: |
122 | 0 | case OP_BRA: |
123 | 0 | case OP_SBRA: |
124 | 0 | case OP_CBRAPOS: |
125 | 0 | case OP_SCBRAPOS: |
126 | 0 | case OP_BRAPOS: |
127 | 0 | case OP_SBRAPOS: |
128 | 0 | case OP_ONCE: |
129 | 0 | case OP_ONCE_NC: |
130 | 0 | d = find_minlength(cc, startcode, options, recurse_depth); |
131 | 0 | if (d < 0) return d; |
132 | 0 | branchlength += d; |
133 | 0 | do cc += GET(cc, 1); while (*cc == OP_ALT); |
134 | 0 | cc += 1 + LINK_SIZE; |
135 | 0 | break; |
136 | | |
137 | | /* ACCEPT makes things far too complicated; we have to give up. */ |
138 | | |
139 | 0 | case OP_ACCEPT: |
140 | 0 | case OP_ASSERT_ACCEPT: |
141 | 0 | return -1; |
142 | | |
143 | | /* Reached end of a branch; if it's a ket it is the end of a nested |
144 | | call. If it's ALT it is an alternation in a nested call. If it is END it's |
145 | | the end of the outer call. All can be handled by the same code. If an |
146 | | ACCEPT was previously encountered, use the length that was in force at that |
147 | | time, and pass back the shortest ACCEPT length. */ |
148 | | |
149 | 0 | case OP_ALT: |
150 | 0 | case OP_KET: |
151 | 0 | case OP_KETRMAX: |
152 | 0 | case OP_KETRMIN: |
153 | 0 | case OP_KETRPOS: |
154 | 0 | case OP_END: |
155 | 0 | if (length < 0 || (!had_recurse && branchlength < length)) |
156 | 0 | length = branchlength; |
157 | 0 | if (op != OP_ALT) return length; |
158 | 0 | cc += 1 + LINK_SIZE; |
159 | 0 | branchlength = 0; |
160 | 0 | had_recurse = FALSE; |
161 | 0 | break; |
162 | | |
163 | | /* Skip over assertive subpatterns */ |
164 | | |
165 | 0 | case OP_ASSERT: |
166 | 0 | case OP_ASSERT_NOT: |
167 | 0 | case OP_ASSERTBACK: |
168 | 0 | case OP_ASSERTBACK_NOT: |
169 | 0 | do cc += GET(cc, 1); while (*cc == OP_ALT); |
170 | | /* Fall through */ |
171 | | |
172 | | /* Skip over things that don't match chars */ |
173 | |
|
174 | 0 | case OP_REVERSE: |
175 | 0 | case OP_CREF: |
176 | 0 | case OP_NCREF: |
177 | 0 | case OP_RREF: |
178 | 0 | case OP_NRREF: |
179 | 0 | case OP_DEF: |
180 | 0 | case OP_CALLOUT: |
181 | 0 | case OP_SOD: |
182 | 0 | case OP_SOM: |
183 | 0 | case OP_EOD: |
184 | 0 | case OP_EODN: |
185 | 0 | case OP_CIRC: |
186 | 0 | case OP_CIRCM: |
187 | 0 | case OP_DOLL: |
188 | 0 | case OP_DOLLM: |
189 | 0 | case OP_NOT_WORD_BOUNDARY: |
190 | 0 | case OP_WORD_BOUNDARY: |
191 | 0 | cc += PRIV(OP_lengths)[*cc]; |
192 | 0 | break; |
193 | | |
194 | | /* Skip over a subpattern that has a {0} or {0,x} quantifier */ |
195 | | |
196 | 0 | case OP_BRAZERO: |
197 | 0 | case OP_BRAMINZERO: |
198 | 0 | case OP_BRAPOSZERO: |
199 | 0 | case OP_SKIPZERO: |
200 | 0 | cc += PRIV(OP_lengths)[*cc]; |
201 | 0 | do cc += GET(cc, 1); while (*cc == OP_ALT); |
202 | 0 | cc += 1 + LINK_SIZE; |
203 | 0 | break; |
204 | | |
205 | | /* Handle literal characters and + repetitions */ |
206 | | |
207 | 0 | case OP_CHAR: |
208 | 0 | case OP_CHARI: |
209 | 0 | case OP_NOT: |
210 | 0 | case OP_NOTI: |
211 | 0 | case OP_PLUS: |
212 | 0 | case OP_PLUSI: |
213 | 0 | case OP_MINPLUS: |
214 | 0 | case OP_MINPLUSI: |
215 | 0 | case OP_POSPLUS: |
216 | 0 | case OP_POSPLUSI: |
217 | 0 | case OP_NOTPLUS: |
218 | 0 | case OP_NOTPLUSI: |
219 | 0 | case OP_NOTMINPLUS: |
220 | 0 | case OP_NOTMINPLUSI: |
221 | 0 | case OP_NOTPOSPLUS: |
222 | 0 | case OP_NOTPOSPLUSI: |
223 | 0 | branchlength++; |
224 | 0 | cc += 2; |
225 | 0 | #ifdef SUPPORT_UTF |
226 | 0 | if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
227 | 0 | #endif |
228 | 0 | break; |
229 | | |
230 | 0 | case OP_TYPEPLUS: |
231 | 0 | case OP_TYPEMINPLUS: |
232 | 0 | case OP_TYPEPOSPLUS: |
233 | 0 | branchlength++; |
234 | 0 | cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; |
235 | 0 | break; |
236 | | |
237 | | /* Handle exact repetitions. The count is already in characters, but we |
238 | | need to skip over a multibyte character in UTF8 mode. */ |
239 | | |
240 | 0 | case OP_EXACT: |
241 | 0 | case OP_EXACTI: |
242 | 0 | case OP_NOTEXACT: |
243 | 0 | case OP_NOTEXACTI: |
244 | 0 | branchlength += GET2(cc,1); |
245 | 0 | cc += 2 + IMM2_SIZE; |
246 | 0 | #ifdef SUPPORT_UTF |
247 | 0 | if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
248 | 0 | #endif |
249 | 0 | break; |
250 | | |
251 | 0 | case OP_TYPEEXACT: |
252 | 0 | branchlength += GET2(cc,1); |
253 | 0 | cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP |
254 | 0 | || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); |
255 | 0 | break; |
256 | | |
257 | | /* Handle single-char non-literal matchers */ |
258 | | |
259 | 0 | case OP_PROP: |
260 | 0 | case OP_NOTPROP: |
261 | 0 | cc += 2; |
262 | | /* Fall through */ |
263 | |
|
264 | 0 | case OP_NOT_DIGIT: |
265 | 0 | case OP_DIGIT: |
266 | 0 | case OP_NOT_WHITESPACE: |
267 | 0 | case OP_WHITESPACE: |
268 | 0 | case OP_NOT_WORDCHAR: |
269 | 0 | case OP_WORDCHAR: |
270 | 0 | case OP_ANY: |
271 | 0 | case OP_ALLANY: |
272 | 0 | case OP_EXTUNI: |
273 | 0 | case OP_HSPACE: |
274 | 0 | case OP_NOT_HSPACE: |
275 | 0 | case OP_VSPACE: |
276 | 0 | case OP_NOT_VSPACE: |
277 | 0 | branchlength++; |
278 | 0 | cc++; |
279 | 0 | break; |
280 | | |
281 | | /* "Any newline" might match two characters, but it also might match just |
282 | | one. */ |
283 | | |
284 | 0 | case OP_ANYNL: |
285 | 0 | branchlength += 1; |
286 | 0 | cc++; |
287 | 0 | break; |
288 | | |
289 | | /* The single-byte matcher means we can't proceed in UTF-8 mode. (In |
290 | | non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever |
291 | | appear, but leave the code, just in case.) */ |
292 | | |
293 | 0 | case OP_ANYBYTE: |
294 | 0 | #ifdef SUPPORT_UTF |
295 | 0 | if (utf) return -1; |
296 | 0 | #endif |
297 | 0 | branchlength++; |
298 | 0 | cc++; |
299 | 0 | break; |
300 | | |
301 | | /* For repeated character types, we have to test for \p and \P, which have |
302 | | an extra two bytes of parameters. */ |
303 | | |
304 | 0 | case OP_TYPESTAR: |
305 | 0 | case OP_TYPEMINSTAR: |
306 | 0 | case OP_TYPEQUERY: |
307 | 0 | case OP_TYPEMINQUERY: |
308 | 0 | case OP_TYPEPOSSTAR: |
309 | 0 | case OP_TYPEPOSQUERY: |
310 | 0 | if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; |
311 | 0 | cc += PRIV(OP_lengths)[op]; |
312 | 0 | break; |
313 | | |
314 | 0 | case OP_TYPEUPTO: |
315 | 0 | case OP_TYPEMINUPTO: |
316 | 0 | case OP_TYPEPOSUPTO: |
317 | 0 | if (cc[1 + IMM2_SIZE] == OP_PROP |
318 | 0 | || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; |
319 | 0 | cc += PRIV(OP_lengths)[op]; |
320 | 0 | break; |
321 | | |
322 | | /* Check a class for variable quantification */ |
323 | | |
324 | 0 | #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
325 | 0 | case OP_XCLASS: |
326 | 0 | cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; |
327 | | /* Fall through */ |
328 | 0 | #endif |
329 | |
|
330 | 0 | case OP_CLASS: |
331 | 0 | case OP_NCLASS: |
332 | 0 | cc += PRIV(OP_lengths)[OP_CLASS]; |
333 | |
|
334 | 0 | switch (*cc) |
335 | 0 | { |
336 | 0 | case OP_CRPLUS: |
337 | 0 | case OP_CRMINPLUS: |
338 | 0 | branchlength++; |
339 | | /* Fall through */ |
340 | |
|
341 | 0 | case OP_CRSTAR: |
342 | 0 | case OP_CRMINSTAR: |
343 | 0 | case OP_CRQUERY: |
344 | 0 | case OP_CRMINQUERY: |
345 | 0 | cc++; |
346 | 0 | break; |
347 | | |
348 | 0 | case OP_CRRANGE: |
349 | 0 | case OP_CRMINRANGE: |
350 | 0 | branchlength += GET2(cc,1); |
351 | 0 | cc += 1 + 2 * IMM2_SIZE; |
352 | 0 | break; |
353 | | |
354 | 0 | default: |
355 | 0 | branchlength++; |
356 | 0 | break; |
357 | 0 | } |
358 | 0 | break; |
359 | | |
360 | | /* Backreferences and subroutine calls are treated in the same way: we find |
361 | | the minimum length for the subpattern. A recursion, however, causes an |
362 | | a flag to be set that causes the length of this branch to be ignored. The |
363 | | logic is that a recursion can only make sense if there is another |
364 | | alternation that stops the recursing. That will provide the minimum length |
365 | | (when no recursion happens). A backreference within the group that it is |
366 | | referencing behaves in the same way. |
367 | | |
368 | | If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket |
369 | | matches an empty string (by default it causes a matching failure), so in |
370 | | that case we must set the minimum length to zero. */ |
371 | | |
372 | 0 | case OP_REF: |
373 | 0 | case OP_REFI: |
374 | 0 | if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) |
375 | 0 | { |
376 | 0 | ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); |
377 | 0 | if (cs == NULL) return -2; |
378 | 0 | do ce += GET(ce, 1); while (*ce == OP_ALT); |
379 | 0 | if (cc > cs && cc < ce) |
380 | 0 | { |
381 | 0 | d = 0; |
382 | 0 | had_recurse = TRUE; |
383 | 0 | } |
384 | 0 | else |
385 | 0 | { |
386 | 0 | d = find_minlength(cs, startcode, options, recurse_depth); |
387 | 0 | } |
388 | 0 | } |
389 | 0 | else d = 0; |
390 | 0 | cc += 1 + IMM2_SIZE; |
391 | | |
392 | | /* Handle repeated back references */ |
393 | |
|
394 | 0 | switch (*cc) |
395 | 0 | { |
396 | 0 | case OP_CRSTAR: |
397 | 0 | case OP_CRMINSTAR: |
398 | 0 | case OP_CRQUERY: |
399 | 0 | case OP_CRMINQUERY: |
400 | 0 | min = 0; |
401 | 0 | cc++; |
402 | 0 | break; |
403 | | |
404 | 0 | case OP_CRPLUS: |
405 | 0 | case OP_CRMINPLUS: |
406 | 0 | min = 1; |
407 | 0 | cc++; |
408 | 0 | break; |
409 | | |
410 | 0 | case OP_CRRANGE: |
411 | 0 | case OP_CRMINRANGE: |
412 | 0 | min = GET2(cc, 1); |
413 | 0 | cc += 1 + 2 * IMM2_SIZE; |
414 | 0 | break; |
415 | | |
416 | 0 | default: |
417 | 0 | min = 1; |
418 | 0 | break; |
419 | 0 | } |
420 | | |
421 | 0 | branchlength += min * d; |
422 | 0 | break; |
423 | | |
424 | | /* We can easily detect direct recursion, but not mutual recursion. This is |
425 | | caught by a recursion depth count. */ |
426 | | |
427 | 0 | case OP_RECURSE: |
428 | 0 | cs = ce = (pcre_uchar *)startcode + GET(cc, 1); |
429 | 0 | do ce += GET(ce, 1); while (*ce == OP_ALT); |
430 | 0 | if ((cc > cs && cc < ce) || recurse_depth > 10) |
431 | 0 | had_recurse = TRUE; |
432 | 0 | else |
433 | 0 | { |
434 | 0 | branchlength += find_minlength(cs, startcode, options, recurse_depth + 1); |
435 | 0 | } |
436 | 0 | cc += 1 + LINK_SIZE; |
437 | 0 | break; |
438 | | |
439 | | /* Anything else does not or need not match a character. We can get the |
440 | | item's length from the table, but for those that can match zero occurrences |
441 | | of a character, we must take special action for UTF-8 characters. As it |
442 | | happens, the "NOT" versions of these opcodes are used at present only for |
443 | | ASCII characters, so they could be omitted from this list. However, in |
444 | | future that may change, so we include them here so as not to leave a |
445 | | gotcha for a future maintainer. */ |
446 | | |
447 | 0 | case OP_UPTO: |
448 | 0 | case OP_UPTOI: |
449 | 0 | case OP_NOTUPTO: |
450 | 0 | case OP_NOTUPTOI: |
451 | 0 | case OP_MINUPTO: |
452 | 0 | case OP_MINUPTOI: |
453 | 0 | case OP_NOTMINUPTO: |
454 | 0 | case OP_NOTMINUPTOI: |
455 | 0 | case OP_POSUPTO: |
456 | 0 | case OP_POSUPTOI: |
457 | 0 | case OP_NOTPOSUPTO: |
458 | 0 | case OP_NOTPOSUPTOI: |
459 | |
|
460 | 0 | case OP_STAR: |
461 | 0 | case OP_STARI: |
462 | 0 | case OP_NOTSTAR: |
463 | 0 | case OP_NOTSTARI: |
464 | 0 | case OP_MINSTAR: |
465 | 0 | case OP_MINSTARI: |
466 | 0 | case OP_NOTMINSTAR: |
467 | 0 | case OP_NOTMINSTARI: |
468 | 0 | case OP_POSSTAR: |
469 | 0 | case OP_POSSTARI: |
470 | 0 | case OP_NOTPOSSTAR: |
471 | 0 | case OP_NOTPOSSTARI: |
472 | |
|
473 | 0 | case OP_QUERY: |
474 | 0 | case OP_QUERYI: |
475 | 0 | case OP_NOTQUERY: |
476 | 0 | case OP_NOTQUERYI: |
477 | 0 | case OP_MINQUERY: |
478 | 0 | case OP_MINQUERYI: |
479 | 0 | case OP_NOTMINQUERY: |
480 | 0 | case OP_NOTMINQUERYI: |
481 | 0 | case OP_POSQUERY: |
482 | 0 | case OP_POSQUERYI: |
483 | 0 | case OP_NOTPOSQUERY: |
484 | 0 | case OP_NOTPOSQUERYI: |
485 | |
|
486 | 0 | cc += PRIV(OP_lengths)[op]; |
487 | 0 | #ifdef SUPPORT_UTF |
488 | 0 | if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
489 | 0 | #endif |
490 | 0 | break; |
491 | | |
492 | | /* Skip these, but we need to add in the name length. */ |
493 | | |
494 | 0 | case OP_MARK: |
495 | 0 | case OP_PRUNE_ARG: |
496 | 0 | case OP_SKIP_ARG: |
497 | 0 | case OP_THEN_ARG: |
498 | 0 | cc += PRIV(OP_lengths)[op] + cc[1]; |
499 | 0 | break; |
500 | | |
501 | | /* The remaining opcodes are just skipped over. */ |
502 | | |
503 | 0 | case OP_CLOSE: |
504 | 0 | case OP_COMMIT: |
505 | 0 | case OP_FAIL: |
506 | 0 | case OP_PRUNE: |
507 | 0 | case OP_SET_SOM: |
508 | 0 | case OP_SKIP: |
509 | 0 | case OP_THEN: |
510 | 0 | cc += PRIV(OP_lengths)[op]; |
511 | 0 | break; |
512 | | |
513 | | /* This should not occur: we list all opcodes explicitly so that when |
514 | | new ones get added they are properly considered. */ |
515 | | |
516 | 0 | default: |
517 | 0 | return -3; |
518 | 0 | } |
519 | 0 | } |
520 | | /* Control never gets here */ |
521 | 0 | } |
522 | | |
523 | | |
524 | | |
525 | | /************************************************* |
526 | | * Set a bit and maybe its alternate case * |
527 | | *************************************************/ |
528 | | |
529 | | /* Given a character, set its first byte's bit in the table, and also the |
530 | | corresponding bit for the other version of a letter if we are caseless. In |
531 | | UTF-8 mode, for characters greater than 127, we can only do the caseless thing |
532 | | when Unicode property support is available. |
533 | | |
534 | | Arguments: |
535 | | start_bits points to the bit map |
536 | | p points to the character |
537 | | caseless the caseless flag |
538 | | cd the block with char table pointers |
539 | | utf TRUE for UTF-8 / UTF-16 mode |
540 | | |
541 | | Returns: pointer after the character |
542 | | */ |
543 | | |
544 | | static const pcre_uchar * |
545 | | set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, |
546 | | compile_data *cd, BOOL utf) |
547 | 0 | { |
548 | 0 | unsigned int c = *p; |
549 | |
|
550 | 0 | #ifdef COMPILE_PCRE8 |
551 | 0 | SET_BIT(c); |
552 | |
|
553 | 0 | #ifdef SUPPORT_UTF |
554 | 0 | if (utf && c > 127) |
555 | 0 | { |
556 | 0 | GETCHARINC(c, p); |
557 | 0 | #ifdef SUPPORT_UCP |
558 | 0 | if (caseless) |
559 | 0 | { |
560 | 0 | pcre_uchar buff[6]; |
561 | 0 | c = UCD_OTHERCASE(c); |
562 | 0 | (void)PRIV(ord2utf)(c, buff); |
563 | 0 | SET_BIT(buff[0]); |
564 | 0 | } |
565 | 0 | #endif |
566 | 0 | return p; |
567 | 0 | } |
568 | 0 | #endif |
569 | | |
570 | | /* Not UTF-8 mode, or character is less than 127. */ |
571 | | |
572 | 0 | if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); |
573 | 0 | return p + 1; |
574 | 0 | #endif |
575 | |
|
576 | | #ifdef COMPILE_PCRE16 |
577 | | if (c > 0xff) |
578 | | { |
579 | | c = 0xff; |
580 | | caseless = FALSE; |
581 | | } |
582 | | SET_BIT(c); |
583 | | |
584 | | #ifdef SUPPORT_UTF |
585 | | if (utf && c > 127) |
586 | | { |
587 | | GETCHARINC(c, p); |
588 | | #ifdef SUPPORT_UCP |
589 | | if (caseless) |
590 | | { |
591 | | c = UCD_OTHERCASE(c); |
592 | | if (c > 0xff) |
593 | | c = 0xff; |
594 | | SET_BIT(c); |
595 | | } |
596 | | #endif |
597 | | return p; |
598 | | } |
599 | | #endif |
600 | | |
601 | | if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); |
602 | | return p + 1; |
603 | | #endif |
604 | 0 | } |
605 | | |
606 | | |
607 | | |
608 | | /************************************************* |
609 | | * Set bits for a positive character type * |
610 | | *************************************************/ |
611 | | |
612 | | /* This function sets starting bits for a character type. In UTF-8 mode, we can |
613 | | only do a direct setting for bytes less than 128, as otherwise there can be |
614 | | confusion with bytes in the middle of UTF-8 characters. In a "traditional" |
615 | | environment, the tables will only recognize ASCII characters anyway, but in at |
616 | | least one Windows environment, some higher bytes bits were set in the tables. |
617 | | So we deal with that case by considering the UTF-8 encoding. |
618 | | |
619 | | Arguments: |
620 | | start_bits the starting bitmap |
621 | | cbit type the type of character wanted |
622 | | table_limit 32 for non-UTF-8; 16 for UTF-8 |
623 | | cd the block with char table pointers |
624 | | |
625 | | Returns: nothing |
626 | | */ |
627 | | |
628 | | static void |
629 | | set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit, |
630 | | compile_data *cd) |
631 | 0 | { |
632 | 0 | int c; |
633 | 0 | for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; |
634 | 0 | #if defined SUPPORT_UTF && defined COMPILE_PCRE8 |
635 | 0 | if (table_limit == 32) return; |
636 | 0 | for (c = 128; c < 256; c++) |
637 | 0 | { |
638 | 0 | if ((cd->cbits[c/8] & (1 << (c&7))) != 0) |
639 | 0 | { |
640 | 0 | pcre_uchar buff[6]; |
641 | 0 | (void)PRIV(ord2utf)(c, buff); |
642 | 0 | SET_BIT(buff[0]); |
643 | 0 | } |
644 | 0 | } |
645 | 0 | #endif |
646 | 0 | } |
647 | | |
648 | | |
649 | | /************************************************* |
650 | | * Set bits for a negative character type * |
651 | | *************************************************/ |
652 | | |
653 | | /* This function sets starting bits for a negative character type such as \D. |
654 | | In UTF-8 mode, we can only do a direct setting for bytes less than 128, as |
655 | | otherwise there can be confusion with bytes in the middle of UTF-8 characters. |
656 | | Unlike in the positive case, where we can set appropriate starting bits for |
657 | | specific high-valued UTF-8 characters, in this case we have to set the bits for |
658 | | all high-valued characters. The lowest is 0xc2, but we overkill by starting at |
659 | | 0xc0 (192) for simplicity. |
660 | | |
661 | | Arguments: |
662 | | start_bits the starting bitmap |
663 | | cbit type the type of character wanted |
664 | | table_limit 32 for non-UTF-8; 16 for UTF-8 |
665 | | cd the block with char table pointers |
666 | | |
667 | | Returns: nothing |
668 | | */ |
669 | | |
670 | | static void |
671 | | set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit, |
672 | | compile_data *cd) |
673 | 0 | { |
674 | 0 | int c; |
675 | 0 | for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; |
676 | 0 | #if defined SUPPORT_UTF && defined COMPILE_PCRE8 |
677 | 0 | if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; |
678 | 0 | #endif |
679 | 0 | } |
680 | | |
681 | | |
682 | | |
683 | | /************************************************* |
684 | | * Create bitmap of starting bytes * |
685 | | *************************************************/ |
686 | | |
687 | | /* This function scans a compiled unanchored expression recursively and |
688 | | attempts to build a bitmap of the set of possible starting bytes. As time goes |
689 | | by, we may be able to get more clever at doing this. The SSB_CONTINUE return is |
690 | | useful for parenthesized groups in patterns such as (a*)b where the group |
691 | | provides some optional starting bytes but scanning must continue at the outer |
692 | | level to find at least one mandatory byte. At the outermost level, this |
693 | | function fails unless the result is SSB_DONE. |
694 | | |
695 | | Arguments: |
696 | | code points to an expression |
697 | | start_bits points to a 32-byte table, initialized to 0 |
698 | | utf TRUE if in UTF-8 / UTF-16 mode |
699 | | cd the block with char table pointers |
700 | | |
701 | | Returns: SSB_FAIL => Failed to find any starting bytes |
702 | | SSB_DONE => Found mandatory starting bytes |
703 | | SSB_CONTINUE => Found optional starting bytes |
704 | | SSB_UNKNOWN => Hit an unrecognized opcode |
705 | | */ |
706 | | |
707 | | static int |
708 | | set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, |
709 | | compile_data *cd) |
710 | 0 | { |
711 | 0 | int c; |
712 | 0 | int yield = SSB_DONE; |
713 | 0 | #if defined SUPPORT_UTF && defined COMPILE_PCRE8 |
714 | 0 | int table_limit = utf? 16:32; |
715 | | #else |
716 | | int table_limit = 32; |
717 | | #endif |
718 | |
|
719 | | #if 0 |
720 | | /* ========================================================================= */ |
721 | | /* The following comment and code was inserted in January 1999. In May 2006, |
722 | | when it was observed to cause compiler warnings about unused values, I took it |
723 | | out again. If anybody is still using OS/2, they will have to put it back |
724 | | manually. */ |
725 | | |
726 | | /* This next statement and the later reference to dummy are here in order to |
727 | | trick the optimizer of the IBM C compiler for OS/2 into generating correct |
728 | | code. Apparently IBM isn't going to fix the problem, and we would rather not |
729 | | disable optimization (in this module it actually makes a big difference, and |
730 | | the pcre module can use all the optimization it can get). */ |
731 | | |
732 | | volatile int dummy; |
733 | | /* ========================================================================= */ |
734 | | #endif |
735 | |
|
736 | 0 | do |
737 | 0 | { |
738 | 0 | BOOL try_next = TRUE; |
739 | 0 | const pcre_uchar *tcode = code + 1 + LINK_SIZE; |
740 | |
|
741 | 0 | if (*code == OP_CBRA || *code == OP_SCBRA || |
742 | 0 | *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; |
743 | |
|
744 | 0 | while (try_next) /* Loop for items in this branch */ |
745 | 0 | { |
746 | 0 | int rc; |
747 | |
|
748 | 0 | switch(*tcode) |
749 | 0 | { |
750 | | /* If we reach something we don't understand, it means a new opcode has |
751 | | been created that hasn't been added to this code. Hopefully this problem |
752 | | will be discovered during testing. */ |
753 | | |
754 | 0 | default: |
755 | 0 | return SSB_UNKNOWN; |
756 | | |
757 | | /* Fail for a valid opcode that implies no starting bits. */ |
758 | | |
759 | 0 | case OP_ACCEPT: |
760 | 0 | case OP_ASSERT_ACCEPT: |
761 | 0 | case OP_ALLANY: |
762 | 0 | case OP_ANY: |
763 | 0 | case OP_ANYBYTE: |
764 | 0 | case OP_CIRC: |
765 | 0 | case OP_CIRCM: |
766 | 0 | case OP_CLOSE: |
767 | 0 | case OP_COMMIT: |
768 | 0 | case OP_COND: |
769 | 0 | case OP_CREF: |
770 | 0 | case OP_DEF: |
771 | 0 | case OP_DOLL: |
772 | 0 | case OP_DOLLM: |
773 | 0 | case OP_END: |
774 | 0 | case OP_EOD: |
775 | 0 | case OP_EODN: |
776 | 0 | case OP_EXTUNI: |
777 | 0 | case OP_FAIL: |
778 | 0 | case OP_MARK: |
779 | 0 | case OP_NCREF: |
780 | 0 | case OP_NOT: |
781 | 0 | case OP_NOTEXACT: |
782 | 0 | case OP_NOTEXACTI: |
783 | 0 | case OP_NOTI: |
784 | 0 | case OP_NOTMINPLUS: |
785 | 0 | case OP_NOTMINPLUSI: |
786 | 0 | case OP_NOTMINQUERY: |
787 | 0 | case OP_NOTMINQUERYI: |
788 | 0 | case OP_NOTMINSTAR: |
789 | 0 | case OP_NOTMINSTARI: |
790 | 0 | case OP_NOTMINUPTO: |
791 | 0 | case OP_NOTMINUPTOI: |
792 | 0 | case OP_NOTPLUS: |
793 | 0 | case OP_NOTPLUSI: |
794 | 0 | case OP_NOTPOSPLUS: |
795 | 0 | case OP_NOTPOSPLUSI: |
796 | 0 | case OP_NOTPOSQUERY: |
797 | 0 | case OP_NOTPOSQUERYI: |
798 | 0 | case OP_NOTPOSSTAR: |
799 | 0 | case OP_NOTPOSSTARI: |
800 | 0 | case OP_NOTPOSUPTO: |
801 | 0 | case OP_NOTPOSUPTOI: |
802 | 0 | case OP_NOTPROP: |
803 | 0 | case OP_NOTQUERY: |
804 | 0 | case OP_NOTQUERYI: |
805 | 0 | case OP_NOTSTAR: |
806 | 0 | case OP_NOTSTARI: |
807 | 0 | case OP_NOTUPTO: |
808 | 0 | case OP_NOTUPTOI: |
809 | 0 | case OP_NOT_HSPACE: |
810 | 0 | case OP_NOT_VSPACE: |
811 | 0 | case OP_NRREF: |
812 | 0 | case OP_PROP: |
813 | 0 | case OP_PRUNE: |
814 | 0 | case OP_PRUNE_ARG: |
815 | 0 | case OP_RECURSE: |
816 | 0 | case OP_REF: |
817 | 0 | case OP_REFI: |
818 | 0 | case OP_REVERSE: |
819 | 0 | case OP_RREF: |
820 | 0 | case OP_SCOND: |
821 | 0 | case OP_SET_SOM: |
822 | 0 | case OP_SKIP: |
823 | 0 | case OP_SKIP_ARG: |
824 | 0 | case OP_SOD: |
825 | 0 | case OP_SOM: |
826 | 0 | case OP_THEN: |
827 | 0 | case OP_THEN_ARG: |
828 | 0 | #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
829 | 0 | case OP_XCLASS: |
830 | 0 | #endif |
831 | 0 | return SSB_FAIL; |
832 | | |
833 | | /* We can ignore word boundary tests. */ |
834 | | |
835 | 0 | case OP_WORD_BOUNDARY: |
836 | 0 | case OP_NOT_WORD_BOUNDARY: |
837 | 0 | tcode++; |
838 | 0 | break; |
839 | | |
840 | | /* If we hit a bracket or a positive lookahead assertion, recurse to set |
841 | | bits from within the subpattern. If it can't find anything, we have to |
842 | | give up. If it finds some mandatory character(s), we are done for this |
843 | | branch. Otherwise, carry on scanning after the subpattern. */ |
844 | | |
845 | 0 | case OP_BRA: |
846 | 0 | case OP_SBRA: |
847 | 0 | case OP_CBRA: |
848 | 0 | case OP_SCBRA: |
849 | 0 | case OP_BRAPOS: |
850 | 0 | case OP_SBRAPOS: |
851 | 0 | case OP_CBRAPOS: |
852 | 0 | case OP_SCBRAPOS: |
853 | 0 | case OP_ONCE: |
854 | 0 | case OP_ONCE_NC: |
855 | 0 | case OP_ASSERT: |
856 | 0 | rc = set_start_bits(tcode, start_bits, utf, cd); |
857 | 0 | if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; |
858 | 0 | if (rc == SSB_DONE) try_next = FALSE; else |
859 | 0 | { |
860 | 0 | do tcode += GET(tcode, 1); while (*tcode == OP_ALT); |
861 | 0 | tcode += 1 + LINK_SIZE; |
862 | 0 | } |
863 | 0 | break; |
864 | | |
865 | | /* If we hit ALT or KET, it means we haven't found anything mandatory in |
866 | | this branch, though we might have found something optional. For ALT, we |
867 | | continue with the next alternative, but we have to arrange that the final |
868 | | result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, |
869 | | return SSB_CONTINUE: if this is the top level, that indicates failure, |
870 | | but after a nested subpattern, it causes scanning to continue. */ |
871 | | |
872 | 0 | case OP_ALT: |
873 | 0 | yield = SSB_CONTINUE; |
874 | 0 | try_next = FALSE; |
875 | 0 | break; |
876 | | |
877 | 0 | case OP_KET: |
878 | 0 | case OP_KETRMAX: |
879 | 0 | case OP_KETRMIN: |
880 | 0 | case OP_KETRPOS: |
881 | 0 | return SSB_CONTINUE; |
882 | | |
883 | | /* Skip over callout */ |
884 | | |
885 | 0 | case OP_CALLOUT: |
886 | 0 | tcode += 2 + 2*LINK_SIZE; |
887 | 0 | break; |
888 | | |
889 | | /* Skip over lookbehind and negative lookahead assertions */ |
890 | | |
891 | 0 | case OP_ASSERT_NOT: |
892 | 0 | case OP_ASSERTBACK: |
893 | 0 | case OP_ASSERTBACK_NOT: |
894 | 0 | do tcode += GET(tcode, 1); while (*tcode == OP_ALT); |
895 | 0 | tcode += 1 + LINK_SIZE; |
896 | 0 | break; |
897 | | |
898 | | /* BRAZERO does the bracket, but carries on. */ |
899 | | |
900 | 0 | case OP_BRAZERO: |
901 | 0 | case OP_BRAMINZERO: |
902 | 0 | case OP_BRAPOSZERO: |
903 | 0 | rc = set_start_bits(++tcode, start_bits, utf, cd); |
904 | 0 | if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; |
905 | | /* ========================================================================= |
906 | | See the comment at the head of this function concerning the next line, |
907 | | which was an old fudge for the benefit of OS/2. |
908 | | dummy = 1; |
909 | | ========================================================================= */ |
910 | 0 | do tcode += GET(tcode,1); while (*tcode == OP_ALT); |
911 | 0 | tcode += 1 + LINK_SIZE; |
912 | 0 | break; |
913 | | |
914 | | /* SKIPZERO skips the bracket. */ |
915 | | |
916 | 0 | case OP_SKIPZERO: |
917 | 0 | tcode++; |
918 | 0 | do tcode += GET(tcode,1); while (*tcode == OP_ALT); |
919 | 0 | tcode += 1 + LINK_SIZE; |
920 | 0 | break; |
921 | | |
922 | | /* Single-char * or ? sets the bit and tries the next item */ |
923 | | |
924 | 0 | case OP_STAR: |
925 | 0 | case OP_MINSTAR: |
926 | 0 | case OP_POSSTAR: |
927 | 0 | case OP_QUERY: |
928 | 0 | case OP_MINQUERY: |
929 | 0 | case OP_POSQUERY: |
930 | 0 | tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); |
931 | 0 | break; |
932 | | |
933 | 0 | case OP_STARI: |
934 | 0 | case OP_MINSTARI: |
935 | 0 | case OP_POSSTARI: |
936 | 0 | case OP_QUERYI: |
937 | 0 | case OP_MINQUERYI: |
938 | 0 | case OP_POSQUERYI: |
939 | 0 | tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); |
940 | 0 | break; |
941 | | |
942 | | /* Single-char upto sets the bit and tries the next */ |
943 | | |
944 | 0 | case OP_UPTO: |
945 | 0 | case OP_MINUPTO: |
946 | 0 | case OP_POSUPTO: |
947 | 0 | tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf); |
948 | 0 | break; |
949 | | |
950 | 0 | case OP_UPTOI: |
951 | 0 | case OP_MINUPTOI: |
952 | 0 | case OP_POSUPTOI: |
953 | 0 | tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf); |
954 | 0 | break; |
955 | | |
956 | | /* At least one single char sets the bit and stops */ |
957 | | |
958 | 0 | case OP_EXACT: |
959 | 0 | tcode += IMM2_SIZE; |
960 | | /* Fall through */ |
961 | 0 | case OP_CHAR: |
962 | 0 | case OP_PLUS: |
963 | 0 | case OP_MINPLUS: |
964 | 0 | case OP_POSPLUS: |
965 | 0 | (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); |
966 | 0 | try_next = FALSE; |
967 | 0 | break; |
968 | | |
969 | 0 | case OP_EXACTI: |
970 | 0 | tcode += IMM2_SIZE; |
971 | | /* Fall through */ |
972 | 0 | case OP_CHARI: |
973 | 0 | case OP_PLUSI: |
974 | 0 | case OP_MINPLUSI: |
975 | 0 | case OP_POSPLUSI: |
976 | 0 | (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); |
977 | 0 | try_next = FALSE; |
978 | 0 | break; |
979 | | |
980 | | /* Special spacing and line-terminating items. These recognize specific |
981 | | lists of characters. The difference between VSPACE and ANYNL is that the |
982 | | latter can match the two-character CRLF sequence, but that is not |
983 | | relevant for finding the first character, so their code here is |
984 | | identical. */ |
985 | | |
986 | 0 | case OP_HSPACE: |
987 | 0 | SET_BIT(0x09); |
988 | 0 | SET_BIT(0x20); |
989 | 0 | #ifdef SUPPORT_UTF |
990 | 0 | if (utf) |
991 | 0 | { |
992 | 0 | #ifdef COMPILE_PCRE8 |
993 | 0 | SET_BIT(0xC2); /* For U+00A0 */ |
994 | 0 | SET_BIT(0xE1); /* For U+1680, U+180E */ |
995 | 0 | SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ |
996 | 0 | SET_BIT(0xE3); /* For U+3000 */ |
997 | 0 | #endif |
998 | | #ifdef COMPILE_PCRE16 |
999 | | SET_BIT(0xA0); |
1000 | | SET_BIT(0xFF); /* For characters > 255 */ |
1001 | | #endif |
1002 | 0 | } |
1003 | 0 | else |
1004 | 0 | #endif /* SUPPORT_UTF */ |
1005 | 0 | { |
1006 | 0 | SET_BIT(0xA0); |
1007 | | #ifdef COMPILE_PCRE16 |
1008 | | SET_BIT(0xFF); /* For characters > 255 */ |
1009 | | #endif |
1010 | 0 | } |
1011 | 0 | try_next = FALSE; |
1012 | 0 | break; |
1013 | | |
1014 | 0 | case OP_ANYNL: |
1015 | 0 | case OP_VSPACE: |
1016 | 0 | SET_BIT(0x0A); |
1017 | 0 | SET_BIT(0x0B); |
1018 | 0 | SET_BIT(0x0C); |
1019 | 0 | SET_BIT(0x0D); |
1020 | 0 | #ifdef SUPPORT_UTF |
1021 | 0 | if (utf) |
1022 | 0 | { |
1023 | 0 | #ifdef COMPILE_PCRE8 |
1024 | 0 | SET_BIT(0xC2); /* For U+0085 */ |
1025 | 0 | SET_BIT(0xE2); /* For U+2028, U+2029 */ |
1026 | 0 | #endif |
1027 | | #ifdef COMPILE_PCRE16 |
1028 | | SET_BIT(0x85); |
1029 | | SET_BIT(0xFF); /* For characters > 255 */ |
1030 | | #endif |
1031 | 0 | } |
1032 | 0 | else |
1033 | 0 | #endif /* SUPPORT_UTF */ |
1034 | 0 | { |
1035 | 0 | SET_BIT(0x85); |
1036 | | #ifdef COMPILE_PCRE16 |
1037 | | SET_BIT(0xFF); /* For characters > 255 */ |
1038 | | #endif |
1039 | 0 | } |
1040 | 0 | try_next = FALSE; |
1041 | 0 | break; |
1042 | | |
1043 | | /* Single character types set the bits and stop. Note that if PCRE_UCP |
1044 | | is set, we do not see these op codes because \d etc are converted to |
1045 | | properties. Therefore, these apply in the case when only characters less |
1046 | | than 256 are recognized to match the types. */ |
1047 | | |
1048 | 0 | case OP_NOT_DIGIT: |
1049 | 0 | set_nottype_bits(start_bits, cbit_digit, table_limit, cd); |
1050 | 0 | try_next = FALSE; |
1051 | 0 | break; |
1052 | | |
1053 | 0 | case OP_DIGIT: |
1054 | 0 | set_type_bits(start_bits, cbit_digit, table_limit, cd); |
1055 | 0 | try_next = FALSE; |
1056 | 0 | break; |
1057 | | |
1058 | | /* The cbit_space table has vertical tab as whitespace; we have to |
1059 | | ensure it is set as not whitespace. */ |
1060 | | |
1061 | 0 | case OP_NOT_WHITESPACE: |
1062 | 0 | set_nottype_bits(start_bits, cbit_space, table_limit, cd); |
1063 | 0 | start_bits[1] |= 0x08; |
1064 | 0 | try_next = FALSE; |
1065 | 0 | break; |
1066 | | |
1067 | | /* The cbit_space table has vertical tab as whitespace; we have to |
1068 | | not set it from the table. */ |
1069 | | |
1070 | 0 | case OP_WHITESPACE: |
1071 | 0 | c = start_bits[1]; /* Save in case it was already set */ |
1072 | 0 | set_type_bits(start_bits, cbit_space, table_limit, cd); |
1073 | 0 | start_bits[1] = (start_bits[1] & ~0x08) | c; |
1074 | 0 | try_next = FALSE; |
1075 | 0 | break; |
1076 | | |
1077 | 0 | case OP_NOT_WORDCHAR: |
1078 | 0 | set_nottype_bits(start_bits, cbit_word, table_limit, cd); |
1079 | 0 | try_next = FALSE; |
1080 | 0 | break; |
1081 | | |
1082 | 0 | case OP_WORDCHAR: |
1083 | 0 | set_type_bits(start_bits, cbit_word, table_limit, cd); |
1084 | 0 | try_next = FALSE; |
1085 | 0 | break; |
1086 | | |
1087 | | /* One or more character type fudges the pointer and restarts, knowing |
1088 | | it will hit a single character type and stop there. */ |
1089 | | |
1090 | 0 | case OP_TYPEPLUS: |
1091 | 0 | case OP_TYPEMINPLUS: |
1092 | 0 | case OP_TYPEPOSPLUS: |
1093 | 0 | tcode++; |
1094 | 0 | break; |
1095 | | |
1096 | 0 | case OP_TYPEEXACT: |
1097 | 0 | tcode += 1 + IMM2_SIZE; |
1098 | 0 | break; |
1099 | | |
1100 | | /* Zero or more repeats of character types set the bits and then |
1101 | | try again. */ |
1102 | | |
1103 | 0 | case OP_TYPEUPTO: |
1104 | 0 | case OP_TYPEMINUPTO: |
1105 | 0 | case OP_TYPEPOSUPTO: |
1106 | 0 | tcode += IMM2_SIZE; /* Fall through */ |
1107 | |
|
1108 | 0 | case OP_TYPESTAR: |
1109 | 0 | case OP_TYPEMINSTAR: |
1110 | 0 | case OP_TYPEPOSSTAR: |
1111 | 0 | case OP_TYPEQUERY: |
1112 | 0 | case OP_TYPEMINQUERY: |
1113 | 0 | case OP_TYPEPOSQUERY: |
1114 | 0 | switch(tcode[1]) |
1115 | 0 | { |
1116 | 0 | default: |
1117 | 0 | case OP_ANY: |
1118 | 0 | case OP_ALLANY: |
1119 | 0 | return SSB_FAIL; |
1120 | | |
1121 | 0 | case OP_HSPACE: |
1122 | 0 | SET_BIT(0x09); |
1123 | 0 | SET_BIT(0x20); |
1124 | 0 | #ifdef SUPPORT_UTF |
1125 | 0 | if (utf) |
1126 | 0 | { |
1127 | 0 | #ifdef COMPILE_PCRE8 |
1128 | 0 | SET_BIT(0xC2); /* For U+00A0 */ |
1129 | 0 | SET_BIT(0xE1); /* For U+1680, U+180E */ |
1130 | 0 | SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ |
1131 | 0 | SET_BIT(0xE3); /* For U+3000 */ |
1132 | 0 | #endif |
1133 | | #ifdef COMPILE_PCRE16 |
1134 | | SET_BIT(0xA0); |
1135 | | SET_BIT(0xFF); /* For characters > 255 */ |
1136 | | #endif |
1137 | 0 | } |
1138 | 0 | else |
1139 | 0 | #endif /* SUPPORT_UTF */ |
1140 | 0 | SET_BIT(0xA0); |
1141 | 0 | break; |
1142 | | |
1143 | 0 | case OP_ANYNL: |
1144 | 0 | case OP_VSPACE: |
1145 | 0 | SET_BIT(0x0A); |
1146 | 0 | SET_BIT(0x0B); |
1147 | 0 | SET_BIT(0x0C); |
1148 | 0 | SET_BIT(0x0D); |
1149 | 0 | #ifdef SUPPORT_UTF |
1150 | 0 | if (utf) |
1151 | 0 | { |
1152 | 0 | #ifdef COMPILE_PCRE8 |
1153 | 0 | SET_BIT(0xC2); /* For U+0085 */ |
1154 | 0 | SET_BIT(0xE2); /* For U+2028, U+2029 */ |
1155 | 0 | #endif |
1156 | | #ifdef COMPILE_PCRE16 |
1157 | | SET_BIT(0x85); |
1158 | | SET_BIT(0xFF); /* For characters > 255 */ |
1159 | | #endif |
1160 | 0 | } |
1161 | 0 | else |
1162 | 0 | #endif /* SUPPORT_UTF */ |
1163 | 0 | SET_BIT(0x85); |
1164 | 0 | break; |
1165 | | |
1166 | 0 | case OP_NOT_DIGIT: |
1167 | 0 | set_nottype_bits(start_bits, cbit_digit, table_limit, cd); |
1168 | 0 | break; |
1169 | | |
1170 | 0 | case OP_DIGIT: |
1171 | 0 | set_type_bits(start_bits, cbit_digit, table_limit, cd); |
1172 | 0 | break; |
1173 | | |
1174 | | /* The cbit_space table has vertical tab as whitespace; we have to |
1175 | | ensure it gets set as not whitespace. */ |
1176 | | |
1177 | 0 | case OP_NOT_WHITESPACE: |
1178 | 0 | set_nottype_bits(start_bits, cbit_space, table_limit, cd); |
1179 | 0 | start_bits[1] |= 0x08; |
1180 | 0 | break; |
1181 | | |
1182 | | /* The cbit_space table has vertical tab as whitespace; we have to |
1183 | | avoid setting it. */ |
1184 | | |
1185 | 0 | case OP_WHITESPACE: |
1186 | 0 | c = start_bits[1]; /* Save in case it was already set */ |
1187 | 0 | set_type_bits(start_bits, cbit_space, table_limit, cd); |
1188 | 0 | start_bits[1] = (start_bits[1] & ~0x08) | c; |
1189 | 0 | break; |
1190 | | |
1191 | 0 | case OP_NOT_WORDCHAR: |
1192 | 0 | set_nottype_bits(start_bits, cbit_word, table_limit, cd); |
1193 | 0 | break; |
1194 | | |
1195 | 0 | case OP_WORDCHAR: |
1196 | 0 | set_type_bits(start_bits, cbit_word, table_limit, cd); |
1197 | 0 | break; |
1198 | 0 | } |
1199 | | |
1200 | 0 | tcode += 2; |
1201 | 0 | break; |
1202 | | |
1203 | | /* Character class where all the information is in a bit map: set the |
1204 | | bits and either carry on or not, according to the repeat count. If it was |
1205 | | a negative class, and we are operating with UTF-8 characters, any byte |
1206 | | with a value >= 0xc4 is a potentially valid starter because it starts a |
1207 | | character with a value > 255. */ |
1208 | | |
1209 | 0 | case OP_NCLASS: |
1210 | 0 | #if defined SUPPORT_UTF && defined COMPILE_PCRE8 |
1211 | 0 | if (utf) |
1212 | 0 | { |
1213 | 0 | start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ |
1214 | 0 | memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ |
1215 | 0 | } |
1216 | 0 | #endif |
1217 | | #ifdef COMPILE_PCRE16 |
1218 | | SET_BIT(0xFF); /* For characters > 255 */ |
1219 | | #endif |
1220 | | /* Fall through */ |
1221 | |
|
1222 | 0 | case OP_CLASS: |
1223 | 0 | { |
1224 | 0 | pcre_uint8 *map; |
1225 | 0 | tcode++; |
1226 | 0 | map = (pcre_uint8 *)tcode; |
1227 | | |
1228 | | /* In UTF-8 mode, the bits in a bit map correspond to character |
1229 | | values, not to byte values. However, the bit map we are constructing is |
1230 | | for byte values. So we have to do a conversion for characters whose |
1231 | | value is > 127. In fact, there are only two possible starting bytes for |
1232 | | characters in the range 128 - 255. */ |
1233 | |
|
1234 | 0 | #if defined SUPPORT_UTF && defined COMPILE_PCRE8 |
1235 | 0 | if (utf) |
1236 | 0 | { |
1237 | 0 | for (c = 0; c < 16; c++) start_bits[c] |= map[c]; |
1238 | 0 | for (c = 128; c < 256; c++) |
1239 | 0 | { |
1240 | 0 | if ((map[c/8] && (1 << (c&7))) != 0) |
1241 | 0 | { |
1242 | 0 | int d = (c >> 6) | 0xc0; /* Set bit for this starter */ |
1243 | 0 | start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ |
1244 | 0 | c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ |
1245 | 0 | } |
1246 | 0 | } |
1247 | 0 | } |
1248 | 0 | else |
1249 | 0 | #endif |
1250 | 0 | { |
1251 | | /* In non-UTF-8 mode, the two bit maps are completely compatible. */ |
1252 | 0 | for (c = 0; c < 32; c++) start_bits[c] |= map[c]; |
1253 | 0 | } |
1254 | | |
1255 | | /* Advance past the bit map, and act on what follows. For a zero |
1256 | | minimum repeat, continue; otherwise stop processing. */ |
1257 | |
|
1258 | 0 | tcode += 32 / sizeof(pcre_uchar); |
1259 | 0 | switch (*tcode) |
1260 | 0 | { |
1261 | 0 | case OP_CRSTAR: |
1262 | 0 | case OP_CRMINSTAR: |
1263 | 0 | case OP_CRQUERY: |
1264 | 0 | case OP_CRMINQUERY: |
1265 | 0 | tcode++; |
1266 | 0 | break; |
1267 | | |
1268 | 0 | case OP_CRRANGE: |
1269 | 0 | case OP_CRMINRANGE: |
1270 | 0 | if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; |
1271 | 0 | else try_next = FALSE; |
1272 | 0 | break; |
1273 | | |
1274 | 0 | default: |
1275 | 0 | try_next = FALSE; |
1276 | 0 | break; |
1277 | 0 | } |
1278 | 0 | } |
1279 | 0 | break; /* End of bitmap class handling */ |
1280 | |
|
1281 | 0 | } /* End of switch */ |
1282 | 0 | } /* End of try_next loop */ |
1283 | | |
1284 | 0 | code += GET(code, 1); /* Advance to next branch */ |
1285 | 0 | } |
1286 | 0 | while (*code == OP_ALT); |
1287 | 0 | return yield; |
1288 | 0 | } |
1289 | | |
1290 | | |
1291 | | |
1292 | | |
1293 | | |
1294 | | /************************************************* |
1295 | | * Study a compiled expression * |
1296 | | *************************************************/ |
1297 | | |
1298 | | /* This function is handed a compiled expression that it must study to produce |
1299 | | information that will speed up the matching. It returns a pcre[16]_extra block |
1300 | | which then gets handed back to pcre_exec(). |
1301 | | |
1302 | | Arguments: |
1303 | | re points to the compiled expression |
1304 | | options contains option bits |
1305 | | errorptr points to where to place error messages; |
1306 | | set NULL unless error |
1307 | | |
1308 | | Returns: pointer to a pcre[16]_extra block, with study_data filled in and |
1309 | | the appropriate flags set; |
1310 | | NULL on error or if no optimization possible |
1311 | | */ |
1312 | | |
1313 | | #ifdef COMPILE_PCRE8 |
1314 | | PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION |
1315 | | pcre_study(const pcre *external_re, int options, const char **errorptr) |
1316 | | #else |
1317 | | PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION |
1318 | | pcre16_study(const pcre16 *external_re, int options, const char **errorptr) |
1319 | | #endif |
1320 | 0 | { |
1321 | 0 | int min; |
1322 | 0 | BOOL bits_set = FALSE; |
1323 | 0 | pcre_uint8 start_bits[32]; |
1324 | 0 | PUBL(extra) *extra = NULL; |
1325 | 0 | pcre_study_data *study; |
1326 | 0 | const pcre_uint8 *tables; |
1327 | 0 | pcre_uchar *code; |
1328 | 0 | compile_data compile_block; |
1329 | 0 | const REAL_PCRE *re = (const REAL_PCRE *)external_re; |
1330 | |
|
1331 | 0 | *errorptr = NULL; |
1332 | |
|
1333 | 0 | if (re == NULL || re->magic_number != MAGIC_NUMBER) |
1334 | 0 | { |
1335 | 0 | *errorptr = "argument is not a compiled regular expression"; |
1336 | 0 | return NULL; |
1337 | 0 | } |
1338 | | |
1339 | 0 | if ((re->flags & PCRE_MODE) == 0) |
1340 | 0 | { |
1341 | 0 | #ifdef COMPILE_PCRE8 |
1342 | 0 | *errorptr = "argument is compiled in 16 bit mode"; |
1343 | | #else |
1344 | | *errorptr = "argument is compiled in 8 bit mode"; |
1345 | | #endif |
1346 | 0 | return NULL; |
1347 | 0 | } |
1348 | | |
1349 | 0 | if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) |
1350 | 0 | { |
1351 | 0 | *errorptr = "unknown or incorrect option bit(s) set"; |
1352 | 0 | return NULL; |
1353 | 0 | } |
1354 | | |
1355 | 0 | code = (pcre_uchar *)re + re->name_table_offset + |
1356 | 0 | (re->name_count * re->name_entry_size); |
1357 | | |
1358 | | /* For an anchored pattern, or an unanchored pattern that has a first char, or |
1359 | | a multiline pattern that matches only at "line starts", there is no point in |
1360 | | seeking a list of starting bytes. */ |
1361 | |
|
1362 | 0 | if ((re->options & PCRE_ANCHORED) == 0 && |
1363 | 0 | (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) |
1364 | 0 | { |
1365 | 0 | int rc; |
1366 | | |
1367 | | /* Set the character tables in the block that is passed around */ |
1368 | |
|
1369 | 0 | tables = re->tables; |
1370 | |
|
1371 | 0 | #ifdef COMPILE_PCRE8 |
1372 | 0 | if (tables == NULL) |
1373 | 0 | (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, |
1374 | 0 | (void *)(&tables)); |
1375 | | #else |
1376 | | if (tables == NULL) |
1377 | | (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, |
1378 | | (void *)(&tables)); |
1379 | | #endif |
1380 | |
|
1381 | 0 | compile_block.lcc = tables + lcc_offset; |
1382 | 0 | compile_block.fcc = tables + fcc_offset; |
1383 | 0 | compile_block.cbits = tables + cbits_offset; |
1384 | 0 | compile_block.ctypes = tables + ctypes_offset; |
1385 | | |
1386 | | /* See if we can find a fixed set of initial characters for the pattern. */ |
1387 | |
|
1388 | 0 | memset(start_bits, 0, 32 * sizeof(pcre_uint8)); |
1389 | 0 | rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0, |
1390 | 0 | &compile_block); |
1391 | 0 | bits_set = rc == SSB_DONE; |
1392 | 0 | if (rc == SSB_UNKNOWN) |
1393 | 0 | { |
1394 | 0 | *errorptr = "internal error: opcode not recognized"; |
1395 | 0 | return NULL; |
1396 | 0 | } |
1397 | 0 | } |
1398 | | |
1399 | | /* Find the minimum length of subject string. */ |
1400 | | |
1401 | 0 | switch(min = find_minlength(code, code, re->options, 0)) |
1402 | 0 | { |
1403 | 0 | case -2: *errorptr = "internal error: missing capturing bracket"; return NULL; |
1404 | 0 | case -3: *errorptr = "internal error: opcode not recognized"; return NULL; |
1405 | 0 | default: break; |
1406 | 0 | } |
1407 | | |
1408 | | /* If a set of starting bytes has been identified, or if the minimum length is |
1409 | | greater than zero, or if JIT optimization has been requested, get a |
1410 | | pcre[16]_extra block and a pcre_study_data block. The study data is put in the |
1411 | | latter, which is pointed to by the former, which may also get additional data |
1412 | | set later by the calling program. At the moment, the size of pcre_study_data |
1413 | | is fixed. We nevertheless save it in a field for returning via the |
1414 | | pcre_fullinfo() function so that if it becomes variable in the future, |
1415 | | we don't have to change that code. */ |
1416 | | |
1417 | 0 | if (bits_set || min > 0 |
1418 | | #ifdef SUPPORT_JIT |
1419 | | || (options & (PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1420 | | | PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE)) != 0 |
1421 | | #endif |
1422 | 0 | ) |
1423 | 0 | { |
1424 | 0 | extra = (PUBL(extra) *)(PUBL(malloc)) |
1425 | 0 | (sizeof(PUBL(extra)) + sizeof(pcre_study_data)); |
1426 | 0 | if (extra == NULL) |
1427 | 0 | { |
1428 | 0 | *errorptr = "failed to get memory"; |
1429 | 0 | return NULL; |
1430 | 0 | } |
1431 | | |
1432 | 0 | study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra))); |
1433 | 0 | extra->flags = PCRE_EXTRA_STUDY_DATA; |
1434 | 0 | extra->study_data = study; |
1435 | |
|
1436 | 0 | study->size = sizeof(pcre_study_data); |
1437 | 0 | study->flags = 0; |
1438 | | |
1439 | | /* Set the start bits always, to avoid unset memory errors if the |
1440 | | study data is written to a file, but set the flag only if any of the bits |
1441 | | are set, to save time looking when none are. */ |
1442 | |
|
1443 | 0 | if (bits_set) |
1444 | 0 | { |
1445 | 0 | study->flags |= PCRE_STUDY_MAPPED; |
1446 | 0 | memcpy(study->start_bits, start_bits, sizeof(start_bits)); |
1447 | 0 | } |
1448 | 0 | else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8)); |
1449 | |
|
1450 | | #ifdef PCRE_DEBUG |
1451 | | if (bits_set) |
1452 | | { |
1453 | | pcre_uint8 *ptr = start_bits; |
1454 | | int i; |
1455 | | |
1456 | | printf("Start bits:\n"); |
1457 | | for (i = 0; i < 32; i++) |
1458 | | printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n"); |
1459 | | } |
1460 | | #endif |
1461 | | |
1462 | | /* Always set the minlength value in the block, because the JIT compiler |
1463 | | makes use of it. However, don't set the bit unless the length is greater than |
1464 | | zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time |
1465 | | checking the zero case. */ |
1466 | |
|
1467 | 0 | if (min > 0) |
1468 | 0 | { |
1469 | 0 | study->flags |= PCRE_STUDY_MINLEN; |
1470 | 0 | study->minlength = min; |
1471 | 0 | } |
1472 | 0 | else study->minlength = 0; |
1473 | | |
1474 | | /* If JIT support was compiled and requested, attempt the JIT compilation. |
1475 | | If no starting bytes were found, and the minimum length is zero, and JIT |
1476 | | compilation fails, abandon the extra block and return NULL. */ |
1477 | |
|
1478 | | #ifdef SUPPORT_JIT |
1479 | | extra->executable_jit = NULL; |
1480 | | if ((options & PCRE_STUDY_JIT_COMPILE) != 0) |
1481 | | PRIV(jit_compile)(re, extra, JIT_COMPILE); |
1482 | | if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0) |
1483 | | PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE); |
1484 | | if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0) |
1485 | | PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE); |
1486 | | |
1487 | | if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0) |
1488 | | { |
1489 | | #ifdef COMPILE_PCRE8 |
1490 | | pcre_free_study(extra); |
1491 | | #endif |
1492 | | #ifdef COMPILE_PCRE16 |
1493 | | pcre16_free_study(extra); |
1494 | | #endif |
1495 | | extra = NULL; |
1496 | | } |
1497 | | #endif |
1498 | 0 | } |
1499 | | |
1500 | 0 | return extra; |
1501 | 0 | } |
1502 | | |
1503 | | |
1504 | | /************************************************* |
1505 | | * Free the study data * |
1506 | | *************************************************/ |
1507 | | |
1508 | | /* This function frees the memory that was obtained by pcre_study(). |
1509 | | |
1510 | | Argument: a pointer to the pcre[16]_extra block |
1511 | | Returns: nothing |
1512 | | */ |
1513 | | |
1514 | | #ifdef COMPILE_PCRE8 |
1515 | | PCRE_EXP_DEFN void |
1516 | | pcre_free_study(pcre_extra *extra) |
1517 | | #else |
1518 | | PCRE_EXP_DEFN void |
1519 | | pcre16_free_study(pcre16_extra *extra) |
1520 | | #endif |
1521 | 0 | { |
1522 | 0 | if (extra == NULL) |
1523 | 0 | return; |
1524 | | #ifdef SUPPORT_JIT |
1525 | | if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 && |
1526 | | extra->executable_jit != NULL) |
1527 | | PRIV(jit_free)(extra->executable_jit); |
1528 | | #endif |
1529 | 0 | PUBL(free)(extra); |
1530 | 0 | } |
1531 | | |
1532 | | /* End of pcre_study.c */ |