Coverage Report

Created: 2026-01-09 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gnupg/regexp/jimregexp.c
Line
Count
Source
1
/*
2
 * vi:se ts=8:
3
 *
4
 * regcomp and regexec -- regsub and regerror are elsewhere
5
 *
6
 *  Copyright (c) 1986 by University of Toronto.
7
 *  Written by Henry Spencer.  Not derived from licensed software.
8
 *
9
 *  Permission is granted to anyone to use this software for any
10
 *  purpose on any computer system, and to redistribute it freely,
11
 *  subject to the following restrictions:
12
 *
13
 *  1. The author is not responsible for the consequences of use of
14
 *    this software, no matter how awful, even if they arise
15
 *    from defects in it.
16
 *
17
 *  2. The origin of this software must not be misrepresented, either
18
 *    by explicit claim or by omission.
19
 *
20
 *  3. Altered versions must be plainly marked as such, and must not
21
 *    be misrepresented as being the original software.
22
 *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
23
 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24
 *** to assist in implementing egrep.
25
 *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
26
 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27
 *** as in BSD grep and ex.
28
 *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
29
 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30
 *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
31
 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32
 *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
33
 *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
34
 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35
 *** to avoid conflicting with the system regexp.h.  Const, bless its
36
 *** soul, was removed so it can compile everywhere.  The declaration
37
 *** of strchr() was in conflict on AIX, so it was removed (as it is
38
 *** happily defined in string.h).
39
 *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
40
 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41
 *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
42
 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
43
 *
44
 *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
45
 *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46
 *   This includes counted repetitions, UTF-8 support, character classes,
47
 *   shorthand character classes, increased number of parentheses to 100,
48
 *   backslash escape sequences. It also removes \n as an alternative to |.
49
 *
50
 *** THIS IS AN ALTERED VERSION.  It was altered to offer POSIX-like
51
 *** regular expression routines of regcomp/regexec/regerror/regfree,
52
 *** with UTF-8 support, by NIIBE Yutaka <gniibe@fsij.org> on
53
 *** 2020-02-14.
54
 *
55
 * Beware that some of this code is subtly aware of the way operator
56
 * precedence is structured in regular expressions.  Serious changes in
57
 * regular-expression syntax might require a total rethink.
58
 */
59
60
#if defined(JIM_REGEXP)
61
#include <stdio.h>
62
#include <ctype.h>
63
#include <stdlib.h>
64
#include <string.h>
65
66
#include "jimregexp.h"
67
#include "utf8.h"
68
69
#define UCHAR(c) ((unsigned char)(c))
70
71
/* An arbitrary limit, but this seems enough. Must be less than 1000. */
72
0
#define REG_MAX_PAREN 100
73
74
/*
75
 * Structure for regexp "program".  This is essentially a linear encoding
76
 * of a nondeterministic finite-state machine (aka syntax charts or
77
 * "railroad normal form" in parsing technology).  Each node is an opcode
78
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
79
 * all nodes except BRANCH implement concatenation; a "next" pointer with
80
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
81
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
82
 * opposed to a collection of them) is never concatenated with anything
83
 * because of operator precedence.)  The operand of some types of node is
84
 * a literal string; for others, it is a node leading into a sub-FSM.  In
85
 * particular, the operand of a BRANCH node is the first node of the branch.
86
 * (NB this is *not* a tree structure:  the tail of the branch connects
87
 * to the thing following the set of BRANCHes.)  The opcodes are:
88
 */
89
90
/* definition   number  opnd? meaning */
91
0
#define END 0  /* no End of program. */
92
0
#define BOL 1  /* no Match "" at beginning of line. */
93
0
#define EOL 2  /* no Match "" at end of line. */
94
0
#define ANY 3  /* no Match any one character. */
95
0
#define ANYOF 4  /* str  Match any character in this string. */
96
0
#define ANYBUT  5  /* str  Match any character not in this string. */
97
0
#define BRANCH  6  /* node Match this alternative, or the next... */
98
0
#define BACK  7  /* no Match "", "next" ptr points backward. */
99
0
#define EXACTLY 8  /* str  Match this string. */
100
0
#define NOTHING 9  /* no Match empty string. */
101
0
#define REP 10  /* max,min  Match this (simple) thing [min,max] times. */
102
0
#define REPMIN  11  /* max,min  Match this (simple) thing [min,max] times, minimal match. */
103
0
#define REPX  12  /* max,min  Match this (complex) thing [min,max] times. */
104
0
#define REPXMIN 13  /* max,min  Match this (complex) thing [min,max] times, minimal match. */
105
0
#define BOLX  14  /* no Match "" at beginning of input. */
106
0
#define EOLX  15  /* no Match "" at end of input. */
107
0
#define WORDA 16  /* no Match "" at wordchar, where prev is nonword */
108
0
#define WORDZ 17  /* no Match "" at nonwordchar, where prev is word */
109
110
0
#define OPENNC  1000  /* no Non-capturing parentheses - must be OPEN-1 */
111
0
#define OPEN    1001  /* no Mark this point in input as start of #n. */
112
      /*  OPEN+1 is number 1, etc. */
113
114
/* must not be any other opts between OPEN and CLOSE */
115
116
0
#define CLOSENC 2000  /* no Non-capturing parentheses - must be CLOSE-1 */
117
0
#define CLOSE 2001  /* no Analogous to OPEN. */
118
0
#define CLOSE_END (CLOSE+REG_MAX_PAREN)
119
120
/*
121
 * The first word of the regexp internal "program" is actually this magic
122
 * number; the start node begins in the second word.
123
 */
124
0
#define REG_MAGIC 0xFADED00D
125
126
/*
127
 * Opcode notes:
128
 *
129
 * BRANCH The set of branches constituting a single choice are hooked
130
 *    together with their "next" pointers, since precedence prevents
131
 *    anything being concatenated to any individual branch.  The
132
 *    "next" pointer of the last BRANCH in a choice points to the
133
 *    thing following the whole choice.  This is also where the
134
 *    final "next" pointer of each individual branch points; each
135
 *    branch starts with the operand node of a BRANCH node.
136
 *
137
 * BACK   Normal "next" pointers all implicitly point forward; BACK
138
 *    exists to make loop structures possible.
139
 *
140
 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
141
 *              as either simple repeats (REP) or complex repeats (REPX).
142
 *              These opcodes include a "min" and "max" count after the opcode.
143
 *    This is followed by a fourth "current count" word that is
144
 *    only used by REPX, as it implements a recursive match.
145
 *    REPMIN and REPXMIN are identical except they implement minimal repeats.
146
 *
147
 * OPEN,CLOSE ...are numbered at compile time.
148
 */
149
150
/*
151
 * A node is one word of opcode followed by one word of "next" pointer.
152
 * The "next" pointer value is a positive offset from the opcode of the node
153
 * containing it.
154
 * An operand, if any, simply follows the node.  (Note that much of the
155
 * code generation knows about this implicit relationship.)
156
 */
157
0
#define OP(preg, p) (preg->program[p])
158
0
#define NEXT(preg, p) (preg->program[p + 1])
159
0
#define OPERAND(p)  ((p) + 2)
160
161
/*
162
 * See regmagic.h for one further detail of program structure.
163
 */
164
165
166
/*
167
 * Utility definitions.
168
 */
169
170
0
#define FAIL(R,M) { (R)->err = (M); return (M); }
171
0
#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
172
0
#define META    "^$.[()|?{+*"
173
174
/*
175
 * Flags to be passed up and down.
176
 */
177
0
#define HASWIDTH  1  /* Known never to match null string. */
178
0
#define SIMPLE    2  /* Simple enough to be STAR/PLUS operand. */
179
0
#define SPSTART   4  /* Starts with * or +. */
180
0
#define WORST   0  /* Worst case. */
181
182
0
#define MAX_REP_COUNT 1000000
183
184
/*
185
 * Forward declarations for regcomp()'s friends.
186
 */
187
static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
188
static int regpiece(regex_t *preg, int *flagp );
189
static int regbranch(regex_t *preg, int *flagp );
190
static int regatom(regex_t *preg, int *flagp );
191
static int regnode(regex_t *preg, int op );
192
static int regnext(regex_t *preg, int p );
193
static void regc(regex_t *preg, int b );
194
static int reginsert(regex_t *preg, int op, int size, int opnd );
195
static void regtail(regex_t *preg, int p, int val);
196
static void regoptail(regex_t *preg, int p, int val );
197
static int regopsize(regex_t *preg, int p );
198
199
static int reg_range_find(const int *string, int c);
200
static const char *str_find(const char *string, int c, int nocase);
201
static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
202
203
/*#define DEBUG*/
204
#ifdef DEBUG
205
static int regnarrate = 0;
206
static void regdump(regex_t *preg);
207
static const char *regprop( int op );
208
#endif
209
210
211
/**
212
 * Returns the length of the null-terminated integer sequence.
213
 */
214
static int str_int_len(const int *seq)
215
0
{
216
0
  int n = 0;
217
0
  while (*seq++) {
218
0
    n++;
219
0
  }
220
0
  return n;
221
0
}
222
223
/*
224
 - regcomp - compile a regular expression into internal code
225
 *
226
 * We can't allocate space until we know how big the compiled form will be,
227
 * but we can't compile it (and thus know how big it is) until we've got a
228
 * place to put the code.  So we cheat:  we compile it twice, once with code
229
 * generation turned off and size counting turned on, and once "for real".
230
 * This also means that we don't allocate space until we are sure that the
231
 * thing really will compile successfully, and we never have to move the
232
 * code and thus invalidate pointers into it.  (Note that it has to be in
233
 * one piece because free() must be able to free it all.)
234
 *
235
 * Beware that the optimization-preparation code in here knows about some
236
 * of the structure of the compiled regexp.
237
 */
238
int regcomp(regex_t *preg, const char *exp, int cflags)
239
0
{
240
0
  int scan;
241
0
  int longest;
242
0
  unsigned len;
243
0
  int flags;
244
245
#ifdef DEBUG
246
  fprintf(stderr, "Compiling: '%s'\n", exp);
247
#endif
248
0
  memset(preg, 0, sizeof(*preg));
249
250
0
  if (exp == NULL)
251
0
    FAIL(preg, REG_ERR_NULL_ARGUMENT);
252
253
  /* First pass: determine size, legality. */
254
0
  preg->cflags = cflags;
255
0
  preg->regparse = exp;
256
257
  /* Allocate space. */
258
0
  preg->proglen = (strlen(exp) + 1) * 5;
259
0
  preg->program = malloc(preg->proglen * sizeof(int));
260
0
  if (preg->program == NULL)
261
0
    FAIL(preg, REG_ERR_NOMEM);
262
263
  /* Note that since we store a magic value as the first item in the program,
264
   * program offsets will never be 0
265
   */
266
0
  regc(preg, REG_MAGIC);
267
0
  if (reg(preg, 0, &flags) == 0) {
268
0
    return preg->err;
269
0
  }
270
271
  /* Small enough for pointer-storage convention? */
272
0
  if (preg->re_nsub >= REG_MAX_PAREN)   /* Probably could be 65535L. */
273
0
    FAIL(preg,REG_ERR_TOO_BIG);
274
275
  /* Dig out information for optimizations. */
276
0
  preg->regstart = 0; /* Worst-case defaults. */
277
0
  preg->reganch = 0;
278
0
  preg->regmust = 0;
279
0
  preg->regmlen = 0;
280
0
  scan = 1;     /* First BRANCH. */
281
0
  if (OP(preg, regnext(preg, scan)) == END) {   /* Only one top-level choice. */
282
0
    scan = OPERAND(scan);
283
284
    /* Starting-point info. */
285
0
    if (OP(preg, scan) == EXACTLY) {
286
0
      preg->regstart = preg->program[OPERAND(scan)];
287
0
    }
288
0
    else if (OP(preg, scan) == BOL)
289
0
      preg->reganch++;
290
291
    /*
292
     * If there's something expensive in the r.e., find the
293
     * longest literal string that must appear and make it the
294
     * regmust.  Resolve ties in favor of later strings, since
295
     * the regstart check works with the beginning of the r.e.
296
     * and avoiding duplication strengthens checking.  Not a
297
     * strong reason, but sufficient in the absence of others.
298
     */
299
0
    if (flags&SPSTART) {
300
0
      longest = 0;
301
0
      len = 0;
302
0
      for (; scan != 0; scan = regnext(preg, scan)) {
303
0
        if (OP(preg, scan) == EXACTLY) {
304
0
          int plen = str_int_len(preg->program + OPERAND(scan));
305
0
          if (plen >= len) {
306
0
            longest = OPERAND(scan);
307
0
            len = plen;
308
0
          }
309
0
        }
310
0
      }
311
0
      preg->regmust = longest;
312
0
      preg->regmlen = len;
313
0
    }
314
0
  }
315
316
#ifdef DEBUG
317
  regdump(preg);
318
#endif
319
320
0
  return 0;
321
0
}
322
323
/*
324
 - reg - regular expression, i.e. main body or parenthesized thing
325
 *
326
 * Caller must absorb opening parenthesis.
327
 *
328
 * Combining parenthesis handling with the base level of regular expression
329
 * is a trifle forced, but the need to tie the tails of the branches to what
330
 * follows makes it hard to avoid.
331
 */
332
static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
333
0
{
334
0
  int ret;
335
0
  int br;
336
0
  int ender;
337
0
  int parno = 0;
338
0
  int flags;
339
340
0
  *flagp = HASWIDTH; /* Tentatively. */
341
342
  /* Make an OPEN node, if parenthesized. */
343
0
  if (paren) {
344
0
    if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
345
      /* non-capturing paren */
346
0
      preg->regparse += 2;
347
0
      parno = -1;
348
0
    }
349
0
    else {
350
0
      parno = ++preg->re_nsub;
351
0
    }
352
0
    ret = regnode(preg, OPEN+parno);
353
0
  } else
354
0
    ret = 0;
355
356
  /* Pick up the branches, linking them together. */
357
0
  br = regbranch(preg, &flags);
358
0
  if (br == 0)
359
0
    return 0;
360
0
  if (ret != 0)
361
0
    regtail(preg, ret, br); /* OPEN -> first. */
362
0
  else
363
0
    ret = br;
364
0
  if (!(flags&HASWIDTH))
365
0
    *flagp &= ~HASWIDTH;
366
0
  *flagp |= flags&SPSTART;
367
0
  while (*preg->regparse == '|') {
368
0
    preg->regparse++;
369
0
    br = regbranch(preg, &flags);
370
0
    if (br == 0)
371
0
      return 0;
372
0
    regtail(preg, ret, br); /* BRANCH -> BRANCH. */
373
0
    if (!(flags&HASWIDTH))
374
0
      *flagp &= ~HASWIDTH;
375
0
    *flagp |= flags&SPSTART;
376
0
  }
377
378
  /* Make a closing node, and hook it on the end. */
379
0
  ender = regnode(preg, (paren) ? CLOSE+parno : END);
380
0
  regtail(preg, ret, ender);
381
382
  /* Hook the tails of the branches to the closing node. */
383
0
  for (br = ret; br != 0; br = regnext(preg, br))
384
0
    regoptail(preg, br, ender);
385
386
  /* Check for proper termination. */
387
0
  if (paren && *preg->regparse++ != ')') {
388
0
    preg->err = REG_ERR_UNMATCHED_PAREN;
389
0
    return 0;
390
0
  } else if (!paren && *preg->regparse != '\0') {
391
0
    if (*preg->regparse == ')') {
392
0
      preg->err = REG_ERR_UNMATCHED_PAREN;
393
0
      return 0;
394
0
    } else {
395
0
      preg->err = REG_ERR_JUNK_ON_END;
396
0
      return 0;
397
0
    }
398
0
  }
399
400
0
  return(ret);
401
0
}
402
403
/*
404
 - regbranch - one alternative of an | operator
405
 *
406
 * Implements the concatenation operator.
407
 */
408
static int regbranch(regex_t *preg, int *flagp )
409
0
{
410
0
  int ret;
411
0
  int chain;
412
0
  int latest;
413
0
  int flags;
414
415
0
  *flagp = WORST;   /* Tentatively. */
416
417
0
  ret = regnode(preg, BRANCH);
418
0
  chain = 0;
419
0
  while (*preg->regparse != '\0' && *preg->regparse != ')' &&
420
0
         *preg->regparse != '|') {
421
0
    latest = regpiece(preg, &flags);
422
0
    if (latest == 0)
423
0
      return 0;
424
0
    *flagp |= flags&HASWIDTH;
425
0
    if (chain == 0) {/* First piece. */
426
0
      *flagp |= flags&SPSTART;
427
0
    }
428
0
    else {
429
0
      regtail(preg, chain, latest);
430
0
    }
431
0
    chain = latest;
432
0
  }
433
0
  if (chain == 0) /* Loop ran zero times. */
434
0
    (void) regnode(preg, NOTHING);
435
436
0
  return(ret);
437
0
}
438
439
/*
440
 - regpiece - something followed by possible [*+?]
441
 *
442
 * Note that the branching code sequences used for ? and the general cases
443
 * of * and + are somewhat optimized:  they use the same NOTHING node as
444
 * both the endmarker for their branch list and the body of the last branch.
445
 * It might seem that this node could be dispensed with entirely, but the
446
 * endmarker role is not redundant.
447
 */
448
static int regpiece(regex_t *preg, int *flagp)
449
0
{
450
0
  int ret;
451
0
  char op;
452
0
  int next;
453
0
  int flags;
454
0
  int min;
455
0
  int max;
456
457
0
  ret = regatom(preg, &flags);
458
0
  if (ret == 0)
459
0
    return 0;
460
461
0
  op = *preg->regparse;
462
0
  if (!ISMULT(op)) {
463
0
    *flagp = flags;
464
0
    return(ret);
465
0
  }
466
467
0
  if (!(flags&HASWIDTH) && op != '?') {
468
0
    preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
469
0
    return 0;
470
0
  }
471
472
  /* Handle braces (counted repetition) by expansion */
473
0
  if (op == '{') {
474
0
    char *end;
475
476
0
    min = strtoul(preg->regparse + 1, &end, 10);
477
0
    if (end == preg->regparse + 1) {
478
0
      preg->err = REG_ERR_BAD_COUNT;
479
0
      return 0;
480
0
    }
481
0
    if (*end == '}') {
482
0
      max = min;
483
0
    }
484
0
    else if (*end == '\0') {
485
0
      preg->err = REG_ERR_UNMATCHED_BRACES;
486
0
      return 0;
487
0
    }
488
0
    else {
489
0
      preg->regparse = end;
490
0
      max = strtoul(preg->regparse + 1, &end, 10);
491
0
      if (*end != '}') {
492
0
        preg->err = REG_ERR_UNMATCHED_BRACES;
493
0
        return 0;
494
0
      }
495
0
    }
496
0
    if (end == preg->regparse + 1) {
497
0
      max = MAX_REP_COUNT;
498
0
    }
499
0
    else if (max < min || max >= 100) {
500
0
      preg->err = REG_ERR_BAD_COUNT;
501
0
      return 0;
502
0
    }
503
0
    if (min >= 100) {
504
0
      preg->err = REG_ERR_BAD_COUNT;
505
0
      return 0;
506
0
    }
507
508
0
    preg->regparse = strchr(preg->regparse, '}');
509
0
  }
510
0
  else {
511
0
    min = (op == '+');
512
0
    max = (op == '?' ? 1 : MAX_REP_COUNT);
513
0
  }
514
515
0
  if (preg->regparse[1] == '?') {
516
0
    preg->regparse++;
517
0
    next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
518
0
  }
519
0
  else {
520
0
    next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
521
0
  }
522
0
  preg->program[ret + 2] = max;
523
0
  preg->program[ret + 3] = min;
524
0
  preg->program[ret + 4] = 0;
525
526
0
  *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
527
528
0
  if (!(flags & SIMPLE)) {
529
0
    int back = regnode(preg, BACK);
530
0
    regtail(preg, back, ret);
531
0
    regtail(preg, next, back);
532
0
  }
533
534
0
  preg->regparse++;
535
0
  if (ISMULT(*preg->regparse)) {
536
0
    preg->err = REG_ERR_NESTED_COUNT;
537
0
    return 0;
538
0
  }
539
540
0
  return ret;
541
0
}
542
543
/**
544
 * Add all characters in the inclusive range between lower and upper.
545
 *
546
 * Handles a swapped range (upper < lower).
547
 */
548
static void reg_addrange(regex_t *preg, int lower, int upper)
549
0
{
550
0
  if (lower > upper) {
551
0
    reg_addrange(preg, upper, lower);
552
0
  }
553
  /* Add a range as length, start */
554
0
  regc(preg, upper - lower + 1);
555
0
  regc(preg, lower);
556
0
}
557
558
/**
559
 * Add a null-terminated literal string as a set of ranges.
560
 */
561
static void reg_addrange_str(regex_t *preg, const char *str)
562
0
{
563
0
  while (*str) {
564
0
    reg_addrange(preg, *str, *str);
565
0
    str++;
566
0
  }
567
0
}
568
569
/**
570
 * Extracts the next unicode char from utf8.
571
 *
572
 * If 'upper' is set, converts the char to uppercase.
573
 */
574
static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
575
0
{
576
0
  int l = utf8_tounicode(s, uc);
577
0
  if (upper) {
578
0
    *uc = utf8_upper(*uc);
579
0
  }
580
0
  return l;
581
0
}
582
583
/**
584
 * Converts a hex digit to decimal.
585
 *
586
 * Returns -1 for an invalid hex digit.
587
 */
588
static int hexdigitval(int c)
589
0
{
590
0
  if (c >= '0' && c <= '9')
591
0
    return c - '0';
592
0
  if (c >= 'a' && c <= 'f')
593
0
    return c - 'a' + 10;
594
0
  if (c >= 'A' && c <= 'F')
595
0
    return c - 'A' + 10;
596
0
  return -1;
597
0
}
598
599
/**
600
 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
601
 *
602
 * Returns the number of hex digits parsed.
603
 * If there are no hex digits, returns 0 and stores nothing.
604
 */
605
static int parse_hex(const char *s, int n, int *uc)
606
0
{
607
0
  int val = 0;
608
0
  int k;
609
610
0
  for (k = 0; k < n; k++) {
611
0
    int c = hexdigitval(*s++);
612
0
    if (c == -1) {
613
0
      break;
614
0
    }
615
0
    val = (val << 4) | c;
616
0
  }
617
0
  if (k) {
618
0
    *uc = val;
619
0
  }
620
0
  return k;
621
0
}
622
623
/**
624
 * Call for chars after a backlash to decode the escape sequence.
625
 *
626
 * Stores the result in *ch.
627
 *
628
 * Returns the number of bytes consumed.
629
 */
630
static int reg_decode_escape(const char *s, int *ch)
631
0
{
632
0
  int n;
633
0
  const char *s0 = s;
634
635
0
  *ch = *s++;
636
637
0
  switch (*ch) {
638
0
    case 'b': *ch = '\b'; break;
639
0
    case 'e': *ch = 27; break;
640
0
    case 'f': *ch = '\f'; break;
641
0
    case 'n': *ch = '\n'; break;
642
0
    case 'r': *ch = '\r'; break;
643
0
    case 't': *ch = '\t'; break;
644
0
    case 'v': *ch = '\v'; break;
645
0
    case 'u':
646
0
      if (*s == '{') {
647
        /* Expect \u{NNNN} */
648
0
        n = parse_hex(s + 1, 6, ch);
649
0
        if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
650
0
          s += n + 2;
651
0
        }
652
0
        else {
653
          /* Invalid, so just treat as an escaped 'u' */
654
0
          *ch = 'u';
655
0
        }
656
0
      }
657
0
      else if ((n = parse_hex(s, 4, ch)) > 0) {
658
0
        s += n;
659
0
      }
660
0
      break;
661
0
    case 'U':
662
0
      if ((n = parse_hex(s, 8, ch)) > 0) {
663
0
        s += n;
664
0
      }
665
0
      break;
666
0
    case 'x':
667
0
      if ((n = parse_hex(s, 2, ch)) > 0) {
668
0
        s += n;
669
0
      }
670
0
      break;
671
0
    case '\0':
672
0
      s--;
673
0
      *ch = '\\';
674
0
      break;
675
0
  }
676
0
  return s - s0;
677
0
}
678
679
/*
680
 - regatom - the lowest level
681
 *
682
 * Optimization:  gobbles an entire sequence of ordinary characters so that
683
 * it can turn them into a single node, which is smaller to store and
684
 * faster to run.  Backslashed characters are exceptions, each becoming a
685
 * separate node; the code is simpler that way and it's not worth fixing.
686
 */
687
static int regatom(regex_t *preg, int *flagp)
688
0
{
689
0
  int ret;
690
0
  int flags;
691
0
  int nocase = (preg->cflags & REG_ICASE);
692
693
0
  int ch;
694
0
  int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
695
696
0
  *flagp = WORST;   /* Tentatively. */
697
698
0
  preg->regparse += n;
699
0
  switch (ch) {
700
  /* FIXME: these chars only have meaning at beg/end of pat? */
701
0
  case '^':
702
0
    ret = regnode(preg, BOL);
703
0
    break;
704
0
  case '$':
705
0
    ret = regnode(preg, EOL);
706
0
    break;
707
0
  case '.':
708
0
    ret = regnode(preg, ANY);
709
0
    *flagp |= HASWIDTH|SIMPLE;
710
0
    break;
711
0
  case '[': {
712
0
      const char *pattern = preg->regparse;
713
714
0
      if (*pattern == '^') { /* Complement of range. */
715
0
        ret = regnode(preg, ANYBUT);
716
0
        pattern++;
717
0
      } else
718
0
        ret = regnode(preg, ANYOF);
719
720
      /* Special case. If the first char is ']' or '-', it is part of the set */
721
0
      if (*pattern == ']' || *pattern == '-') {
722
0
        reg_addrange(preg, *pattern, *pattern);
723
0
        pattern++;
724
0
      }
725
726
0
      while (*pattern != ']') {
727
        /* Is this a range? a-z */
728
0
        int start;
729
0
        int end;
730
731
0
        enum {
732
0
          CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
733
0
          CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
734
0
          CC_NUM
735
0
        };
736
0
        int cc;
737
738
0
        if (!*pattern) {
739
0
          preg->err = REG_ERR_UNMATCHED_BRACKET;
740
0
          return 0;
741
0
        }
742
743
0
        pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
744
0
        if (start == '\\') {
745
          /* First check for class shorthand escapes */
746
0
          switch (*pattern) {
747
0
            case 's':
748
0
              pattern++;
749
0
              cc = CC_SPACE;
750
0
              goto cc_switch;
751
0
            case 'd':
752
0
              pattern++;
753
0
              cc = CC_DIGIT;
754
0
              goto cc_switch;
755
0
            case 'w':
756
0
              pattern++;
757
0
              reg_addrange(preg, '_', '_');
758
0
              cc = CC_ALNUM;
759
0
              goto cc_switch;
760
0
          }
761
0
          pattern += reg_decode_escape(pattern, &start);
762
0
          if (start == 0) {
763
0
            preg->err = REG_ERR_NULL_CHAR;
764
0
            return 0;
765
0
          }
766
0
          if (start == '\\' && *pattern == 0) {
767
0
            preg->err = REG_ERR_INVALID_ESCAPE;
768
0
            return 0;
769
0
          }
770
0
        }
771
0
        if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
772
          /* skip '-' */
773
0
          pattern += utf8_tounicode(pattern, &end);
774
0
          pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
775
0
          if (end == '\\') {
776
0
            pattern += reg_decode_escape(pattern, &end);
777
0
            if (end == 0) {
778
0
              preg->err = REG_ERR_NULL_CHAR;
779
0
              return 0;
780
0
            }
781
0
            if (end == '\\' && *pattern == 0) {
782
0
              preg->err = REG_ERR_INVALID_ESCAPE;
783
0
              return 0;
784
0
            }
785
0
          }
786
787
0
          reg_addrange(preg, start, end);
788
0
          continue;
789
0
        }
790
0
        if (start == '[' && pattern[0] == ':') {
791
0
          static const char *character_class[] = {
792
0
            ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
793
0
            ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
794
0
          };
795
796
0
          for (cc = 0; cc < CC_NUM; cc++) {
797
0
            n = strlen(character_class[cc]);
798
0
            if (strncmp(pattern, character_class[cc], n) == 0) {
799
0
              if (pattern[n] != ']') {
800
0
                preg->err = REG_ERR_UNMATCHED_BRACKET;
801
0
                return 0;
802
0
              }
803
              /* Found a character class */
804
0
              pattern += n + 1;
805
0
              break;
806
0
            }
807
0
          }
808
0
          if (cc != CC_NUM) {
809
0
cc_switch:
810
0
            switch (cc) {
811
0
              case CC_ALNUM:
812
0
                reg_addrange(preg, '0', '9');
813
                /* Fall through */
814
0
              case CC_ALPHA:
815
0
                if ((preg->cflags & REG_ICASE) == 0) {
816
0
                  reg_addrange(preg, 'a', 'z');
817
0
                }
818
0
                reg_addrange(preg, 'A', 'Z');
819
0
                break;
820
0
              case CC_SPACE:
821
0
                reg_addrange_str(preg, " \t\r\n\f\v");
822
0
                break;
823
0
              case CC_BLANK:
824
0
                reg_addrange_str(preg, " \t");
825
0
                break;
826
0
              case CC_UPPER:
827
0
                reg_addrange(preg, 'A', 'Z');
828
0
                break;
829
0
              case CC_LOWER:
830
0
                reg_addrange(preg, 'a', 'z');
831
0
                break;
832
0
              case CC_XDIGIT:
833
0
                reg_addrange(preg, 'a', 'f');
834
0
                reg_addrange(preg, 'A', 'F');
835
                /* Fall through */
836
0
              case CC_DIGIT:
837
0
                reg_addrange(preg, '0', '9');
838
0
                break;
839
0
              case CC_CNTRL:
840
0
                reg_addrange(preg, 0, 31);
841
0
                reg_addrange(preg, 127, 127);
842
0
                break;
843
0
              case CC_PRINT:
844
0
                reg_addrange(preg, ' ', '~');
845
0
                break;
846
0
              case CC_GRAPH:
847
0
                reg_addrange(preg, '!', '~');
848
0
                break;
849
0
              case CC_PUNCT:
850
0
                reg_addrange(preg, '!', '/');
851
0
                reg_addrange(preg, ':', '@');
852
0
                reg_addrange(preg, '[', '`');
853
0
                reg_addrange(preg, '{', '~');
854
0
                break;
855
0
            }
856
0
            continue;
857
0
          }
858
0
        }
859
        /* Not a range, so just add the char */
860
0
        reg_addrange(preg, start, start);
861
0
      }
862
0
      regc(preg, '\0');
863
864
0
      if (*pattern) {
865
0
        pattern++;
866
0
      }
867
0
      preg->regparse = pattern;
868
869
0
      *flagp |= HASWIDTH|SIMPLE;
870
0
    }
871
0
    break;
872
0
  case '(':
873
0
    ret = reg(preg, 1, &flags);
874
0
    if (ret == 0)
875
0
      return 0;
876
0
    *flagp |= flags&(HASWIDTH|SPSTART);
877
0
    break;
878
0
  case '\0':
879
0
  case '|':
880
0
  case ')':
881
0
    preg->err = REG_ERR_INTERNAL;
882
0
    return 0; /* Supposed to be caught earlier. */
883
0
  case '?':
884
0
  case '+':
885
0
  case '*':
886
0
  case '{':
887
0
    preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
888
0
    return 0;
889
0
  case '\\':
890
0
    ch = *preg->regparse++;
891
0
    switch (ch) {
892
0
    case '\0':
893
0
      preg->err = REG_ERR_INVALID_ESCAPE;
894
0
      return 0;
895
0
    case 'A':
896
0
      ret = regnode(preg, BOLX);
897
0
      break;
898
0
    case 'Z':
899
0
      ret = regnode(preg, EOLX);
900
0
      break;
901
0
    case '<':
902
0
    case 'm':
903
0
      ret = regnode(preg, WORDA);
904
0
      break;
905
0
    case '>':
906
0
    case 'M':
907
0
      ret = regnode(preg, WORDZ);
908
0
      break;
909
0
    case 'd':
910
0
    case 'D':
911
0
      ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
912
0
      reg_addrange(preg, '0', '9');
913
0
      regc(preg, '\0');
914
0
      *flagp |= HASWIDTH|SIMPLE;
915
0
      break;
916
0
    case 'w':
917
0
    case 'W':
918
0
      ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
919
0
      if ((preg->cflags & REG_ICASE) == 0) {
920
0
        reg_addrange(preg, 'a', 'z');
921
0
      }
922
0
      reg_addrange(preg, 'A', 'Z');
923
0
      reg_addrange(preg, '0', '9');
924
0
      reg_addrange(preg, '_', '_');
925
0
      regc(preg, '\0');
926
0
      *flagp |= HASWIDTH|SIMPLE;
927
0
      break;
928
0
    case 's':
929
0
    case 'S':
930
0
      ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
931
0
      reg_addrange_str(preg," \t\r\n\f\v");
932
0
      regc(preg, '\0');
933
0
      *flagp |= HASWIDTH|SIMPLE;
934
0
      break;
935
    /* FIXME: Someday handle \1, \2, ... */
936
0
    default:
937
      /* Handle general quoted chars in exact-match routine */
938
      /* Back up to include the backslash */
939
0
      preg->regparse--;
940
0
      goto de_fault;
941
0
    }
942
0
    break;
943
0
  de_fault:
944
0
  default: {
945
      /*
946
       * Encode a string of characters to be matched exactly.
947
       */
948
0
      int added = 0;
949
950
      /* Back up to pick up the first char of interest */
951
0
      preg->regparse -= n;
952
953
0
      ret = regnode(preg, EXACTLY);
954
955
      /* Note that a META operator such as ? or * consumes the
956
       * preceding char.
957
       * Thus we must be careful to look ahead by 2 and add the
958
       * last char as it's own EXACTLY if necessary
959
       */
960
961
      /* Until end of string or a META char is reached */
962
0
      while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
963
0
        n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
964
0
        if (ch == '\\' && preg->regparse[n]) {
965
          /* Non-trailing backslash.
966
           * Is this a special escape, or a regular escape?
967
           */
968
0
          if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
969
            /* A special escape. All done with EXACTLY */
970
0
            break;
971
0
          }
972
          /* Decode it. Note that we add the length for the escape
973
           * sequence to the length for the backlash so we can skip
974
           * the entire sequence, or not as required.
975
           */
976
0
          n += reg_decode_escape(preg->regparse + n, &ch);
977
0
          if (ch == 0) {
978
0
            preg->err = REG_ERR_NULL_CHAR;
979
0
            return 0;
980
0
          }
981
0
        }
982
983
        /* Now we have one char 'ch' of length 'n'.
984
         * Check to see if the following char is a MULT
985
         */
986
987
0
        if (ISMULT(preg->regparse[n])) {
988
          /* Yes. But do we already have some EXACTLY chars? */
989
0
          if (added) {
990
            /* Yes, so return what we have and pick up the current char next time around */
991
0
            break;
992
0
          }
993
          /* No, so add this single char and finish */
994
0
          regc(preg, ch);
995
0
          added++;
996
0
          preg->regparse += n;
997
0
          break;
998
0
        }
999
1000
        /* No, so just add this char normally */
1001
0
        regc(preg, ch);
1002
0
        added++;
1003
0
        preg->regparse += n;
1004
0
      }
1005
0
      regc(preg, '\0');
1006
1007
0
      *flagp |= HASWIDTH;
1008
0
      if (added == 1)
1009
0
        *flagp |= SIMPLE;
1010
0
      break;
1011
0
    }
1012
0
    break;
1013
0
  }
1014
1015
0
  return(ret);
1016
0
}
1017
1018
static void reg_grow(regex_t *preg, int n)
1019
0
{
1020
0
  if (preg->p + n >= preg->proglen) {
1021
0
    preg->proglen = (preg->p + n) * 2;
1022
0
    preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1023
0
  }
1024
0
}
1025
1026
/*
1027
 - regnode - emit a node
1028
 */
1029
/* Location. */
1030
static int regnode(regex_t *preg, int op)
1031
0
{
1032
0
  reg_grow(preg, 2);
1033
1034
  /* The OP followed by a next pointer */
1035
0
  preg->program[preg->p++] = op;
1036
0
  preg->program[preg->p++] = 0;
1037
1038
  /* Return the start of the node */
1039
0
  return preg->p - 2;
1040
0
}
1041
1042
/*
1043
 - regc - emit (if appropriate) a byte of code
1044
 */
1045
static void regc(regex_t *preg, int b )
1046
0
{
1047
0
  reg_grow(preg, 1);
1048
0
  preg->program[preg->p++] = b;
1049
0
}
1050
1051
/*
1052
 - reginsert - insert an operator in front of already-emitted operand
1053
 *
1054
 * Means relocating the operand.
1055
 * Returns the new location of the original operand.
1056
 */
1057
static int reginsert(regex_t *preg, int op, int size, int opnd )
1058
0
{
1059
0
  reg_grow(preg, size);
1060
1061
  /* Move everything from opnd up */
1062
0
  memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1063
  /* Zero out the new space */
1064
0
  memset(preg->program + opnd, 0, sizeof(int) * size);
1065
1066
0
  preg->program[opnd] = op;
1067
1068
0
  preg->p += size;
1069
1070
0
  return opnd + size;
1071
0
}
1072
1073
/*
1074
 - regtail - set the next-pointer at the end of a node chain
1075
 */
1076
static void regtail(regex_t *preg, int p, int val)
1077
0
{
1078
0
  int scan;
1079
0
  int temp;
1080
0
  int offset;
1081
1082
  /* Find last node. */
1083
0
  scan = p;
1084
0
  for (;;) {
1085
0
    temp = regnext(preg, scan);
1086
0
    if (temp == 0)
1087
0
      break;
1088
0
    scan = temp;
1089
0
  }
1090
1091
0
  if (OP(preg, scan) == BACK)
1092
0
    offset = scan - val;
1093
0
  else
1094
0
    offset = val - scan;
1095
1096
0
  preg->program[scan + 1] = offset;
1097
0
}
1098
1099
/*
1100
 - regoptail - regtail on operand of first argument; nop if operandless
1101
 */
1102
1103
static void regoptail(regex_t *preg, int p, int val )
1104
0
{
1105
  /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1106
0
  if (p != 0 && OP(preg, p) == BRANCH) {
1107
0
    regtail(preg, OPERAND(p), val);
1108
0
  }
1109
0
}
1110
1111
/*
1112
 * regexec and friends
1113
 */
1114
1115
/*
1116
 * Forwards.
1117
 */
1118
static int regtry(regex_t *preg, const char *string );
1119
static int regmatch(regex_t *preg, int prog);
1120
static int regrepeat(regex_t *preg, int p, int max);
1121
1122
/*
1123
 - regexec - match a regexp against a string
1124
 */
1125
int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1126
0
{
1127
0
  const char *s;
1128
0
  int scan;
1129
1130
  /* Be paranoid... */
1131
0
  if (preg == NULL || preg->program == NULL || string == NULL) {
1132
0
    return REG_ERR_NULL_ARGUMENT;
1133
0
  }
1134
1135
  /* Check validity of program. */
1136
0
  if (*preg->program != REG_MAGIC) {
1137
0
    return REG_ERR_CORRUPTED;
1138
0
  }
1139
1140
#ifdef DEBUG
1141
  fprintf(stderr, "regexec: %s\n", string);
1142
  regdump(preg);
1143
#endif
1144
1145
0
  preg->eflags = eflags;
1146
0
  preg->pmatch = pmatch;
1147
0
  preg->nmatch = nmatch;
1148
0
  preg->start = string; /* All offsets are computed from here */
1149
1150
  /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1151
0
  for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1152
0
    int op = OP(preg, scan);
1153
0
    if (op == END)
1154
0
      break;
1155
0
    if (op == REPX || op == REPXMIN)
1156
0
      preg->program[scan + 4] = 0;
1157
0
  }
1158
1159
  /* If there is a "must appear" string, look for it. */
1160
0
  if (preg->regmust != 0) {
1161
0
    s = string;
1162
0
    while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1163
0
      if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1164
0
        break;
1165
0
      }
1166
0
      s++;
1167
0
    }
1168
0
    if (s == NULL) /* Not present. */
1169
0
      return REG_NOMATCH;
1170
0
  }
1171
1172
  /* Mark beginning of line for ^ . */
1173
0
  preg->regbol = string;
1174
1175
  /* Simplest case:  anchored match need be tried only once (maybe per line). */
1176
0
  if (preg->reganch) {
1177
0
    if (eflags & REG_NOTBOL) {
1178
      /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1179
0
      goto nextline;
1180
0
    }
1181
0
    while (1) {
1182
0
      if (regtry(preg, string)) {
1183
0
        return REG_NOERROR;
1184
0
      }
1185
0
      if (*string) {
1186
0
nextline:
1187
0
        if (preg->cflags & REG_NEWLINE) {
1188
          /* Try the next anchor? */
1189
0
          string = strchr(string, '\n');
1190
0
          if (string) {
1191
0
            preg->regbol = ++string;
1192
0
            continue;
1193
0
          }
1194
0
        }
1195
0
      }
1196
0
      return REG_NOMATCH;
1197
0
    }
1198
0
  }
1199
1200
  /* Messy cases:  unanchored match. */
1201
0
  s = string;
1202
0
  if (preg->regstart != '\0') {
1203
    /* We know what char it must start with. */
1204
0
    while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1205
0
      if (regtry(preg, s))
1206
0
        return REG_NOERROR;
1207
0
      s++;
1208
0
    }
1209
0
  }
1210
0
  else
1211
    /* We don't -- general case. */
1212
0
    while (1) {
1213
0
      if (regtry(preg, s))
1214
0
        return REG_NOERROR;
1215
0
      if (*s == '\0') {
1216
0
        break;
1217
0
      }
1218
0
      else {
1219
0
        int c;
1220
0
        s += utf8_tounicode(s, &c);
1221
0
      }
1222
0
    }
1223
1224
  /* Failure. */
1225
0
  return REG_NOMATCH;
1226
0
}
1227
1228
/*
1229
 - regtry - try match at specific point
1230
 */
1231
      /* 0 failure, 1 success */
1232
static int regtry( regex_t *preg, const char *string )
1233
0
{
1234
0
  int i;
1235
1236
0
  preg->reginput = string;
1237
1238
0
  for (i = 0; i < preg->nmatch; i++) {
1239
0
    if (preg->pmatch) {
1240
0
      preg->pmatch[i].rm_so = -1;
1241
0
      preg->pmatch[i].rm_eo = -1;
1242
0
    }
1243
0
  }
1244
0
  if (regmatch(preg, 1)) {
1245
0
    if (preg->pmatch) {
1246
0
      preg->pmatch[0].rm_so = string - preg->start;
1247
0
      preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1248
0
    }
1249
0
    return(1);
1250
0
  } else
1251
0
    return(0);
1252
0
}
1253
1254
/**
1255
 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1256
 *
1257
 * If 'nocase' is non-zero, does a case-insensitive match.
1258
 *
1259
 * Returns -1 on not found.
1260
 */
1261
static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1262
0
{
1263
0
  const char *s = string;
1264
0
  while (proglen && *s) {
1265
0
    int ch;
1266
0
    int n = reg_utf8_tounicode_case(s, &ch, nocase);
1267
0
    if (ch != *prog) {
1268
0
      return -1;
1269
0
    }
1270
0
    prog++;
1271
0
    s += n;
1272
0
    proglen--;
1273
0
  }
1274
0
  if (proglen == 0) {
1275
0
    return s - string;
1276
0
  }
1277
0
  return -1;
1278
0
}
1279
1280
/**
1281
 * Searches for 'c' in the range 'range'.
1282
 *
1283
 * Returns 1 if found, or 0 if not.
1284
 */
1285
static int reg_range_find(const int *range, int c)
1286
0
{
1287
0
  while (*range) {
1288
    /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1289
0
    if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1290
0
      return 1;
1291
0
    }
1292
0
    range += 2;
1293
0
  }
1294
0
  return 0;
1295
0
}
1296
1297
/**
1298
 * Search for the character 'c' in the utf-8 string 'string'.
1299
 *
1300
 * If 'nocase' is set, the 'string' is assumed to be uppercase
1301
 * and 'c' is converted to uppercase before matching.
1302
 *
1303
 * Returns the byte position in the string where the 'c' was found, or
1304
 * NULL if not found.
1305
 */
1306
static const char *str_find(const char *string, int c, int nocase)
1307
0
{
1308
0
  if (nocase) {
1309
    /* The "string" should already be converted to uppercase */
1310
0
    c = utf8_upper(c);
1311
0
  }
1312
0
  while (*string) {
1313
0
    int ch;
1314
0
    int n = reg_utf8_tounicode_case(string, &ch, nocase);
1315
0
    if (c == ch) {
1316
0
      return string;
1317
0
    }
1318
0
    string += n;
1319
0
  }
1320
0
  return NULL;
1321
0
}
1322
1323
/**
1324
 * Returns true if 'ch' is an end-of-line char.
1325
 *
1326
 * In REG_NEWLINE mode, \n is considered EOL in
1327
 * addition to \0
1328
 */
1329
static int reg_iseol(regex_t *preg, int ch)
1330
0
{
1331
0
  if (preg->cflags & REG_NEWLINE) {
1332
0
    return ch == '\0' || ch == '\n';
1333
0
  }
1334
0
  else {
1335
0
    return ch == '\0';
1336
0
  }
1337
0
}
1338
1339
static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1340
0
{
1341
0
  int nextch = '\0';
1342
0
  const char *save;
1343
0
  int no;
1344
0
  int c;
1345
1346
0
  int max = preg->program[scan + 2];
1347
0
  int min = preg->program[scan + 3];
1348
0
  int next = regnext(preg, scan);
1349
1350
  /*
1351
   * Lookahead to avoid useless match attempts
1352
   * when we know what character comes next.
1353
   */
1354
0
  if (OP(preg, next) == EXACTLY) {
1355
0
    nextch = preg->program[OPERAND(next)];
1356
0
  }
1357
0
  save = preg->reginput;
1358
0
  no = regrepeat(preg, scan + 5, max);
1359
0
  if (no < min) {
1360
0
    return 0;
1361
0
  }
1362
0
  if (matchmin) {
1363
    /* from min up to no */
1364
0
    max = no;
1365
0
    no = min;
1366
0
  }
1367
  /* else from no down to min */
1368
0
  while (1) {
1369
0
    if (matchmin) {
1370
0
      if (no > max) {
1371
0
        break;
1372
0
      }
1373
0
    }
1374
0
    else {
1375
0
      if (no < min) {
1376
0
        break;
1377
0
      }
1378
0
    }
1379
0
    preg->reginput = save + utf8_index(save, no);
1380
0
    reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1381
    /* If it could work, try it. */
1382
0
    if (reg_iseol(preg, nextch) || c == nextch) {
1383
0
      if (regmatch(preg, next)) {
1384
0
        return(1);
1385
0
      }
1386
0
    }
1387
0
    if (matchmin) {
1388
      /* Couldn't or didn't, add one more */
1389
0
      no++;
1390
0
    }
1391
0
    else {
1392
      /* Couldn't or didn't -- back up. */
1393
0
      no--;
1394
0
    }
1395
0
  }
1396
0
  return(0);
1397
0
}
1398
1399
static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1400
0
{
1401
0
  int *scanpt = preg->program + scan;
1402
1403
0
  int max = scanpt[2];
1404
0
  int min = scanpt[3];
1405
1406
  /* Have we reached min? */
1407
0
  if (scanpt[4] < min) {
1408
    /* No, so get another one */
1409
0
    scanpt[4]++;
1410
0
    if (regmatch(preg, scan + 5)) {
1411
0
      return 1;
1412
0
    }
1413
0
    scanpt[4]--;
1414
0
    return 0;
1415
0
  }
1416
0
  if (scanpt[4] > max) {
1417
0
    return 0;
1418
0
  }
1419
1420
0
  if (matchmin) {
1421
    /* minimal, so try other branch first */
1422
0
    if (regmatch(preg, regnext(preg, scan))) {
1423
0
      return 1;
1424
0
    }
1425
    /* No, so try one more */
1426
0
    scanpt[4]++;
1427
0
    if (regmatch(preg, scan + 5)) {
1428
0
      return 1;
1429
0
    }
1430
0
    scanpt[4]--;
1431
0
    return 0;
1432
0
  }
1433
  /* maximal, so try this branch again */
1434
0
  if (scanpt[4] < max) {
1435
0
    scanpt[4]++;
1436
0
    if (regmatch(preg, scan + 5)) {
1437
0
      return 1;
1438
0
    }
1439
0
    scanpt[4]--;
1440
0
  }
1441
  /* At this point we are at max with no match. Try the other branch */
1442
0
  return regmatch(preg, regnext(preg, scan));
1443
0
}
1444
1445
/*
1446
 - regmatch - main matching routine
1447
 *
1448
 * Conceptually the strategy is simple:  check to see whether the current
1449
 * node matches, call self recursively to see whether the rest matches,
1450
 * and then act accordingly.  In practice we make some effort to avoid
1451
 * recursion, in particular by going through "ordinary" nodes (that don't
1452
 * need to know whether the rest of the match failed) by a loop instead of
1453
 * by recursion.
1454
 */
1455
/* 0 failure, 1 success */
1456
static int regmatch(regex_t *preg, int prog)
1457
0
{
1458
0
  int scan; /* Current node. */
1459
0
  int next;   /* Next node. */
1460
0
  const char *save;
1461
1462
0
  scan = prog;
1463
1464
#ifdef DEBUG
1465
  if (scan != 0 && regnarrate)
1466
    fprintf(stderr, "%s(\n", regprop(scan));
1467
#endif
1468
0
  while (scan != 0) {
1469
0
    int n;
1470
0
    int c;
1471
#ifdef DEBUG
1472
    if (regnarrate) {
1473
      fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1474
    }
1475
#endif
1476
0
    next = regnext(preg, scan);
1477
0
    n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1478
1479
0
    switch (OP(preg, scan)) {
1480
0
    case BOLX:
1481
0
      if ((preg->eflags & REG_NOTBOL)) {
1482
0
        return(0);
1483
0
      }
1484
      /* Fall through */
1485
0
    case BOL:
1486
0
      if (preg->reginput != preg->regbol) {
1487
0
        return(0);
1488
0
      }
1489
0
      break;
1490
0
    case EOLX:
1491
0
      if (c != 0) {
1492
        /* For EOLX, only match real end of line, not newline */
1493
0
        return 0;
1494
0
      }
1495
0
      break;
1496
0
    case EOL:
1497
0
      if (!reg_iseol(preg, c)) {
1498
0
        return(0);
1499
0
      }
1500
0
      break;
1501
0
    case WORDA:
1502
      /* Must be looking at a letter, digit, or _ */
1503
0
      if ((!isalnum(UCHAR(c))) && c != '_')
1504
0
        return(0);
1505
      /* Prev must be BOL or nonword */
1506
0
      if (preg->reginput > preg->regbol &&
1507
0
        (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1508
0
        return(0);
1509
0
      break;
1510
0
    case WORDZ:
1511
      /* Can't match at BOL */
1512
0
      if (preg->reginput > preg->regbol) {
1513
        /* Current must be EOL or nonword */
1514
0
        if (reg_iseol(preg, c) || !(isalnum(UCHAR(c)) || c == '_')) {
1515
0
          c = preg->reginput[-1];
1516
          /* Previous must be word */
1517
0
          if (isalnum(UCHAR(c)) || c == '_') {
1518
0
            break;
1519
0
          }
1520
0
        }
1521
0
      }
1522
      /* No */
1523
0
      return(0);
1524
1525
0
    case ANY:
1526
0
      if (reg_iseol(preg, c))
1527
0
        return 0;
1528
0
      preg->reginput += n;
1529
0
      break;
1530
0
    case EXACTLY: {
1531
0
        int opnd;
1532
0
        int len;
1533
0
        int slen;
1534
1535
0
        opnd = OPERAND(scan);
1536
0
        len = str_int_len(preg->program + opnd);
1537
1538
0
        slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1539
0
        if (slen < 0) {
1540
0
          return(0);
1541
0
        }
1542
0
        preg->reginput += slen;
1543
0
      }
1544
0
      break;
1545
0
    case ANYOF:
1546
0
      if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1547
0
        return(0);
1548
0
      }
1549
0
      preg->reginput += n;
1550
0
      break;
1551
0
    case ANYBUT:
1552
0
      if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1553
0
        return(0);
1554
0
      }
1555
0
      preg->reginput += n;
1556
0
      break;
1557
0
    case NOTHING:
1558
0
      break;
1559
0
    case BACK:
1560
0
      break;
1561
0
    case BRANCH:
1562
0
      if (OP(preg, next) != BRANCH)   /* No choice. */
1563
0
        next = OPERAND(scan); /* Avoid recursion. */
1564
0
      else {
1565
0
        do {
1566
0
          save = preg->reginput;
1567
0
          if (regmatch(preg, OPERAND(scan))) {
1568
0
            return(1);
1569
0
          }
1570
0
          preg->reginput = save;
1571
0
          scan = regnext(preg, scan);
1572
0
        } while (scan != 0 && OP(preg, scan) == BRANCH);
1573
0
        return(0);
1574
        /* NOTREACHED */
1575
0
      }
1576
0
      break;
1577
0
    case REP:
1578
0
    case REPMIN:
1579
0
      return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1580
1581
0
    case REPX:
1582
0
    case REPXMIN:
1583
0
      return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1584
1585
0
    case END:
1586
0
      return 1; /* Success! */
1587
1588
0
    case OPENNC:
1589
0
    case CLOSENC:
1590
0
      return regmatch(preg, next);
1591
1592
0
    default:
1593
0
      if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1594
0
        save = preg->reginput;
1595
0
        if (regmatch(preg, next)) {
1596
0
          if (OP(preg, scan) < CLOSE) {
1597
0
            int no = OP(preg, scan) - OPEN;
1598
0
            if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_so == -1) {
1599
0
              preg->pmatch[no].rm_so = save - preg->start;
1600
0
            }
1601
0
          }
1602
0
          else {
1603
0
            int no = OP(preg, scan) - CLOSE;
1604
0
            if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_eo == -1) {
1605
0
              preg->pmatch[no].rm_eo = save - preg->start;
1606
0
            }
1607
0
          }
1608
0
          return(1);
1609
0
        }
1610
        /* Restore input position after failure */
1611
0
        preg->reginput = save;
1612
0
        return(0);
1613
0
      }
1614
0
      return REG_ERR_INTERNAL;
1615
0
    }
1616
1617
0
    scan = next;
1618
0
  }
1619
1620
  /*
1621
   * We get here only if there's trouble -- normally "case END" is
1622
   * the terminating point.
1623
   */
1624
0
  return REG_ERR_INTERNAL;
1625
0
}
1626
1627
/*
1628
 - regrepeat - repeatedly match something simple, report how many
1629
 */
1630
static int regrepeat(regex_t *preg, int p, int max)
1631
0
{
1632
0
  int count = 0;
1633
0
  const char *scan;
1634
0
  int opnd;
1635
0
  int ch;
1636
0
  int n;
1637
1638
0
  scan = preg->reginput;
1639
0
  opnd = OPERAND(p);
1640
0
  switch (OP(preg, p)) {
1641
0
  case ANY:
1642
0
    while (!reg_iseol(preg, *scan) && count < max) {
1643
0
      count++;
1644
0
      scan += utf8_charlen(*scan);
1645
0
    }
1646
0
    break;
1647
0
  case EXACTLY:
1648
0
    while (count < max) {
1649
0
      n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1650
0
      if (preg->program[opnd] != ch) {
1651
0
        break;
1652
0
      }
1653
0
      count++;
1654
0
      scan += n;
1655
0
    }
1656
0
    break;
1657
0
  case ANYOF:
1658
0
    while (count < max) {
1659
0
      n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1660
0
      if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1661
0
        break;
1662
0
      }
1663
0
      count++;
1664
0
      scan += n;
1665
0
    }
1666
0
    break;
1667
0
  case ANYBUT:
1668
0
    while (count < max) {
1669
0
      n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1670
0
      if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1671
0
        break;
1672
0
      }
1673
0
      count++;
1674
0
      scan += n;
1675
0
    }
1676
0
    break;
1677
0
  default:    /* Oh dear.  Called inappropriately. */
1678
0
    preg->err = REG_ERR_INTERNAL;
1679
0
    count = 0;  /* Best compromise. */
1680
0
    break;
1681
0
  }
1682
0
  preg->reginput = scan;
1683
1684
0
  return(count);
1685
0
}
1686
1687
/*
1688
 - regnext - dig the "next" pointer out of a node
1689
 */
1690
static int regnext(regex_t *preg, int p )
1691
0
{
1692
0
  int offset;
1693
1694
0
  offset = NEXT(preg, p);
1695
1696
0
  if (offset == 0)
1697
0
    return 0;
1698
1699
0
  if (OP(preg, p) == BACK)
1700
0
    return(p-offset);
1701
0
  else
1702
0
    return(p+offset);
1703
0
}
1704
1705
/*
1706
 - regopsize - returns the size of opcode + operands at 'p' in words
1707
 */
1708
static int regopsize(regex_t *preg, int p )
1709
0
{
1710
  /* Almost all opcodes are 2 words, but some are more */
1711
0
  switch (OP(preg, p)) {
1712
0
    case REP:
1713
0
    case REPMIN:
1714
0
    case REPX:
1715
0
    case REPXMIN:
1716
0
      return 5;
1717
1718
0
    case ANYOF:
1719
0
    case ANYBUT:
1720
0
    case EXACTLY: {
1721
0
      int s = p + 2;
1722
0
      while (preg->program[s++]) {
1723
0
      }
1724
0
      return s - p;
1725
0
    }
1726
0
  }
1727
0
  return 2;
1728
0
}
1729
1730
#if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1731
1732
/*
1733
 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1734
 */
1735
static void regdump(regex_t *preg)
1736
{
1737
  int s;
1738
  int op = EXACTLY; /* Arbitrary non-END op. */
1739
  int next;
1740
  char buf[MAX_UTF8_LEN + 1];
1741
1742
  int i;
1743
  for (i = 1; i < preg->p; i++) {
1744
    printf("%02x ", (unsigned char)preg->program[i]);
1745
    if (i % 16 == 0) {
1746
      printf("\n");
1747
    }
1748
  }
1749
  printf("\n");
1750
1751
  s = 1;
1752
  while (op != END && s < preg->p) {  /* While that wasn't END last time... */
1753
    op = OP(preg, s);
1754
    printf("%3d: %s", s, regprop(op));  /* Where, what. */
1755
    next = regnext(preg, s);
1756
    if (next == 0)    /* Next ptr. */
1757
      printf("(0)");
1758
    else
1759
      printf("(%d)", next);
1760
    s += 2;
1761
    if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1762
      int max = preg->program[s];
1763
      int min = preg->program[s + 1];
1764
      if (max == 65535) {
1765
        printf("{%d,*}", min);
1766
      }
1767
      else {
1768
        printf("{%d,%d}", min, max);
1769
      }
1770
      printf(" %d", preg->program[s + 2]);
1771
      s += 3;
1772
    }
1773
    else if (op == ANYOF || op == ANYBUT) {
1774
      /* set of ranges */
1775
1776
      while (preg->program[s]) {
1777
        int len = preg->program[s++];
1778
        int first = preg->program[s++];
1779
        buf[utf8_getchars(buf, first)] = 0;
1780
        printf("%s", buf);
1781
        if (len > 1) {
1782
          buf[utf8_getchars(buf, first + len - 1)] = 0;
1783
          printf("-%s", buf);
1784
        }
1785
      }
1786
      s++;
1787
    }
1788
    else if (op == EXACTLY) {
1789
      /* Literal string, where present. */
1790
1791
      while (preg->program[s]) {
1792
        buf[utf8_getchars(buf, preg->program[s])] = 0;
1793
        printf("%s", buf);
1794
        s++;
1795
      }
1796
      s++;
1797
    }
1798
    putchar('\n');
1799
  }
1800
1801
  if (op == END) {
1802
    /* Header fields of interest. */
1803
    if (preg->regstart) {
1804
      buf[utf8_getchars(buf, preg->regstart)] = 0;
1805
      printf("start '%s' ", buf);
1806
    }
1807
    if (preg->reganch)
1808
      printf("anchored ");
1809
    if (preg->regmust != 0) {
1810
      int i;
1811
      printf("must have:");
1812
      for (i = 0; i < preg->regmlen; i++) {
1813
        putchar(preg->program[preg->regmust + i]);
1814
      }
1815
      putchar('\n');
1816
    }
1817
  }
1818
  printf("\n");
1819
}
1820
1821
/*
1822
 - regprop - printable representation of opcode
1823
 */
1824
static const char *regprop( int op )
1825
{
1826
  static char buf[50];
1827
1828
  switch (op) {
1829
  case BOL:
1830
    return "BOL";
1831
  case EOL:
1832
    return "EOL";
1833
  case BOLX:
1834
    return "BOLX";
1835
  case EOLX:
1836
    return "EOLX";
1837
  case ANY:
1838
    return "ANY";
1839
  case ANYOF:
1840
    return "ANYOF";
1841
  case ANYBUT:
1842
    return "ANYBUT";
1843
  case BRANCH:
1844
    return "BRANCH";
1845
  case EXACTLY:
1846
    return "EXACTLY";
1847
  case NOTHING:
1848
    return "NOTHING";
1849
  case BACK:
1850
    return "BACK";
1851
  case END:
1852
    return "END";
1853
  case REP:
1854
    return "REP";
1855
  case REPMIN:
1856
    return "REPMIN";
1857
  case REPX:
1858
    return "REPX";
1859
  case REPXMIN:
1860
    return "REPXMIN";
1861
  case WORDA:
1862
    return "WORDA";
1863
  case WORDZ:
1864
    return "WORDZ";
1865
  case OPENNC:
1866
    return "OPEN";
1867
  case CLOSENC:
1868
    return "CLOSE";
1869
  default:
1870
    if (op >= OPEN && op < CLOSE) {
1871
      snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1872
    }
1873
    else if (op >= CLOSE && op < CLOSE_END) {
1874
      snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1875
    }
1876
    else {
1877
      snprintf(buf, sizeof(buf), "?%d?\n", op);
1878
    }
1879
    return(buf);
1880
  }
1881
}
1882
#endif /* JIM_BOOTSTRAP */
1883
1884
size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1885
0
{
1886
0
  static const char *error_strings[] = {
1887
0
    "success",
1888
0
    "no match",
1889
0
    "bad pattern",
1890
0
    "null argument",
1891
0
    "unknown error",
1892
0
    "too big",
1893
0
    "out of memory",
1894
0
    "too many ()",
1895
0
    "parentheses () not balanced",
1896
0
    "braces {} not balanced",
1897
0
    "invalid repetition count(s)",
1898
0
    "extra characters",
1899
0
    "*+ of empty atom",
1900
0
    "nested count",
1901
0
    "internal error",
1902
0
    "count follows nothing",
1903
0
    "invalid escape \\ sequence",
1904
0
    "corrupted program",
1905
0
    "contains null char",
1906
0
    "brackets [] not balanced",
1907
0
  };
1908
0
  const char *err;
1909
1910
0
        (void)preg;
1911
1912
0
  if (errcode < 0 || errcode >= REG_ERR_NUM) {
1913
0
    err = "Bad error code";
1914
0
  }
1915
0
  else {
1916
0
    err = error_strings[errcode];
1917
0
  }
1918
1919
0
  return snprintf(errbuf, errbuf_size, "%s", err);
1920
0
}
1921
1922
void regfree(regex_t *preg)
1923
0
{
1924
0
  free(preg->program);
1925
0
}
1926
1927
#endif