/src/gnupg/regexp/jimregexp.c

Source
/*
 * vi:se ts=8:
 *
 * regcomp and regexec -- regsub and regerror are elsewhere
 *
 *  Copyright (c) 1986 by University of Toronto.
 *  Written by Henry Spencer.  Not derived from licensed software.
 *
 *  Permission is granted to anyone to use this software for any
 *  purpose on any computer system, and to redistribute it freely,
 *  subject to the following restrictions:
 *
 *  1. The author is not responsible for the consequences of use of
 *    this software, no matter how awful, even if they arise
 *    from defects in it.
 *
 *  2. The origin of this software must not be misrepresented, either
 *    by explicit claim or by omission.
 *
 *  3. Altered versions must be plainly marked as such, and must not
 *    be misrepresented as being the original software.
 *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
 *** to assist in implementing egrep.
 *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
 *** as in BSD grep and ex.
 *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
 *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
 *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
 *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
 *** was moved into regexp.h, and the include of regexp.h now uses "'s
 *** to avoid conflicting with the system regexp.h.  Const, bless its
 *** soul, was removed so it can compile everywhere.  The declaration
 *** of strchr() was in conflict on AIX, so it was removed (as it is
 *** happily defined in string.h).
 *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
 *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
 *
 *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
 *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
 *   This includes counted repetitions, UTF-8 support, character classes,
 *   shorthand character classes, increased number of parentheses to 100,
 *   backslash escape sequences. It also removes \n as an alternative to |.
 *
 *** THIS IS AN ALTERED VERSION.  It was altered to offer POSIX-like
 *** regular expression routines of regcomp/regexec/regerror/regfree,
 *** with UTF-8 support, by NIIBE Yutaka <gniibe@fsij.org> on
 *** 2020-02-14.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 */

#if defined(JIM_REGEXP)
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#include "jimregexp.h"
#include "utf8.h"

#define UCHAR(c) ((unsigned char)(c))

/* An arbitrary limit, but this seems enough. Must be less than 1000. */
#define REG_MAX_PAREN 100

/*
 * Structure for regexp "program".  This is essentially a linear encoding
 * of a nondeterministic finite-state machine (aka syntax charts or
 * "railroad normal form" in parsing technology).  Each node is an opcode
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 * all nodes except BRANCH implement concatenation; a "next" pointer with
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
 * opposed to a collection of them) is never concatenated with anything
 * because of operator precedence.)  The operand of some types of node is
 * a literal string; for others, it is a node leading into a sub-FSM.  In
 * particular, the operand of a BRANCH node is the first node of the branch.
 * (NB this is *not* a tree structure:  the tail of the branch connects
 * to the thing following the set of BRANCHes.)  The opcodes are:
 */

/* definition   number  opnd? meaning */
#define END 0  /* no End of program. */
#define BOL 1  /* no Match "" at beginning of line. */
#define EOL 2  /* no Match "" at end of line. */
#define ANY 3  /* no Match any one character. */
#define ANYOF 4  /* str  Match any character in this string. */
#define ANYBUT  5  /* str  Match any character not in this string. */
#define BRANCH  6  /* node Match this alternative, or the next... */
#define BACK  7  /* no Match "", "next" ptr points backward. */
#define EXACTLY 8  /* str  Match this string. */
#define NOTHING 9  /* no Match empty string. */
#define REP 10  /* max,min  Match this (simple) thing [min,max] times. */
#define REPMIN  11  /* max,min  Match this (simple) thing [min,max] times, minimal match. */
#define REPX  12  /* max,min  Match this (complex) thing [min,max] times. */
#define REPXMIN 13  /* max,min  Match this (complex) thing [min,max] times, minimal match. */
#define BOLX  14  /* no Match "" at beginning of input. */
#define EOLX  15  /* no Match "" at end of input. */
#define WORDA 16  /* no Match "" at wordchar, where prev is nonword */
#define WORDZ 17  /* no Match "" at nonwordchar, where prev is word */

#define OPENNC  1000  /* no Non-capturing parentheses - must be OPEN-1 */
#define OPEN    1001  /* no Mark this point in input as start of #n. */
      /*  OPEN+1 is number 1, etc. */

/* must not be any other opts between OPEN and CLOSE */

#define CLOSENC 2000  /* no Non-capturing parentheses - must be CLOSE-1 */
#define CLOSE 2001  /* no Analogous to OPEN. */
#define CLOSE_END (CLOSE+REG_MAX_PAREN)

/*
 * The first word of the regexp internal "program" is actually this magic
 * number; the start node begins in the second word.
 */
#define REG_MAGIC 0xFADED00D

/*
 * Opcode notes:
 *
 * BRANCH The set of branches constituting a single choice are hooked
 *    together with their "next" pointers, since precedence prevents
 *    anything being concatenated to any individual branch.  The
 *    "next" pointer of the last BRANCH in a choice points to the
 *    thing following the whole choice.  This is also where the
 *    final "next" pointer of each individual branch points; each
 *    branch starts with the operand node of a BRANCH node.
 *
 * BACK   Normal "next" pointers all implicitly point forward; BACK
 *    exists to make loop structures possible.
 *
 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
 *              as either simple repeats (REP) or complex repeats (REPX).
 *              These opcodes include a "min" and "max" count after the opcode.
 *    This is followed by a fourth "current count" word that is
 *    only used by REPX, as it implements a recursive match.
 *    REPMIN and REPXMIN are identical except they implement minimal repeats.
 *
 * OPEN,CLOSE ...are numbered at compile time.
 */

/*
 * A node is one word of opcode followed by one word of "next" pointer.
 * The "next" pointer value is a positive offset from the opcode of the node
 * containing it.
 * An operand, if any, simply follows the node.  (Note that much of the
 * code generation knows about this implicit relationship.)
 */
#define OP(preg, p) (preg->program[p])
#define NEXT(preg, p) (preg->program[p + 1])
#define OPERAND(p)  ((p) + 2)

/*
 * See regmagic.h for one further detail of program structure.
 */


/*
 * Utility definitions.
 */

#define FAIL(R,M) { (R)->err = (M); return (M); }
#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
#define META    "^$.[()|?{+*"

/*
 * Flags to be passed up and down.
 */
#define HASWIDTH  1  /* Known never to match null string. */
#define SIMPLE    2  /* Simple enough to be STAR/PLUS operand. */
#define SPSTART   4  /* Starts with * or +. */
#define WORST   0  /* Worst case. */

#define MAX_REP_COUNT 1000000

/*
 * Forward declarations for regcomp()'s friends.
 */
static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
static int regpiece(regex_t *preg, int *flagp );
static int regbranch(regex_t *preg, int *flagp );
static int regatom(regex_t *preg, int *flagp );
static int regnode(regex_t *preg, int op );
static int regnext(regex_t *preg, int p );
static void regc(regex_t *preg, int b );
static int reginsert(regex_t *preg, int op, int size, int opnd );
static void regtail(regex_t *preg, int p, int val);
static void regoptail(regex_t *preg, int p, int val );
static int regopsize(regex_t *preg, int p );

static int reg_range_find(const int *string, int c);
static const char *str_find(const char *string, int c, int nocase);
static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);

/*#define DEBUG*/
#ifdef DEBUG
static int regnarrate = 0;
static void regdump(regex_t *preg);
static const char *regprop( int op );
#endif


/**
 * Returns the length of the null-terminated integer sequence.
 */
static int str_int_len(const int *seq)
{
  int n = 0;
  while (*seq++) {
    n++;
  }
  return n;
}

/*
 - regcomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
int regcomp(regex_t *preg, const char *exp, int cflags)
{
  int scan;
  int longest;
  unsigned len;
  int flags;

#ifdef DEBUG
  fprintf(stderr, "Compiling: '%s'\n", exp);
#endif
  memset(preg, 0, sizeof(*preg));

  if (exp == NULL)
    FAIL(preg, REG_ERR_NULL_ARGUMENT);

  /* First pass: determine size, legality. */
  preg->cflags = cflags;
  preg->regparse = exp;

  /* Allocate space. */
  preg->proglen = (strlen(exp) + 1) * 5;
  preg->program = malloc(preg->proglen * sizeof(int));
  if (preg->program == NULL)
    FAIL(preg, REG_ERR_NOMEM);

  /* Note that since we store a magic value as the first item in the program,
   * program offsets will never be 0
   */
  regc(preg, REG_MAGIC);
  if (reg(preg, 0, &flags) == 0) {
    return preg->err;
  }

  /* Small enough for pointer-storage convention? */
  if (preg->re_nsub >= REG_MAX_PAREN)   /* Probably could be 65535L. */
    FAIL(preg,REG_ERR_TOO_BIG);

  /* Dig out information for optimizations. */
  preg->regstart = 0; /* Worst-case defaults. */
  preg->reganch = 0;
  preg->regmust = 0;
  preg->regmlen = 0;
  scan = 1;     /* First BRANCH. */
  if (OP(preg, regnext(preg, scan)) == END) {   /* Only one top-level choice. */
    scan = OPERAND(scan);

    /* Starting-point info. */
    if (OP(preg, scan) == EXACTLY) {
      preg->regstart = preg->program[OPERAND(scan)];
    }
    else if (OP(preg, scan) == BOL)
      preg->reganch++;

    /*
     * If there's something expensive in the r.e., find the
     * longest literal string that must appear and make it the
     * regmust.  Resolve ties in favor of later strings, since
     * the regstart check works with the beginning of the r.e.
     * and avoiding duplication strengthens checking.  Not a
     * strong reason, but sufficient in the absence of others.
     */
    if (flags&SPSTART) {
      longest = 0;
      len = 0;
      for (; scan != 0; scan = regnext(preg, scan)) {
        if (OP(preg, scan) == EXACTLY) {
          int plen = str_int_len(preg->program + OPERAND(scan));
          if (plen >= len) {
            longest = OPERAND(scan);
            len = plen;
          }
        }
      }
      preg->regmust = longest;
      preg->regmlen = len;
    }
  }

#ifdef DEBUG
  regdump(preg);
#endif

  return 0;
}

/*
 - reg - regular expression, i.e. main body or parenthesized thing
 *
 * Caller must absorb opening parenthesis.
 *
 * Combining parenthesis handling with the base level of regular expression
 * is a trifle forced, but the need to tie the tails of the branches to what
 * follows makes it hard to avoid.
 */
static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
{
  int ret;
  int br;
  int ender;
  int parno = 0;
  int flags;

  *flagp = HASWIDTH; /* Tentatively. */

  /* Make an OPEN node, if parenthesized. */
  if (paren) {
    if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
      /* non-capturing paren */
      preg->regparse += 2;
      parno = -1;
    }
    else {
      parno = ++preg->re_nsub;
    }
    ret = regnode(preg, OPEN+parno);
  } else
    ret = 0;

  /* Pick up the branches, linking them together. */
  br = regbranch(preg, &flags);
  if (br == 0)
    return 0;
  if (ret != 0)
    regtail(preg, ret, br); /* OPEN -> first. */
  else
    ret = br;
  if (!(flags&HASWIDTH))
    *flagp &= ~HASWIDTH;
  *flagp |= flags&SPSTART;
  while (*preg->regparse == '|') {
    preg->regparse++;
    br = regbranch(preg, &flags);
    if (br == 0)
      return 0;
    regtail(preg, ret, br); /* BRANCH -> BRANCH. */
    if (!(flags&HASWIDTH))
      *flagp &= ~HASWIDTH;
    *flagp |= flags&SPSTART;
  }

  /* Make a closing node, and hook it on the end. */
  ender = regnode(preg, (paren) ? CLOSE+parno : END);
  regtail(preg, ret, ender);

  /* Hook the tails of the branches to the closing node. */
  for (br = ret; br != 0; br = regnext(preg, br))
    regoptail(preg, br, ender);

  /* Check for proper termination. */
  if (paren && *preg->regparse++ != ')') {
    preg->err = REG_ERR_UNMATCHED_PAREN;
    return 0;
  } else if (!paren && *preg->regparse != '\0') {
    if (*preg->regparse == ')') {
      preg->err = REG_ERR_UNMATCHED_PAREN;
      return 0;
    } else {
      preg->err = REG_ERR_JUNK_ON_END;
      return 0;
    }
  }

  return(ret);
}

/*
 - regbranch - one alternative of an | operator
 *
 * Implements the concatenation operator.
 */
static int regbranch(regex_t *preg, int *flagp )
{
  int ret;
  int chain;
  int latest;
  int flags;

  *flagp = WORST;   /* Tentatively. */

  ret = regnode(preg, BRANCH);
  chain = 0;
  while (*preg->regparse != '\0' && *preg->regparse != ')' &&
         *preg->regparse != '|') {
    latest = regpiece(preg, &flags);
    if (latest == 0)
      return 0;
    *flagp |= flags&HASWIDTH;
    if (chain == 0) {/* First piece. */
      *flagp |= flags&SPSTART;
    }
    else {
      regtail(preg, chain, latest);
    }
    chain = latest;
  }
  if (chain == 0) /* Loop ran zero times. */
    (void) regnode(preg, NOTHING);

  return(ret);
}

/*
 - regpiece - something followed by possible [*+?]
 *
 * Note that the branching code sequences used for ? and the general cases
 * of * and + are somewhat optimized:  they use the same NOTHING node as
 * both the endmarker for their branch list and the body of the last branch.
 * It might seem that this node could be dispensed with entirely, but the
 * endmarker role is not redundant.
 */
static int regpiece(regex_t *preg, int *flagp)
{
  int ret;
  char op;
  int next;
  int flags;
  int min;
  int max;

  ret = regatom(preg, &flags);
  if (ret == 0)
    return 0;

  op = *preg->regparse;
  if (!ISMULT(op)) {
    *flagp = flags;
    return(ret);
  }

  if (!(flags&HASWIDTH) && op != '?') {
    preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
    return 0;
  }

  /* Handle braces (counted repetition) by expansion */
  if (op == '{') {
    char *end;

    min = strtoul(preg->regparse + 1, &end, 10);
    if (end == preg->regparse + 1) {
      preg->err = REG_ERR_BAD_COUNT;
      return 0;
    }
    if (*end == '}') {
      max = min;
    }
    else if (*end == '\0') {
      preg->err = REG_ERR_UNMATCHED_BRACES;
      return 0;
    }
    else {
      preg->regparse = end;
      max = strtoul(preg->regparse + 1, &end, 10);
      if (*end != '}') {
        preg->err = REG_ERR_UNMATCHED_BRACES;
        return 0;
      }
    }
    if (end == preg->regparse + 1) {
      max = MAX_REP_COUNT;
    }
    else if (max < min || max >= 100) {
      preg->err = REG_ERR_BAD_COUNT;
      return 0;
    }
    if (min >= 100) {
      preg->err = REG_ERR_BAD_COUNT;
      return 0;
    }

    preg->regparse = strchr(preg->regparse, '}');
  }
  else {
    min = (op == '+');
    max = (op == '?' ? 1 : MAX_REP_COUNT);
  }

  if (preg->regparse[1] == '?') {
    preg->regparse++;
    next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
  }
  else {
    next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
  }
  preg->program[ret + 2] = max;
  preg->program[ret + 3] = min;
  preg->program[ret + 4] = 0;

  *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);

  if (!(flags & SIMPLE)) {
    int back = regnode(preg, BACK);
    regtail(preg, back, ret);
    regtail(preg, next, back);
  }

  preg->regparse++;
  if (ISMULT(*preg->regparse)) {
    preg->err = REG_ERR_NESTED_COUNT;
    return 0;
  }

  return ret;
}

/**
 * Add all characters in the inclusive range between lower and upper.
 *
 * Handles a swapped range (upper < lower).
 */
static void reg_addrange(regex_t *preg, int lower, int upper)
{
  if (lower > upper) {
    reg_addrange(preg, upper, lower);
  }
  /* Add a range as length, start */
  regc(preg, upper - lower + 1);
  regc(preg, lower);
}

/**
 * Add a null-terminated literal string as a set of ranges.
 */
static void reg_addrange_str(regex_t *preg, const char *str)
{
  while (*str) {
    reg_addrange(preg, *str, *str);
    str++;
  }
}

/**
 * Extracts the next unicode char from utf8.
 *
 * If 'upper' is set, converts the char to uppercase.
 */
static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
{
  int l = utf8_tounicode(s, uc);
  if (upper) {
    *uc = utf8_upper(*uc);
  }
  return l;
}

/**
 * Converts a hex digit to decimal.
 *
 * Returns -1 for an invalid hex digit.
 */
static int hexdigitval(int c)
{
  if (c >= '0' && c <= '9')
    return c - '0';
  if (c >= 'a' && c <= 'f')
    return c - 'a' + 10;
  if (c >= 'A' && c <= 'F')
    return c - 'A' + 10;
  return -1;
}

/**
 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 *
 * Returns the number of hex digits parsed.
 * If there are no hex digits, returns 0 and stores nothing.
 */
static int parse_hex(const char *s, int n, int *uc)
{
  int val = 0;
  int k;

  for (k = 0; k < n; k++) {
    int c = hexdigitval(*s++);
    if (c == -1) {
      break;
    }
    val = (val << 4) | c;
  }
  if (k) {
    *uc = val;
  }
  return k;
}

/**
 * Call for chars after a backlash to decode the escape sequence.
 *
 * Stores the result in *ch.
 *
 * Returns the number of bytes consumed.
 */
static int reg_decode_escape(const char *s, int *ch)
{
  int n;
  const char *s0 = s;

  *ch = *s++;

  switch (*ch) {
    case 'b': *ch = '\b'; break;
    case 'e': *ch = 27; break;
    case 'f': *ch = '\f'; break;
    case 'n': *ch = '\n'; break;
    case 'r': *ch = '\r'; break;
    case 't': *ch = '\t'; break;
    case 'v': *ch = '\v'; break;
    case 'u':
      if (*s == '{') {
        /* Expect \u{NNNN} */
        n = parse_hex(s + 1, 6, ch);
        if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
          s += n + 2;
        }
        else {
          /* Invalid, so just treat as an escaped 'u' */
          *ch = 'u';
        }
      }
      else if ((n = parse_hex(s, 4, ch)) > 0) {
        s += n;
      }
      break;
    case 'U':
      if ((n = parse_hex(s, 8, ch)) > 0) {
        s += n;
      }
      break;
    case 'x':
      if ((n = parse_hex(s, 2, ch)) > 0) {
        s += n;
      }
      break;
    case '\0':
      s--;
      *ch = '\\';
      break;
  }
  return s - s0;
}

/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static int regatom(regex_t *preg, int *flagp)
{
  int ret;
  int flags;
  int nocase = (preg->cflags & REG_ICASE);

  int ch;
  int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);

  *flagp = WORST;   /* Tentatively. */

  preg->regparse += n;
  switch (ch) {
  /* FIXME: these chars only have meaning at beg/end of pat? */
  case '^':
    ret = regnode(preg, BOL);
    break;
  case '$':
    ret = regnode(preg, EOL);
    break;
  case '.':
    ret = regnode(preg, ANY);
    *flagp |= HASWIDTH|SIMPLE;
    break;
  case '[': {
      const char *pattern = preg->regparse;

      if (*pattern == '^') { /* Complement of range. */
        ret = regnode(preg, ANYBUT);
        pattern++;
      } else
        ret = regnode(preg, ANYOF);

      /* Special case. If the first char is ']' or '-', it is part of the set */
      if (*pattern == ']' || *pattern == '-') {
        reg_addrange(preg, *pattern, *pattern);
        pattern++;
      }

      while (*pattern != ']') {
        /* Is this a range? a-z */
        int start;
        int end;

        enum {
          CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
          CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
          CC_NUM
        };
        int cc;

        if (!*pattern) {
          preg->err = REG_ERR_UNMATCHED_BRACKET;
          return 0;
        }

        pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
        if (start == '\\') {
          /* First check for class shorthand escapes */
          switch (*pattern) {
            case 's':
              pattern++;
              cc = CC_SPACE;
              goto cc_switch;
            case 'd':
              pattern++;
              cc = CC_DIGIT;
              goto cc_switch;
            case 'w':
              pattern++;
              reg_addrange(preg, '_', '_');
              cc = CC_ALNUM;
              goto cc_switch;
          }
          pattern += reg_decode_escape(pattern, &start);
          if (start == 0) {
            preg->err = REG_ERR_NULL_CHAR;
            return 0;
          }
          if (start == '\\' && *pattern == 0) {
            preg->err = REG_ERR_INVALID_ESCAPE;
            return 0;
          }
        }
        if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
          /* skip '-' */
          pattern += utf8_tounicode(pattern, &end);
          pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
          if (end == '\\') {
            pattern += reg_decode_escape(pattern, &end);
            if (end == 0) {
              preg->err = REG_ERR_NULL_CHAR;
              return 0;
            }
            if (end == '\\' && *pattern == 0) {
              preg->err = REG_ERR_INVALID_ESCAPE;
              return 0;
            }
          }

          reg_addrange(preg, start, end);
          continue;
        }
        if (start == '[' && pattern[0] == ':') {
          static const char *character_class[] = {
            ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
            ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
          };

          for (cc = 0; cc < CC_NUM; cc++) {
            n = strlen(character_class[cc]);
            if (strncmp(pattern, character_class[cc], n) == 0) {
              if (pattern[n] != ']') {
                preg->err = REG_ERR_UNMATCHED_BRACKET;
                return 0;
              }
              /* Found a character class */
              pattern += n + 1;
              break;
            }
          }
          if (cc != CC_NUM) {
cc_switch:
            switch (cc) {
              case CC_ALNUM:
                reg_addrange(preg, '0', '9');
                /* Fall through */
              case CC_ALPHA:
                if ((preg->cflags & REG_ICASE) == 0) {
                  reg_addrange(preg, 'a', 'z');
                }
                reg_addrange(preg, 'A', 'Z');
                break;
              case CC_SPACE:
                reg_addrange_str(preg, " \t\r\n\f\v");
                break;
              case CC_BLANK:
                reg_addrange_str(preg, " \t");
                break;
              case CC_UPPER:
                reg_addrange(preg, 'A', 'Z');
                break;
              case CC_LOWER:
                reg_addrange(preg, 'a', 'z');
                break;
              case CC_XDIGIT:
                reg_addrange(preg, 'a', 'f');
                reg_addrange(preg, 'A', 'F');
                /* Fall through */
              case CC_DIGIT:
                reg_addrange(preg, '0', '9');
                break;
              case CC_CNTRL:
                reg_addrange(preg, 0, 31);
                reg_addrange(preg, 127, 127);
                break;
              case CC_PRINT:
                reg_addrange(preg, ' ', '~');
                break;
              case CC_GRAPH:
                reg_addrange(preg, '!', '~');
                break;
              case CC_PUNCT:
                reg_addrange(preg, '!', '/');
                reg_addrange(preg, ':', '@');
                reg_addrange(preg, '[', '`');
                reg_addrange(preg, '{', '~');
                break;
            }
            continue;
          }
        }
        /* Not a range, so just add the char */
        reg_addrange(preg, start, start);
      }
      regc(preg, '\0');

      if (*pattern) {
        pattern++;
      }
      preg->regparse = pattern;

      *flagp |= HASWIDTH|SIMPLE;
    }
    break;
  case '(':
    ret = reg(preg, 1, &flags);
    if (ret == 0)
      return 0;
    *flagp |= flags&(HASWIDTH|SPSTART);
    break;
  case '\0':
  case '|':
  case ')':
    preg->err = REG_ERR_INTERNAL;
    return 0; /* Supposed to be caught earlier. */
  case '?':
  case '+':
  case '*':
  case '{':
    preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
    return 0;
  case '\\':
    ch = *preg->regparse++;
    switch (ch) {
    case '\0':
      preg->err = REG_ERR_INVALID_ESCAPE;
      return 0;
    case 'A':
      ret = regnode(preg, BOLX);
      break;
    case 'Z':
      ret = regnode(preg, EOLX);
      break;
    case '<':
    case 'm':
      ret = regnode(preg, WORDA);
      break;
    case '>':
    case 'M':
      ret = regnode(preg, WORDZ);
      break;
    case 'd':
    case 'D':
      ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
      reg_addrange(preg, '0', '9');
      regc(preg, '\0');
      *flagp |= HASWIDTH|SIMPLE;
      break;
    case 'w':
    case 'W':
      ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
      if ((preg->cflags & REG_ICASE) == 0) {
        reg_addrange(preg, 'a', 'z');
      }
      reg_addrange(preg, 'A', 'Z');
      reg_addrange(preg, '0', '9');
      reg_addrange(preg, '_', '_');
      regc(preg, '\0');
      *flagp |= HASWIDTH|SIMPLE;
      break;
    case 's':
    case 'S':
      ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
      reg_addrange_str(preg," \t\r\n\f\v");
      regc(preg, '\0');
      *flagp |= HASWIDTH|SIMPLE;
      break;
    /* FIXME: Someday handle \1, \2, ... */
    default:
      /* Handle general quoted chars in exact-match routine */
      /* Back up to include the backslash */
      preg->regparse--;
      goto de_fault;
    }
    break;
  de_fault:
  default: {
      /*
       * Encode a string of characters to be matched exactly.
       */
      int added = 0;

      /* Back up to pick up the first char of interest */
      preg->regparse -= n;

      ret = regnode(preg, EXACTLY);

      /* Note that a META operator such as ? or * consumes the
       * preceding char.
       * Thus we must be careful to look ahead by 2 and add the
       * last char as it's own EXACTLY if necessary
       */

      /* Until end of string or a META char is reached */
      while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
        n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
        if (ch == '\\' && preg->regparse[n]) {
          /* Non-trailing backslash.
           * Is this a special escape, or a regular escape?
           */
          if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
            /* A special escape. All done with EXACTLY */
            break;
          }
          /* Decode it. Note that we add the length for the escape
           * sequence to the length for the backlash so we can skip
           * the entire sequence, or not as required.
           */
          n += reg_decode_escape(preg->regparse + n, &ch);
          if (ch == 0) {
            preg->err = REG_ERR_NULL_CHAR;
            return 0;
          }
        }

        /* Now we have one char 'ch' of length 'n'.
         * Check to see if the following char is a MULT
         */

        if (ISMULT(preg->regparse[n])) {
          /* Yes. But do we already have some EXACTLY chars? */
          if (added) {
            /* Yes, so return what we have and pick up the current char next time around */
            break;
          }
          /* No, so add this single char and finish */
          regc(preg, ch);
          added++;
          preg->regparse += n;
          break;
        }

        /* No, so just add this char normally */
        regc(preg, ch);
        added++;
        preg->regparse += n;
      }
      regc(preg, '\0');

      *flagp |= HASWIDTH;
      if (added == 1)
        *flagp |= SIMPLE;
      break;
    }
    break;
  }

  return(ret);
}

static void reg_grow(regex_t *preg, int n)
{
  if (preg->p + n >= preg->proglen) {
    preg->proglen = (preg->p + n) * 2;
    preg->program = realloc(preg->program, preg->proglen * sizeof(int));
  }
}

/*
 - regnode - emit a node
 */
/* Location. */
static int regnode(regex_t *preg, int op)
{
  reg_grow(preg, 2);

  /* The OP followed by a next pointer */
  preg->program[preg->p++] = op;
  preg->program[preg->p++] = 0;

  /* Return the start of the node */
  return preg->p - 2;
}

/*
 - regc - emit (if appropriate) a byte of code
 */
static void regc(regex_t *preg, int b )
{
  reg_grow(preg, 1);
  preg->program[preg->p++] = b;
}

/*
 - reginsert - insert an operator in front of already-emitted operand
 *
 * Means relocating the operand.
 * Returns the new location of the original operand.
 */
static int reginsert(regex_t *preg, int op, int size, int opnd )
{
  reg_grow(preg, size);

  /* Move everything from opnd up */
  memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
  /* Zero out the new space */
  memset(preg->program + opnd, 0, sizeof(int) * size);

  preg->program[opnd] = op;

  preg->p += size;

  return opnd + size;
}

/*
 - regtail - set the next-pointer at the end of a node chain
 */
static void regtail(regex_t *preg, int p, int val)
{
  int scan;
  int temp;
  int offset;

  /* Find last node. */
  scan = p;
  for (;;) {
    temp = regnext(preg, scan);
    if (temp == 0)
      break;
    scan = temp;
  }

  if (OP(preg, scan) == BACK)
    offset = scan - val;
  else
    offset = val - scan;

  preg->program[scan + 1] = offset;
}

/*
 - regoptail - regtail on operand of first argument; nop if operandless
 */

static void regoptail(regex_t *preg, int p, int val )
{
  /* "Operandless" and "op != BRANCH" are synonymous in practice. */
  if (p != 0 && OP(preg, p) == BRANCH) {
    regtail(preg, OPERAND(p), val);
  }
}

/*
 * regexec and friends
 */

/*
 * Forwards.
 */
static int regtry(regex_t *preg, const char *string );
static int regmatch(regex_t *preg, int prog);
static int regrepeat(regex_t *preg, int p, int max);

/*
 - regexec - match a regexp against a string
 */
int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
{
  const char *s;
  int scan;

  /* Be paranoid... */
  if (preg == NULL || preg->program == NULL || string == NULL) {
    return REG_ERR_NULL_ARGUMENT;
  }

  /* Check validity of program. */
  if (*preg->program != REG_MAGIC) {
    return REG_ERR_CORRUPTED;
  }

#ifdef DEBUG
  fprintf(stderr, "regexec: %s\n", string);
  regdump(preg);
#endif

  preg->eflags = eflags;
  preg->pmatch = pmatch;
  preg->nmatch = nmatch;
  preg->start = string; /* All offsets are computed from here */

  /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
  for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
    int op = OP(preg, scan);
    if (op == END)
      break;
    if (op == REPX || op == REPXMIN)
      preg->program[scan + 4] = 0;
  }

  /* If there is a "must appear" string, look for it. */
  if (preg->regmust != 0) {
    s = string;
    while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
      if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
        break;
      }
      s++;
    }
    if (s == NULL) /* Not present. */
      return REG_NOMATCH;
  }

  /* Mark beginning of line for ^ . */
  preg->regbol = string;

  /* Simplest case:  anchored match need be tried only once (maybe per line). */
  if (preg->reganch) {
    if (eflags & REG_NOTBOL) {
      /* This is an anchored search, but not an BOL, so possibly skip to the next line */
      goto nextline;
    }
    while (1) {
      if (regtry(preg, string)) {
        return REG_NOERROR;
      }
      if (*string) {
nextline:
        if (preg->cflags & REG_NEWLINE) {
          /* Try the next anchor? */
          string = strchr(string, '\n');
          if (string) {
            preg->regbol = ++string;
            continue;
          }
        }
      }
      return REG_NOMATCH;
    }
  }

  /* Messy cases:  unanchored match. */
  s = string;
  if (preg->regstart != '\0') {
    /* We know what char it must start with. */
    while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
      if (regtry(preg, s))
        return REG_NOERROR;
      s++;
    }
  }
  else
    /* We don't -- general case. */
    while (1) {
      if (regtry(preg, s))
        return REG_NOERROR;
      if (*s == '\0') {
        break;
      }
      else {
        int c;
        s += utf8_tounicode(s, &c);
      }
    }

  /* Failure. */
  return REG_NOMATCH;
}

/*
 - regtry - try match at specific point
 */
      /* 0 failure, 1 success */
static int regtry( regex_t *preg, const char *string )
{
  int i;

  preg->reginput = string;

  for (i = 0; i < preg->nmatch; i++) {
    if (preg->pmatch) {
      preg->pmatch[i].rm_so = -1;
      preg->pmatch[i].rm_eo = -1;
    }
  }
  if (regmatch(preg, 1)) {
    if (preg->pmatch) {
      preg->pmatch[0].rm_so = string - preg->start;
      preg->pmatch[0].rm_eo = preg->reginput - preg->start;
    }
    return(1);
  } else
    return(0);
}

/**
 * Returns bytes matched if 'pattern' is a prefix of 'string'.
 *
 * If 'nocase' is non-zero, does a case-insensitive match.
 *
 * Returns -1 on not found.
 */
static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
{
  const char *s = string;
  while (proglen && *s) {
    int ch;
    int n = reg_utf8_tounicode_case(s, &ch, nocase);
    if (ch != *prog) {
      return -1;
    }
    prog++;
    s += n;
    proglen--;
  }
  if (proglen == 0) {
    return s - string;
  }
  return -1;
}

/**
 * Searches for 'c' in the range 'range'.
 *
 * Returns 1 if found, or 0 if not.
 */
static int reg_range_find(const int *range, int c)
{
  while (*range) {
    /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
    if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
      return 1;
    }
    range += 2;
  }
  return 0;
}

/**
 * Search for the character 'c' in the utf-8 string 'string'.
 *
 * If 'nocase' is set, the 'string' is assumed to be uppercase
 * and 'c' is converted to uppercase before matching.
 *
 * Returns the byte position in the string where the 'c' was found, or
 * NULL if not found.
 */
static const char *str_find(const char *string, int c, int nocase)
{
  if (nocase) {
    /* The "string" should already be converted to uppercase */
    c = utf8_upper(c);
  }
  while (*string) {
    int ch;
    int n = reg_utf8_tounicode_case(string, &ch, nocase);
    if (c == ch) {
      return string;
    }
    string += n;
  }
  return NULL;
}

/**
 * Returns true if 'ch' is an end-of-line char.
 *
 * In REG_NEWLINE mode, \n is considered EOL in
 * addition to \0
 */
static int reg_iseol(regex_t *preg, int ch)
{
  if (preg->cflags & REG_NEWLINE) {
    return ch == '\0' || ch == '\n';
  }
  else {
    return ch == '\0';
  }
}

static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
{
  int nextch = '\0';
  const char *save;
  int no;
  int c;

  int max = preg->program[scan + 2];
  int min = preg->program[scan + 3];
  int next = regnext(preg, scan);

  /*
   * Lookahead to avoid useless match attempts
   * when we know what character comes next.
   */
  if (OP(preg, next) == EXACTLY) {
    nextch = preg->program[OPERAND(next)];
  }
  save = preg->reginput;
  no = regrepeat(preg, scan + 5, max);
  if (no < min) {
    return 0;
  }
  if (matchmin) {
    /* from min up to no */
    max = no;
    no = min;
  }
  /* else from no down to min */
  while (1) {
    if (matchmin) {
      if (no > max) {
        break;
      }
    }
    else {
      if (no < min) {
        break;
      }
    }
    preg->reginput = save + utf8_index(save, no);
    reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
    /* If it could work, try it. */
    if (reg_iseol(preg, nextch) || c == nextch) {
      if (regmatch(preg, next)) {
        return(1);
      }
    }
    if (matchmin) {
      /* Couldn't or didn't, add one more */
      no++;
    }
    else {
      /* Couldn't or didn't -- back up. */
      no--;
    }
  }
  return(0);
}

static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
{
  int *scanpt = preg->program + scan;

  int max = scanpt[2];
  int min = scanpt[3];

  /* Have we reached min? */
  if (scanpt[4] < min) {
    /* No, so get another one */
    scanpt[4]++;
    if (regmatch(preg, scan + 5)) {
      return 1;
    }
    scanpt[4]--;
    return 0;
  }
  if (scanpt[4] > max) {
    return 0;
  }

  if (matchmin) {
    /* minimal, so try other branch first */
    if (regmatch(preg, regnext(preg, scan))) {
      return 1;
    }
    /* No, so try one more */
    scanpt[4]++;
    if (regmatch(preg, scan + 5)) {
      return 1;
    }
    scanpt[4]--;
    return 0;
  }
  /* maximal, so try this branch again */
  if (scanpt[4] < max) {
    scanpt[4]++;
    if (regmatch(preg, scan + 5)) {
      return 1;
    }
    scanpt[4]--;
  }
  /* At this point we are at max with no match. Try the other branch */
  return regmatch(preg, regnext(preg, scan));
}

/*
 - regmatch - main matching routine
 *
 * Conceptually the strategy is simple:  check to see whether the current
 * node matches, call self recursively to see whether the rest matches,
 * and then act accordingly.  In practice we make some effort to avoid
 * recursion, in particular by going through "ordinary" nodes (that don't
 * need to know whether the rest of the match failed) by a loop instead of
 * by recursion.
 */
/* 0 failure, 1 success */
static int regmatch(regex_t *preg, int prog)
{
  int scan; /* Current node. */
  int next;   /* Next node. */
  const char *save;

  scan = prog;

#ifdef DEBUG
  if (scan != 0 && regnarrate)
    fprintf(stderr, "%s(\n", regprop(scan));
#endif
  while (scan != 0) {
    int n;
    int c;
#ifdef DEBUG
    if (regnarrate) {
      fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
    }
#endif
    next = regnext(preg, scan);
    n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));

    switch (OP(preg, scan)) {
    case BOLX:
      if ((preg->eflags & REG_NOTBOL)) {
        return(0);
      }
      /* Fall through */
    case BOL:
      if (preg->reginput != preg->regbol) {
        return(0);
      }
      break;
    case EOLX:
      if (c != 0) {
        /* For EOLX, only match real end of line, not newline */
        return 0;
      }
      break;
    case EOL:
      if (!reg_iseol(preg, c)) {
        return(0);
      }
      break;
    case WORDA:
      /* Must be looking at a letter, digit, or _ */
      if ((!isalnum(UCHAR(c))) && c != '_')
        return(0);
      /* Prev must be BOL or nonword */
      if (preg->reginput > preg->regbol &&
        (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
        return(0);
      break;
    case WORDZ:
      /* Can't match at BOL */
      if (preg->reginput > preg->regbol) {
        /* Current must be EOL or nonword */
        if (reg_iseol(preg, c) || !(isalnum(UCHAR(c)) || c == '_')) {
          c = preg->reginput[-1];
          /* Previous must be word */
          if (isalnum(UCHAR(c)) || c == '_') {
            break;
          }
        }
      }
      /* No */
      return(0);

    case ANY:
      if (reg_iseol(preg, c))
        return 0;
      preg->reginput += n;
      break;
    case EXACTLY: {
        int opnd;
        int len;
        int slen;

        opnd = OPERAND(scan);
        len = str_int_len(preg->program + opnd);

        slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
        if (slen < 0) {
          return(0);
        }
        preg->reginput += slen;
      }
      break;
    case ANYOF:
      if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
        return(0);
      }
      preg->reginput += n;
      break;
    case ANYBUT:
      if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
        return(0);
      }
      preg->reginput += n;
      break;
    case NOTHING:
      break;
    case BACK:
      break;
    case BRANCH:
      if (OP(preg, next) != BRANCH)   /* No choice. */
        next = OPERAND(scan); /* Avoid recursion. */
      else {
        do {
          save = preg->reginput;
          if (regmatch(preg, OPERAND(scan))) {
            return(1);
          }
          preg->reginput = save;
          scan = regnext(preg, scan);
        } while (scan != 0 && OP(preg, scan) == BRANCH);
        return(0);
        /* NOTREACHED */
      }
      break;
    case REP:
    case REPMIN:
      return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);

    case REPX:
    case REPXMIN:
      return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);

    case END:
      return 1; /* Success! */

    case OPENNC:
    case CLOSENC:
      return regmatch(preg, next);

    default:
      if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
        save = preg->reginput;
        if (regmatch(preg, next)) {
          if (OP(preg, scan) < CLOSE) {
            int no = OP(preg, scan) - OPEN;
            if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_so == -1) {
              preg->pmatch[no].rm_so = save - preg->start;
            }
          }
          else {
            int no = OP(preg, scan) - CLOSE;
            if (no < preg->nmatch && preg->pmatch && preg->pmatch[no].rm_eo == -1) {
              preg->pmatch[no].rm_eo = save - preg->start;
            }
          }
          return(1);
        }
        /* Restore input position after failure */
        preg->reginput = save;
        return(0);
      }
      return REG_ERR_INTERNAL;
    }

    scan = next;
  }

  /*
   * We get here only if there's trouble -- normally "case END" is
   * the terminating point.
   */
  return REG_ERR_INTERNAL;
}

/*
 - regrepeat - repeatedly match something simple, report how many
 */
static int regrepeat(regex_t *preg, int p, int max)
{
  int count = 0;
  const char *scan;
  int opnd;
  int ch;
  int n;

  scan = preg->reginput;
  opnd = OPERAND(p);
  switch (OP(preg, p)) {
  case ANY:
    while (!reg_iseol(preg, *scan) && count < max) {
      count++;
      scan += utf8_charlen(*scan);
    }
    break;
  case EXACTLY:
    while (count < max) {
      n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
      if (preg->program[opnd] != ch) {
        break;
      }
      count++;
      scan += n;
    }
    break;
  case ANYOF:
    while (count < max) {
      n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
      if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
        break;
      }
      count++;
      scan += n;
    }
    break;
  case ANYBUT:
    while (count < max) {
      n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
      if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
        break;
      }
      count++;
      scan += n;
    }
    break;
  default:    /* Oh dear.  Called inappropriately. */
    preg->err = REG_ERR_INTERNAL;
    count = 0;  /* Best compromise. */
    break;
  }
  preg->reginput = scan;

  return(count);
}

/*
 - regnext - dig the "next" pointer out of a node
 */
static int regnext(regex_t *preg, int p )
{
  int offset;

  offset = NEXT(preg, p);

  if (offset == 0)
    return 0;

  if (OP(preg, p) == BACK)
    return(p-offset);
  else
    return(p+offset);
}

/*
 - regopsize - returns the size of opcode + operands at 'p' in words
 */
static int regopsize(regex_t *preg, int p )
{
  /* Almost all opcodes are 2 words, but some are more */
  switch (OP(preg, p)) {
    case REP:
    case REPMIN:
    case REPX:
    case REPXMIN:
      return 5;

    case ANYOF:
    case ANYBUT:
    case EXACTLY: {
      int s = p + 2;
      while (preg->program[s++]) {
      }
      return s - p;
    }
  }
  return 2;
}

#if defined(DEBUG) && !defined(JIM_BOOTSTRAP)

/*
 - regdump - dump a regexp onto stdout in vaguely comprehensible form
 */
static void regdump(regex_t *preg)
{
  int s;
  int op = EXACTLY; /* Arbitrary non-END op. */
  int next;
  char buf[MAX_UTF8_LEN + 1];

  int i;
  for (i = 1; i < preg->p; i++) {
    printf("%02x ", (unsigned char)preg->program[i]);
    if (i % 16 == 0) {
      printf("\n");
    }
  }
  printf("\n");

  s = 1;
  while (op != END && s < preg->p) {  /* While that wasn't END last time... */
    op = OP(preg, s);
    printf("%3d: %s", s, regprop(op));  /* Where, what. */
    next = regnext(preg, s);
    if (next == 0)    /* Next ptr. */
      printf("(0)");
    else
      printf("(%d)", next);
    s += 2;
    if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
      int max = preg->program[s];
      int min = preg->program[s + 1];
      if (max == 65535) {
        printf("{%d,*}", min);
      }
      else {
        printf("{%d,%d}", min, max);
      }
      printf(" %d", preg->program[s + 2]);
      s += 3;
    }
    else if (op == ANYOF || op == ANYBUT) {
      /* set of ranges */

      while (preg->program[s]) {
        int len = preg->program[s++];
        int first = preg->program[s++];
        buf[utf8_getchars(buf, first)] = 0;
        printf("%s", buf);
        if (len > 1) {
          buf[utf8_getchars(buf, first + len - 1)] = 0;
          printf("-%s", buf);
        }
      }
      s++;
    }
    else if (op == EXACTLY) {
      /* Literal string, where present. */

      while (preg->program[s]) {
        buf[utf8_getchars(buf, preg->program[s])] = 0;
        printf("%s", buf);
        s++;
      }
      s++;
    }
    putchar('\n');
  }

  if (op == END) {
    /* Header fields of interest. */
    if (preg->regstart) {
      buf[utf8_getchars(buf, preg->regstart)] = 0;
      printf("start '%s' ", buf);
    }
    if (preg->reganch)
      printf("anchored ");
    if (preg->regmust != 0) {
      int i;
      printf("must have:");
      for (i = 0; i < preg->regmlen; i++) {
        putchar(preg->program[preg->regmust + i]);
      }
      putchar('\n');
    }
  }
  printf("\n");
}

/*
 - regprop - printable representation of opcode
 */
static const char *regprop( int op )
{
  static char buf[50];

  switch (op) {
  case BOL:
    return "BOL";
  case EOL:
    return "EOL";
  case BOLX:
    return "BOLX";
  case EOLX:
    return "EOLX";
  case ANY:
    return "ANY";
  case ANYOF:
    return "ANYOF";
  case ANYBUT:
    return "ANYBUT";
  case BRANCH:
    return "BRANCH";
  case EXACTLY:
    return "EXACTLY";
  case NOTHING:
    return "NOTHING";
  case BACK:
    return "BACK";
  case END:
    return "END";
  case REP:
    return "REP";
  case REPMIN:
    return "REPMIN";
  case REPX:
    return "REPX";
  case REPXMIN:
    return "REPXMIN";
  case WORDA:
    return "WORDA";
  case WORDZ:
    return "WORDZ";
  case OPENNC:
    return "OPEN";
  case CLOSENC:
    return "CLOSE";
  default:
    if (op >= OPEN && op < CLOSE) {
      snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
    }
    else if (op >= CLOSE && op < CLOSE_END) {
      snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
    }
    else {
      snprintf(buf, sizeof(buf), "?%d?\n", op);
    }
    return(buf);
  }
}
#endif /* JIM_BOOTSTRAP */

size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
{
  static const char *error_strings[] = {
    "success",
    "no match",
    "bad pattern",
    "null argument",
    "unknown error",
    "too big",
    "out of memory",
    "too many ()",
    "parentheses () not balanced",
    "braces {} not balanced",
    "invalid repetition count(s)",
    "extra characters",
    "*+ of empty atom",
    "nested count",
    "internal error",
    "count follows nothing",
    "invalid escape \\ sequence",
    "corrupted program",
    "contains null char",
    "brackets [] not balanced",
  };
  const char *err;

        (void)preg;

  if (errcode < 0 || errcode >= REG_ERR_NUM) {
    err = "Bad error code";
  }
  else {
    err = error_strings[errcode];
  }

  return snprintf(errbuf, errbuf_size, "%s", err);
}

void regfree(regex_t *preg)
{
  free(preg->program);
}

#endif

Coverage Report

Created: 2026-01-09 06:48