RegExp.java
/*
* Copyright (C) 1998-2018 Gerwin Klein <lsf@jflex.de>
* SPDX-License-Identifier: BSD-3-Clause
*/
package jflex.core;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import jflex.chars.Interval;
import jflex.core.unicode.CharClasses;
import jflex.core.unicode.IntCharSet;
import jflex.core.unicode.UnicodeProperties;
import jflex.exceptions.CharClassException;
import jflex.exceptions.GeneratorException;
import jflex.l10n.ErrorMessages;
import jflex.logging.Out;
import jflex.option.Options;
/**
* Stores a regular expression of rules section in a JFlex-specification.
*
* <p>This base class has no content other than its type.
*
* @author Gerwin Klein
* @version JFlex 1.10.0-SNAPSHOT
*/
public class RegExp {
/**
* The type of the regular expression. This field will be filled with values from class sym.java
* (generated by cup)
*/
int type;
/**
* Create a new regular expression of the specified type.
*
* @param type a value from the cup generated class sym.
*/
public RegExp(int type) {
this.type = type;
}
/**
* Returns a String-representation of this regular expression with the specified indentation.
*
* @param tab a String that should contain only space characters and that is inserted in front of
* standard String-representation pf this object.
* @return a {@link java.lang.String} object.
*/
public String print(String tab) {
return tab + toString();
}
@Override
public String toString() {
return "type = " + typeName();
}
/** String representation of the type of this regular expression. */
public String typeName() {
return sym.terminalNames[type];
}
/**
* Find out if this regexp is a char class or equivalent to one.
*
* @return true if the regexp is equivalent to a char class.
*/
public boolean isCharClass() {
switch (type) {
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
return true;
case sym.BAR:
RegExp2 binary = (RegExp2) this;
return binary.r1.isCharClass() && binary.r2.isCharClass();
default:
return false;
}
}
/**
* The approximate number of NFA states this expression will need (only works correctly after
* macro expansion and without negation)
*
* @param macros macro table for expansion
* @return a int.
*/
public int size(Macros macros) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
binary = (RegExp2) this;
return binary.r1.size(macros) + binary.r2.size(macros) + 2;
case sym.CONCAT:
binary = (RegExp2) this;
return binary.r1.size(macros) + binary.r2.size(macros);
case sym.STAR:
case sym.PLUS:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros) + 2;
case sym.QUESTION:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros);
case sym.BANG:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros) * content.size(macros);
// this is only a very rough estimate (worst case 2^n)
// exact size too complicated (propably requires construction)
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros) * content.size(macros) * 3;
// see sym.BANG
case sym.STRING:
case sym.STRING_I:
unary = (RegExp1) this;
return ((String) unary.content).length() + 1;
case sym.CHAR:
case sym.CHAR_I:
return 2;
case sym.CCLASS:
case sym.CCLASSNOT:
case sym.CCLASSOP:
case sym.PRIMCLASS:
return 2;
case sym.MACROUSE:
unary = (RegExp1) this;
return macros.getDefinition((String) unary.content).size(macros);
default:
throw new RegExpException(this);
}
}
/** Reverses a string. */
static String revString(String s) {
return new StringBuilder(s).reverse().toString();
}
/**
* Recursively convert tilde (upto) expressions into negation and star.
*
* @return new RegExp equivalent to the current one, but without upto expressions.
*/
public final RegExp resolveTilde() {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
binary = (RegExp2) this;
return new RegExp2(sym.BAR, binary.r1.resolveTilde(), binary.r2.resolveTilde());
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(sym.CONCAT, binary.r1.resolveTilde(), binary.r2.resolveTilde());
case sym.STAR:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.STAR, content.resolveTilde());
case sym.PLUS:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.PLUS, content.resolveTilde());
case sym.QUESTION:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.QUESTION, content.resolveTilde());
case sym.BANG:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.BANG, content.resolveTilde());
case sym.TILDE:
// ~a = !([^]* a [^]*) a
// uses subexpression sharing
unary = (RegExp1) this;
content = ((RegExp) unary.content).resolveTilde();
RegExp any_star = new RegExp1(sym.STAR, anyChar());
RegExp neg =
new RegExp1(
sym.BANG,
new RegExp2(sym.CONCAT, any_star, new RegExp2(sym.CONCAT, content, any_star)));
return new RegExp2(sym.CONCAT, neg, content);
case sym.STRING:
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
unary = (RegExp1) this;
return new RegExp1(unary.type, unary.content);
default:
throw new RegExpException(this);
}
}
/**
* Returns a regexp that matches any character: {@code [^]}
*
* @return the regexp for {@code [^]}
*/
public static RegExp anyChar() {
return new RegExp1(sym.PRIMCLASS, IntCharSet.allChars());
}
/**
* Confirms that the parameter is a RegExp1 of type sym.PRIMCLASS.
*
* @param r the RegExp to check
* @throws CharClassException if r is not a RegExp1 or of type sym.PRIMCLASS.
* @return r cast to RegExp1
*/
public static RegExp1 checkPrimClass(RegExp r) {
if (!(r instanceof RegExp1 && r.type == sym.PRIMCLASS))
throw new CharClassException("Not normalised " + r);
return (RegExp1) r;
}
/**
* Performs the given set operation on the two {@link IntCharSet} parameters.
*
* @param op the operation to perform (as @{link sym} constant)
* @param l the left operator of the expression
* @param r the right operator of the expression
* @param ctxt the regular expression containing the provided operator
* @return a new {@link IntCharSet}
* @throws RegExpException for {@code ctxt} if the operator is not supported
*/
public static IntCharSet performClassOp(int op, IntCharSet l, IntCharSet r, RegExp ctxt) {
IntCharSet set;
IntCharSet intersection = l.and(r);
switch (op) {
case sym.INTERSECTION:
return intersection;
case sym.DIFFERENCE:
// IntCharSet.sub() assumes its argument is a subset, so subtract intersection
set = IntCharSet.copyOf(l);
set.sub(intersection);
return set;
case sym.SYMMETRICDIFFERENCE:
set = IntCharSet.copyOf(l);
set.add(r);
set.sub(intersection);
return set;
default:
throw new RegExpException(ctxt);
}
}
/**
* Normalise the regular expression to eliminate macro use (expand them).
*
* @return a regexp that contains no {@link sym#MACROUSE}.
*/
@SuppressWarnings("unchecked")
public final RegExp normaliseMacros(Macros m) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(type, binary.r1.normaliseMacros(m), binary.r2.normaliseMacros(m));
case sym.STAR:
case sym.PLUS:
case sym.QUESTION:
case sym.BANG:
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(type, content.normaliseMacros(m));
case sym.CCLASS:
case sym.CCLASSNOT:
{
unary = (RegExp1) this;
List<RegExp> contents = (List<RegExp>) unary.content;
List<RegExp> newContents = new ArrayList<RegExp>(contents.size());
for (RegExp r : contents) {
RegExp n = r.normaliseMacros(m);
newContents.add(n);
}
return new RegExp1(type, newContents);
}
case sym.CCLASSOP:
unary = (RegExp1) this;
binary = (RegExp2) unary.content;
RegExp l = binary.r1.normaliseMacros(m);
RegExp r = binary.r2.normaliseMacros(m);
return new RegExp1(type, new RegExp2(binary.type, l, r));
case sym.STRING:
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
case sym.PRECLASS:
case sym.UNIPROPCCLASS:
unary = (RegExp1) this;
return new RegExp1(type, unary.content);
case sym.MACROUSE:
unary = (RegExp1) this;
return m.getDefinition((String) unary.content).normaliseMacros(m);
default:
throw new RegExpException(this);
}
}
/**
* Normalise the regular expression to eliminate compound character class expression (compute
* their content).
*
* @param f the spec file containing the regular expression (for error reporting)
* @param line the line number of the regular expression (for error reporting)
* @return a regexp where all char classes are primitive {@link IntCharSet} classes.
*/
@SuppressWarnings("unchecked")
public final RegExp normaliseCCLs(File f, int line) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
try {
switch (type) {
case sym.BAR:
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(
type, binary.r1.normaliseCCLs(f, line), binary.r2.normaliseCCLs(f, line));
case sym.STAR:
case sym.PLUS:
case sym.QUESTION:
case sym.BANG:
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(type, content.normaliseCCLs(f, line));
case sym.STRING:
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
unary = (RegExp1) this;
return new RegExp1(type, unary.content);
case sym.CCLASS:
case sym.CCLASSNOT:
{
unary = (RegExp1) this;
List<RegExp> contents = (List<RegExp>) unary.content;
IntCharSet set = new IntCharSet();
for (RegExp r : contents) {
RegExp1 n = checkPrimClass(r.normaliseCCLs(f, line));
set.add((IntCharSet) n.content);
}
return new RegExp1(
sym.PRIMCLASS, type == sym.CCLASS ? set : IntCharSet.complementOf(set));
}
case sym.CCLASSOP:
unary = (RegExp1) this;
binary = (RegExp2) unary.content;
RegExp1 l = checkPrimClass(binary.r1.normaliseCCLs(f, line));
IntCharSet setl = (IntCharSet) l.content;
RegExp1 r = checkPrimClass(binary.r2.normaliseCCLs(f, line));
IntCharSet setr = (IntCharSet) r.content;
IntCharSet set = performClassOp(binary.type, setl, setr, this);
return new RegExp1(sym.PRIMCLASS, set);
default:
throw new RegExpException(this);
}
} catch (CharClassException e) {
Out.error(f, ErrorMessages.NOT_CHARCLASS, line, -1);
throw new GeneratorException(e);
}
}
/**
* Expand pre-defined character classes into primitive IntCharSet classes.
*
* @param cache memoized pre-defined character class expansions
* @param cl character class partitions
* @return the expanded regular expression
*/
@SuppressWarnings("unchecked")
public RegExp expandPreClasses(Map<Integer, IntCharSet> cache, CharClasses cl, boolean caseless) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(
type,
binary.r1.expandPreClasses(cache, cl, caseless),
binary.r2.expandPreClasses(cache, cl, caseless));
case sym.STAR:
case sym.PLUS:
case sym.QUESTION:
case sym.BANG:
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(type, content.expandPreClasses(cache, cl, caseless));
case sym.CCLASS:
case sym.CCLASSNOT:
{
unary = (RegExp1) this;
List<RegExp> contents = (List<RegExp>) unary.content;
List<RegExp> newContents = new ArrayList<RegExp>(contents.size());
for (RegExp r : contents) {
RegExp n = r.expandPreClasses(cache, cl, caseless);
newContents.add(n);
}
return new RegExp1(type, newContents);
}
case sym.CCLASSOP:
unary = (RegExp1) this;
binary = (RegExp2) unary.content;
RegExp l = binary.r1.expandPreClasses(cache, cl, caseless);
RegExp r = binary.r2.expandPreClasses(cache, cl, caseless);
return new RegExp1(type, new RegExp2(binary.type, l, r));
case sym.PRECLASS:
{
unary = (RegExp1) this;
IntCharSet set = getPreClass(cache, cl, (Integer) unary.content);
return new RegExp1(sym.PRIMCLASS, set);
}
case sym.UNIPROPCCLASS:
{
unary = (RegExp1) this;
IntCharSet set = cl.getUnicodeProperties().getIntCharSet((String) unary.content);
if (caseless) {
set = set.getCaseless(cl.getUnicodeProperties());
}
return new RegExp1(sym.PRIMCLASS, set);
}
case sym.STRING:
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
unary = (RegExp1) this;
return new RegExp1(type, unary.content);
default:
throw new RegExpException(this);
}
}
/**
* Check whether a character is a member of the give char class type.
*
* @param type the type of the character class ({@link sym#JLETTERCLASS} or {@link
* sym#JLETTERDIGITCLASS})
* @param c the character to check
* @return true if the character is a member of the class
*/
private static boolean checkJPartStart(int type, int c) {
switch (type) {
case sym.JLETTERCLASS:
return Character.isJavaIdentifierStart(c);
case sym.JLETTERDIGITCLASS:
return Character.isJavaIdentifierPart(c);
default:
return false;
}
}
/**
* Compute and memoize a pre-defined character class.
*
* @param preclassCache memoized pre-defined character class expansions
* @param charClasses character class partitions
* @param type the type of the predefined character class
* @return the expanded IntCharSet for the class
*/
private static IntCharSet getPreClass(
Map<Integer, IntCharSet> preclassCache, CharClasses charClasses, int type) {
IntCharSet result = preclassCache.get(type);
if (null == result) {
UnicodeProperties unicodeProperties = charClasses.getUnicodeProperties();
switch (type) {
case sym.LETTERCLASS:
result = unicodeProperties.getIntCharSet("L");
break;
case sym.DIGITCLASS:
result = unicodeProperties.getIntCharSet("Nd");
break;
case sym.DIGITCLASSNOT:
IntCharSet digits = unicodeProperties.getIntCharSet("Nd");
result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
result.sub(digits);
break;
case sym.UPPERCLASS:
// "Uppercase" is more than Uppercase_Letter, but older Unicode
// versions don't have this definition - check for "Uppercase",
// then fall back to Uppercase_Letter (Lu) if it does not exist.
result = unicodeProperties.getIntCharSet("Uppercase");
if (null == result) {
result = unicodeProperties.getIntCharSet("Lu");
}
break;
case sym.LOWERCLASS:
// "Lowercase" is more than Lowercase_Letter, but older Unicode
// versions don't have this definition - check for "Lowercase",
// then fall back to Lowercase_Letter (Ll) if it does not exist.
result = unicodeProperties.getIntCharSet("Lowercase");
if (null == result) {
result = unicodeProperties.getIntCharSet("Ll");
}
break;
case sym.WHITESPACECLASS:
// Although later versions do, Unicode 1.1 does not have the
// "Whitespace" definition - check for "Whitespace", then fall back
// to "Space_separator" (Zs) if it does not exist.
result = unicodeProperties.getIntCharSet("Whitespace");
if (null == result) {
result = unicodeProperties.getIntCharSet("Zs");
}
break;
case sym.WHITESPACECLASSNOT:
// Although later versions do, Unicode 1.1 does not have the
// "Whitespace" definition - check for "Whitespace", then fall back
// to "Space_separator" (Zs) if it does not exist.
IntCharSet whitespaceClass = unicodeProperties.getIntCharSet("Whitespace");
if (null == whitespaceClass) {
whitespaceClass = unicodeProperties.getIntCharSet("Zs");
}
result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
result.sub(whitespaceClass);
break;
case sym.WORDCLASS:
{
// UTR#18: \w = [\p{alpha}\p{gc=Mark}\p{digit}\p{gc=Connector_Punctuation}]
IntCharSet alphaClass = unicodeProperties.getIntCharSet("Alphabetic");
if (null == alphaClass) {
// For Unicode 1.1, substitute "Letter" (L) for "Alphabetic".
alphaClass = unicodeProperties.getIntCharSet("L");
}
IntCharSet markClass = unicodeProperties.getIntCharSet("M");
IntCharSet digitClass = unicodeProperties.getIntCharSet("Nd");
IntCharSet connectorPunctClass = unicodeProperties.getIntCharSet("Pc");
if (null == connectorPunctClass) {
// For Unicode 1.1, substitute "_" for "Connector_Punctuation".
connectorPunctClass = IntCharSet.ofCharacter('_');
}
result = IntCharSet.copyOf(alphaClass);
result.add(markClass);
result.add(digitClass);
result.add(connectorPunctClass);
break;
}
case sym.WORDCLASSNOT:
{
// UTR#18: \W = [^\p{alpha}\p{gc=Mark}\p{digit}\p{gc=Connector_Punctuation}]
IntCharSet alphaClass = unicodeProperties.getIntCharSet("Alphabetic");
if (null == alphaClass) {
// For Unicode 1.1, substitute "Letter" (L) for "Alphabetic".
alphaClass = unicodeProperties.getIntCharSet("L");
}
IntCharSet markClass = unicodeProperties.getIntCharSet("M");
IntCharSet digitClass = unicodeProperties.getIntCharSet("Nd");
IntCharSet connectorPunctClass = unicodeProperties.getIntCharSet("Pc");
if (null == connectorPunctClass) {
// For Unicode 1.1, substitute "_" for "Connector_Punctuation".
connectorPunctClass = IntCharSet.ofCharacter('_');
}
IntCharSet wordClass = IntCharSet.copyOf(alphaClass);
wordClass.add(markClass);
wordClass.add(digitClass);
wordClass.add(connectorPunctClass);
result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
result.sub(wordClass);
break;
}
case sym.JLETTERCLASS:
case sym.JLETTERDIGITCLASS:
result = new IntCharSet();
int c = 0;
int start = 0;
int last = charClasses.getMaxCharCode();
boolean prev, current;
prev = checkJPartStart(type, 0);
for (c = 1; c < last; c++) {
current = checkJPartStart(type, c);
if (!prev && current) start = c;
if (prev && !current) {
result.add(new Interval(start, c - 1));
}
prev = current;
}
// the last iteration is moved out of the loop to
// avoid an endless loop if last == maxCharCode and
// last+1 == 0
current = checkJPartStart(type, c);
if (!prev && current) result.add(new Interval(c, c));
if (prev && current) result.add(new Interval(start, c));
if (prev && !current) result.add(new Interval(start, c - 1));
break;
default:
throw new CharClassException("Unknown predefined char class type: " + type);
}
preclassCache.put(type, result);
}
return result;
}
/**
* Make character class partitions based on the classes mentioned in this regexp.
*
* <p>Assumption: regexp is normalised.
*/
public final void makeCCLs(CharClasses c, boolean caseless) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
case sym.CONCAT:
binary = (RegExp2) this;
binary.r1.makeCCLs(c, caseless);
binary.r2.makeCCLs(c, caseless);
return;
case sym.STAR:
case sym.PLUS:
case sym.QUESTION:
case sym.BANG:
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
content.makeCCLs(c, caseless);
return;
case sym.CHAR:
case sym.CHAR_I:
Integer ch = (Integer) ((RegExp1) this).content;
c.makeClass(ch, caseless);
return;
case sym.STRING:
case sym.STRING_I:
String str = (String) ((RegExp1) this).content;
c.makeClass(str, caseless);
return;
case sym.PRIMCLASS:
unary = (RegExp1) this;
IntCharSet set = (IntCharSet) unary.content;
c.makeClass(set, Options.jlex && caseless);
return;
default:
throw new CharClassException("makeCCLs: unexpected regexp " + this);
}
}
/**
* Creates a new regexp that matches the reverse text of this one.
*
* @return the reverse regexp
*/
public final RegExp rev() {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
binary = (RegExp2) this;
return new RegExp2(sym.BAR, binary.r1.rev(), binary.r2.rev());
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(sym.CONCAT, binary.r2.rev(), binary.r1.rev());
case sym.STAR:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.STAR, content.rev());
case sym.PLUS:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.PLUS, content.rev());
case sym.QUESTION:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.QUESTION, content.rev());
case sym.BANG:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.BANG, content.rev());
case sym.TILDE:
content = resolveTilde();
return content.rev();
case sym.STRING:
case sym.STRING_I:
unary = (RegExp1) this;
return new RegExp1(unary.type, revString((String) unary.content));
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
unary = (RegExp1) this;
return new RegExp1(unary.type, unary.content);
default:
throw new RegExpException(this);
}
}
}