NFA.java
/*
* Copyright (C) 1998-2018 Gerwin Klein <lsf@jflex.de>
* SPDX-License-Identifier: BSD-3-Clause
*/
package jflex.core;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import jflex.base.Build;
import jflex.base.IntPair;
import jflex.chars.Interval;
import jflex.core.unicode.CharClasses;
import jflex.core.unicode.IntCharSet;
import jflex.exceptions.GeneratorException;
import jflex.l10n.ErrorMessages;
import jflex.logging.Out;
import jflex.option.Options;
import jflex.state.StateSet;
import jflex.state.StateSetEnumerator;
/**
* Non-deterministic finite automata representation in JFlex.
*
* <p>Contains algorithms RegExp ��� NFA.
*
* @author Gerwin Klein
* @version JFlex 1.10.0-SNAPSHOT
*/
public final class NFA {
/**
* table[current_state][next_char] is the set of states that can be reached from current_state
* with an input next_char
*/
private StateSet[][] table;
/**
* epsilon[current_state] is the set of states that can be reached from current_state via epsilon
* edges
*/
private StateSet[] epsilon;
/** isFinal[state] == true <=> state is a final state of the NFA */
private boolean[] isFinal;
/**
* action[current_state]: the action associated with the state current_state (null, if there is no
* action for the state)
*/
private Action[] action;
/** the number of states in this NFA */
private int numStates;
/** the current maximum number of input characters */
private final int numInput;
/**
* the number of lexical States. Lexical states have the indices 0..numLexStates-1 in the
* transition table
*/
private int numLexStates;
/** estimated size of the NFA (before actual construction) */
private final int estSize;
private CharClasses classes;
private LexScan scanner;
private RegExps regExps;
// will be reused by several methods (avoids excessive object creation)
private final StateSetEnumerator states = new StateSetEnumerator();
private final StateSet tempStateSet = new StateSet();
/** Constructor for NFA. */
public NFA(int numInput, int estSize) {
this.numInput = numInput;
this.estSize = estSize;
numStates = 0;
epsilon = new StateSet[estSize];
action = new Action[estSize];
isFinal = new boolean[estSize];
table = new StateSet[estSize][numInput];
}
/**
* Construct new NFA.
*
* <p>Assumes that lookahead cases and numbers are already resolved in RegExps.
*
* @see RegExps#checkLookAheads()
* @param numInput a int.
* @param scanner a {@link LexScan} object.
* @param regExps a {@link RegExps} object.
* @param macros a {@link Macros} object.
* @param classes a {@link CharClasses} object.
*/
public NFA(int numInput, LexScan scanner, RegExps regExps, Macros macros, CharClasses classes) {
this(numInput, regExps.NFASize(macros) + 2 * scanner.states.number());
this.scanner = scanner;
this.regExps = regExps;
this.classes = classes;
numLexStates = scanner.states.number();
// ensureCapacity assumes correctly set up numStates.
int new_num = numEntryStates();
ensureCapacity(new_num);
numStates = new_num;
}
public StateSet epsilon(int i) {
return epsilon[i];
}
public int numEntryStates() {
return 2 * (numLexStates + regExps.gen_look_count);
}
public int numInput() {
return numInput;
}
public int numLexStates() {
return numLexStates;
}
public int numStates() {
return numStates;
}
/** Returns the set of states that can be reached from currentState with an input nextChar. */
public StateSet reachableStates(int currentState, int nextChar) {
return table[currentState][nextChar];
}
public StateSetEnumerator states() {
return states;
}
public StateSet tempStateSet() {
return tempStateSet;
}
/**
* Add a standalone rule that has minimum priority, fires a transition on all single input
* characters and has a "print yytext" action.
*/
public void addStandaloneRule() {
int start = numStates;
int end = numStates + 1;
for (int c = 0; c < classes.getNumClasses(); c++) addTransition(start, c, end);
for (int i = 0; i < numLexStates * 2; i++) addEpsilonTransition(i, start);
action[end] = new Action("System.out.print(yytext());", Integer.MAX_VALUE);
isFinal[end] = true;
}
/**
* Add a regexp to this NFA.
*
* @param regExpNum the number of the regexp to add.
*/
public void addRegExp(int regExpNum) {
if (Build.DEBUG)
Out.debug(
"Adding nfa for regexp " + regExpNum + " :" + Out.NL + regExps.getRegExp(regExpNum));
IntPair nfa = insertNFA(regExps.getRegExp(regExpNum));
List<Integer> lexStates = regExps.getStates(regExpNum);
if (lexStates.isEmpty()) lexStates = scanner.states.getInclusiveStates();
for (Integer stateNum : lexStates) {
if (!regExps.isBOL(regExpNum)) addEpsilonTransition(2 * stateNum, nfa.start());
addEpsilonTransition(2 * stateNum + 1, nfa.start());
}
if (regExps.getLookAhead(regExpNum) != null) {
Action a = regExps.getAction(regExpNum);
if (a != null && a.lookAhead() == Action.Kind.FINITE_CHOICE) {
insertLookAheadChoices(nfa.end(), a, regExps.getLookAhead(regExpNum));
// remove the original action from the collection: it will never
// be matched directly, only its copies will.
scanner.actions.remove(a);
} else {
RegExp r1 = regExps.getRegExp(regExpNum);
RegExp r2 = regExps.getLookAhead(regExpNum);
IntPair look = insertNFA(r2);
addEpsilonTransition(nfa.end(), look.start());
action[look.end()] = a;
isFinal[look.end()] = true;
if (a != null && a.lookAhead() == Action.Kind.GENERAL_LOOK) {
// base forward pass
IntPair forward = insertNFA(r1);
// lookahead backward pass
IntPair backward = insertNFA(r2.rev());
isFinal[forward.end()] = true;
action[forward.end()] = new Action(Action.Kind.FORWARD_ACTION);
isFinal[backward.end()] = true;
action[backward.end()] = new Action(Action.Kind.BACKWARD_ACTION);
int entry = 2 * (regExps.getLookEntry(regExpNum) + numLexStates);
addEpsilonTransition(entry, forward.start());
addEpsilonTransition(entry + 1, backward.start());
a.setEntryState(entry);
}
}
} else {
action[nfa.end()] = regExps.getAction(regExpNum);
isFinal[nfa.end()] = true;
}
}
/**
* Insert NFAs for the (finitely many) fixed length lookahead choices.
*
* @param lookAhead a lookahead of which isFiniteChoice is true
* @param baseEnd the end state of the base expression NFA
* @param a the action of the expression
* @see SemCheck#isFiniteChoice(RegExp)
*/
private void insertLookAheadChoices(int baseEnd, Action a, RegExp lookAhead) {
if (lookAhead.type == sym.BAR) {
RegExp2 r = (RegExp2) lookAhead;
insertLookAheadChoices(baseEnd, a, r.r1);
insertLookAheadChoices(baseEnd, a, r.r2);
} else {
int len = SemCheck.length(lookAhead);
if (len >= 0) {
// termination case
IntPair look = insertNFA(lookAhead);
addEpsilonTransition(baseEnd, look.start());
Action x = a.copyChoice(len);
action[look.end()] = x;
isFinal[look.end()] = true;
// add new copy to the collection of known actions such that
// it can be checked for the NEVER_MATCH warning.
scanner.actions.add(x);
} else {
// should never happen
throw new RegExpException(lookAhead);
}
}
}
/**
* Make sure the NFA can contain at least newNumStates states.
*
* @param newNumStates the minimum number of states.
*/
private void ensureCapacity(int newNumStates) {
int oldLength = epsilon.length;
if (newNumStates < oldLength) return;
int newStatesLength = Math.max(oldLength * 2, newNumStates);
boolean[] newFinal = new boolean[newStatesLength];
Action[] newAction = new Action[newStatesLength];
StateSet[][] newTable = new StateSet[newStatesLength][numInput];
StateSet[] newEpsilon = new StateSet[newStatesLength];
System.arraycopy(isFinal, 0, newFinal, 0, numStates);
System.arraycopy(action, 0, newAction, 0, numStates);
System.arraycopy(epsilon, 0, newEpsilon, 0, numStates);
System.arraycopy(table, 0, newTable, 0, numStates);
isFinal = newFinal;
action = newAction;
epsilon = newEpsilon;
table = newTable;
}
public void addTransition(int start, int input, int dest) {
Out.debug("Adding transition (" + start + ", " + input + ", " + dest + ")");
// Trying to insert a transition for a character that is not in the input
// char set. Ignore it. This can happen for case insensitive matching.
if (input == -1) return;
int maxS = Math.max(start, dest) + 1;
ensureCapacity(maxS);
if (maxS > numStates) numStates = maxS;
if (table[start][input] != null) table[start][input].addState(dest);
else table[start][input] = new StateSet(estSize, dest);
}
public void addEpsilonTransition(int start, int dest) {
int max = Math.max(start, dest) + 1;
ensureCapacity(max);
if (max > numStates) numStates = max;
if (epsilon[start] != null) epsilon[start].addState(dest);
else epsilon[start] = new StateSet(estSize, dest);
}
/**
* Returns {@code true}, iff the specified set of states contains a final state.
*
* @param set the set of states that is tested for final states.
*/
public boolean containsFinal(StateSet set) {
states.reset(set);
while (states.hasMoreElements()) if (isFinal[states.nextElement()]) return true;
return false;
}
/**
* Returns the action with highest priority in the specified set of states.
*
* @param set the set of states for which to determine the action
*/
public Action getAction(StateSet set) {
states.reset(set);
Action maxAction = null;
Out.debug("Determining action of : " + set);
while (states.hasMoreElements()) {
Action currentAction = action[states.nextElement()];
if (currentAction != null) {
if (maxAction == null) maxAction = currentAction;
else maxAction = maxAction.getHigherPriority(currentAction);
}
}
return maxAction;
}
/**
* Calculates the epsilon closure for a specified set of states.
*
* <p>The epsilon closure for set a is the set of states that can be reached by epsilon edges from
* a.
*
* @param startState the start state for the set of states to calculate the epsilon closure for
* @return the epsilon closure of the specified set of states in this NFA
*/
private StateSet closure(int startState) {
// Out.debug("Calculating closure of "+set);
StateSet notvisited = tempStateSet;
StateSet closure = new StateSet(numStates, startState);
notvisited.clear();
notvisited.addState(startState);
while (notvisited.containsElements()) {
// Out.debug("closure is now "+closure);
// Out.debug("notvisited is "+notvisited);
int state = notvisited.getAndRemoveElement();
// Out.debug("removed element "+state+" of "+notvisited);
// Out.debug("epsilon[states] = "+epsilon[state]);
notvisited.add(closure.complement(epsilon[state]));
closure.add(epsilon[state]);
}
// Out.debug("Closure is : "+closure);
return closure;
}
public void epsilonFill() {
for (int i = 0; i < numStates; i++) {
epsilon[i] = closure(i);
}
}
/**
* Calculates the set of states that can be reached from another set of states {@code start} with
* an specified input character {@code input}
*
* @param start the set of states to start from
* @param input the input character for which to search the next states
* @return the set of states that are reached from {@code start</code> via <code>input}
*/
private StateSet DFAEdge(StateSet start, int input) {
// Out.debug(String.format("Calculating DFAEdge for state set "+start+" and input U+04X"),
// input);
tempStateSet.clear();
states.reset(start);
while (states.hasMoreElements()) tempStateSet.add(table[states.nextElement()][input]);
StateSet result = new StateSet(tempStateSet);
states.reset(tempStateSet);
while (states.hasMoreElements()) result.add(epsilon[states.nextElement()]);
// Out.debug("DFAEdge is : "+result);
return result;
}
public void dumpTable() {
Out.dump(toString());
}
@Override
public String toString() {
StringBuilder result = new StringBuilder();
for (int i = 0; i < numStates; i++) {
result.append("State");
if (isFinal[i]) {
result.append("[FINAL");
String l = action[i].lookString();
if (!Objects.equals(l, "")) {
result.append(", ");
result.append(l);
}
result.append("]");
}
result.append(" ").append(i).append(Out.NL);
for (int input = 0; input < numInput; input++) {
if (table[i][input] != null && table[i][input].containsElements())
result
.append(" with ")
.append(input)
.append(" in ")
.append(table[i][input])
.append(Out.NL);
}
if (epsilon[i] != null && epsilon[i].containsElements())
result.append(" with epsilon in ").append(epsilon[i]).append(Out.NL);
}
return result.toString();
}
public void writeDot(File file) {
try {
PrintWriter writer =
new PrintWriter(
new OutputStreamWriter(new FileOutputStream(file), StandardCharsets.UTF_8));
writer.println(dotFormat());
writer.close();
} catch (IOException e) {
Out.error(ErrorMessages.FILE_WRITE, file);
throw new GeneratorException(e);
}
}
public String dotFormat() {
StringBuilder result = new StringBuilder();
result.append("digraph NFA {").append(Out.NL);
result.append("rankdir = LR").append(Out.NL);
for (int i = 0; i < numStates; i++) {
if (isFinal[i]) {
result.append(i);
result.append(" [shape = doublecircle]");
result.append(Out.NL);
}
}
for (int i = 0; i < numStates; i++) {
for (int input = 0; input < numInput; input++) {
if (table[i][input] != null) {
for (int s : table[i][input]) {
result.append(i).append(" -> ").append(s);
result
.append(" [label=\"")
.append(classes.toString(input))
.append("\"]")
.append(Out.NL);
}
}
}
if (epsilon[i] != null) {
for (int s : epsilon[i]) {
result.append(i).append(" -> ").append(s).append(" [style=dotted]").append(Out.NL);
}
}
}
result.append("}").append(Out.NL);
return result.toString();
}
// -----------------------------------------------------------------------
// Functions for constructing NFAs out of regular expressions.
private void insertLetterNFA(boolean caseless, int ch, int start, int end) {
if (caseless) {
IntCharSet set = IntCharSet.ofCharacter(ch);
IntCharSet caselessSet = set.getCaseless(scanner.getUnicodeProperties());
for (Interval interval : caselessSet.getIntervals()) {
for (int elem = interval.start; elem <= interval.end; ++elem) {
addTransition(start, classes.getClassCode(elem), end);
}
}
} else {
addTransition(start, classes.getClassCode(ch), end);
}
}
private IntPair insertStringNFA(boolean caseless, String str) {
int start = numStates;
int i = 0;
for (int pos = 0; pos < str.length(); ++i) {
int ch = str.codePointAt(pos);
if (caseless) {
IntCharSet set = IntCharSet.ofCharacter(ch);
IntCharSet caselessSet = set.getCaseless(scanner.getUnicodeProperties());
for (Interval interval : caselessSet.getIntervals()) {
for (int elem = interval.start; elem <= interval.end; ++elem) {
addTransition(i + start, classes.getClassCode(elem), i + start + 1);
}
}
} else {
addTransition(i + start, classes.getClassCode(ch), i + start + 1);
}
pos += Character.charCount(ch);
}
return IntPair.create(start, i + start);
}
private void insertClassNFA(IntCharSet set, int start, int end) {
for (int aCl : classes.getClassCodes(set, false)) {
addTransition(start, aCl, end);
}
}
/**
* Constructs an NFA accepting the complement of the language of a given NFA.
*
* <p>Converts the NFA into a DFA, then negates that DFA. Exponential state blowup possible and
* common.
*
* @param nfa the NFA to construct the complement for.
* @return a pair of integers denoting the index of start and end state of the complement NFA.
*/
private IntPair complement(IntPair nfa) {
if (Build.DEBUG) {
Out.debug("complement for " + nfa);
Out.debug("NFA is :" + Out.NL + this);
}
int dfaStart = nfa.end() + 1;
epsilonFill();
Map<StateSet, Integer> dfaStates = new HashMap<>(numStates);
List<StateSet> dfaList = new ArrayList<>(numStates);
int numDFAStates = 0;
int currentDFAState = 0;
StateSet currentState, newState;
newState = epsilon[nfa.start()];
dfaStates.put(newState, numDFAStates);
dfaList.add(newState);
if (Build.DEBUG) {
Out.debug(
"pos DFA start state is :"
+ Out.NL
+ dfaStates
+ Out.NL
+ Out.NL
+ "ordered :"
+ Out.NL
+ dfaList);
}
while (currentDFAState <= numDFAStates) {
currentState = dfaList.get(currentDFAState);
for (int input = 0; input < numInput; input++) {
newState = DFAEdge(currentState, input);
if (newState.containsElements()) {
// Out.debug("DFAEdge for input "+(int)input+" and state set "+currentState+" is
// "+newState);
// Out.debug("Looking for state set "+newState);
Integer nextDFAState = dfaStates.get(newState);
if (nextDFAState != null) {
// Out.debug("FOUND!");
addTransition(dfaStart + currentDFAState, input, dfaStart + nextDFAState);
} else {
if (Options.dump) {
Out.print("+");
// Out.debug("NOT FOUND!");
// Out.debug("Table was "+dfaStates);
}
numDFAStates++;
dfaStates.put(newState, numDFAStates);
dfaList.add(newState);
addTransition(dfaStart + currentDFAState, input, dfaStart + numDFAStates);
}
}
}
currentDFAState++;
}
// We have a dfa accepting the positive regexp.
// Now the complement:
if (Build.DEBUG) {
Out.debug("dfa finished, nfa is now :" + Out.NL + this);
}
int start = dfaStart + numDFAStates + 1;
int error = dfaStart + numDFAStates + 2;
int end = dfaStart + numDFAStates + 3;
addEpsilonTransition(start, dfaStart);
for (int i = 0; i < numInput; i++) addTransition(error, i, error);
addEpsilonTransition(error, end);
for (int s = 0; s <= numDFAStates; s++) {
currentState = dfaList.get(s);
currentDFAState = dfaStart + s;
// if it was not a final state, it is now in the complement
if (!currentState.hasElement(nfa.end())) addEpsilonTransition(currentDFAState, end);
// all inputs not present (formerly leading to an implicit error)
// now lead to an explicit (final) state accepting everything.
for (int i = 0; i < numInput; i++)
if (table[currentDFAState][i] == null) addTransition(currentDFAState, i, error);
}
// eliminate transitions that cannot reach final states
removeDead(dfaStart, end);
if (Build.DEBUG) {
Out.debug("complement finished, nfa (" + start + "," + end + ") is now :" + this);
}
return IntPair.create(start, end);
}
/**
* Find all states from (numerically) {@code start} to @{@code end} that (transitively) cannot
* reach reach {@code end}, and remove the transitions leading to those states.
*
* <p>After a complement operation, there may be dead states left over in the NFA, which could
* lead the scanning engine into a situation where it is trying to perform lookahead even though
* no final state can ever be reached.
*
* <p>Precondition: all states that potentially lead to {@code end} are within the interval @{code
* [start,end]}. This is satisfied by DFA generation in the complement operation.
*
* <p>Precondition: end state has no outgoing transitions
*
* @param start the first state from which to compute live states
* @param end the state that if it can be reached makes a state live
* @see NFA#complement(IntPair)
*/
private void removeDead(int start, int end) {
if (Build.DEBUG) {
Out.debug("removeDead (" + start + "," + end + ") " + Out.NL + this);
}
StateSet notvisited = tempStateSet;
StateSet reachable = new StateSet(numStates, start);
notvisited.clear();
notvisited.addState(start);
while (notvisited.containsElements()) {
int state = notvisited.getAndRemoveElement();
notvisited.add(reachable.complement(epsilon[state]));
reachable.add(epsilon[state]);
for (int i = 0; i < numInput; i++) {
notvisited.add(reachable.complement(table[state][i]));
reachable.add(table[state][i]);
}
}
if (Build.DEBUG) {
Out.debug("reachable states " + reachable);
}
StateSet live = new StateSet(numStates, end);
boolean changed = true;
// compute all live states
while (changed) {
changed = false;
Out.debug("live: " + live);
StateSet complement = live.complement(reachable);
if (complement == null) throw new GeneratorException(new NullPointerException(), true);
for (int s : complement) {
for (int i = 0; i < numInput; i++) {
if (table[s][i] != null) {
for (int state : table[s][i]) {
if (live.hasElement(state)) {
changed = true;
live.addState(s);
}
}
}
}
if (epsilon[s] != null) {
for (int state : epsilon[s]) {
if (live.hasElement(state)) {
changed = true;
live.addState(s);
}
}
}
}
}
if (Build.DEBUG) {
Out.debug("live states: " + live);
}
// now remove all transitions to non-live states (unless everything is live)
if (!reachable.equals(live)) {
for (int s : reachable) {
for (int i = 0; i < numInput; i++) if (table[s][i] != null) table[s][i].intersect(live);
if (epsilon[s] != null) epsilon[s].intersect(live);
}
}
if (Build.DEBUG) {
Out.debug("Removed dead states " + Out.NL + this);
}
}
/**
* Constructs a two state NFA for char class regexps, such that the NFA has
*
* <ul>
* <li>exactly one start state,
* <li>exactly one end state,
* <li>no transitions leading out of the end state,
* <li>no transitions leading into the start state.
* </ul>
*
* <p>Assumes that regExp.isCharClass(macros) == true
*
* @param regExp the regular expression to construct the NFA for
*/
private void insertCCLNFA(RegExp regExp, int start, int end) {
switch (regExp.type) {
case sym.BAR:
RegExp2 r = (RegExp2) regExp;
insertCCLNFA(r.r1, start, end);
insertCCLNFA(r.r2, start, end);
return;
case sym.PRIMCLASS:
insertClassNFA((IntCharSet) ((RegExp1) regExp).content, start, end);
return;
case sym.CHAR:
insertLetterNFA(false, (Integer) ((RegExp1) regExp).content, start, end);
return;
case sym.CHAR_I:
insertLetterNFA(true, (Integer) ((RegExp1) regExp).content, start, end);
return;
default:
throw new RegExpException(regExp);
}
}
/**
* Constructs an NFA for regExp such that the NFA has
*
* <p>exactly one start state, exactly one end state, no transitions leading out of the end state
* no transitions leading into the start state
*
* @param regExp the regular expression to construct the NFA for
* @return a pair of integers denoting the index of start and end state of the NFA.
*/
public IntPair insertNFA(RegExp regExp) {
IntPair nfa1, nfa2;
int start, end;
RegExp2 r;
if (Build.DEBUG) {
Out.debug("Inserting RegExp : " + regExp);
}
if (regExp.isCharClass()) {
start = numStates;
end = numStates + 1;
ensureCapacity(end + 1);
numStates = end + 1;
insertCCLNFA(regExp, start, end);
return IntPair.create(start, end);
}
switch (regExp.type) {
case sym.BAR:
r = (RegExp2) regExp;
nfa1 = insertNFA(r.r1);
nfa2 = insertNFA(r.r2);
start = nfa2.end() + 1;
end = nfa2.end() + 2;
addEpsilonTransition(start, nfa1.start());
addEpsilonTransition(start, nfa2.start());
addEpsilonTransition(nfa1.end(), end);
addEpsilonTransition(nfa2.end(), end);
return IntPair.create(start, end);
case sym.CONCAT:
r = (RegExp2) regExp;
nfa1 = insertNFA(r.r1);
nfa2 = insertNFA(r.r2);
addEpsilonTransition(nfa1.end(), nfa2.start());
return IntPair.create(nfa1.start(), nfa2.end());
case sym.STAR:
nfa1 = insertNFA((RegExp) ((RegExp1) regExp).content);
start = nfa1.end() + 1;
end = nfa1.end() + 2;
addEpsilonTransition(nfa1.end(), end);
addEpsilonTransition(start, nfa1.start());
addEpsilonTransition(start, end);
addEpsilonTransition(nfa1.end(), nfa1.start());
return IntPair.create(start, end);
case sym.PLUS:
nfa1 = insertNFA((RegExp) ((RegExp1) regExp).content);
start = nfa1.end() + 1;
end = nfa1.end() + 2;
addEpsilonTransition(nfa1.end(), end);
addEpsilonTransition(start, nfa1.start());
addEpsilonTransition(nfa1.end(), nfa1.start());
return IntPair.create(start, end);
case sym.QUESTION:
nfa1 = insertNFA((RegExp) ((RegExp1) regExp).content);
addEpsilonTransition(nfa1.start(), nfa1.end());
return IntPair.create(nfa1.start(), nfa1.end());
case sym.BANG:
return complement(insertNFA((RegExp) ((RegExp1) regExp).content));
case sym.TILDE:
return insertNFA(regExp.resolveTilde());
case sym.STRING:
return insertStringNFA(false, (String) ((RegExp1) regExp).content);
case sym.STRING_I:
return insertStringNFA(true, (String) ((RegExp1) regExp).content);
default:
throw new RegExpException(regExp);
}
}
}