CharClasses.java

/*
 * Copyright (C) 1998-2018  Gerwin Klein <lsf@jflex.de>
 * SPDX-License-Identifier: BSD-3-Clause
 */

package jflex.core.unicode;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import jflex.base.Pair;
import jflex.chars.Interval;
import jflex.logging.Out;

/**
 * Character Classes.
 *
 * @author Gerwin Klein
 * @version JFlex 1.10.0-SNAPSHOT
 */
public class CharClasses {

  /** debug flag (for char classes only) */
  private static final boolean DEBUG = false;

  /** for sorting disjoint IntCharSets */
  private static final Comparator<IntCharSet> INT_CHAR_SET_COMPARATOR = new IntCharSetComparator();

  /** the largest character that can be used in char classes */
  public static final int maxChar = 0x10FFFF;

  /** the char classes */
  private List<IntCharSet> classes;

  /** the largest character actually used in a specification */
  private int maxCharUsed;

  /** the @{link UnicodeProperties} the spec scanner used */
  private UnicodeProperties unicodeProps;

  /**
   * Constructs a new CharClasses object.
   *
   * @param maxCharCode the last character code to be considered. (127 for 7bit Lexers, 255 for 8bit
   *     Lexers and UnicodeProperties.getMaximumCodePoint() for Unicode Lexers).
   * @param scanner the scanner containing the UnicodeProperties instance from which caseless
   *     mappings can be obtained.
   */
  public CharClasses(int maxCharCode, ILexScan scanner) {
    init(maxCharCode, scanner.getUnicodeProperties());
  }

  /**
   * Constructs a new CharClasses object.
   *
   * @param maxCharCode the last character code to be considered. (127 for 7bit Lexers, 255 for 8bit
   *     Lexers and UnicodeProperties.getMaximumCodePoint() for Unicode Lexers).
   * @param props the UnicodeProperties instance from which caseless mappings can be obtained.
   */
  public CharClasses(int maxCharCode, UnicodeProperties props) {
    init(maxCharCode, props);
  }

  /**
   * Provides space for classes of characters from 0 to maxCharCode.
   *
   * <p>Initially all characters are in class 0.
   *
   * @param maxCharCode the last character code to be considered. (127 for 7bit Lexers, 255 for 8bit
   *     Lexers and UnicodeProperties.getMaximumCodePoint() for Unicode Lexers).
   * @param props the UnicodeProperties instance from which caseless mappings can be obtained.
   */
  private void init(int maxCharCode, UnicodeProperties props) {
    if (maxCharCode < 0) {
      throw new IllegalArgumentException("maxCharCode " + maxCharCode + " is negative.");
    } else if (maxCharCode > maxChar) {
      throw new IllegalArgumentException(
          "maxCharCode "
              + Integer.toHexString(maxCharCode)
              + " is larger than maxChar "
              + Integer.toHexString(maxChar));
    }

    maxCharUsed = maxCharCode;
    this.unicodeProps = props;
    classes = new ArrayList<>();
    classes.add(IntCharSet.ofCharacterRange(0, maxCharCode));
  }

  /**
   * Returns the greatest Unicode value of the current input character set.
   *
   * @return unicode value.
   */
  public int getMaxCharCode() {
    return maxCharUsed;
  }

  /**
   * Sets the largest Unicode value of the current input character set.
   *
   * @param maxCharCode the largest character code, used for the scanner (i.e. %7bit, %8bit, %16bit
   *     etc.)
   */
  public void setMaxCharCode(int maxCharCode) {
    if (maxCharCode < 0) {
      throw new IllegalArgumentException("maxCharCode " + maxCharCode + " is negative.");
    } else if (maxCharCode > maxChar) {
      throw new IllegalArgumentException(
          "maxCharCode "
              + Integer.toHexString(maxCharCode)
              + " is larger than maxChar "
              + Integer.toHexString(maxChar));
    }

    maxCharUsed = maxCharCode;
  }

  /**
   * Returns the current number of character classes.
   *
   * @return number of character classes.
   */
  public int getNumClasses() {
    return classes.size();
  }

  /**
   * Returns the unicode properties used by this CharClasses object.
   *
   * @return the unicode properties used by this CharClasses object.
   */
  public UnicodeProperties getUnicodeProperties() {
    return unicodeProps;
  }

  /** Returns a deep-copy list of all char class partions. */
  public List<IntCharSet> allClasses() {
    List<IntCharSet> result = new ArrayList<>();
    for (IntCharSet ccl : classes) {
      result.add(IntCharSet.copyOf(ccl));
    }
    return result;
  }

  /**
   * Updates the current partition, so that the specified set of characters gets a new character
   * class.
   *
   * <p>Characters that are elements of {@code set} are not in the same equivalence class with
   * characters that are not elements of {@code set}.
   *
   * @param set the set of characters to distinguish from the rest
   * @param caseless if true upper/lower/title case are considered equivalent
   */
  public void makeClass(IntCharSet set, boolean caseless) {
    set = IntCharSet.copyOf(set); // avoid destructively updating the original

    if (caseless) set = set.getCaseless(unicodeProps);

    if (DEBUG) {
      Out.dump("makeClass(" + set + ", " + caseless + ")");
      dump();
    }

    int oldSize = classes.size();
    for (int i = 0; i < oldSize; i++) {
      IntCharSet x = classes.get(i);

      if (Objects.equals(x, set)) return;

      IntCharSet and = x.and(set);

      if (and.containsElements()) {
        if (Objects.equals(x, and)) {
          set.sub(and);
          continue;
        } else if (Objects.equals(set, and)) {
          x.sub(and);
          classes.add(and);
          if (DEBUG) {
            Out.dump("makeClass(..) finished");
            dump();
          }
          return;
        }

        set.sub(and);
        x.sub(and);
        classes.add(and);
      }
    }

    if (DEBUG) {
      Out.dump("makeClass(..) finished");
      dump();
    }
  }

  /**
   * Returns the code of the character class the specified character belongs to.
   *
   * @param codePoint code point to get the char class for.
   * @return code of the character class, -1 if {@code codePoint} is not in the input char set.
   */
  public int getClassCode(int codePoint) {
    int i = 0;
    while (i < classes.size()) {
      IntCharSet x = classes.get(i);
      if (x.contains(codePoint)) return i;
      i++;
    }
    return -1;
  }

  /**
   * Retuns a copy of a single char class partition by code.
   *
   * @param code the code of the char class partition to return.
   * @return a copy of the char class with the specified code.
   */
  public IntCharSet getCharClass(int code) {
    return IntCharSet.copyOf(classes.get(code));
  }

  /** Dumps charclasses to the dump output stream. */
  public void dump() {
    Out.dump(toString());
  }

  /**
   * Returns a string representation of one char class
   *
   * @param theClass the index of the class to
   * @return a {@link java.lang.String} object.
   */
  public String toString(int theClass) {
    return classes.get(theClass).toString();
  }

  @Override
  public String toString() {
    StringBuilder result = new StringBuilder("CharClasses:");

    result.append(Out.NL);

    for (int i = 0; i < classes.size(); i++)
      result
          .append("class ")
          .append(i)
          .append(":")
          .append(Out.NL)
          .append(classes.get(i))
          .append(Out.NL);

    return result.toString();
  }

  /**
   * Creates a new character class for the single character {@code singleChar}.
   *
   * @param caseless if true upper/lower/title case are considered equivalent
   * @param singleChar character.
   */
  public void makeClass(int singleChar, boolean caseless) {
    makeClass(IntCharSet.ofCharacter(singleChar), caseless);
  }

  /**
   * Creates a new character class for each character of the specified String.
   *
   * @param caseless if true upper/lower/title case are considered equivalent
   * @param str the String to iterate single char class creation over.
   */
  public void makeClass(String str, boolean caseless) {
    for (int i = 0; i < str.length(); ) {
      int ch = str.codePointAt(i);
      makeClass(ch, caseless);
      i += Character.charCount(ch);
    }
  }

  /**
   * Returns an array that contains the character class codes of all characters in the specified set
   * of input characters.
   */
  public int[] getClassCodes(IntCharSet set, boolean negate) {

    if (DEBUG) {
      Out.dump("getting class codes for " + set);
      if (negate) Out.dump("[negated]");
    }

    int size = classes.size();

    // [fixme: optimize]
    int[] temp = new int[size];
    int length = 0;

    for (int i = 0; i < size; i++) {
      IntCharSet x = classes.get(i);
      if (negate) {
        if (!set.and(x).containsElements()) {
          temp[length++] = i;
          if (DEBUG) Out.dump("code " + i);
        }
      } else {
        if (set.and(x).containsElements()) {
          temp[length++] = i;
          if (DEBUG) Out.dump("code " + i);
        }
      }
    }

    int[] result = new int[length];
    System.arraycopy(temp, 0, result, 0, length);

    return result;
  }

  /**
   * Checks the invariants of this object.
   *
   * <p>All classes must be disjoint, and their union must be the entire input set.
   *
   * @return true when the invariants of this objects hold.
   */
  public boolean invariants() {
    for (int i = 0; i < classes.size(); i++)
      for (int j = i + 1; j < classes.size(); j++) {
        if (classes.get(i).and(classes.get(j)).containsElements()) {
          return false;
        }
      }

    IntCharSet union = new IntCharSet();
    for (IntCharSet i : classes) {
      union.add(i);
    }

    return IntCharSet.allChars().equals(union);
  }

  /**
   * Brings the partitions into a canonical order such that objects that implement the same
   * partitions but in different order become equal.
   *
   * <p>For example, [ {0}, {1} ] and [ {1}, {0} ] implement the same partition of the set {0,1} but
   * have different content. Different order will lead to different input assignments in the NFA and
   * DFA phases and will make otherwise equal automata look distinct.
   *
   * <p>This is not needed for correctness, but it makes the comparison of output DFAs (e.g. in the
   * test suite) for equivalence more robust.
   */
  public void normalise() {
    classes.sort(INT_CHAR_SET_COMPARATOR);
  }

  /**
   * Construct a (deep) copy of the the provided CharClasses object.
   *
   * @param c the CharClasses to copy
   * @return a deep copy of c
   */
  public static CharClasses copyOf(CharClasses c) {
    CharClasses result = new CharClasses(c.maxCharUsed, c.unicodeProps);
    result.classes = c.allClasses();
    return result;
  }

  /**
   * Returns an array of all CharClassIntervals in this char class collection.
   *
   * <p>The array is ordered by char code, i.e. {@code result[i+1].start = result[i].end+1} Each
   * CharClassInterval contains the number of the char class it belongs to.
   *
   * @return an array of all {@link CharClassInterval} in this char class collection.
   */
  public CharClassInterval[] getIntervals() {
    int i, c;
    int size = classes.size();
    int numIntervals = 0;

    for (i = 0; i < size; i++) numIntervals += classes.get(i).numIntervals();

    List<Iterator<Interval>> iterators = new ArrayList<>();
    for (IntCharSet set : classes) iterators.add(set.intervalIterator());

    CharClassInterval[] result = new CharClassInterval[numIntervals];

    i = 0;
    c = 0;
    while (i < numIntervals) {
      int code = getClassCode(c);
      Interval iv = iterators.get(code).next(); // must have enough elements
      result[i++] = new CharClassInterval(iv.start, iv.end, code);
      c = iv.end + 1;
    }

    return result;
  }

  /**
   * Computes a two-level table structure representing this CharClass object, where second-level
   * blocks are shared if equal. The hope is that this sharing happens (very) often with a large
   * number of blocks being mapped to the same character class.
   *
   * @return a pair of a top-level table, and a list of second-level blocks for this char class
   *     object.
   */
  Pair<int[], List<CMapBlock>> computeTables() {
    CharClassInterval[] intervals = getIntervals();
    int intervalIndex = 0;
    int curClass = intervals[intervalIndex].charClass;
    int codePoint = 0;

    int topLevelSize = (maxCharUsed + 1) >> CMapBlock.BLOCK_BITS;
    int[] topLevel = new int[topLevelSize];
    List<CMapBlock> blocks = new ArrayList<>();

    for (int topIndex = 0; topIndex < topLevelSize; topIndex++) {
      int[] block = new int[CMapBlock.BLOCK_SIZE];
      for (int i = 0; i < CMapBlock.BLOCK_SIZE; i++, codePoint++) {
        // if maxCharUsed doesn't align to CMapBlock.BLOCK_BITS, we leave the
        // rest of the highest block equal to 0.
        if (maxCharUsed < codePoint) break;
        if (!intervals[intervalIndex].contains(codePoint)) {
          curClass = intervals[++intervalIndex].charClass;
        }
        block[i] = curClass;
      }
      // find earliest equal block (if any)
      CMapBlock b = new CMapBlock(block);
      int idx = blocks.indexOf(b);
      if (idx < 0) {
        idx = blocks.size();
        blocks.add(b);
      }
      topLevel[topIndex] = idx;
    }
    return new Pair<int[], List<CMapBlock>>(topLevel, blocks);
  }

  /** Turn a list of second-level blocks into a flat array. */
  private static int[] flattenBlocks(List<CMapBlock> blocks) {
    int[] result = new int[blocks.size() * CMapBlock.BLOCK_SIZE];
    for (int i = 0; i < blocks.size(); i++) {
      int[] block = blocks.get(i).block;
      System.arraycopy(block, 0, result, i << CMapBlock.BLOCK_BITS, CMapBlock.BLOCK_SIZE);
    }
    return result;
  }

  /**
   * Returns a two-level table structure for this char-class object. The char class of input {@code
   * x} is {@code snd[(fst[x >> BLOCK_BITS]) | (x && BLOCK_MASK))]} where {@code BLOCK_MASK =
   * BLOCK_SIZE - 1}, and the index of the first block in the top level is guaranteed to be 0 (which
   * means the {@code fst} lookup can be skipped if {@code x <= BLOCK_MASK}).
   *
   * @see CMapBlock#BLOCK_BITS
   * @see CMapBlock#BLOCK_SIZE
   */
  public Pair<int[], int[]> getTables() {
    Pair<int[], List<CMapBlock>> p = computeTables();
    int[] shifted = new int[p.fst.length];
    for (int i = 0; i < p.fst.length; i++) {
      shifted[i] = p.fst[i] << CMapBlock.BLOCK_BITS;
    }
    return new Pair<int[], int[]>(shifted, flattenBlocks(p.snd));
  }
}