UnicodeProperties.java

/*
 * Copyright (C) 1998-2019 Gerwin Klein <lsf@jflex.de>
 * Copyright (C) 2008-2019 Steve Rowe <sarowe@gmail.com>
 * Copyright (C) 2017-2020 Google, LLC.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */
package jflex.core.unicode;

import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import jflex.chars.Interval;

// DO NOT EDIT
// This class was automatically generated by //java/de/jflex/ucd_generator
// based on Unicode data files downloaded from unicode.org.
/**
 * Unicode properties that can be bound to a specific Unicode version.
 *
 * <p>Supported unicode versions are defined in {@link #UNICODE_VERSIONS}.
 */
public class UnicodeProperties {

  /** Human-readable list of all supported Unicode versions. */
  public static final String UNICODE_VERSIONS =
      "1.1, 1.1.5, 2, 2.0, 2.0.14, 2.1, 2.1.9, 3, 3.0, 3.0.1, 3.1, 3.1.1, 3.2, 3.2.0, 4, 4.0,"
          + " 4.0.1, 4.1, 4.1.0, 5, 5.0, 5.0.0, 5.1, 5.1.0, 5.2, 5.2.0, 6, 6.0, 6.0.0, 6.1, 6.1.0,"
          + " 6.2, 6.2.0, 6.3, 6.3.0, 7, 7.0, 7.0.0, 8, 8.0, 8.0.0, 9, 9.0, 9.0.0, 10, 10.0,"
          + " 10.0.0, 11, 11.0, 11.0.0, 12, 12.0, 12.0.0, 12.1, 12.1.0";

  private static final String DEFAULT_UNICODE_VERSION = "12.1";

  private static final Pattern WORD_SEP_PATTERN = Pattern.compile("[-_\\s()]");

  private int maximumCodePoint;
  private final Map<String, IntCharSet> propertyValueIntervals = new HashMap<>();
  private String caselessMatchPartitions;
  private int caselessMatchPartitionSize;
  private IntCharSet[] caselessMatches;

  /**
   * Unpacks the Unicode data corresponding to the default Unicode version.
   *
   * @throws UnsupportedUnicodeVersionException if the default version is not supported.
   */
  public UnicodeProperties() throws UnsupportedUnicodeVersionException {
    init(DEFAULT_UNICODE_VERSION);
  }

  /**
   * Unpacks the Unicode data corresponding to the given version.
   *
   * @param version The Unicode version for which to unpack data
   * @throws UnsupportedUnicodeVersionException if the given version is not supported.
   */
  public UnicodeProperties(String version) throws UnsupportedUnicodeVersionException {
    init(version);
  }

  /**
   * Returns the maximum code point for the selected Unicode version.
   *
   * @return the maximum code point for the selected Unicode version.
   */
  public int getMaximumCodePoint() {
    return maximumCodePoint;
  }

  /**
   * Returns the character interval set associated with the given property value for the selected
   * Unicode version.
   *
   * @param propertyValue The Unicode property or property value (or alias for one of these) for
   *     which to return the corresponding character intervals.
   * @return The character interval set corresponding to the given property value, if a match
   *     exists, and null otherwise.
   */
  public IntCharSet getIntCharSet(String propertyValue) {
    return propertyValueIntervals.get(normalize(propertyValue));
  }

  /**
   * Returns the set of all properties, property values, and their aliases supported by the
   * specified Unicode version.
   *
   * @return The set of all properties supported by the specified Unicode version
   */
  public Set<String> getPropertyValues() {
    return propertyValueIntervals.keySet();
  }

  /**
   * Returns a set of character intervals representing all characters that are case-insensitively
   * equivalent to the given character, including the given character itself.
   *
   * <p>The first call to this method lazily initializes the backing data.
   *
   * @param c The character for which to return case-insensitive equivalents.
   * @return All case-insensitively equivalent characters, or null if the given character is
   *     case-insensitively equivalent only to itself.
   */
  public IntCharSet getCaselessMatches(int c) {
    if (null == caselessMatches) initCaselessMatches();
    return caselessMatches[c];
  }

  /**
   * Unpacks the caseless match data. Called from {@link #getCaselessMatches(int)} to lazily
   * initialize.
   */
  private void initCaselessMatches() {
    caselessMatches = new IntCharSet[maximumCodePoint + 1];
    int[] members = new int[caselessMatchPartitionSize];
    for (int index = 0; index < caselessMatchPartitions.length(); ) {
      IntCharSet partition = new IntCharSet();
      for (int n = 0; n < caselessMatchPartitionSize; ++n) {
        int c = caselessMatchPartitions.codePointAt(index);
        index += Character.charCount(c);
        members[n] = c;
        if (c > 0) partition.add(c); // ignore trailing zero padding
      }
      if (partition.containsElements()) {
        for (int n = 0; n < caselessMatchPartitionSize; ++n) {
          if (members[n] > 0) caselessMatches[members[n]] = partition;
        }
      }
    }
  }

  /**
   * Based on the given version, selects and binds the corresponding Unicode data to facilitate
   * mappings from property values to character intervals.
   *
   * @param version The Unicode version for which to bind data
   * @throws UnsupportedUnicodeVersionException if the given version is not supported.
   */
  private void init(String version) throws UnsupportedUnicodeVersionException {
    switch (version) {
      case "1.1":
      case "1.1.5":
        bind(
            jflex.core.unicode.data.Unicode_1_1.propertyValues,
            jflex.core.unicode.data.Unicode_1_1.intervals,
            jflex.core.unicode.data.Unicode_1_1.propertyValueAliases,
            jflex.core.unicode.data.Unicode_1_1.maximumCodePoint,
            jflex.core.unicode.data.Unicode_1_1.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_1_1.caselessMatchPartitionSize);
        break;
      case "2":
      case "2.0":
      case "2.0.14":
        bind(
            jflex.core.unicode.data.Unicode_2_0.propertyValues,
            jflex.core.unicode.data.Unicode_2_0.intervals,
            jflex.core.unicode.data.Unicode_2_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_2_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_2_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_2_0.caselessMatchPartitionSize);
        break;
      case "2.1":
      case "2.1.9":
        bind(
            jflex.core.unicode.data.Unicode_2_1.propertyValues,
            jflex.core.unicode.data.Unicode_2_1.intervals,
            jflex.core.unicode.data.Unicode_2_1.propertyValueAliases,
            jflex.core.unicode.data.Unicode_2_1.maximumCodePoint,
            jflex.core.unicode.data.Unicode_2_1.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_2_1.caselessMatchPartitionSize);
        break;
      case "3":
      case "3.0":
      case "3.0.1":
        bind(
            jflex.core.unicode.data.Unicode_3_0.propertyValues,
            jflex.core.unicode.data.Unicode_3_0.intervals,
            jflex.core.unicode.data.Unicode_3_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_3_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_3_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_3_0.caselessMatchPartitionSize);
        break;
      case "3.1":
      case "3.1.1":
        bind(
            jflex.core.unicode.data.Unicode_3_1.propertyValues,
            jflex.core.unicode.data.Unicode_3_1.intervals,
            jflex.core.unicode.data.Unicode_3_1.propertyValueAliases,
            jflex.core.unicode.data.Unicode_3_1.maximumCodePoint,
            jflex.core.unicode.data.Unicode_3_1.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_3_1.caselessMatchPartitionSize);
        break;
      case "3.2":
      case "3.2.0":
        bind(
            jflex.core.unicode.data.Unicode_3_2.propertyValues,
            jflex.core.unicode.data.Unicode_3_2.intervals,
            jflex.core.unicode.data.Unicode_3_2.propertyValueAliases,
            jflex.core.unicode.data.Unicode_3_2.maximumCodePoint,
            jflex.core.unicode.data.Unicode_3_2.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_3_2.caselessMatchPartitionSize);
        break;
      case "4":
      case "4.0":
      case "4.0.1":
        bind(
            jflex.core.unicode.data.Unicode_4_0.propertyValues,
            jflex.core.unicode.data.Unicode_4_0.intervals,
            jflex.core.unicode.data.Unicode_4_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_4_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_4_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_4_0.caselessMatchPartitionSize);
        break;
      case "4.1":
      case "4.1.0":
        bind(
            jflex.core.unicode.data.Unicode_4_1.propertyValues,
            jflex.core.unicode.data.Unicode_4_1.intervals,
            jflex.core.unicode.data.Unicode_4_1.propertyValueAliases,
            jflex.core.unicode.data.Unicode_4_1.maximumCodePoint,
            jflex.core.unicode.data.Unicode_4_1.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_4_1.caselessMatchPartitionSize);
        break;
      case "5":
      case "5.0":
      case "5.0.0":
        bind(
            jflex.core.unicode.data.Unicode_5_0.propertyValues,
            jflex.core.unicode.data.Unicode_5_0.intervals,
            jflex.core.unicode.data.Unicode_5_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_5_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_5_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_5_0.caselessMatchPartitionSize);
        break;
      case "5.1":
      case "5.1.0":
        bind(
            jflex.core.unicode.data.Unicode_5_1.propertyValues,
            jflex.core.unicode.data.Unicode_5_1.intervals,
            jflex.core.unicode.data.Unicode_5_1.propertyValueAliases,
            jflex.core.unicode.data.Unicode_5_1.maximumCodePoint,
            jflex.core.unicode.data.Unicode_5_1.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_5_1.caselessMatchPartitionSize);
        break;
      case "5.2":
      case "5.2.0":
        bind(
            jflex.core.unicode.data.Unicode_5_2.propertyValues,
            jflex.core.unicode.data.Unicode_5_2.intervals,
            jflex.core.unicode.data.Unicode_5_2.propertyValueAliases,
            jflex.core.unicode.data.Unicode_5_2.maximumCodePoint,
            jflex.core.unicode.data.Unicode_5_2.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_5_2.caselessMatchPartitionSize);
        break;
      case "6":
      case "6.0":
      case "6.0.0":
        bind(
            jflex.core.unicode.data.Unicode_6_0.propertyValues,
            jflex.core.unicode.data.Unicode_6_0.intervals,
            jflex.core.unicode.data.Unicode_6_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_6_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_6_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_6_0.caselessMatchPartitionSize);
        break;
      case "6.1":
      case "6.1.0":
        bind(
            jflex.core.unicode.data.Unicode_6_1.propertyValues,
            jflex.core.unicode.data.Unicode_6_1.intervals,
            jflex.core.unicode.data.Unicode_6_1.propertyValueAliases,
            jflex.core.unicode.data.Unicode_6_1.maximumCodePoint,
            jflex.core.unicode.data.Unicode_6_1.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_6_1.caselessMatchPartitionSize);
        break;
      case "6.2":
      case "6.2.0":
        bind(
            jflex.core.unicode.data.Unicode_6_2.propertyValues,
            jflex.core.unicode.data.Unicode_6_2.intervals,
            jflex.core.unicode.data.Unicode_6_2.propertyValueAliases,
            jflex.core.unicode.data.Unicode_6_2.maximumCodePoint,
            jflex.core.unicode.data.Unicode_6_2.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_6_2.caselessMatchPartitionSize);
        break;
      case "6.3":
      case "6.3.0":
        bind(
            jflex.core.unicode.data.Unicode_6_3.propertyValues,
            jflex.core.unicode.data.Unicode_6_3.intervals,
            jflex.core.unicode.data.Unicode_6_3.propertyValueAliases,
            jflex.core.unicode.data.Unicode_6_3.maximumCodePoint,
            jflex.core.unicode.data.Unicode_6_3.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_6_3.caselessMatchPartitionSize);
        break;
      case "7":
      case "7.0":
      case "7.0.0":
        bind(
            jflex.core.unicode.data.Unicode_7_0.propertyValues,
            jflex.core.unicode.data.Unicode_7_0.intervals,
            jflex.core.unicode.data.Unicode_7_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_7_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_7_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_7_0.caselessMatchPartitionSize);
        break;
      case "8":
      case "8.0":
      case "8.0.0":
        bind(
            jflex.core.unicode.data.Unicode_8_0.propertyValues,
            jflex.core.unicode.data.Unicode_8_0.intervals,
            jflex.core.unicode.data.Unicode_8_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_8_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_8_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_8_0.caselessMatchPartitionSize);
        break;
      case "9":
      case "9.0":
      case "9.0.0":
        bind(
            jflex.core.unicode.data.Unicode_9_0.propertyValues,
            jflex.core.unicode.data.Unicode_9_0.intervals,
            jflex.core.unicode.data.Unicode_9_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_9_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_9_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_9_0.caselessMatchPartitionSize);
        break;
      case "10":
      case "10.0":
      case "10.0.0":
        bind(
            jflex.core.unicode.data.Unicode_10_0.propertyValues,
            jflex.core.unicode.data.Unicode_10_0.intervals,
            jflex.core.unicode.data.Unicode_10_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_10_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_10_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_10_0.caselessMatchPartitionSize);
        break;
      case "11":
      case "11.0":
      case "11.0.0":
        bind(
            jflex.core.unicode.data.Unicode_11_0.propertyValues,
            jflex.core.unicode.data.Unicode_11_0.intervals,
            jflex.core.unicode.data.Unicode_11_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_11_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_11_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_11_0.caselessMatchPartitionSize);
        break;
      case "12":
      case "12.0":
      case "12.0.0":
        bind(
            jflex.core.unicode.data.Unicode_12_0.propertyValues,
            jflex.core.unicode.data.Unicode_12_0.intervals,
            jflex.core.unicode.data.Unicode_12_0.propertyValueAliases,
            jflex.core.unicode.data.Unicode_12_0.maximumCodePoint,
            jflex.core.unicode.data.Unicode_12_0.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_12_0.caselessMatchPartitionSize);
        break;
      case "12.1":
      case "12.1.0":
        bind(
            jflex.core.unicode.data.Unicode_12_1.propertyValues,
            jflex.core.unicode.data.Unicode_12_1.intervals,
            jflex.core.unicode.data.Unicode_12_1.propertyValueAliases,
            jflex.core.unicode.data.Unicode_12_1.maximumCodePoint,
            jflex.core.unicode.data.Unicode_12_1.caselessMatchPartitions,
            jflex.core.unicode.data.Unicode_12_1.caselessMatchPartitionSize);
        break;
      default:
        throw new UnsupportedUnicodeVersionException();
    }
  }

  /**
   * Unpacks data for the selected Unicode version, populating {@link #propertyValueIntervals}.
   *
   * @param propertyValues The list of property values, in same order as the packed data
   *     corresponding to them, in the given intervals, for the selected Unicode version.
   * @param intervals The packed character intervals corresponding to and in the same order as the
   *     given propertyValues, for the selected Unicode version.
   * @param propertyValueAliases Key/value pairs mapping property value aliases to property values,
   *     for the selected Unicode version.
   * @param maximumCodePoint The maximum code point for the selected Unicode version.
   * @param caselessMatchPartitions The packed caseless match partition data for the selected
   *     Unicode version
   * @param caselessMatchPartitionSize The partition data record length (the maximum number of
   *     elements in a caseless match partition) for the selected Unicode version.
   */
  private void bind(
      String[] propertyValues,
      String[] intervals,
      String[] propertyValueAliases,
      int maximumCodePoint,
      String caselessMatchPartitions,
      int caselessMatchPartitionSize) {
    // IntCharSet caselessMatches[] is lazily initialized - don't unpack here
    this.caselessMatchPartitions = caselessMatchPartitions;
    this.caselessMatchPartitionSize = caselessMatchPartitionSize;
    this.maximumCodePoint = maximumCodePoint;
    for (int n = 0; n < propertyValues.length; ++n) {
      String propertyValue = propertyValues[n];
      String propertyIntervals = intervals[n];
      IntCharSet set = new IntCharSet();
      for (int index = 0; index < propertyIntervals.length(); ) {
        int start = propertyIntervals.codePointAt(index);
        index += Character.charCount(start);
        int end = propertyIntervals.codePointAt(index);
        index += Character.charCount(end);
        set.add(new Interval(start, end));
      }
      propertyValueIntervals.put(propertyValue, set);
      if (2 == propertyValue.length()) {
        String singleLetter = propertyValue.substring(0, 1);
        IntCharSet singleLetterPropValueSet = propertyValueIntervals.get(singleLetter);
        if (null == singleLetterPropValueSet) {
          singleLetterPropValueSet = new IntCharSet();
          propertyValueIntervals.put(singleLetter, singleLetterPropValueSet);
        }
        singleLetterPropValueSet.add(set);
      }
    }
    // We expect the length of propertyValueAliases to be divisible by 2 (alias/value pairs)
    assert 0 == propertyValueAliases.length % 2;
    for (int n = 0; n < propertyValueAliases.length - 1; n += 2) {
      String alias = propertyValueAliases[n];
      String propertyValue = propertyValueAliases[n + 1];
      IntCharSet targetSet = propertyValueIntervals.get(propertyValue);
      if (null != targetSet) {
        propertyValueIntervals.put(alias, targetSet);
      }
    }
    bindInvariantIntervals();
  }

  /** Adds intervals for \p{ASCII} and \p{Any} to {@link #propertyValueIntervals}. */
  private void bindInvariantIntervals() {
    IntCharSet asciiSet = IntCharSet.ofCharacterRange(0, 0x7F);
    propertyValueIntervals.put(normalize("ASCII"), asciiSet);

    IntCharSet anySet = IntCharSet.ofCharacterRange(0, maximumCodePoint);
    propertyValueIntervals.put(normalize("Any"), anySet);
  }

  /**
   * Normalizes the given identifier, by: downcasing; removing whitespace, underscores, hyphens, and
   * parentheses; and substituting '=' for every ':'.
   *
   * @param identifier The identifier to normalize
   * @return The normalized identifier
   */
  private static String normalize(String identifier) {
    if (null == identifier) return identifier;
    Matcher matcher = WORD_SEP_PATTERN.matcher(identifier.toLowerCase(Locale.ENGLISH));
    return matcher.replaceAll("").replace(':', '=');
  }

  public static class UnsupportedUnicodeVersionException extends Exception {
    private static final long serialVersionUID = -1718158223161422981L;

    public UnsupportedUnicodeVersionException() {
      super("Supported versions: " + UNICODE_VERSIONS);
    }

    public UnsupportedUnicodeVersionException(Throwable cause) {
      super("Supported versions: " + UNICODE_VERSIONS, cause);
    }
  }
}