UnicodeProperties.java

package org.mozilla.javascript.regexp;

import java.util.Map;
import java.util.regex.Matcher;

/**
 * Unicode properties handler for Java 11 Character class. Handles binary properties from ECMA-262
 * and general category values.
 */
public class UnicodeProperties {
    // Binary Property Names (from ECMA-262 table-binary-unicode-properties)
    public static final byte ALPHABETIC = 1;
    public static final byte ASCII = ALPHABETIC + 1;
    public static final byte CASE_IGNORABLE = ASCII + 1;
    public static final byte ASCII_HEX_DIGIT = CASE_IGNORABLE + 1;
    public static final byte HEX_DIGIT = ASCII_HEX_DIGIT + 1;
    public static final byte ID_CONTINUE = HEX_DIGIT + 1;
    public static final byte ID_START = ID_CONTINUE + 1;
    public static final byte LOWERCASE = ID_START + 1;
    public static final byte UPPERCASE = LOWERCASE + 1;
    public static final byte WHITE_SPACE = UPPERCASE + 1;

    // Non-binary properties
    public static final byte GENERAL_CATEGORY = WHITE_SPACE + 1;
    public static final byte SCRIPT = GENERAL_CATEGORY + 1;

    // Property Values for General Category (from PropertyValueAliases.txt)
    // OTHER
    public static final byte OTHER = 1;
    public static final byte CONTROL = OTHER + 1;
    public static final byte FORMAT = CONTROL + 1;
    public static final byte UNASSIGNED = FORMAT + 1;
    public static final byte PRIVATE_USE = UNASSIGNED + 1;
    public static final byte SURROGATE = PRIVATE_USE + 1;
    public static final byte LETTER = SURROGATE + 1;
    public static final byte LOWERCASE_LETTER = LETTER + 1;
    public static final byte MODIFIER_LETTER = LOWERCASE_LETTER + 1;
    public static final byte OTHER_LETTER = MODIFIER_LETTER + 1;
    public static final byte TITLECASE_LETTER = OTHER_LETTER + 1;
    public static final byte UPPERCASE_LETTER = TITLECASE_LETTER + 1;
    public static final byte MARK = UPPERCASE_LETTER + 1;
    public static final byte SPACING_MARK = MARK + 1;
    public static final byte ENCLOSING_MARK = SPACING_MARK + 1;
    public static final byte NONSPACING_MARK = ENCLOSING_MARK + 1;
    public static final byte NUMBER = NONSPACING_MARK + 1;
    public static final byte DECIMAL_NUMBER = NUMBER + 1;
    public static final byte LETTER_NUMBER = DECIMAL_NUMBER + 1;
    public static final byte OTHER_NUMBER = LETTER_NUMBER + 1;
    public static final byte PUNCTUATION = OTHER_NUMBER + 1;
    public static final byte CONNECTOR_PUNCTUATION = PUNCTUATION + 1;
    public static final byte DASH_PUNCTUATION = CONNECTOR_PUNCTUATION + 1;
    public static final byte CLOSE_PUNCTUATION = DASH_PUNCTUATION + 1;
    public static final byte FINAL_PUNCTUATION = CLOSE_PUNCTUATION + 1;
    public static final byte INITIAL_PUNCTUATION = FINAL_PUNCTUATION + 1;
    public static final byte OTHER_PUNCTUATION = INITIAL_PUNCTUATION + 1;
    public static final byte OPEN_PUNCTUATION = OTHER_PUNCTUATION + 1;
    public static final byte SYMBOL = OPEN_PUNCTUATION + 1;
    public static final byte CURRENCY_SYMBOL = SYMBOL + 1;
    public static final byte MODIFIER_SYMBOL = CURRENCY_SYMBOL + 1;
    public static final byte MATH_SYMBOL = MODIFIER_SYMBOL + 1;
    public static final byte OTHER_SYMBOL = MATH_SYMBOL + 1;
    public static final byte SEPARATOR = OTHER_SYMBOL + 1;
    public static final byte LINE_SEPARATOR = SEPARATOR + 1;
    public static final byte PARAGRAPH_SEPARATOR = LINE_SEPARATOR + 1;
    public static final byte SPACE_SEPARATOR = PARAGRAPH_SEPARATOR + 1;

    // Binary property values
    public static final byte TRUE = SPACE_SEPARATOR + 1;
    public static final byte FALSE = TRUE + 1;

    // Property Name Map (canonical names and aliases)
    public static final Map<String, Byte> PROPERTY_NAMES =
            Map.ofEntries(
                    Map.entry("Alphabetic", ALPHABETIC),
                    Map.entry("Alpha", ALPHABETIC),
                    Map.entry("ASCII", ASCII),
                    Map.entry("Case_Ignorable", CASE_IGNORABLE),
                    Map.entry("CI", CASE_IGNORABLE),
                    Map.entry("General_Category", GENERAL_CATEGORY),
                    Map.entry("gc", GENERAL_CATEGORY),
                    Map.entry("Script", SCRIPT),
                    Map.entry("sc", SCRIPT),
                    Map.entry("ASCII_Hex_Digit", ASCII_HEX_DIGIT),
                    Map.entry("AHex", ASCII_HEX_DIGIT),
                    Map.entry("Hex_Digit", HEX_DIGIT),
                    Map.entry("Hex", HEX_DIGIT),
                    Map.entry("ID_Continue", ID_CONTINUE),
                    Map.entry("IDC", ID_CONTINUE),
                    Map.entry("ID_Start", ID_START),
                    Map.entry("IDS", ID_START),
                    Map.entry("Lowercase", LOWERCASE),
                    Map.entry("Lower", LOWERCASE),
                    Map.entry("Uppercase", UPPERCASE),
                    Map.entry("Upper", UPPERCASE),
                    Map.entry("White_Space", WHITE_SPACE),
                    Map.entry("space", WHITE_SPACE));

    // Property Value Map for General Category (canonical names and aliases)
    public static final Map<String, Byte> PROPERTY_VALUES =
            Map.<String, Byte>ofEntries(
                    Map.entry("Other", OTHER),
                    Map.entry("C", OTHER),
                    Map.entry("Control", CONTROL),
                    Map.entry("Cc", CONTROL),
                    Map.entry("cntrl", CONTROL),
                    Map.entry("Format", FORMAT),
                    Map.entry("Cf", FORMAT),
                    Map.entry("Unassigned", UNASSIGNED),
                    Map.entry("Cn", UNASSIGNED),
                    Map.entry("Private_Use", PRIVATE_USE),
                    Map.entry("Co", PRIVATE_USE),
                    Map.entry("Surrogate", SURROGATE),
                    Map.entry("Cs", SURROGATE),
                    Map.entry("Letter", LETTER),
                    Map.entry("L", LETTER),
                    Map.entry("Lowercase_Letter", LOWERCASE_LETTER),
                    Map.entry("Ll", LOWERCASE_LETTER),
                    Map.entry("Modifier_Letter", MODIFIER_LETTER),
                    Map.entry("Lm", MODIFIER_LETTER),
                    Map.entry("Other_Letter", OTHER_LETTER),
                    Map.entry("Lo", OTHER_LETTER),
                    Map.entry("Titlecase_Letter", TITLECASE_LETTER),
                    Map.entry("Lt", TITLECASE_LETTER),
                    Map.entry("Uppercase_Letter", UPPERCASE_LETTER),
                    Map.entry("Lu", UPPERCASE_LETTER),
                    Map.entry("Mark", MARK),
                    Map.entry("M", MARK),
                    Map.entry("Combining_Mark", MARK),
                    Map.entry("Spacing_Mark", SPACING_MARK),
                    Map.entry("Mc", SPACING_MARK),
                    Map.entry("Enclosing_Mark", ENCLOSING_MARK),
                    Map.entry("Me", ENCLOSING_MARK),
                    Map.entry("Nonspacing_Mark", NONSPACING_MARK),
                    Map.entry("Mn", NONSPACING_MARK),
                    Map.entry("Number", NUMBER),
                    Map.entry("N", NUMBER),
                    Map.entry("Decimal_Number", DECIMAL_NUMBER),
                    Map.entry("Nd", DECIMAL_NUMBER),
                    Map.entry("digit", NUMBER),
                    Map.entry("Letter_Number", LETTER_NUMBER),
                    Map.entry("Nl", LETTER_NUMBER),
                    Map.entry("Other_Number", OTHER_NUMBER),
                    Map.entry("No", OTHER_NUMBER),
                    Map.entry("Punctuation", PUNCTUATION),
                    Map.entry("P", PUNCTUATION),
                    Map.entry("punct", PUNCTUATION),
                    Map.entry("Connector_Punctuation", CONNECTOR_PUNCTUATION),
                    Map.entry("Pc", CONNECTOR_PUNCTUATION),
                    Map.entry("Dash_Punctuation", DASH_PUNCTUATION),
                    Map.entry("Pd", DASH_PUNCTUATION),
                    Map.entry("Close_Punctuation", CLOSE_PUNCTUATION),
                    Map.entry("Pe", CLOSE_PUNCTUATION),
                    Map.entry("Final_Punctuation", FINAL_PUNCTUATION),
                    Map.entry("Pf", FINAL_PUNCTUATION),
                    Map.entry("Initial_Punctuation", INITIAL_PUNCTUATION),
                    Map.entry("Pi", INITIAL_PUNCTUATION),
                    Map.entry("Other_Punctuation", OTHER_PUNCTUATION),
                    Map.entry("Po", OTHER_PUNCTUATION),
                    Map.entry("Open_Punctuation", OPEN_PUNCTUATION),
                    Map.entry("Ps", OPEN_PUNCTUATION),
                    Map.entry("Symbol", SYMBOL),
                    Map.entry("S", SYMBOL),
                    Map.entry("Currency_Symbol", CURRENCY_SYMBOL),
                    Map.entry("Sc", CURRENCY_SYMBOL),
                    Map.entry("Modifier_Symbol", MODIFIER_SYMBOL),
                    Map.entry("Sk", MODIFIER_SYMBOL),
                    Map.entry("Math_Symbol", MATH_SYMBOL),
                    Map.entry("Sm", MATH_SYMBOL),
                    Map.entry("Other_Symbol", OTHER_SYMBOL),
                    Map.entry("So", OTHER_SYMBOL),
                    Map.entry("Separator", SEPARATOR),
                    Map.entry("Z", SEPARATOR),
                    Map.entry("Line_Separator", LINE_SEPARATOR),
                    Map.entry("Zl", LINE_SEPARATOR),
                    Map.entry("Paragraph_Separator", PARAGRAPH_SEPARATOR),
                    Map.entry("Zp", PARAGRAPH_SEPARATOR),
                    Map.entry("Space_Separator", SPACE_SEPARATOR),
                    Map.entry("Zs", SPACE_SEPARATOR));

    /**
     * Looks up a property name and optionally a value and returns an encoded int. For binary
     * properties, combines the property name with TRUE. For General_Category, combines
     * General_Category with the specified value.
     *
     * @param propertyOrValue Property name or property name=value pair
     * @return Encoded int combining property name and value
     */
    @SuppressWarnings("EnumOrdinal") // We don't persist the ordinals; hence this is safe.
    public static int lookup(String propertyOrValue) {
        if (propertyOrValue == null || propertyOrValue.isEmpty()) {
            return -1;
        }

        Matcher m =
                java.util.regex.Pattern.compile(
                                "^(?<propName>[a-zA-Z_]+)(?:=(?<propValue>[a-zA-Z_0-9]+))?$")
                        .matcher(propertyOrValue);
        m.find();
        if (!m.matches() || m.group("propName") == null) {
            return -1;
        }

        if (m.group("propValue") == null) {
            // It's a single property name (binary property)
            String property = m.group("propName");

            Byte propByte = PROPERTY_NAMES.get(property);

            if (propByte == null) {
                // Check if it's a general category value without the gc= prefix
                Byte valueByte = PROPERTY_VALUES.get(property);
                if (valueByte != null) {
                    // It's a GC value, encode it with GC property
                    return encodeProperty(GENERAL_CATEGORY, valueByte);
                }
                return -1;
            }

            if (propByte == GENERAL_CATEGORY || propByte == SCRIPT) {
                return -1;
            }

            // It's a binary property, encode with TRUE
            return encodeProperty(propByte, TRUE);
        } else {
            // It's a property=value format
            String property = m.group("propName");
            String value = m.group("propValue");

            Byte propByte = PROPERTY_NAMES.get(property);
            if (propByte == null) {
                return -1;
            }

            switch (propByte) {
                case GENERAL_CATEGORY:
                    Byte valueByte = PROPERTY_VALUES.get(value);
                    if (valueByte == null) {
                        return -1;
                    }
                    return encodeProperty(GENERAL_CATEGORY, valueByte);
                case SCRIPT:
                    try {
                        return encodeProperty(
                                SCRIPT, (byte) Character.UnicodeScript.forName(value).ordinal());
                    } catch (IllegalArgumentException e) {
                        return -1;
                    }
                default:
                    // Binary properties don't have values
                    return -1;
            }
        }
    }

    /**
     * Encodes a property name and value into a single int. The property name is in the high 16
     * bits, the value in the low 16 bits.
     *
     * @param property Property name constant
     * @param value Property value constant
     * @return Encoded int
     */
    private static int encodeProperty(byte property, byte value) {
        return ((property & 0xFF) << 8) | (value & 0xFF);
    }

    private static final Character.UnicodeScript[] UnicodeScriptValues =
            Character.UnicodeScript.values();

    /**
     * Tests if a code point has a specific Unicode property.
     *
     * @param property Encoded property (from lookup method)
     * @param codePoint Character code point to test
     * @return true if the code point has the property
     */
    public static boolean hasProperty(int property, int codePoint) {
        byte propByte = (byte) ((property >> 8) & 0xFF);
        int valueByte = (property & 0xFF);

        switch (propByte) {
            case ALPHABETIC:
                return Character.isAlphabetic(codePoint) == (valueByte == TRUE);

            case ASCII:
                return (codePoint <= 0x7F) == (valueByte == TRUE);

            case CASE_IGNORABLE:
                // Java doesn't have a direct method for this
                // This is an approximation
                return (Character.getType(codePoint) == Character.MODIFIER_SYMBOL
                                || Character.getType(codePoint) == Character.MODIFIER_LETTER
                                || Character.getType(codePoint) == Character.NON_SPACING_MARK)
                        == (valueByte == TRUE);

            case GENERAL_CATEGORY:
                int javaCategory = Character.getType(codePoint);
                return checkGeneralCategory(valueByte, javaCategory);
            case ASCII_HEX_DIGIT:
                return isHexDigit(codePoint) == (valueByte == TRUE);
            case HEX_DIGIT:
                return (Character.digit(codePoint, 16) != -1) == (valueByte == TRUE);
            case ID_CONTINUE:
                return Character.isUnicodeIdentifierPart(codePoint) == (valueByte == TRUE);

            case ID_START:
                return Character.isUnicodeIdentifierStart(codePoint) == (valueByte == TRUE);

            case LOWERCASE:
                return Character.isLowerCase(codePoint) == (valueByte == TRUE);

            case UPPERCASE:
                return Character.isUpperCase(codePoint) == (valueByte == TRUE);

            case WHITE_SPACE:
                {
                    // Note: This only a good approximation of the Unicode white space property
                    return (valueByte == TRUE)
                            == (Character.isSpaceChar(codePoint)
                                    || Character.isWhitespace(codePoint));
                }
            case SCRIPT:
                return Character.UnicodeScript.of(codePoint) == UnicodeScriptValues[valueByte];
            default:
                return false;
        }
    }

    /** Maps our property value bytes to Java's Character.getType() values. */
    private static boolean checkGeneralCategory(int propertyValueByte, int javaCategory) {
        switch (propertyValueByte) {
            case LETTER:
                return javaCategory == Character.UPPERCASE_LETTER
                        || javaCategory == Character.LOWERCASE_LETTER
                        || javaCategory == Character.TITLECASE_LETTER
                        || javaCategory == Character.MODIFIER_LETTER
                        || javaCategory == Character.OTHER_LETTER;
            case UPPERCASE_LETTER:
                return javaCategory == Character.UPPERCASE_LETTER;
            case LOWERCASE_LETTER:
                return javaCategory == Character.LOWERCASE_LETTER;
            case TITLECASE_LETTER:
                return javaCategory == Character.TITLECASE_LETTER;
            case MODIFIER_LETTER:
                return javaCategory == Character.MODIFIER_LETTER;
            case OTHER_LETTER:
                return javaCategory == Character.OTHER_LETTER;
            case MARK:
                return javaCategory == Character.NON_SPACING_MARK
                        || javaCategory == Character.ENCLOSING_MARK
                        || javaCategory == Character.COMBINING_SPACING_MARK;
            case NONSPACING_MARK:
                return javaCategory == Character.NON_SPACING_MARK;
            case ENCLOSING_MARK:
                return javaCategory == Character.ENCLOSING_MARK;
            case SPACING_MARK:
                return javaCategory == Character.COMBINING_SPACING_MARK;
            case NUMBER:
                return javaCategory == Character.DECIMAL_DIGIT_NUMBER
                        || javaCategory == Character.LETTER_NUMBER
                        || javaCategory == Character.OTHER_NUMBER;
            case DECIMAL_NUMBER:
                return javaCategory == Character.DECIMAL_DIGIT_NUMBER;
            case LETTER_NUMBER:
                return javaCategory == Character.LETTER_NUMBER;
            case OTHER_NUMBER:
                return javaCategory == Character.OTHER_NUMBER;

            case SEPARATOR:
                return javaCategory == Character.SPACE_SEPARATOR
                        || javaCategory == Character.LINE_SEPARATOR
                        || javaCategory == Character.PARAGRAPH_SEPARATOR;
            case SPACE_SEPARATOR:
                return javaCategory == Character.SPACE_SEPARATOR;
            case LINE_SEPARATOR:
                return javaCategory == Character.LINE_SEPARATOR;
            case PARAGRAPH_SEPARATOR:
                return javaCategory == Character.PARAGRAPH_SEPARATOR;

            case OTHER:
                return javaCategory == Character.OTHER_LETTER
                        || javaCategory == Character.OTHER_NUMBER
                        || javaCategory == Character.OTHER_PUNCTUATION
                        || javaCategory == Character.OTHER_SYMBOL;
            case CONTROL:
                return javaCategory == Character.CONTROL;
            case FORMAT:
                return javaCategory == Character.FORMAT;
            case SURROGATE:
                return javaCategory == Character.SURROGATE;
            case PRIVATE_USE:
                return javaCategory == Character.PRIVATE_USE;

            case PUNCTUATION:
                return javaCategory == Character.CONNECTOR_PUNCTUATION
                        || javaCategory == Character.DASH_PUNCTUATION
                        || javaCategory == Character.START_PUNCTUATION
                        || javaCategory == Character.END_PUNCTUATION
                        || javaCategory == Character.OTHER_PUNCTUATION
                        || javaCategory == Character.INITIAL_QUOTE_PUNCTUATION
                        || javaCategory == Character.FINAL_QUOTE_PUNCTUATION;
            case DASH_PUNCTUATION:
                return javaCategory == Character.DASH_PUNCTUATION;
            case OPEN_PUNCTUATION:
                return javaCategory == Character.START_PUNCTUATION;
            case CLOSE_PUNCTUATION:
                return javaCategory == Character.END_PUNCTUATION;
            case CONNECTOR_PUNCTUATION:
                return javaCategory == Character.CONNECTOR_PUNCTUATION;
            case OTHER_PUNCTUATION:
                return javaCategory == Character.OTHER_PUNCTUATION;
            case INITIAL_PUNCTUATION:
                return javaCategory == Character.INITIAL_QUOTE_PUNCTUATION;
            case FINAL_PUNCTUATION:
                return javaCategory == Character.FINAL_QUOTE_PUNCTUATION;

            case SYMBOL:
                return javaCategory == Character.MATH_SYMBOL
                        || javaCategory == Character.CURRENCY_SYMBOL
                        || javaCategory == Character.MODIFIER_SYMBOL
                        || javaCategory == Character.OTHER_SYMBOL;
            case MATH_SYMBOL:
                return javaCategory == Character.MATH_SYMBOL;
            case CURRENCY_SYMBOL:
                return javaCategory == Character.CURRENCY_SYMBOL;
            case MODIFIER_SYMBOL:
                return javaCategory == Character.MODIFIER_SYMBOL;
            case OTHER_SYMBOL:
                return javaCategory == Character.OTHER_SYMBOL;
            case UNASSIGNED:
                return javaCategory == Character.UNASSIGNED;

            default:
                return false;
        }
    }

    /** Checks if a code point is a hex digit. */
    private static boolean isHexDigit(int codePoint) {
        return (codePoint >= '0' && codePoint <= '9')
                || (codePoint >= 'a' && codePoint <= 'f')
                || (codePoint >= 'A' && codePoint <= 'F');
    }
}