English.java

/*
 * Copyright 2011 Atteo.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.atteo.evo.inflector;

/**
 * Transforms English words from singular to plural form.
 *
 * Examples:
 * <pre>
 *    English.plural("word") = "words";
 *
 *    English.plural("cat", 1) = "cat";
 *    English.plural("cat", 2) = "cats";
 * </pre>
 *
 * Based on <a href="http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html">
 * An Algorithmic Approach to English Pluralization</a> by Damian Conway.
 */
public class English {
    /**
     * Selects which pluralization rule set to use.
     */
    public enum MODE {
        /**
         * Uses the anglicized plural forms where alternatives exist.
         */
        ENGLISH_ANGLICIZED,

        /**
         * Uses the classical plural forms where alternatives exist.
         */
        ENGLISH_CLASSICAL
    }

    private static final String[] CATEGORY_EX_ICES = {
        "codex", "murex", "silex",
    };

    private static final String[] CATEGORY_IX_ICES = {
        "radix", "helix",
    };

    private static final String[] CATEGORY_UM_A = {
        "bacterium", "agendum", "desideratum", "erratum", "stratum", "datum", "ovum", "extremum", "candelabrum",
    };

    // Always us -> i
    private static final String[] CATEGORY_US_I = {
        "alumnus", "alveolus", "bacillus", "bronchus", "locus", "nucleus", "stimulus", "meniscus", "thesaurus",
    };

    private static final String[] CATEGORY_ON_A = {
        "criterion",
        "perihelion",
        "aphelion",
        "phenomenon",
        "prolegomenon",
        "noumenon",
        "organon",
        "asyndeton",
        "hyperbaton",
    };

    private static final String[] CATEGORY_A_AE = {"alumna", "alga", "vertebra", "persona"};

    // Always o -> os
    private static final String[] CATEGORY_O_OS = {
        "albino",
        "archipelago",
        "armadillo",
        "commando",
        "crescendo",
        "fiasco",
        "ditto",
        "dynamo",
        "embryo",
        "ghetto",
        "guano",
        "inferno",
        "jumbo",
        "lumbago",
        "magneto",
        "manifesto",
        "medico",
        "octavo",
        "photo",
        "pro",
        "quarto",
        "canto",
        "lingo",
        "generalissimo",
        "stylo",
        "rhino",
        "casino",
        "auto",
        "macro",
        "zero",
        "todo"
    };

    // Classical o -> i  (normally -> os)
    private static final String[] CATEGORY_O_I = {
        "solo", "soprano", "basso", "alto", "contralto", "tempo", "piano", "virtuoso",
    };

    private static final String[] CATEGORY_EN_INA = {"stamen", "foramen", "lumen"};

    // -a to -as (anglicized) or -ata (classical)
    private static final String[] CATEGORY_A_ATA = {
        "anathema", "enema", "oedema", "bema", "enigma", "sarcoma",
        "carcinoma", "gumma", "schema", "charisma", "lemma", "soma",
        "diploma", "lymphoma", "stigma", "dogma", "magma", "stoma",
        "drama", "melisma", "trauma", "edema", "miasma"
    };

    private static final String[] CATEGORY_IS_IDES = {"iris", "clitoris"};

    // -us to -uses (anglicized) or -us (classical)
    private static final String[] CATEGORY_US_US = {
        "apparatus", "impetus", "prospectus", "cantus", "nexus", "sinus", "coitus", "plexus", "status", "hiatus"
    };

    private static final String[] CATEGORY_NONE_I = {"afreet", "afrit", "efreet"};

    private static final String[] CATEGORY_NONE_IM = {"cherub", "goy", "seraph"};

    private static final String[] CATEGORY_EX_EXES = {
        "apex", "latex", "vertex", "cortex", "pontifex", "vortex", "index", "simplex"
    };

    private static final String[] CATEGORY_IX_IXES = {"appendix"};

    private static final String[] CATEGORY_S_ES = {
        "acropolis",
        "chaos",
        "lens",
        "aegis",
        "cosmos",
        "mantis",
        "alias",
        "dais",
        "marquis",
        "asbestos",
        "digitalis",
        "metropolis",
        "atlas",
        "epidermis",
        "pathos",
        "bathos",
        "ethos",
        "pelvis",
        "bias",
        "gas",
        "polis",
        "caddis",
        "glottis",
        "rhinoceros",
        "cannabis",
        "glottis",
        "sassafras",
        "canvas",
        "ibis",
        "trellis"
    };

    private static final String[] CATEGORY_MAN_MANS = {
        "human",
        "Alabaman",
        "Bahaman",
        "Burman",
        "German",
        "Hiroshiman",
        "Liman",
        "Nakayaman",
        "Oklahoman",
        "Panaman",
        "Selman",
        "Sonaman",
        "Tacoman",
        "Yakiman",
        "Yokohaman",
        "Yuman"
    };

    private static final CompiledInflector ANGLICIZED_ENGINE = buildEngine(MODE.ENGLISH_ANGLICIZED);
    private static final CompiledInflector CLASSICAL_ENGINE = buildEngine(MODE.ENGLISH_CLASSICAL);

    private static volatile English inflector = new English();

    private final CompiledInflector engine;

    /**
     * Creates an inflector using anglicized pluralization rules.
     */
    public English() {
        this(MODE.ENGLISH_ANGLICIZED);
    }

    /**
     * Creates an inflector for the selected pluralization mode.
     *
     * @param mode pluralization rule set to use
     */
    public English(MODE mode) {
        engine = mode == MODE.ENGLISH_CLASSICAL ? CLASSICAL_ENGINE : ANGLICIZED_ENGINE;
    }

    /**
     * Returns plural form of the given word.
     *
     * @param word word in singular form
     * @return plural form of the word
     */
    public String getPlural(String word) {
        return engine.pluralize(word);
    }

    /**
     * Returns singular or plural form of the word based on count.
     *
     * @param word word in singular form
     * @param count word count
     * @return form of the word correct for given count
     */
    public String getPlural(String word, int count) {
        if (count == 1) {
            return word;
        }
        return getPlural(word);
    }

    /**
     * Returns plural form of the given word.
     *
     * For instance:
     * <pre>
     * {@code
     * English.plural("cat") == "cats";
     * }
     * </pre>
     *
     * @param word word in singular form
     * @return plural form of given word
     */
    public static String plural(String word) {
        return inflector.getPlural(word);
    }

    /**
     * Returns singular or plural form of the word based on count.
     *
     * For instance:
     * <pre>
     * {@code
     * English.plural("cat", 1) == "cat";
     * English.plural("cat", 2) == "cats";
     * }
     * </pre>
     *
     * @param word word in singular form
     * @param count word count
     * @return form of the word correct for given count
     */
    public static String plural(String word, int count) {
        return inflector.getPlural(word, count);
    }

    /**
     * Sets the global pluralization mode used by the static helper methods.
     *
     * @param mode pluralization rule set to use for static calls
     */
    public static void setMode(MODE mode) {
        var newInflector = new English(mode);
        inflector = newInflector;
    }

    private static CompiledInflector buildEngine(MODE mode) {
        CompiledInflector.Builder builder = CompiledInflector.builder();

        builder.addIdentityCategory(new String[] {
            "fish",
            "ois",
            "sheep",
            "deer",
            "pox",
            "itis",
            "bison",
            "flounder",
            "pliers",
            "bream",
            "gallows",
            "proceedings",
            "breeches",
            "graffiti",
            "rabies",
            "britches",
            "headquarters",
            "salmon",
            "carp",
            "herpes",
            "scissors",
            "chassis",
            "high-jinks",
            "sea-bass",
            "clippers",
            "homework",
            "series",
            "cod",
            "innings",
            "shears",
            "contretemps",
            "jackanapes",
            "species",
            "corps",
            "mackerel",
            "swine",
            "debris",
            "measles",
            "trout",
            "diabetes",
            "mews",
            "tuna",
            "djinn",
            "mumps",
            "whiting",
            "eland",
            "news",
            "wildebeest",
            "elk",
            "pincers",
            "sugar"
        });

        addIrregular(builder, "child", "children");
        addIrregular(builder, "ephemeris", "ephemerides");
        addIrregular(builder, "mongoose", "mongoose");
        addIrregular(builder, "mythos", "mythoi");
        addIrregular(builder, "soliloquy", "soliloquies");
        addIrregular(builder, "trilby", "trilbys");
        addIrregular(builder, "genus", "genera");
        addIrregular(builder, "quiz", "quizzes");

        if (mode == MODE.ENGLISH_ANGLICIZED) {
            addIrregular(builder, "beef", "beefs");
            addIrregular(builder, "brother", "brothers");
            addIrregular(builder, "cow", "cows");
            addIrregular(builder, "genie", "genies");
            addIrregular(builder, "money", "moneys");
            addIrregular(builder, "octopus", "octopuses");
            addIrregular(builder, "opus", "opuses");
        } else {
            addIrregular(builder, "beef", "beeves");
            addIrregular(builder, "brother", "brethren");
            addIrregular(builder, "cow", "kine");
            addIrregular(builder, "genie", "genii");
            addIrregular(builder, "money", "monies");
            addIrregular(builder, "octopus", "octopodes");
            addIrregular(builder, "opus", "opera");
        }

        builder.addCategoryRule(CATEGORY_MAN_MANS, 0, "s");

        builder.addSuffixRule("man", 2, "en");
        builder.addSuffixRule("mouse", 4, "ice");
        builder.addSuffixRule("louse", 4, "ice");
        builder.addSuffixRule("tooth", 4, "eeth");
        builder.addSuffixRule("goose", 4, "eese");
        builder.addSuffixRule("foot", 3, "eet");
        builder.addSuffixRule("zoon", 3, "oa");
        builder.addSuffixRule("is", CompiledInflector.previousCharIn("csx"), 2, "es");

        builder.addCategoryRule(CATEGORY_EX_ICES, 2, "ices");
        builder.addCategoryRule(CATEGORY_IX_ICES, 2, "ices");
        builder.addCategoryRule(CATEGORY_UM_A, 2, "a");
        builder.addCategoryRule(CATEGORY_ON_A, 2, "a");
        builder.addCategoryRule(CATEGORY_A_AE, 1, "ae");

        if (mode == MODE.ENGLISH_CLASSICAL) {
            builder.addSuffixRule("trix", 4, "trices");
            builder.addSuffixRule("eau", 0, "x");
            builder.addSuffixRule("ieu", 0, "x");
            builder.addSuffixRule(
                    "nx",
                    CompiledInflector.and(
                            CompiledInflector.suffixStartAtLeast(3), CompiledInflector.previousCharIn("iay")),
                    2,
                    "nges");
            builder.addCategoryRule(CATEGORY_EN_INA, 2, "ina");
            builder.addCategoryRule(CATEGORY_A_ATA, 1, "ata");
            builder.addCategoryRule(CATEGORY_IS_IDES, 2, "ides");
            builder.addIdentityCategory(CATEGORY_US_US);
            builder.addCategoryRule(CATEGORY_O_I, 1, "i");
            builder.addCategoryRule(CATEGORY_NONE_I, 0, "i");
            builder.addCategoryRule(CATEGORY_NONE_IM, 0, "im");
            builder.addCategoryRule(CATEGORY_EX_EXES, 2, "ices");
            builder.addCategoryRule(CATEGORY_IX_IXES, 2, "ices");
        }

        builder.addCategoryRule(CATEGORY_US_I, 2, "i");

        builder.addSuffixRule("ch", 0, "es");
        builder.addSuffixRule("sh", 0, "es");
        builder.addSuffixRule("z", 0, "es");
        builder.addSuffixRule("x", 0, "es");
        builder.addCategoryRule(CATEGORY_S_ES, 0, "es");
        builder.addCategoryRule(CATEGORY_IS_IDES, 0, "es");
        builder.addCategoryRule(CATEGORY_US_US, 0, "es");
        builder.addSuffixRule("us", 0, "es");
        builder.addCategoryRule(CATEGORY_A_ATA, 0, "s");
        builder.addSuffixRule("ss", 0, "es");

        builder.addSuffixRule("lf", CompiledInflector.previousCharIn("aeo"), 1, "ves");
        builder.addSuffixRule("eaf", CompiledInflector.previousCharNot('d'), 1, "ves");
        builder.addSuffixRule("arf", 1, "ves");
        builder.addSuffixRule("ife", CompiledInflector.previousCharIn("nlw"), 2, "ves");

        builder.addSuffixRule("y", CompiledInflector.previousCharIn("aeiou"), 0, "s");
        builder.addSuffixRule("y", 1, "ies");

        builder.addCategoryRule(CATEGORY_O_I, 1, "os");
        builder.addCategoryRule(CATEGORY_O_OS, 1, "os");
        builder.addSuffixRule("o", CompiledInflector.previousCharIn("aeiou"), 0, "s");
        builder.addSuffixRule("o", 0, "es");

        builder.addSuffixRule("ulum", 2, "a");
        builder.addCategoryRule(CATEGORY_A_ATA, 0, "es");
        builder.addSuffixRule("s", 0, "es");
        builder.addSuffixRule("", (lowerWord, suffixStart) -> lowerWord.isEmpty(), 0, "");
        builder.addSuffixRule("", 0, "s");

        return builder.build();
    }

    private static void addIrregular(CompiledInflector.Builder builder, String singular, String plural) {
        if (singular.charAt(0) == plural.charAt(0)) {
            builder.addPreservedInitialRule(singular, plural);
        } else {
            builder.addWholeWordRule(singular, plural);
        }
    }
}