English.java

/*
 * Copyright 2011 Atteo.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.atteo.evo.inflector;

/**
 * Transforms English words from singular to plural form.
 * <p>
 * Examples:
 * <pre>
 *    English.plural("word") = "words";
 *
 *    English.plural("cat", 1) = "cat";
 *    English.plural("cat", 2) = "cats";
 * </pre>
 * </p>
 * <p>
 * Based on <a href="http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html">
 * An Algorithmic Approach to English Pluralization</a> by Damian Conway.
 * </p>
 */
public class English extends TwoFormInflector {
	public static enum MODE {
		ENGLISH_ANGLICIZED, ENGLISH_CLASSICAL
	}

	private static final String[] CATEGORY_EX_ICES = { "codex", "murex",
			"silex", };

	private static final String[] CATEGORY_IX_ICES = { "radix", "helix", };

	private static final String[] CATEGORY_UM_A = { "bacterium",
			"agendum", "desideratum", "erratum", "stratum", "datum", "ovum",
			"extremum", "candelabrum", };

	// Always us -> i
	private static final String[] CATEGORY_US_I = { "alumnus", "alveolus",
			"bacillus", "bronchus", "locus", "nucleus", "stimulus", "meniscus",
			"thesaurus", };

	private static final String[] CATEGORY_ON_A = { "criterion",
			"perihelion", "aphelion", "phenomenon", "prolegomenon", "noumenon",
			"organon", "asyndeton", "hyperbaton", };

	private static final String[] CATEGORY_A_AE = { "alumna", "alga",
			"vertebra", "persona" };

	// Always o -> os
	private static final String[] CATEGORY_O_OS = { "albino",
			"archipelago", "armadillo", "commando", "crescendo", "fiasco",
			"ditto", "dynamo", "embryo", "ghetto", "guano", "inferno", "jumbo",
			"lumbago", "magneto", "manifesto", "medico", "octavo", "photo",
			"pro", "quarto", "canto", "lingo", "generalissimo", "stylo",
			"rhino", "casino", "auto", "macro", "zero", "todo"
	};

	// Classical o -> i  (normally -> os)
	private static final String[] CATEGORY_O_I = {
			"solo", "soprano", "basso", "alto", "contralto", "tempo", "piano",
			"virtuoso", };

	private static final String[] CATEGORY_EN_INA = {
			"stamen", "foramen", "lumen"
	};

	// -a to -as (anglicized) or -ata (classical)
	private static final String[] CATEGORY_A_ATA = {
			"anathema", "enema", "oedema", "bema", "enigma", "sarcoma",
			"carcinoma", "gumma", "schema", "charisma", "lemma", "soma",
			"diploma", "lymphoma", "stigma", "dogma", "magma", "stoma",
			"drama", "melisma", "trauma", "edema", "miasma"
	};

	private static final String[] CATEGORY_IS_IDES = {
			"iris", "clitoris"
	};

	// -us to -uses (anglicized) or -us (classical)
	private static final String[] CATEGORY_US_US = {
			"apparatus", "impetus", "prospectus", "cantus", "nexus", "sinus", "coitus",
			"plexus", "status", "hiatus"
	};

	private static final String[] CATEGORY_NONE_I = {
		"afreet", "afrit", "efreet"
	};

	private static final String[] CATEGORY_NONE_IM = {
		"cherub", "goy", "seraph"
	};

	private static final String[] CATEGORY_EX_EXES = {
		"apex", "latex", "vertex", "cortex", "pontifex", "vortex", "index", "simplex"
	};

	private static final String[] CATEGORY_IX_IXES = {
		"appendix"
	};

	private static final String[] CATEGORY_S_ES = {
		"acropolis", "chaos", "lens", "aegis",
		"cosmos", "mantis", "alias", "dais", "marquis", "asbestos",
		"digitalis", "metropolis", "atlas", "epidermis", "pathos",
		"bathos", "ethos", "pelvis", "bias", "gas", "polis", "caddis",
		"glottis", "rhinoceros", "cannabis", "glottis", "sassafras",
		"canvas", "ibis", "trellis"
	};

	private static final String[] CATEGORY_MAN_MANS = {
		"human", "Alabaman", "Bahaman", "Burman", "German", "Hiroshiman", "Liman", "Nakayaman", "Oklahoman",
		"Panaman", "Selman", "Sonaman", "Tacoman", "Yakiman", "Yokohaman", "Yuman"
	};

	private static English inflector = new English();


	public English() {
		this(MODE.ENGLISH_ANGLICIZED);
	}

	public English(MODE mode) {

		uncountable(new String[] {
			// 2. Handle words that do not inflect in the plural (such as fish, travois, chassis, nationalities ending
			// endings
			"fish", "ois", "sheep", "deer", "pox", "itis",

			// words
			"bison", "flounder", "pliers", "bream",
			"gallows", "proceedings", "breeches", "graffiti", "rabies",
			"britches", "headquarters", "salmon", "carp", "herpes",
			"scissors", "chassis", "high-jinks", "sea-bass", "clippers",
			"homework", "series", "cod", "innings", "shears",
			"contretemps", "jackanapes", "species", "corps", "mackerel",
			"swine", "debris", "measles", "trout", "diabetes", "mews",
			"tuna", "djinn", "mumps", "whiting", "eland", "news",
			"wildebeest", "elk", "pincers", "sugar" });

		// 4. Handle standard irregular plurals (mongooses, oxen, etc.)

		irregular(new String[][] {
				{ "child", "children" }, // classical
				{ "ephemeris", "ephemerides" }, // classical
				{ "mongoose", "mongoose" }, // anglicized
				{ "mythos", "mythoi" }, // classical
				// TODO: handle entire word correctly
				//{ "ox", "oxen" }, // classical
				{ "soliloquy", "soliloquies" }, // anglicized
				{ "trilby", "trilbys" }, // anglicized
				{ "genus", "genera" }, // classical
				{ "quiz", "quizzes" },
		});

		if (mode == MODE.ENGLISH_ANGLICIZED) {
			// Anglicized plural
			irregular(new String[][] {
					{ "beef", "beefs" },
					{ "brother", "brothers" },
					{ "cow", "cows" },
					{ "genie", "genies" },
					{ "money", "moneys" },
					{ "octopus", "octopuses" },
					{ "opus", "opuses" },
				});
		} else if (mode == MODE.ENGLISH_CLASSICAL) {
			// Classical plural
			irregular(new String[][] { { "beef", "beeves"},
					{ "brother", "brethren" },
					{ "cow", "kine" }, { "genie", "genii"},
					{ "money", "monies" },
					{ "octopus", "octopodes" },
					{ "opus", "opera" },
			});
		}

		categoryRule(CATEGORY_MAN_MANS, "", "s");

		// questionable
		/*
		 rule(new String[][] {
				{ "(ness)$", "$1" },
				{ "(ality)$", "$1" }
				{ "(icity)$", "$1" },
				{ "(ivity)$", "$1" },
		});
		 */
		// 5. Handle irregular inflections for common suffixes
		rule(new String[][] {
				{ "(m)an$", "$1en" },
				{ "([lm])ouse$", "$1ice" },
				{ "(t)ooth$", "$1eeth" },
				{ "(g)oose$", "$1eese" },
				{ "(f)oot$", "$1eet" },
				{ "(z)oon$", "$1oa" },
				{ "([csx])is$", "$1es" },
		});

		// 6. Handle fully assimilated classical inflections
		categoryRule(CATEGORY_EX_ICES, "ex", "ices");
		categoryRule(CATEGORY_IX_ICES, "ix", "ices");
		categoryRule(CATEGORY_UM_A, "um", "a");
		categoryRule(CATEGORY_ON_A, "on", "a");
		categoryRule(CATEGORY_A_AE, "a", "ae");

		// 7. Handle classical variants of modern inflections
		if (mode == MODE.ENGLISH_CLASSICAL) {
			rule(new String[][]{
					{ "trix$", "trices" },
					{ "eau$", "eaux" },
					{ "ieu$", "ieux" },
					{ "(..[iay])nx$", "$1nges" },
			});
			categoryRule(CATEGORY_EN_INA, "en", "ina");
			categoryRule(CATEGORY_A_ATA, "a", "ata");
			categoryRule(CATEGORY_IS_IDES, "is", "ides");
			categoryRule(CATEGORY_US_US, "", "");
			categoryRule(CATEGORY_O_I, "o", "i");
			categoryRule(CATEGORY_NONE_I, "", "i");
			categoryRule(CATEGORY_NONE_IM, "", "im");
			categoryRule(CATEGORY_EX_EXES, "ex", "ices");
			categoryRule(CATEGORY_IX_IXES, "ix", "ices");
		}

		categoryRule(CATEGORY_US_I, "us", "i");

		rule("([cs]h|[zx])$", "$1es");
		categoryRule(CATEGORY_S_ES, "", "es");
		categoryRule(CATEGORY_IS_IDES, "", "es");
		categoryRule(CATEGORY_US_US, "", "es");
		rule("(us)$", "$1es");
		categoryRule(CATEGORY_A_ATA, "", "s");

		// The suffixes -ch, -sh, and -ss all take -es in the plural (churches,
		// classes, etc)...
		rule(new String[][] { { "([cs])h$", "$1hes" }, { "ss$", "sses" } });

		// Certain words ending in -f or -fe take -ves in the plural (lives,
		// wolves, etc)...
		rule(new String[][] {
				{ "([aeo]l)f$", "$1ves" },
				{ "([^d]ea)f$", "$1ves" },
				{ "(ar)f$", "$1ves" },
				{ "([nlw]i)fe$", "$1ves" }
		});

		// Words ending in -y take -ys
		rule(new String[][] { { "([aeiou]y)$", "$1s" }, { "y$", "ies" }, });

		// Some words ending in -o take -os (including does preceded by a vowel)
		categoryRule(CATEGORY_O_I, "o", "os");
		categoryRule(CATEGORY_O_OS, "o", "os");
		rule("([aeiou]o)$", "$1s");
		// The rest take -oes
		rule("(o)$", "$1es");

		rule("(ul)um$", "$1a");

		categoryRule(CATEGORY_A_ATA, "", "es");

		rule("(s)$", "$1es");

		// Return empty string for empty string input
		rule("^$", "");
		// Otherwise, assume that the plural just adds -s
		rule("$", "s");
	}

	/**
	 * Returns plural form of the given word.
	 *
	 * @param word word in singular form
	 * @return plural form of the word
	 */
	@Override
	public String getPlural(String word) {
		return super.getPlural(word);
	}

	/**
	 * Returns singular or plural form of the word based on count.
	 *
	 * @param word word in singular form
	 * @param count word count
	 * @return form of the word correct for given count
	 */
	public String getPlural(String word, int count) {
		if (count == 1) {
			return word;
		}
		return getPlural(word);
	}

	/**
	 * Returns plural form of the given word.
	 * <p>
	 * For instance:
	 * <pre>
	 * {@code
	 * English.plural("cat") == "cats";
	 * }
	 * </pre>
	 * </p>
	 * @param word word in singular form
	 * @return plural form of given word
	 */
	public static String plural(String word) {
		return inflector.getPlural(word);
	}

	/**
	 * Returns singular or plural form of the word based on count.
	 * <p>
	 * For instance:
	 * <pre>
	 * {@code
	 * English.plural("cat", 1) == "cat";
	 * English.plural("cat", 2) == "cats";
	 * }
	 * </pre>
	 * </p>
	 * @param word word in singular form
	 * @param count word count
	 * @return form of the word correct for given count
	 */
	public static String plural(String word, int count) {
		return inflector.getPlural(word, count);
	}

	public static void setMode(MODE mode) {
		English newInflector = new English(mode);
		inflector = newInflector;
	}
}