WikiNoun.java
package org.atteo.evo.inflector;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class WikiNoun {
private static final Pattern enNounPattern = Pattern.compile("\\{\\{en-noun((\\|[\\-\\+~!?a-z\\[\\] =]+)*)\\}\\}");
private final String singular;
private final List<String> plurals = new ArrayList<>();
private final String ennoun;
private NounType nounType;
public WikiNoun(String word, String ennoun) {
this.singular = word;
this.ennoun = ennoun;
calculatePlurals(singular, ennoun);
}
private void calculatePlurals(String singular, String ennoun) {
var split = ennoun.split("\\|");
for (var i = 1; i < split.length; i++) {
if (split[i].startsWith("head=")) {
continue;
}
var noun = interpretInflection(singular, split[i]);
if (nounType == null) {
// first entry is the most common
nounType = noun.type;
}
plurals.add(noun.plural);
}
if (plurals.isEmpty()) {
plurals.add(defaultPlural(singular));
}
}
static final class Noun {
NounType type;
String plural;
private Noun(NounType type, String plural) {
this.type = type;
this.plural = plural;
}
public static Noun countable(String plural) {
return new Noun(NounType.COUNTABLE, plural);
}
public static Noun uncountable(String plural) {
return new Noun(NounType.UNCOUNTABLE, plural);
}
public static Noun pluralNotAttested() {
return new Noun(NounType.PLURAL_NOT_ATTESTED, "");
}
public static Noun unknownPlural() {
return new Noun(NounType.UNKNOWN_PLURAL, "");
}
}
private Noun interpretInflection(String singular, String inflection) {
if ("-".equals(inflection)) {
return Noun.uncountable(singular);
}
if ("~".equals(inflection)) {
return Noun.countable(defaultPlural(singular));
}
if ("+".equals(inflection)) {
return Noun.countable(defaultPlural(singular));
}
if ("!".equals(inflection)) {
return Noun.pluralNotAttested();
}
// unknown or uncertain plural
if ("?".equals(inflection)) {
return Noun.unknownPlural();
}
if ("s".equals(inflection)) {
return Noun.countable(singular + "s");
}
if ("es".equals(inflection)) {
return Noun.countable(singular + "es");
}
return Noun.countable(inflection);
}
public String singular() {
return singular;
}
public List<String> plurals() {
return plurals;
}
public boolean isCountable() {
return nounType == NounType.COUNTABLE;
}
public boolean isUncountable() {
return nounType == NounType.UNCOUNTABLE;
}
public boolean isPluralUnknown() {
return nounType == NounType.UNKNOWN_PLURAL;
}
public boolean isPluralNotAttested() {
return nounType == NounType.PLURAL_NOT_ATTESTED;
}
public String ennoun() {
return ennoun;
}
private static String defaultPlural(String singular) {
if (singular.endsWith("s")
|| singular.endsWith("x")
|| singular.endsWith("z")
|| singular.endsWith("sh")
|| singular.endsWith("ch")) {
return singular + "es";
}
if (singular.endsWith("y")) {
if (singular.length() > 1 && isVowel(singular.charAt(singular.length() - 2))) {
return singular + "s";
}
return singular.substring(0, singular.length() - 1) + "ies";
}
return singular + "s";
}
private static boolean isVowel(char c) {
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
}
public static List<WikiNoun> find(Page page) {
var matcher = enNounPattern.matcher(page.getRevision().getText());
var nouns = new ArrayList<WikiNoun>();
while (matcher.find()) {
nouns.add(new WikiNoun(page.getTitle(), matcher.group(1)));
}
return nouns;
}
}