EnglishInflectorTest.java

/*
 * Copyright 2011 Atteo.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.atteo.evo.inflector;

import java.io.FileWriter;
import java.io.IOException;
import java.net.URLEncoder;
import java.time.LocalDate;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import org.junit.jupiter.api.Test;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.assertj.core.api.Assertions.assertThat;

public class EnglishInflectorTest {
	private final English inflector = new English();

	@Test
	public void wiktionaryTest() throws IOException {

		AtomicInteger all = new AtomicInteger();
		AtomicInteger countable = new AtomicInteger();
		AtomicInteger correctCountable = new AtomicInteger();
		AtomicInteger uncountable = new AtomicInteger();
		AtomicInteger correctUncountable = new AtomicInteger();
		AtomicInteger pluralNotAttested = new AtomicInteger();
		AtomicInteger pluralUnknown = new AtomicInteger();

		FileWriter incorrectCountable = new FileWriter("reports/incorrect-countable.md");
		incorrectCountable.append("|Singular|Evo-Inflector plural|Wiktionary plurals|\n");
		incorrectCountable.append("|--------|--------------------|------------------|\n");

		new WikiParser().parse(page -> {
			if (page.getTitle().contains(" ") || page.getTitle().contains(":")) {
				return;
			}

			List<WikiNoun> wikiNouns = WikiNoun.find(page);

			if (wikiNouns.isEmpty()) {
				return;
			}

			all.getAndIncrement();
			if (all.get() % 10000 == 0) {
				printSummary(countable, correctCountable, uncountable, correctUncountable, pluralNotAttested, pluralUnknown);
			}

			String calculatedPlural = inflector.getPlural(wikiNouns.get(0).singular());

			Optional<WikiNoun> correctNoun = wikiNouns.stream()
				.filter(noun -> noun.plurals().contains(calculatedPlural))
				.findFirst();

			boolean correct = correctNoun.isPresent();

			WikiNoun wikiNoun = correctNoun.orElse(wikiNouns.get(0));

			if (wikiNoun.isUncountable()) {
					uncountable.getAndIncrement();
					if (correct) {
					    correctUncountable.getAndIncrement();
                    }
					return;
				}
				if (wikiNoun.isPluralNotAttested()) {
					pluralNotAttested.getAndIncrement();
					return;
				}

				if (wikiNoun.isPluralUnknown()) {
					pluralUnknown.getAndIncrement();
					return;
				}

				countable.getAndIncrement();

				if (correct) {
					correctCountable.getAndIncrement();
					return;
				}

				try {
					String wiktionaryPlurals = wikiNouns.stream()
						.flatMap(noun -> noun.plurals().stream())
						.collect(Collectors.joining(","));
					String ennouns = wikiNouns.stream()
						.map(WikiNoun::ennoun)
						.collect(Collectors.joining(","));

					String uriEncodedSingular = URLEncoder.encode(wikiNoun.singular(), UTF_8.toString());

					incorrectCountable.append("|" + wikiNoun.singular() + " | " + calculatedPlural + " | ["
						+ wiktionaryPlurals + "](https://en.wiktionary.org/wiki/" + uriEncodedSingular + ") |\n");
					System.out.println(wikiNoun.singular() + " -> " + calculatedPlural
						+ " Wiktionary says: " + wiktionaryPlurals +" {{en-noun" +  ennouns+ "}}");
				} catch (IOException e) {
					throw new RuntimeException(e);
				}
		});

		printSummary(countable, correctCountable, uncountable, correctUncountable, pluralNotAttested, pluralUnknown);
		incorrectCountable.close();
	}

	private void printSummary(AtomicInteger countable, AtomicInteger correctCountable,
							  AtomicInteger uncountable, AtomicInteger correctUncountable,
							  AtomicInteger pluralNotAttested, AtomicInteger pluralUnknown) {
		int all = countable.get() + uncountable.get() + pluralNotAttested.get() + pluralUnknown.get();

		System.out.println("");
		System.out.println("There are (" + LocalDate.now().toString() + ") " + all
			+ " single word english nouns in the English Wiktionary of which:");
		System.out.println("- " + percent(countable.get(), all) + " are countable nouns,");
		System.out.println("- " + percent(uncountable.get(), all) + " are uncountable nouns,");
		System.out.println("- for " + percent(pluralUnknown.get(), all) + " nouns plural is unknown,");
		System.out.println("- for " + percent(pluralNotAttested.get(), all) + " nouns plural is not attested.");
		System.out.println("");
		System.out.println("Evo Inflector returns correct answer for: ");
		System.out.println("- " + percent(correctCountable.get(), countable.get())
			+ " of all countable nouns, see [this report](reports/incorrect-countable.md),");
		System.out.println("- but only for " + percent(correctUncountable.get(), uncountable.get()) + " of uncountable nouns.");
		System.out.println("In overall it returns correct answer for "
			+ percent(correctCountable.get() + correctUncountable.get(), all) + " of all nouns");
		System.out.println("");
	}

	private String percent(int count, int all) {
		float percent =  count*100 / (float) all;
		return percent + "% (" + count + ")";
	}

	@Test
	public void exampleWordList() {
		check(new String[][] {
			{ "alga", "algae" },
			{ "nova", "novas" },
			{ "dogma", "dogmas" },
			{ "Woman", "Women" },
			{ "church", "churches" },
			{ "quick_chateau", "quick_chateaus" },
			{ "codex", "codices" },
			{ "index", "indexes" },
			{ "basis", "bases" },
			{ "iris", "irises" },
			{ "phalanx", "phalanxes" },
			{ "tempo", "tempos" },
			{ "foot", "feet" },
			{ "series", "series" },
			{ "wish", "wishes" },
			{ "Bacterium", "Bacteria" },
			{ "medium", "mediums" },
			{ "Genus", "Genera" },
			{ "stimulus", "stimuli" },
			{ "opus", "opuses" },
			{ "status", "statuses" },
			{ "Box", "Boxes" },
			{ "ferry", "ferries" },
			{ "protozoon", "protozoa" },
			{ "cherub", "cherubs" },
			{ "human", "humans" },
			{ "sugar", "sugar" },
			{ "virus", "viruses" },
			{ "gastrostomy", "gastrostomies" },
			{ "baculum", "bacula" },
			{ "pancreas", "pancreases" },
			{ "todo", "todos"},
			{ "person", "persons"},
			{ "baculumulum", "baculumula" }, // https://github.com/atteo/evo-inflector/pull/18
			{ "", "" },
		});
	}

	@Test
	public void shouldPreserveCapitalLetters() {
		check(new String[][] {
			{ "Milieu", "Milieus" },
			{ "NightWolf", "NightWolves" },
			{ "WorldAtlas", "WorldAtlases" },
			{ "SMS", "SMSes" },
			{ "bacuLum", "bacuLa" },
			{ "alO", "alOes" },
			{ "luO", "luOs" },
			{ "boY", "boYs" },
			{ "Foot", "Feet" },
			{ "Goose", "Geese" },
			{ "Man", "Men" },
			{ "Tooth", "Teeth" },
		});

	}

	@Test
	public void withCount() {
		assertThat(inflector.getPlural("cat", 1)).isEqualTo("cat");
		assertThat(inflector.getPlural("cat", 2)).isEqualTo("cats");

		assertThat(inflector.getPlural("demoness", 1)).isEqualTo("demoness");
		assertThat(inflector.getPlural("demoness", 2)).isEqualTo("demonesses");
	}

	@Test
	public void staticMethods() {
		assertThat(English.plural("sulfimide")).isEqualTo("sulfimides");
		assertThat(English.plural("semifluid", 2)).isEqualTo("semifluids");
	}

	private void check(String[][] list) {
		for (String[] pair : list) {
			check(pair[0], pair[1]);
		}
	}

	private void check(String singular, String plural) {
		assertThat(inflector.getPlural(singular)).isEqualTo(plural);
	}
}