WiktionaryCorpus.java

package org.atteo.evo.inflector;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.opentest4j.TestAbortedException;

final class WiktionaryCorpus {
    private static final String CACHE_HEADER = "# evo-inflector wiktionary noun cache v1";

    private static final Path RAW_DUMP = Path.of("src/test/resources/enwiktionary-latest-pages-articles.xml.bz2");
    private static final Path COMPACT_CACHE = Path.of("src/test/resources/enwiktionary-nouns.tsv.gz");
    private static final Path TEMP_CACHE = Path.of("src/test/resources/enwiktionary-nouns.tsv.gz.tmp");

    void forEach(Consumer<List<WikiNoun>> consumer) throws IOException {
        var cachePath = ensureCompactCache();

        try {
            readCache(cachePath, consumer);
        } catch (IOException e) {
            if (!Files.exists(RAW_DUMP)) {
                throw e;
            }

            Files.deleteIfExists(COMPACT_CACHE);
            Files.deleteIfExists(TEMP_CACHE);
            createCompactCache();
            readCache(COMPACT_CACHE, consumer);
        }
    }

    private Path ensureCompactCache() throws IOException {
        var rawExists = Files.exists(RAW_DUMP);
        var cacheExists = Files.exists(COMPACT_CACHE);

        if (!rawExists && !cacheExists) {
            throw new TestAbortedException("Wiktionary dump and compact cache are missing");
        }

        if (rawExists && shouldRegenerateCache()) {
            createCompactCache();
            return COMPACT_CACHE;
        }

        if (cacheExists) {
            return COMPACT_CACHE;
        }

        throw new TestAbortedException("Wiktionary compact cache is missing");
    }

    private boolean shouldRegenerateCache() throws IOException {
        Files.deleteIfExists(TEMP_CACHE);

        if (!Files.exists(COMPACT_CACHE)) {
            return true;
        }

        if (Files.getLastModifiedTime(RAW_DUMP).compareTo(Files.getLastModifiedTime(COMPACT_CACHE)) > 0) {
            return true;
        }

        try (var reader = newReader(COMPACT_CACHE)) {
            return !CACHE_HEADER.equals(reader.readLine());
        } catch (IOException e) {
            return true;
        }
    }

    private void createCompactCache() throws IOException {
        Files.deleteIfExists(TEMP_CACHE);

        try (var writer = new BufferedWriter(
                new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(TEMP_CACHE)), UTF_8))) {
            writer.write(CACHE_HEADER);
            writer.newLine();

            try {
                new WikiParser().parse(RAW_DUMP, page -> writePage(writer, page));
            } catch (UncheckedIOException e) {
                throw e.getCause();
            }
        } catch (IOException e) {
            Files.deleteIfExists(TEMP_CACHE);
            throw e;
        }

        Files.move(TEMP_CACHE, COMPACT_CACHE, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
    }

    private void writePage(BufferedWriter writer, Page page) {
        if (page.getTitle().contains(" ") || page.getTitle().contains(":")) {
            return;
        }

        List<WikiNoun> nouns = WikiNoun.find(page);
        if (nouns.isEmpty()) {
            return;
        }

        try {
            writer.write(page.getTitle());
            for (WikiNoun noun : nouns) {
                writer.write('\t');
                writer.write(noun.ennoun());
            }
            writer.newLine();
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    private List<WikiNoun> parseLine(String line) {
        var parts = line.split("\\t", -1);
        var nouns = new ArrayList<WikiNoun>(Math.max(1, parts.length - 1));
        for (var i = 1; i < parts.length; i++) {
            nouns.add(new WikiNoun(parts[0], parts[i]));
        }
        return nouns;
    }

    private BufferedReader newReader(Path path) throws IOException {
        try {
            return new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(path)), UTF_8));
        } catch (EOFException e) {
            throw new IOException("Corrupted Wiktionary compact cache: " + path, e);
        }
    }

    private void readCache(Path path, Consumer<List<WikiNoun>> consumer) throws IOException {
        try (var reader = newReader(path)) {
            var header = reader.readLine();
            if (!CACHE_HEADER.equals(header)) {
                throw new IOException("Unsupported Wiktionary compact cache format: " + path);
            }

            String line;
            while ((line = reader.readLine()) != null) {
                if (line.isEmpty()) {
                    continue;
                }
                consumer.accept(parseLine(line));
            }
        }
    }
}