WikiParser.java

package org.atteo.evo.inflector;

import java.io.IOException;
import java.io.InputStream;
import java.util.function.Consumer;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.opentest4j.TestAbortedException;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.dataformat.xml.XmlFactory;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;

public class WikiParser {
    public void parse(Consumer<Page> consumer) throws IOException {
        InputStream compressedStream = EnglishInflectorTest.class.getResourceAsStream(
            "/enwiktionary-latest-pages-articles.xml.bz2");
        if (compressedStream == null) {
            System.err.println("\nFull test requires wiktionary dump which was not found\n" +
                "To run rull test do the following:\n" +
                "cd src/test/resources\n" +
                "wget http://download.wikimedia.org/enwiktionary/latest/" +
                "enwiktionary-latest-pages-articles.xml.bz2\n");
            throw new TestAbortedException("Wiktionary data is missing");
        }
        BZip2CompressorInputStream stream = new BZip2CompressorInputStream(compressedStream);

        XmlFactory xmlFactory = new XmlFactory();
        JsonParser parser = xmlFactory.createParser(stream);
        XmlMapper xmlMapper = new XmlMapper(xmlFactory);
        xmlMapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
        parser.setCodec(xmlMapper);

        if (parser.nextToken() != JsonToken.START_OBJECT) {
            throw new RuntimeException("START_OBJECT is required at the beginning");
        }

        while (true) {
            JsonToken token = parser.nextToken();
            if (token == null) {
                break;
            }

            if (token == JsonToken.START_OBJECT) {
                if ("siteinfo".equals(parser.currentName())) {
                    parser.skipChildren();
                } else if ("page".equals(parser.currentName())) {

                    Page page = parser.readValueAs(Page.class);

                    consumer.accept(page);
                }
            }
        }
        parser.close();
    }
}