WikiParser.java
package org.atteo.evo.inflector;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.function.Consumer;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.opentest4j.TestAbortedException;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.dataformat.xml.XmlFactory;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;
public class WikiParser {
public void parse(Consumer<Page> consumer) throws IOException {
parse(Path.of("src/test/resources/enwiktionary-latest-pages-articles.xml.bz2"), consumer);
}
public void parse(Path dumpPath, Consumer<Page> consumer) throws IOException {
if (!Files.exists(dumpPath)) {
System.err.println("""
Full test requires wiktionary dump which was not found
To run full test do the following:
cd src/test/resources
wget http://download.wikimedia.org/enwiktionary/latest/\
enwiktionary-latest-pages-articles.xml.bz2
""");
throw new TestAbortedException("Wiktionary data is missing");
}
try (InputStream compressedStream = Files.newInputStream(dumpPath);
var stream = new BZip2CompressorInputStream(compressedStream)) {
parse(stream, consumer);
}
}
private void parse(InputStream stream, Consumer<Page> consumer) throws IOException {
var xmlFactory = new XmlFactory();
var parser = xmlFactory.createParser(stream);
var xmlMapper = new XmlMapper(xmlFactory);
xmlMapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
parser.setCodec(xmlMapper);
if (parser.nextToken() != JsonToken.START_OBJECT) {
throw new RuntimeException("START_OBJECT is required at the beginning");
}
while (true) {
var token = parser.nextToken();
if (token == null) {
break;
}
if (token == JsonToken.START_OBJECT) {
if ("siteinfo".equals(parser.currentName())) {
parser.skipChildren();
} else if ("page".equals(parser.currentName())) {
var page = parser.readValueAs(Page.class);
consumer.accept(page);
}
}
}
parser.close();
}
}