OpenNLPDetector.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.langdetect.opennlp;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.langdetect.Language;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.language.detect.LanguageConfidence;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;
/**
* <p>
* This is based on OpenNLP's language detector. However,
* we've built our own ProbingLanguageDetector and our own language
* models.
* </p>
* To build our model, we followed OpenNLP's lead by using the
* (<a href="https://wortschatz.uni-leipzig.de/en/download">Leipzig corpus</a>)
* as gathered and preprocessed
* (
* <a href="https://svn.apache.org/repos/bigdata/opennlp/trunk/leipzig/">big-data corpus</a>
* ). We removed azj, plt, sun
* and zsm because our models couldn't sufficiently well distinguish
* them from related languages. We removed cmn in favor of the
* finer-grained zho-trad and zho-simp.
* <p>
* We then added the following languages from <a href="http://data.statmt.org/cc-100/">cc-100</a>:
* ben-rom (Bengali Romanized), ful, gla, gug, hau, hin-rom, ibo, ful, linm
* mya-zaw, nso, orm, quz, roh, srd, ssw, tam-rom, tel-rom, tsn, urd-rom,
* wol, yor.
* <p>
* We ran our own train/devtest/test code because OpenNLPs required
* more sentences/data than were available for some languages.
* <p>
* Please open an issue on our JIRA if we made mistakes and/or had
* misunderstandings in our design choices or if you need to have other
* languages added.
* <p>
* Citations for the cc-100 corpus:
* <p>
* Unsupervised Cross-lingual Representation Learning at Scale, Alexis Conneau,
* Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek,
* Francisco Guzm��n, Edouard Grave, Myle Ott, Luke Zettlemoyer, Veselin Stoyanov,
* Proceedings of the 58th Annual Meeting of the Association for Computational
* Linguistics (ACL), p. 8440-8451, July 2020, pdf, bib.
* <p>
* CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data,
* Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary,
* Francisco Guzm��n, Armand Joulin, Edouard Grave, Proceedings of the 12th
* Language Resources and Evaluation Conference (LREC), p. 4003-4012,
* May 2020, pdf, bib.
*/
@TikaComponent
public class OpenNLPDetector extends LanguageDetector {
static LanguageDetectorModel LANG_MODEL;
static {
try {
loadBuiltInModels();
} catch (IOException e) {
throw new RuntimeException("Can't find built-in language models");
}
}
private final ProbingLanguageDetector detector =
new ProbingLanguageDetector(LANG_MODEL, getNormalizers());
private final StringBuilder buffer = new StringBuilder();
public OpenNLPDetector() {
}
static void loadBuiltInModels() throws IOException {
try (InputStream is = OpenNLPDetector.class
.getResourceAsStream("/opennlp-langdetect-20210413.bin")) {
LANG_MODEL = new LanguageDetectorModel(is);
}
}
private static CharSequenceNormalizer[] getNormalizers() {
return new CharSequenceNormalizer[]{TikaUrlCharSequenceNormalizer.getInstance(),
AlphaIdeographSequenceNormalizer.getInstance(),
EmojiCharSequenceNormalizer.getInstance(),
TwitterCharSequenceNormalizer.getInstance(),
NumberCharSequenceNormalizer.getInstance(),
ShrinkCharSequenceNormalizer.getInstance()};
}
private static LanguageConfidence getConfidence(double confidence) {
//COMPLETELY heuristic
if (confidence > 0.9) {
return LanguageConfidence.HIGH;
} else if (confidence > 0.85) {
return LanguageConfidence.MEDIUM;
} else if (confidence > 0.20) {
return LanguageConfidence.LOW;
}
return LanguageConfidence.NONE;
}
/**
* No-op. Models are loaded statically.
*
* @return
* @throws IOException
*/
@Override
public LanguageDetector loadModels() throws IOException {
return new OpenNLPDetector();
}
/**
* NOT SUPPORTED. Throws {@link UnsupportedOperationException}
*
* @param languages list of target languages.
* @return
* @throws IOException
*/
@Override
public LanguageDetector loadModels(Set<String> languages) throws IOException {
throw new UnsupportedOperationException(
"This lang detector doesn't allow subsetting models");
}
@Override
public boolean hasModel(String language) {
for (String lang : detector.getSupportedLanguages()) {
if (language.equals(lang)) {
return true;
}
}
return false;
}
/**
* NOT YET SUPPORTED. Throws {@link UnsupportedOperationException}
*
* @param languageProbabilities Map from language to probability
* @return
* @throws IOException
*/
@Override
public LanguageDetector setPriors(Map<String, Float> languageProbabilities) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void reset() {
buffer.setLength(0);
}
/**
* This will buffer up to {@link #setMaxLength(int)} and then
* ignore the rest of the text.
*
* @param cbuf Character buffer
* @param off Offset into cbuf to first character in the run of text
* @param len Number of characters in the run of text.
*/
@Override
public void addText(char[] cbuf, int off, int len) {
int buffLen = buffer.length();
int newLen = Math.min(len, detector.getMaxLength() - buffLen);
if (len <= 0) {
return;
}
buffer.append(cbuf, off, newLen);
}
@Override
public List<LanguageResult> detectAll() {
Language[] langs = detector.predictLanguages(buffer.toString());
List<LanguageResult> results = new ArrayList<>();
for (Language lang : langs) {
LanguageResult r =
new LanguageResult(lang.getLang(), getConfidence(lang.getConfidence()),
(float) lang.getConfidence());
results.add(r);
}
return results;
}
public void setMaxLength(int maxLength) {
detector.setMaxLength(maxLength);
}
public String[] getSupportedLanguages() {
return detector.getSupportedLanguages();
}
private static class TikaUrlCharSequenceNormalizer implements CharSequenceNormalizer {
//use this custom copy/paste of opennlp to avoid long, long hang with mail_regex
//TIKA-2777
private static final Pattern URL_REGEX =
Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}");
private static final Pattern MAIL_REGEX =
Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}");
private static final TikaUrlCharSequenceNormalizer INSTANCE =
new TikaUrlCharSequenceNormalizer();
private TikaUrlCharSequenceNormalizer() {
}
public static TikaUrlCharSequenceNormalizer getInstance() {
return INSTANCE;
}
@Override
public CharSequence normalize(CharSequence charSequence) {
String modified = URL_REGEX.matcher(charSequence).replaceAll(" ");
return MAIL_REGEX.matcher(modified).replaceAll(" ");
}
}
private static class AlphaIdeographSequenceNormalizer implements CharSequenceNormalizer {
private static final Pattern REGEX =
Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+");
private static final AlphaIdeographSequenceNormalizer INSTANCE =
new AlphaIdeographSequenceNormalizer();
private AlphaIdeographSequenceNormalizer() {
}
public static AlphaIdeographSequenceNormalizer getInstance() {
return INSTANCE;
}
@Override
public CharSequence normalize(CharSequence charSequence) {
return REGEX.matcher(charSequence).replaceAll(" ");
}
}
}