TikaEncodingDetectorTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.BOMDetector;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.detect.MetaEncodingDetector;
import org.apache.tika.detect.MetadataCharsetDetector;
import org.apache.tika.detect.OverrideEncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.ml.chardetect.MojibusterEncodingDetector;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
import org.apache.tika.parser.txt.TXTParser;
public class TikaEncodingDetectorTest extends TikaTest {
@Test
public void testDefault() throws TikaConfigException {
EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
// 4 base detectors (BOM, Metadata, ML, StandardHtml) + CharSoupEncodingDetector (MetaEncodingDetector)
assertEquals(5, detectors.size());
// meta detector is always last (partitioned by CompositeEncodingDetector)
assertTrue(detectors.get(4) instanceof MetaEncodingDetector);
// base detectors ��� sorted by full class name; check by type
Set<Class<?>> baseClasses = detectors.subList(0, 4).stream()
.map(Object::getClass).collect(Collectors.toSet());
assertTrue(baseClasses.contains(BOMDetector.class));
assertTrue(baseClasses.contains(MetadataCharsetDetector.class));
assertTrue(baseClasses.contains(MojibusterEncodingDetector.class));
assertTrue(baseClasses.contains(StandardHtmlEncodingDetector.class));
}
@Test
public void testExcludeList() throws Exception {
TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-2273-exclude-encoding-detector-default.json");
EncodingDetector detector = tikaLoader.loadEncodingDetectors();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
// default-encoding-detector (inner composite) + override-encoding-detector
// The inner composite now includes CharSoupEncodingDetector from SPI
assertEquals(2, detectors.size());
EncodingDetector detector1 = detectors.get(0);
assertTrue(detector1 instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors1Children =
((CompositeEncodingDetector) detector1).getDetectors();
// BOM + Metadata + ML base detectors + CharSoup meta (html excluded)
assertEquals(4, detectors1Children.size());
Set<Class<?>> innerClasses = detectors1Children.subList(0, 3).stream()
.map(Object::getClass).collect(Collectors.toSet());
assertTrue(innerClasses.contains(BOMDetector.class));
assertTrue(innerClasses.contains(MetadataCharsetDetector.class));
assertTrue(innerClasses.contains(MojibusterEncodingDetector.class));
assertTrue(detectors1Children.get(3) instanceof MetaEncodingDetector);
assertTrue(detectors.get(1) instanceof OverrideEncodingDetector);
}
@Test
public void testParameterization() throws Exception {
TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-2273-parameterize-encoding-detector.json");
EncodingDetector detector = tikaLoader.loadEncodingDetectors();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(2, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertEquals(65000, ((HtmlEncodingDetector) detectors.get(0)).getDefaultConfig().getMarkLimit());
assertTrue(detectors.get(1) instanceof OverrideEncodingDetector);
}
@Test
public void testEncodingDetectorsAreLoaded() {
EncodingDetector encodingDetector =
((AbstractEncodingDetectorParser) new TXTParser()).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
}
@Test
public void testEncodingDetectorConfigurability() throws Exception {
// CP500 (EBCDIC) is now detected by MojibusterEncodingDetector's structural IBM500 rule.
// We must hint Content-Type=text/plain so that TXTParser is selected; without the filename
// extension the byte-level MIME sniffer classifies the EBCDIC data as octet-stream.
Metadata md = new Metadata();
md.set("Content-Type", "text/plain");
Metadata metadata = getXML("english.cp500.txt", md).metadata;
assertNotNull(metadata.get(TikaCoreProperties.DETECTED_ENCODING));
// Excluding ICU4J from the config (which is already not in the default chain)
// should still work ��� ML handles EBCDIC detection.
TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-2273-no-icu4j-encoding-detector.json");
Parser p = tikaLoader.loadAutoDetectParser();
md = new Metadata();
md.set("Content-Type", "text/plain");
metadata = getXML("english.cp500.txt", p, md).metadata;
assertNotNull(metadata.get(TikaCoreProperties.DETECTED_ENCODING));
}
@Test
public void testNonDetectingDetectorParams() throws Exception {
TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-2273-override-detector-params.json");
Parser p = tikaLoader.loadAutoDetectParser();
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(7, parsers.size());
EncodingDetector encodingDetector =
((AbstractEncodingDetectorParser) parsers.get(0)).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
assertEquals(1, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
EncodingDetector child =
((CompositeEncodingDetector) encodingDetector).getDetectors().get(0);
assertTrue(child instanceof OverrideEncodingDetector);
assertEquals(StandardCharsets.UTF_16LE,
((OverrideEncodingDetector) child).getCharset());
}
@Test
public void testNonDetectingDetectorParamsBadCharset() throws Exception {
TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-2273-non-detecting-params-bad-charset.json");
assertThrows(TikaConfigException.class, tikaLoader::loadEncodingDetectors);
}
@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-2273-encoding-detector-outside-static-init.json");
Parser p = tikaLoader.loadAutoDetectParser();
//make sure that all static and non-static parsers are using the same encoding detector!
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(8, parsers.size());
for (Parser encodingDetectingParser : parsers) {
EncodingDetector encodingDetector =
((AbstractEncodingDetectorParser) encodingDetectingParser)
.getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
// BOM, Metadata, ML, Html base detectors + CharSoup MetaEncodingDetector
// (ICU4J is excluded but was already not in the default chain)
assertEquals(5, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector)
.getDetectors()) {
assertNotContained("cu4j", child.getClass().getCanonicalName());
}
}
// ML handles EBCDIC (IBM500) via structural rules, so CP500 is detectable
Metadata metadata = getXML("english.cp500.txt", p).metadata;
assertNotNull(metadata.get(TikaCoreProperties.DETECTED_ENCODING));
}
@Test
public void testMarkLimitUnit() throws Exception {
TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-2485-encoding-detector-mark-limits.json");
Parser p = tikaLoader.loadAutoDetectParser();
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(7, parsers.size());
for (Parser childParser : parsers) {
EncodingDetector encodingDetector =
((AbstractEncodingDetectorParser) childParser).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
List<EncodingDetector> children =
((CompositeEncodingDetector) encodingDetector).getDetectors();
// 3 base detectors + 1 MetaEncodingDetector (CharSoup) = 4 total
assertEquals(4, children.size(), childParser.getClass().toString());
assertTrue(children.get(0) instanceof MojibusterEncodingDetector,
childParser.getClass().toString());
HtmlEncodingDetector htmlDet = (HtmlEncodingDetector) children.get(1);
assertEquals(100000, htmlDet.getDefaultConfig().getMarkLimit(),
childParser.getClass().toString());
assertTrue(children.get(2) instanceof StandardHtmlEncodingDetector,
childParser.getClass().toString());
assertTrue(children.get(3) instanceof CharSoupEncodingDetector,
childParser.getClass().toString());
}
}
@Test
public void testMarkLimitIntegration() throws Exception {
StringBuilder sb = new StringBuilder();
sb.append("<html><head><script>");
// script length = ~80000 bytes, beyond the default mark limit of 65536
for (int i = 0; i < 16000; i++) {
sb.append("blah ");
}
sb.append("</script>");
sb.append("<meta charset=\"utf-8\">").append("</head><body>");
sb.append("<p>Frugt, grønd og mere</p>");
sb.append("<p>5 kg \u00f8kologisk frugt og gr\u00F8nt, lige til at g\u00E5 til</p>");
sb.append("</body></html>");
byte[] bytes = sb.toString().getBytes(StandardCharsets.UTF_8);
// Default: the meta charset is buried at ~byte 80,000, past the default
// mark limit of 65536. The detector falls back to windows-1252 for the
// pure-ASCII probe. HTML entities (ø) render correctly regardless;
// raw UTF-8 multibyte sequences (e.g. �� in "��kologisk") are garbled.
// Raise the mark limit via config to fix this (see below).
Parser p = AUTO_DETECT_PARSER;
Metadata metadata = new Metadata();
String xml = getXML(TikaInputStream.get(bytes), p, metadata).xml;
assertContains("gr\u00F8nd", xml); // ø entity ��� correct regardless
assertNotContained("\u00f8kologisk", xml); // raw UTF-8 bytes ��� garbled by default
assertNotContained("gr\u00F8nt", xml);
assertNotContained("g\u00E5 til", xml);
// With a raised mark limit the detector reaches the meta charset and
// correctly decodes UTF-8 content.
p = TikaLoaderHelper.getLoader("TIKA-2485-encoding-detector-mark-limits.json").loadAutoDetectParser();
metadata = new Metadata();
xml = getXML(TikaInputStream.get(bytes), p, metadata).xml;
assertContains("gr\u00F8nd", xml);
assertContains("\u00f8kologisk", xml);
assertContains("gr\u00F8nt", xml);
assertContains("g\u00E5 til", xml);
}
@Test
public void testExcludeCharSoupEncodingDetector() throws Exception {
TikaLoader tikaLoader = TikaLoaderHelper.getLoader(
"TIKA-4671-exclude-charsoup-encoding-detector.json");
EncodingDetector detector = tikaLoader.loadEncodingDetectors();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors =
((CompositeEncodingDetector) detector).getDetectors();
// 4 base detectors (BOM + Metadata + ML + StandardHtml), no MetaEncodingDetector
assertEquals(4, detectors.size());
Set<Class<?>> excludedCharSoupClasses = detectors.stream()
.map(Object::getClass).collect(Collectors.toSet());
assertTrue(excludedCharSoupClasses.contains(BOMDetector.class));
assertTrue(excludedCharSoupClasses.contains(MetadataCharsetDetector.class));
assertTrue(excludedCharSoupClasses.contains(MojibusterEncodingDetector.class));
assertTrue(excludedCharSoupClasses.contains(StandardHtmlEncodingDetector.class));
for (EncodingDetector d : detectors) {
assertNotContained("CharSoup", d.getClass().getSimpleName());
}
}
// -----------------------------------------------------------------------
// Solr integration-test regression (TIKA-4662)
// -----------------------------------------------------------------------
/**
* ASCII HTML with an explicit {@code <meta charset="UTF-8">} must be
* detected as UTF-8. The full detection chain is required: the HTML
* detector produces a DECLARATIVE UTF-8 result; CharSoupEncodingDetector
* sees that both UTF-8 and the statistical winner (windows-1252) decode
* the pure-ASCII bytes identically and therefore defers to the declaration.
*/
@Test
public void testAsciiHtmlWithMetaIsDetectedAsUtf8() throws Exception {
byte[] bytes =
"<html><head><meta charset=\"UTF-8\"></head><body>initial</body></html>"
.getBytes(StandardCharsets.UTF_8);
EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors();
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
List<EncodingResult> results =
detector.detect(tis, new Metadata(), new ParseContext());
assertFalse(results.isEmpty(), "detector returned no result for ASCII HTML with meta");
assertEquals(StandardCharsets.UTF_8, results.get(0).getCharset(),
"ASCII HTML with <meta charset=UTF-8> should be detected as UTF-8, got: "
+ results.get(0).getCharset().name());
}
}
@Test
public void testArabicMisleadingCharsetHtml() throws Exception {
// This HTML file is encoded in windows-1256 but declares charset=UTF-8
// in the meta tag. The CharSoupEncodingDetector should override the
// misleading HTML meta and detect that the actual content is Arabic
// (windows-1256) because windows-1256 decoded text produces a higher
// language detection score.
Metadata metadata = new Metadata();
XMLResult result = getXML("testArabicMisleadingCharset.html", metadata);
// Verify encoding was detected as windows-1256, not the misleading UTF-8
assertEquals("windows-1256",
metadata.get(TikaCoreProperties.DETECTED_ENCODING));
// Verify extracted text contains readable Arabic, not mojibake
// \u0627\u0644\u0639\u0631\u0628\u064a\u0629 = "��������������" (Arabic)
assertContains("\u0627\u0644\u0639\u0631\u0628\u064a\u0629", result.xml);
}
private void findEncodingDetectionParsers(Parser p, List<Parser> encodingDetectionParsers) {
if (p instanceof CompositeParser) {
for (Parser child : ((CompositeParser) p).getAllComponentParsers()) {
findEncodingDetectionParsers(child, encodingDetectionParsers);
}
} else if (p instanceof ParserDecorator) {
findEncodingDetectionParsers(((ParserDecorator) p), encodingDetectionParsers);
}
if (p instanceof AbstractEncodingDetectorParser) {
encodingDetectionParsers.add(p);
}
}
}