CharsetDetectionRegressionTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.ml.chardetect;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import java.nio.charset.Charset;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
/**
* Regression tests for charset detection edge-cases that surfaced during
* integration testing with the CharSoup language-aware detector.
*
* <ul>
* <li><b>ASCII-only HTML</b> (Solr integration test regression): simple
* {@code <html><body>���</body></html>} content written as UTF-8 was
* returned as {@code ISO-8859-1} by the old detector chain.
* The correct answer is {@code UTF-8}.</li>
* <li><b>Short plain-text English</b> (TXTParserTest regression): a short
* English paragraph whose bytes are all in the ASCII range was returned
* as {@code ISO-8859-1} and in some cases as {@code UTF-16}.
* The ML-based chain must not return UTF-16 for ASCII-range input.</li>
* <li><b>Shift-JIS ZIP entry name</b>: 9 raw bytes encoding {@code ������1.txt}
* in Shift-JIS must be detected as {@code Shift_JIS}, not Big5-HKSCS.
* The raw ML logits favour Big5-HKSCS; the CharSoup language signal must
* override the model ranking.</li>
* </ul>
*/
public class CharsetDetectionRegressionTest {
// ������1.txt in Shift-JIS (9 raw bytes from a real zip entry)
private static final byte[] SJIS_RAW = hexToBytes("95b68fcd312e747874");
// Pure-ASCII HTML without a meta charset declaration ��� mirrors what the
// Solr integration test wrote before the meta-tag workaround was added.
// The old detector returned ISO-8859-1 for this without any meta tag.
// The new detector required adding <meta charset="UTF-8"> to avoid
// returning an unexpected charset.
private static final byte[] ASCII_HTML_NO_META =
"<html><body>initial</body></html>".getBytes(UTF_8);
// English plain text from TXTParserTest ��� all bytes in the ASCII range
private static final byte[] ENGLISH_TEXT =
("Hello, World! This is simple UTF-8 text content written"
+ " in English to test autodetection of both the character"
+ " encoding and the language of the input stream.").getBytes(UTF_8);
// -----------------------------------------------------------------------
// Solr integration-test regression
// -----------------------------------------------------------------------
/**
* ASCII HTML <em>without</em> a meta charset declaration must not be
* returned as UTF-16.
*
* <p>The old detector returned {@code ISO-8859-1} here without requiring
* any meta tag. The new detector regressed: without a meta tag it started
* returning an unexpected charset, which caused the Solr integration test
* to fail. The workaround was to add {@code <meta charset="UTF-8">} to
* the generated HTML ��� but we should not need to do that. UTF-8,
* US-ASCII, and ISO-8859-1 are all acceptable; UTF-16 is not.</p>
*/
@Test
public void asciiHtmlWithoutMetaIsNotDetectedAsUtf16() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
try (TikaInputStream tis = TikaInputStream.get(ASCII_HTML_NO_META)) {
List<EncodingResult> results =
detector.detect(tis, new Metadata(), new ParseContext());
assertFalse(results.isEmpty(), "detector returned no result for ASCII HTML");
Charset top = results.get(0).getCharset();
assertFalse(top.name().startsWith("UTF-16"),
"ASCII HTML without meta tag must not be detected as UTF-16, got: "
+ top.name());
}
}
// -----------------------------------------------------------------------
// TXTParser regression
// -----------------------------------------------------------------------
/**
* A plain-English paragraph whose bytes are all in the ASCII range must
* be returned as {@code windows-1252} ��� the HTML5/WHATWG default for
* unlabeled 8-bit Western content and the statistical fallback for
* pure-ASCII bytes in the ML-based detector chain.
*/
@Test
public void englishPlainTextIsDetectedAsWindows1252() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
try (TikaInputStream tis = TikaInputStream.get(ENGLISH_TEXT)) {
List<EncodingResult> results =
detector.detect(tis, new Metadata(), new ParseContext());
assertFalse(results.isEmpty(), "detector returned no result for English text");
Charset top = results.get(0).getCharset();
assertEquals("windows-1252", top.name(),
"Pure-ASCII English text should be detected as windows-1252, got: "
+ top.name());
}
}
// -----------------------------------------------------------------------
// Shift-JIS ZIP entry name
// -----------------------------------------------------------------------
/**
* 9 raw bytes encoding {@code ������1.txt} in Shift-JIS must be identified
* as {@code Shift_JIS}.
*
* <p>The same bytes are structurally valid Big5-HKSCS and ranked higher by
* the raw ML logits. CharSoup must override the model ranking using the
* Japanese language signal. ZipParser feeds entry names as raw byte arrays
* to the encoding detector, so a wrong answer here means garbled filenames
* in Japanese zip archives.</p>
*/
@Disabled("Requires retrained model from TIKA-4691")
@Test
public void sjisZipEntryNameIsDetectedAsShiftJis() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
try (TikaInputStream tis = TikaInputStream.get(SJIS_RAW)) {
List<EncodingResult> results =
detector.detect(tis, new Metadata(), new ParseContext());
assertFalse(results.isEmpty(),
"detector returned no result for SJIS filename bytes");
Charset top = results.get(0).getCharset();
assertEquals("Shift_JIS", top.name(),
"SJIS zip entry bytes should be detected as Shift_JIS, got: " + top.name());
}
}
// -----------------------------------------------------------------------
private static byte[] hexToBytes(String hex) {
byte[] b = new byte[hex.length() / 2];
for (int i = 0; i < b.length; i++) {
b[i] = (byte) Integer.parseInt(hex.substring(i * 2, i * 2 + 2), 16);
}
return b;
}
}