ZipFilenameDetectionTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.ml.chardetect;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.charset.Charset;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingDetectorContext;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
/**
* Integration tests for charset detection of short byte sequences typical of
* ZIP entry names ��� a particularly hard case because the probes are tiny (6-23
* bytes) and structurally valid in several encodings simultaneously.
*
* Detection strategy: Mojibuster ranks candidates by raw logit; CharSoup
* arbitrates using language signal (positive max-logit wins).
*/
public class ZipFilenameDetectionTest {
// ������1.txt in Shift-JIS (9 raw bytes from a real zip entry)
private static final byte[] SJIS_RAW = hexToBytes("95b68fcd312e747874");
// ������2.txt in Shift-JIS (same but '2' instead of '1')
private static final byte[] SJIS_RAW2 = hexToBytes("95b68fcd322e747874");
// ���������������������������������/ in GBK (23 bytes from gbk.zip)
private static final byte[] GBK_RAW = hexToBytes("c9f3bcc6d1b9cbf5b0fccec4bcfebceccbf7b2e2cad42f");
private static byte[] hexToBytes(String hex) {
byte[] b = new byte[hex.length() / 2];
for (int i = 0; i < b.length; i++) {
b[i] = (byte) Integer.parseInt(hex.substring(i * 2, i * 2 + 2), 16);
}
return b;
}
/**
* CharSoup should confirm Shift-JIS even when Mojibuster ranks Big5-HKSCS first,
* because the language model gives a higher logit to the Japanese text decoded
* from the same bytes.
*/
@Disabled("Requires generative language model for reliable Shift-JIS vs Big5-HKSCS arbitration")
@Test
public void charSoupOverridesModelRankingForShiftJis() throws Exception {
Charset big5 = Charset.forName("Big5-HKSCS");
Charset shiftJis = Charset.forName("Shift_JIS");
EncodingDetectorContext ctx = new EncodingDetectorContext();
ctx.addResult(List.of(
new EncodingResult(big5, 0.9f, "Big5-HKSCS", EncodingResult.ResultType.STATISTICAL),
new EncodingResult(shiftJis, 0.3f, "Shift_JIS", EncodingResult.ResultType.STATISTICAL)
), "MojibusterEncodingDetector");
ParseContext parseContext = new ParseContext();
parseContext.set(EncodingDetectorContext.class, ctx);
CharSoupEncodingDetector charSoup = new CharSoupEncodingDetector();
try (TikaInputStream tis = TikaInputStream.get(SJIS_RAW)) {
List<EncodingResult> result = charSoup.detect(tis, new Metadata(), parseContext);
assertTrue(!result.isEmpty(), "CharSoup should return a result");
assertEquals(shiftJis, result.get(0).getCharset(),
"CharSoup should pick Shift-JIS (������) over Big5-HKSCS via language signal");
}
}
/**
* Full pipeline (BOM ��� Metadata ��� Mojibuster ��� StandardHtml ��� CharSoup) run
* sequentially on two entries differing only in byte 5 (0x31 vs 0x32), simulating
* what ZipParser does when iterating entries with the same ParseContext.
*/
@Disabled("Requires generative language model for reliable Shift-JIS detection on short probes")
@Test
public void fullPipelineDetectsBothSjisEntries() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
Metadata parentMeta = new Metadata();
ParseContext outerContext = new ParseContext();
for (byte[] raw : new byte[][]{SJIS_RAW, SJIS_RAW2}) {
String label = (raw == SJIS_RAW) ? "������1.txt" : "������2.txt";
try (TikaInputStream tis = TikaInputStream.get(raw)) {
List<EncodingResult> results = detector.detect(tis, parentMeta, outerContext);
String charset = results.isEmpty() ? "(empty)" : results.get(0).getCharset().name();
assertTrue(!results.isEmpty() && "Shift_JIS".equals(results.get(0).getCharset().name()),
label + " should be detected as Shift_JIS, got: " + charset);
}
}
}
/**
* Full pipeline should detect GBK-encoded entry names as GB18030.
* Disabled: CharSoup's discriminative language model picks KOI8-U over GB18030
* on short probes because the GBK bytes happen to score as Cyrillic.
* Re-enable once generative language models are in place (better calibrated
* confidence will let CharSoup correctly abstain on cross-script ambiguity).
*/
@Disabled("Requires generative language model for reliable cross-script arbitration")
@Test
public void fullPipelineDetectsGbkEntry() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
Metadata meta = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(GBK_RAW)) {
List<EncodingResult> results = detector.detect(tis, meta, new ParseContext());
String charset = results.isEmpty() ? "(empty)" : results.get(0).getCharset().name();
assertTrue(!results.isEmpty() && results.get(0).getCharset().name().startsWith("GB"),
"GBK entry should be detected as GB18030/GBK, got: " + charset);
}
}
}