EbcdicRoutingTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.ml.chardetect;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.ml.LinearModel;
import org.apache.tika.parser.ParseContext;
/**
* Verifies EBCDIC detection using the single statistical model.
*
* <p>EBCDIC variants (IBM420-ltr/rtl, IBM424-ltr/rtl, IBM500, IBM1047) are
* direct labels in the general model alongside all other charsets. IBM855 and
* IBM866 are DOS/OEM Cyrillic code pages ��� not true EBCDIC ��� and are also
* direct model labels.
*/
public class EbcdicRoutingTest {
private static MojibusterEncodingDetector detector;
// Representative English prose encoded in IBM500 (International EBCDIC).
private static final byte[] IBM500_BYTES = encode("IBM500",
"The quick brown fox jumps over the lazy dog. " +
"This sentence contains every letter of the English alphabet. " +
"EBCDIC encoding is used on IBM mainframe systems. " +
"Fields are often fixed-width and space-padded in EBCDIC files.");
private static byte[] encode(String charsetName, String text) {
try {
return text.getBytes(Charset.forName(charsetName));
} catch (Exception e) {
throw new RuntimeException("Cannot encode test data as " + charsetName, e);
}
}
@BeforeAll
static void setUp() throws Exception {
detector = new MojibusterEncodingDetector();
}
/**
* The general model must have direct labels for all EBCDIC variants.
* There must be no bare "EBCDIC" routing label ��� that was the old two-model
* architecture which has been replaced by a single model.
*/
@Test
public void generalModelHasDirectEbcdicLabels() {
LinearModel general = detector.getModel();
List<String> labels = Arrays.asList(general.getLabels());
assertFalse(labels.contains("EBCDIC"),
"Model must not have a bare 'EBCDIC' routing label (single-model architecture)");
// True EBCDIC variants must be direct labels
for (String ebcdic : new String[]{"IBM420-ltr", "IBM420-rtl", "IBM424-ltr", "IBM424-rtl", "IBM500", "IBM1047"}) {
assertTrue(labels.contains(ebcdic),
"EBCDIC variant must be a direct model label: " + ebcdic);
}
// DOS Cyrillic variants must also be direct labels
assertTrue(labels.contains("IBM855"), "IBM855 (DOS Cyrillic) must be a direct model label");
assertTrue(labels.contains("IBM866"), "IBM866 (DOS Cyrillic) must be a direct model label");
}
/**
* IBM500 bytes must be detected as an IBM variant directly by the model.
*/
@Test
public void ibm500IsDetectedDirectly() throws Exception {
try (TikaInputStream tis = TikaInputStream.get(IBM500_BYTES)) {
List<EncodingResult> results = detector.detect(tis, new Metadata(), new ParseContext());
assertFalse(results.isEmpty(), "Should detect something for IBM500 bytes");
String topLabel = results.get(0).getLabel();
assertTrue(topLabel.startsWith("IBM"),
"Result should be an IBM variant, got: " + topLabel);
}
}
}