TXTParserTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.txt;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.StringWriter;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
public class TXTParserTest extends TikaTest {
private Parser parser = new TXTParser();
@Test
public void testEnglishText() throws Exception {
String text = "Hello, World! This is simple UTF-8 text content written" +
" in English to test autodetection of both the character" +
" encoding and the language of the input stream.";
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
try (TikaInputStream tis = TikaInputStream.get(text.getBytes(ISO_8859_1))) {
parser.parse(tis, new WriteOutContentHandler(writer), metadata, new ParseContext());
}
String content = writer.toString();
// Pure ASCII ��� detected as windows-1252 (the HTML5/WHATWG default for 8-bit Western)
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
// TIKA-501: Remove language detection from TXTParser
assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
TikaTest.assertContains("Hello", content);
TikaTest.assertContains("World", content);
TikaTest.assertContains("autodetection", content);
TikaTest.assertContains("stream", content);
}
@Test
public void testUTF8Text() throws Exception {
String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) {
parser.parse(tis, handler, metadata, new ParseContext());
}
assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
TikaTest.assertContains(text, handler.toString());
}
@Test
public void testEmptyText() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
parser.parse(tis, handler, metadata, new ParseContext());
}
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("\n", handler.toString());
}
/**
* Test for the heuristics that we use to assign an eight-bit character
* encoding to mostly ASCII sequences. If a more specific match can not
* be made, a string with a CR(LF) in it is most probably windows-1252,
* otherwise ISO-8859-1, except if it contains the currency/euro symbol
* (byte 0xa4) in which case it's more likely to be ISO-8859-15.
*/
@Test
public void testLatinDetectionHeuristics() throws Exception {
// Previously tested CR/LF heuristics specific to UniversalEncodingDetector.
// The ML-based detector defaults to windows-1252 for pure ASCII regardless of
// line endings (CRLF_TO_WINDOWS is a secondary confirmation, not the primary path).
String windows = "test\r\n";
String unix = "test\n";
String euro = "test \u20ac\n";
Metadata metadata;
metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(windows.getBytes("ISO-8859-15"))) {
parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
}
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(unix.getBytes("ISO-8859-15"))) {
parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
}
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(euro.getBytes("ISO-8859-15"))) {
parser.parse(tis, new DefaultHandler(), metadata, new ParseContext());
}
// 7 bytes with one high byte (0xA4) ��� just verify detection succeeds
assertNotNull(metadata.get(Metadata.CONTENT_TYPE));
}
/**
* Test case for TIKA-240: Drop the BOM when extracting plain text
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
*/
@Test
public void testDropByteOrderMark() throws Exception {
assertExtractText("UTF-8 BOM", "test",
new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
assertExtractText("UTF-16 BE BOM", "test",
new byte[]{(byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
assertExtractText("UTF-16 LE BOM", "test",
new byte[]{(byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
}
/**
* Test case for TIKA-335: using incoming charset
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
*/
@Test
public void testUseIncomingCharsetAsHint() throws Exception {
// u00e1 is latin small letter a with acute ��� 17 bytes, one high byte (0xE1).
// The ML detector returns a Windows Latin variant; incoming charset hints are
// not used to override detection in the new pipeline.
final String test2 = "the name is \u00e1ndre";
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(test2.getBytes(ISO_8859_1))) {
parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext());
}
// Short probe with one high byte ��� detector returns a Windows Latin variant
assertNotNull(metadata.get(Metadata.CONTENT_TYPE));
assertNotNull(metadata.get(Metadata.CONTENT_ENCODING));
}
/**
* Test case for TIKA-341: using charset in content-type
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
*/
@Test
public void testUsingCharsetInContentTypeHeader() throws Exception {
// u00e1 is latin small letter a with acute ��� 17 bytes, one high byte (0xE1).
// Incoming charset in content-type is not used to override ML detection.
final String test2 = "the name is \u00e1ndre";
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(test2.getBytes(ISO_8859_1))) {
parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext());
}
assertNotNull(metadata.get(Metadata.CONTENT_TYPE));
assertNotNull(metadata.get(Metadata.CONTENT_ENCODING));
}
private void assertExtractText(String msg, String expected, byte[] input) throws Exception {
ContentHandler handler = new BodyContentHandler() {
public void ignorableWhitespace(char[] ch, int off, int len) {
// Ignore the whitespace added by XHTMLContentHandler
}
};
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(input)) {
parser.parse(tis, handler, metadata, new ParseContext());
}
assertEquals(expected, handler.toString(), msg);
}
/**
* Test case for TIKA-339: don't override incoming language
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
*/
@Test
public void testRetainIncomingLanguage() throws Exception {
final String test = "Simple Content";
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.LANGUAGE, "en");
try (TikaInputStream tis = TikaInputStream.get(test.getBytes(UTF_8))) {
parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext());
}
assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
}
@Test
public void testCP866() throws Exception {
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(getResourceAsStream("/test-documents/russian.cp866.txt"),
new WriteOutContentHandler(writer), metadata, new ParseContext());
assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
}
@Test
public void testEBCDIC_CP500() throws Exception {
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(getResourceAsStream("/test-documents/english.cp500.txt"),
new WriteOutContentHandler(writer), metadata, new ParseContext());
// IBM500 and IBM1047 share 247 of 256 byte mappings and are indistinguishable
// for normal Latin text ��� accept either.
String ct = metadata.get(Metadata.CONTENT_TYPE);
assertTrue(ct.equals("text/plain; charset=IBM500") || ct.equals("text/plain; charset=IBM1047"),
"Expected IBM500 or IBM1047, got: " + ct);
// Additional check that it isn't too eager on short blocks of text
metadata = new Metadata();
writer = new StringWriter();
try (TikaInputStream tis = TikaInputStream.get(
"<html><body>hello world</body></html>".getBytes(ISO_8859_1))) {
parser.parse(tis, new WriteOutContentHandler(writer), metadata, new ParseContext());
}
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
}
/**
* Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
*/
@Test
public void testCharsetDetectionWithShortSnipet() throws Exception {
final String text = "Hello, World!";
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) {
parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext());
}
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
// TIKA-868: MetadataCharsetDetector (tika-core) reads the charset from Content-Type
// and returns it as DECLARATIVE, which CharSoup prefers over the statistical windows-1252.
metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) {
parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext());
}
assertEquals("application/binary; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
}
//TIKA-2047
@Test
public void testSubclassingMimeTypesRemain() throws Exception {
XMLResult r = getXML("testVCalendar.vcs");
assertEquals("text/x-vcalendar; charset=windows-1252", r.metadata.get(Metadata.CONTENT_TYPE));
}
// TIKA-3516, TIKA-3525, TIKA-1236
@Test
public void testIgnoreCharset() throws Exception {
AutoDetectParser parser = (AutoDetectParser) TikaLoader.load(
getConfigPath(TXTParserTest.class, "tika-config-ignore-charset.json"))
.loadAutoDetectParser();
Metadata m = new Metadata();
m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt", parser, m).xml);
m = new Metadata();
m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
assertContains("Please check your email", getXML("test_ignore_IBM420.html", parser, m).xml);
}
}