RTFHtmlDecapsulatorTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf.jflex;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
/**
* Tests for {@link RTFHtmlDecapsulator}, mirroring the original
* RTFEncapsulatedHTMLExtractorTest to verify parity.
*/
public class RTFHtmlDecapsulatorTest {
private static String extract(byte[] rtfBytes)
throws IOException, SAXException, TikaException {
return new RTFHtmlDecapsulator(new DefaultHandler(), new ParseContext())
.extract(rtfBytes);
}
@Test
public void testNullAndEmpty() throws Exception {
assertNull(extract(null));
assertNull(extract(new byte[0]));
}
@Test
public void testNonEncapsulatedRtf() throws Exception {
String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
assertNull(extract(rtf.getBytes(US_ASCII)));
}
@Test
public void testSimpleEncapsulatedHtml() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag19 <html>}\n" +
"{\\*\\htmltag34 <head>}\n" +
"{\\*\\htmltag41 </head>}\n" +
"{\\*\\htmltag50 <body>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"{\\*\\htmltag64 <p>}\n" +
"{\\*\\htmltag84 Hello world}\n" +
"{\\*\\htmltag72 </p>}\n" +
"\\htmlrtf }\\htmlrtf0\n" +
"{\\*\\htmltag58 </body>}\n" +
"{\\*\\htmltag27 </html>}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("<html>"));
assertTrue(html.contains("<p>"));
assertTrue(html.contains("Hello world"));
assertTrue(html.contains("</html>"));
}
@Test
public void testImgCidExtraction() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag19 <html>}\n" +
"{\\*\\htmltag50 <body>}\n" +
"{\\*\\htmltag84 <img src=\"cid:image001.png@01DC5A2C.E674FE00\">}\n" +
"{\\*\\htmltag58 </body>}\n" +
"{\\*\\htmltag27 </html>}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("cid:image001.png@01DC5A2C.E674FE00"),
"CID reference should be preserved in extracted HTML");
}
@Test
public void testParAndTabDecoding() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag241 <style>}\n" +
"{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" +
"{\\*\\htmltag249 </style>}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("<style>"));
assertTrue(html.contains("body {"));
assertTrue(html.contains("\tcolor: red;"));
assertTrue(html.contains("</style>"));
}
@Test
public void testHexEscapeDecoding() throws Exception {
// \'e9 = 0xE9 = 'e' in windows-1252
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 caf\\'e9}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("caf\u00e9", html);
}
@Test
public void testMultiByteHexEscape() throws Exception {
// \'fc = 'u' and \'df = 'ss' in windows-1252
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("gr\u00fc\u00dfe", html);
}
@Test
public void testCodePage1254Turkish() throws Exception {
// \'fd in windows-1254 = 0xFD, decoded by Java's windows-1254 charset
String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 Say\\'fdn}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
// Verify the byte 0xFD is decoded through windows-1254
byte[] expected = new byte[] { 'S', 'a', 'y', (byte) 0xFD, 'n' };
assertEquals(new String(expected, java.nio.charset.Charset.forName("windows-1254")), html);
}
@Test
public void testHtmlrtfSkipping() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 Hello}\n" +
"\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
"{\\*\\htmltag84 World}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("Hello World", html);
}
@Test
public void testEscapedBracesAndBackslash() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("a { b } c \\d", html);
}
@Test
public void testEmptyHtmltag() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag72}\n" +
"{\\*\\htmltag84 text}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("text", html);
}
@Test
public void testInterTagTextContent() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag19 <html>}\n" +
"{\\*\\htmltag50 <body>}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"Hello from the message body\n" +
"\\htmlrtf\\par}\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"Second paragraph\n" +
"\\htmlrtf\\par}\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"{\\*\\htmltag58 </body>}\n" +
"{\\*\\htmltag27 </html>}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("<p>"), "should contain HTML tags");
assertTrue(html.contains("Hello from the message body"),
"should contain inter-tag text content");
assertTrue(html.contains("Second paragraph"),
"should contain second paragraph text");
assertTrue(html.contains("</html>"), "should contain closing tag");
}
@Test
public void testInterTagHexEscapes() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"caf\\'e9\n" +
"\\htmlrtf }\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("caf\u00e9"), "hex escapes in inter-tag text should be decoded");
}
@Test
public void testLineControlWord() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 line1\\line line2}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("line1<br>line2", html);
}
@Test
public void testFontAwareCodePageDecoding() throws Exception {
// f0 = ANSI (fcharset 0 = windows-1252), f1 = Greek (fcharset 161 = cp1253)
// \'e1 in windows-1252 = U+00E1 (a with acute)
// \'e1 in cp1253 = U+03B1 (GREEK SMALL LETTER ALPHA)
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\fonttbl{\\f0\\fcharset0 Times;}{\\f1\\fcharset161 Greek;}}\n" +
"{\\*\\htmltag84 \\f0 caf\\'e9}\n" +
"{\\*\\htmltag84 \\f1 \\'e1}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
// f0: \'e9 in windows-1252 = e with acute
assertTrue(html.contains("caf\u00e9"), "f0 should decode as windows-1252");
// f1: \'e1 in cp1253 = Greek alpha
assertTrue(html.contains("\u03b1"), "f1 should decode as cp1253 (Greek)");
}
@Test
public void testUnicodeEscapeWithAnsiShadow() throws Exception {
// \u8212 is em dash (U+2014). The \'97 is the ANSI shadow and should be skipped.
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\fonttbl{\\f0\\fcharset0 Times;}}\n" +
"{\\*\\htmltag84 A\\u8212\\'97B}\n" +
"}";
String html = extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("A\u2014B", html);
}
}