RTFEncapsulatedHTMLExtractorTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.msg;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.charset.Charset;
import java.util.Map;
import org.junit.jupiter.api.Test;
public class RTFEncapsulatedHTMLExtractorTest {
@Test
public void testNullAndEmpty() {
assertNull(RTFEncapsulatedHTMLExtractor.extract(null));
assertNull(RTFEncapsulatedHTMLExtractor.extract(new byte[0]));
}
@Test
public void testNonEncapsulatedRtf() {
String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
assertNull(RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII)));
}
@Test
public void testSimpleEncapsulatedHtml() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag19 <html>}\n" +
"{\\*\\htmltag34 <head>}\n" +
"{\\*\\htmltag41 </head>}\n" +
"{\\*\\htmltag50 <body>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"{\\*\\htmltag64 <p>}\n" +
"{\\*\\htmltag84 Hello world}\n" +
"{\\*\\htmltag72 </p>}\n" +
"\\htmlrtf }\\htmlrtf0\n" +
"{\\*\\htmltag58 </body>}\n" +
"{\\*\\htmltag27 </html>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("<html>"));
assertTrue(html.contains("<p>"));
assertTrue(html.contains("Hello world"));
assertTrue(html.contains("</html>"));
}
@Test
public void testImgCidExtraction() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag19 <html>}\n" +
"{\\*\\htmltag50 <body>}\n" +
"{\\*\\htmltag84 <img src=\"cid:image001.png@01DC5A2C.E674FE00\">}\n" +
"{\\*\\htmltag58 </body>}\n" +
"{\\*\\htmltag27 </html>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("cid:image001.png@01DC5A2C.E674FE00"),
"CID reference should be preserved in extracted HTML");
}
@Test
public void testParAndTabDecoding() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag241 <style>}\n" +
"{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" +
"{\\*\\htmltag249 </style>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("<style>"));
assertTrue(html.contains("body {"));
assertTrue(html.contains("\tcolor: red;"));
assertTrue(html.contains("</style>"));
}
@Test
public void testHexEscapeDecoding() {
// \'e9 = 0xE9 = '��' in windows-1252
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 caf\\'e9}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("caf��", html);
}
@Test
public void testMultiByteHexEscape() {
// UTF-8 encoded '��' = 0xC3 0xBC in code page 65001 (UTF-8)
// But more commonly: \'fc in windows-1252 = '��'
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("gr����e", html);
}
@Test
public void testCodePage1254Turkish() {
// \'fe in windows-1254 = '��' (U+00FE, LATIN SMALL LETTER THORN)
// \'fd in windows-1254 = '��' (U+00FD)
String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 Say\\'fdn}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("Say��n", html);
}
@Test
public void testHtmlrtfSkipping() {
// Content between \htmlrtf and \htmlrtf0 should be skipped
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 Hello}\n" +
"\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
"{\\*\\htmltag84 World}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("Hello World", html);
}
@Test
public void testEscapedBracesAndBackslash() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("a { b } c \\d", html);
}
@Test
public void testEmptyHtmltag() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag72}\n" +
"{\\*\\htmltag84 text}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("text", html);
}
@Test
public void testInterTagTextContent() {
// Realistic pattern: text content appears BETWEEN htmltag groups,
// with \htmlrtf blocks that should be skipped
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag19 <html>}\n" +
"{\\*\\htmltag50 <body>}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"Hello from the message body\n" +
"\\htmlrtf\\par}\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"Second paragraph\n" +
"\\htmlrtf\\par}\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"{\\*\\htmltag58 </body>}\n" +
"{\\*\\htmltag27 </html>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("<p>"), "should contain HTML tags");
assertTrue(html.contains("Hello from the message body"),
"should contain inter-tag text content");
assertTrue(html.contains("Second paragraph"),
"should contain second paragraph text");
assertTrue(html.contains("</html>"), "should contain closing tag");
}
@Test
public void testInterTagHexEscapes() {
// Text between htmltag groups can also have \'xx escapes
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"caf\\'e9\n" +
"\\htmlrtf }\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("caf��"), "hex escapes in inter-tag text should be decoded");
}
@Test
public void testLineControlWord() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
"{\\*\\htmltag84 line1\\line line2}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("line1<br>line2", html);
}
@Test
public void testParseFontTable() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" +
"{\\f0\\fswiss\\fcharset0 Arial;}\n" +
"{\\f1\\fmodern\\fcharset0 Courier New;}\n" +
"{\\f4\\fswiss\\fcharset134 Simsun;}\n" +
"{\\f5\\fswiss\\fcharset128 MS PGothic;}\n" +
"{\\f6\\fswiss\\fcharset162 Arial Tur;}\n" +
"}\n}";
Map<Integer, Charset> fonts = RTFEncapsulatedHTMLExtractor.parseFontTable(rtf);
assertEquals(Charset.forName("windows-1252"), fonts.get(0));
assertEquals(Charset.forName("GBK"), fonts.get(4));
assertEquals(Charset.forName("MS932"), fonts.get(5));
assertEquals(Charset.forName("windows-1254"), fonts.get(6));
}
@Test
public void testParseFontTableEmpty() {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0 no font table}";
Map<Integer, Charset> fonts = RTFEncapsulatedHTMLExtractor.parseFontTable(rtf);
assertTrue(fonts.isEmpty());
}
@Test
public void testCjkFontCharsetTracking() {
// Simulates the real-world case: \ansicpg1252 but \fcharset134 (GBK) font
// used for inter-tag CJK text. The \htmlrtf block switches to \f1 (GBK font)
// and the \'xx bytes after \htmlrtf0 should be decoded as GBK.
// \u53ef\u4ee5 = ������, GBK bytes: BF C9 D2 D4
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" +
"{\\f0\\fswiss\\fcharset0 Arial;}\n" +
"{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
"}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\f1 \\htmlrtf0\n" +
"\\'bf\\'c9\\'d2\\'d4\n" +
"\\htmlrtf }\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("\u53ef\u4ee5"),
"GBK bytes should be decoded as Chinese characters, got: " + html);
}
@Test
public void testCjkFontSwitchBackToLatin() {
// After CJK text, font switches back to Latin font for ASCII content
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" +
"{\\f0\\fswiss\\fcharset0 Arial;}\n" +
"{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
"}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\f1 \\htmlrtf0\n" +
"\\'bf\\'c9\\'d2\\'d4\n" +
"\\htmlrtf\\f0 \\htmlrtf0\n" +
"Hello\n" +
"\\htmlrtf }\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("\u53ef\u4ee5"),
"CJK should be decoded correctly, got: " + html);
assertTrue(html.contains("Hello"),
"Latin text after font switch should be preserved");
}
@Test
public void testHtmltagUsesDefaultCodePage() {
// Per MS-OXRTFEX spec, \'xx inside htmltag groups should use the
// default code page (\ansicpg), not the current font's charset.
// \'e9 in windows-1252 = ��
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" +
"{\\f0\\fswiss\\fcharset0 Arial;}\n" +
"{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
"}\n" +
"\\htmlrtf {\\f1 \\htmlrtf0\n" +
"{\\*\\htmltag84 caf\\'e9}\n" +
"\\htmlrtf }\\htmlrtf0\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertEquals("caf��", html,
"htmltag content should use default code page, not font charset");
}
@Test
public void testFontSwitchInBracedSkipBlockDoesNotPersist() {
// Reproduces the Hebrew/Chinese bug: a skip block contains {\f3\'a0}
// where \f3 is a Latin font (charset 0). The braces should scope the
// font switch so it doesn't affect subsequent inter-tag text.
// \u05d0\u05d2 = ����, windows-1255 bytes: E0 E2
String rtf = "{\\rtf1\\ansi\\ansicpg1255\\fromhtml1 \\deff0{\\fonttbl\n" +
"{\\f0\\fswiss\\fcharset177 David;}\n" +
"{\\f3\\fmodern\\fcharset0 Courier New;}\n" +
"}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\htmlrtf {\\htmlrtf0\n" +
"\\'e0\\'e2\n" + // Hebrew: ����
"{\\*\\htmltag84 }" +
"\\htmlrtf {\\f3\\'a0}\\htmlrtf0\n" + // skip block with braced \f3
"\\'e8\\'e5\\'e1\n" + // Hebrew: ������
"\\htmlrtf }\\htmlrtf0\n" +
"{\\*\\htmltag72 </p>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("\u05d0\u05d2"),
"First Hebrew text should decode correctly, got: " + html);
// \xe8\xe5\xe1 in windows-1255 = ������; in windows-1252 = ������
assertTrue(html.contains("\u05d8\u05d5\u05d1"),
"Hebrew text after braced skip block should still use windows-1255, got: " + html);
assertFalse(html.contains("\u00e8\u00e5\u00e1"),
"Should NOT decode as windows-1252 (mojibake), got: " + html);
}
@Test
public void testFontSwitchInInterTagText() {
// \f control word directly in inter-tag text (outside \htmlrtf blocks)
// should also update the current charset
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0{\\fonttbl\n" +
"{\\f0\\fswiss\\fcharset0 Arial;}\n" +
"{\\f1\\fswiss\\fcharset134 Simsun;}\n" +
"}\n" +
"{\\*\\htmltag64 <p>}\n" +
"\\f1 \\'bf\\'c9\n" +
"{\\*\\htmltag72 </p>}\n" +
"}";
String html = RTFEncapsulatedHTMLExtractor.extract(rtf.getBytes(US_ASCII));
assertNotNull(html);
assertTrue(html.contains("\u53ef"),
"Font switch in inter-tag text should affect charset, got: " + html);
}
}