RTFStateTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf.jflex;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.StringReader;
import java.nio.charset.Charset;
import org.junit.jupiter.api.Test;
public class RTFStateTest {
private RTFState processRtf(String rtf) throws Exception {
RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
RTFState state = new RTFState();
RTFToken tok;
while ((tok = tokenizer.yylex()) != null) {
if (tok.getType() == RTFTokenType.EOF) {
break;
}
state.processToken(tok);
}
return state;
}
@Test
public void testGlobalCharsetFromAnsicpg() throws Exception {
RTFState state = processRtf("{\\rtf1\\ansi\\ansicpg1251}");
assertEquals(Charset.forName("CP1251"), state.getGlobalCharset());
}
@Test
public void testGlobalCharsetDefaultWindows1252() throws Exception {
RTFState state = processRtf("{\\rtf1\\ansi}");
assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getGlobalCharset());
}
@Test
public void testGlobalCharsetPca() throws Exception {
RTFState state = processRtf("{\\rtf1\\pca}");
assertEquals(Charset.forName("cp850"), state.getGlobalCharset());
}
@Test
public void testGlobalCharsetPc() throws Exception {
RTFState state = processRtf("{\\rtf1\\pc}");
assertEquals(Charset.forName("cp437"), state.getGlobalCharset());
}
@Test
public void testGlobalCharsetMac() throws Exception {
RTFState state = processRtf("{\\rtf1\\mac}");
assertEquals(Charset.forName("MacRoman"), state.getGlobalCharset());
}
@Test
public void testFontTableParsing() throws Exception {
// Realistic font table: f0=Times New Roman (ANSI), f1=MS Mincho (Shift_JIS)
String rtf = "{\\rtf1\\ansi\\deff0" +
"{\\fonttbl" +
"{\\f0\\froman\\fcharset0 Times New Roman;}" +
"{\\f1\\fnil\\fcharset128 MS Mincho;}" +
"}" +
"\\f0 Hello}";
RTFState state = processRtf(rtf);
// fcharset 0 = ANSI = WINDOWS-1252
assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getFontToCharset().get(0));
// fcharset 128 = Shift JIS = MS932
assertEquals(Charset.forName("MS932"), state.getFontToCharset().get(1));
}
@Test
public void testCurrentCharsetFollowsFont() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
"{\\fonttbl" +
"{\\f0\\froman\\fcharset0 Times;}" +
"{\\f1\\fnil\\fcharset161 Greek;}" +
"}" +
"\\f1 text}";
RTFTokenizer tokenizer = new RTFTokenizer(new java.io.StringReader(rtf));
RTFState state = new RTFState();
Charset charsetAtText = null;
RTFToken tok;
while ((tok = tokenizer.yylex()) != null) {
if (tok.getType() == RTFTokenType.EOF) {
break;
}
state.processToken(tok);
// Capture charset when we see the first body text char
if (tok.getType() == RTFTokenType.TEXT && tok.getChar() == 't'
&& charsetAtText == null) {
charsetAtText = state.getCurrentCharset();
}
}
// Verify font table was populated
assertEquals(2, state.getFontToCharset().size());
assertEquals(Charset.forName("cp1253"), state.getFontToCharset().get(1));
// After \f1, charset should be cp1253 (Greek)
assertNotNull(charsetAtText);
assertEquals(Charset.forName("cp1253"), charsetAtText);
}
@Test
public void testCurrentCharsetFallsBackToGlobal() throws Exception {
String rtf = "{\\rtf1\\ansi\\ansicpg1254\\deff0" +
"{\\fonttbl" +
"{\\f0\\froman\\fcharset0 Times;}" +
"}" +
"\\f0 text}";
RTFState state = processRtf(rtf);
// fcharset 0 = WINDOWS-1252 (ANSI)
assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getCurrentCharset());
}
@Test
public void testDefaultFontCharset() throws Exception {
// \deff1 sets default font to f1, which maps to fcharset 162 (Turkish = cp1254)
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff1" +
"{\\fonttbl" +
"{\\f0\\froman\\fcharset0 Times;}" +
"{\\f1\\fnil\\fcharset162 Arial;}" +
"}" +
"\\pard text}";
RTFState state = processRtf(rtf);
// No explicit \fN in body, so should fall back to deff1 -> fcharset 162 -> cp1254
assertEquals(Charset.forName("cp1254"), state.getCurrentCharset());
}
@Test
public void testUcSkipInherited() throws Exception {
// RTF uc control word sets skip count to 2, inherited by child groups
// We process token-by-token and check inside the inner group
String rtf = "{\\rtf1\\ansi\\uc2{inner}}";
RTFTokenizer tokenizer = new RTFTokenizer(new java.io.StringReader(rtf));
RTFState state = new RTFState();
int ucSkipInInnerGroup = -1;
boolean seenInnerText = false;
RTFToken tok;
while ((tok = tokenizer.yylex()) != null) {
if (tok.getType() == RTFTokenType.EOF) {
break;
}
state.processToken(tok);
// Check ucSkip when we see the first char of "inner"
if (tok.getType() == RTFTokenType.TEXT && tok.getChar() == 'i' && !seenInnerText) {
ucSkipInInnerGroup = state.getCurrentGroup().ucSkip;
seenInnerText = true;
}
}
// Inside {inner}, ucSkip should be inherited as 2 from parent
assertEquals(2, ucSkipInInnerGroup);
}
@Test
public void testAnsiSkipAfterUnicode() throws Exception {
// After \u8212, the next ucSkip (default 1) ANSI chars should be skipped
String rtf = "{\\rtf1\\ansi\\ansicpg1252" +
"{\\fonttbl{\\f0\\fcharset0 Times;}}" +
"\\f0 A\\u8212\\'97B}";
RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
RTFState state = new RTFState();
StringBuilder textOutput = new StringBuilder();
RTFToken tok;
while ((tok = tokenizer.yylex()) != null) {
if (tok.getType() == RTFTokenType.EOF) {
break;
}
boolean consumed = state.processToken(tok);
if (!consumed && !state.getCurrentGroup().ignore) {
if (tok.getType() == RTFTokenType.TEXT) {
textOutput.append(tok.getChar());
} else if (tok.getType() == RTFTokenType.UNICODE_ESCAPE) {
int cp = tok.getParameter();
if (Character.isValidCodePoint(cp)) {
textOutput.appendCodePoint(cp);
}
}
}
}
// A + \u8212 (em dash) + B. The \'97 should be skipped as unicode shadow.
assertEquals("A\u2014B", textOutput.toString());
}
@Test
public void testGroupStateRestored() throws Exception {
// Font change inside a group should be reverted when group closes
String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
"{\\fonttbl" +
"{\\f0\\fcharset0 Times;}" +
"{\\f1\\fcharset161 Greek;}" +
"}" +
"\\f0 {\\f1 greek}{back to times}}";
RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
RTFState state = new RTFState();
Charset charsetInsideGroup = null;
Charset charsetAfterGroup = null;
boolean seenGreekGroup = false;
int bodyGroupDepth = 0;
RTFToken tok;
while ((tok = tokenizer.yylex()) != null) {
if (tok.getType() == RTFTokenType.EOF) {
break;
}
state.processToken(tok);
if (tok.getType() == RTFTokenType.TEXT) {
char ch = tok.getChar();
if (ch == 'g' && !seenGreekGroup) {
charsetInsideGroup = state.getCurrentCharset();
seenGreekGroup = true;
} else if (ch == 'b') {
charsetAfterGroup = state.getCurrentCharset();
}
}
}
assertNotNull(charsetInsideGroup);
assertNotNull(charsetAfterGroup);
// Inside the {\f1 ...} group, charset should be Greek (cp1253)
assertEquals(Charset.forName("cp1253"), charsetInsideGroup);
// After the group closes, should be back to f0 (WINDOWS-1252)
assertEquals(RTFCharsetMaps.WINDOWS_1252, charsetAfterGroup);
}
}