RTFState.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf.jflex;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.HashMap;
import java.util.Map;
/**
* Shared RTF parsing state: group stack, font table, codepage tracking,
* and unicode skip handling.
*
* <p>Both the HTML decapsulator and the full RTF parser use this class
* to manage the stateful parts of RTF processing.</p>
*
* <p>Typical usage: feed every token to {@link #processToken(RTFToken)}
* and query the current charset via {@link #getCurrentCharset()}.</p>
*/
public class RTFState {
/** Global charset from {@code \ansicpgN} or charset family selectors. */
private Charset globalCharset = RTFCharsetMaps.WINDOWS_1252;
/** Default font ID from {@code \deffN}. */
private int globalDefaultFont = -1;
/** Font table: maps font number ({@code \fN}) to charset ({@code \fcharsetN}). */
private final Map<Integer, Charset> fontToCharset = new HashMap<>();
private static final int MAX_GROUP_DEPTH = 10_000;
/** Group state stack. */
private final Deque<RTFGroupState> stack = new ArrayDeque<>();
/** Current (active) group state. */
private RTFGroupState current = new RTFGroupState();
/** Number of ANSI chars remaining to skip after a unicode escape. */
private int ansiSkip = 0;
/** The group state that was just closed (before popGroup). Set on GROUP_CLOSE. */
private RTFGroupState lastClosedGroup;
// Font table parsing state
// 0 = not yet seen, 1 = inside fonttbl, 2 = finished fonttbl
private int fontTableState = 0;
private int fontTableDepth = -1;
private int currentFontId = -1;
private boolean inHeader = true;
/**
* Process a single token to update internal state.
* <p>
* This handles: group open/close, charset selectors (ansi, ansicpg,
* deff), font table parsing (fonttbl, f, fcharset),
* unicode skip tracking (uc), and font changes (f in body).
*
* @return true if the token was consumed by state management (caller should skip it),
* false if the caller should also process it
*/
public boolean processToken(RTFToken tok) {
switch (tok.getType()) {
case GROUP_OPEN:
pushGroup();
return false;
case GROUP_CLOSE:
lastClosedGroup = current;
popGroup();
// Check if we've exited the font table
if (fontTableState == 1 && current.depth < fontTableDepth) {
fontTableState = 2;
}
return false;
case CONTROL_SYMBOL:
if (tok.getChar() == '*') {
current.ignore = true;
}
return false;
case CONTROL_WORD:
return processControlWord(tok);
case UNICODE_ESCAPE:
// After a unicode escape, skip the next ucSkip ANSI chars
ansiSkip = current.ucSkip;
return false;
case HEX_ESCAPE:
// If we're in the ANSI shadow of a unicode escape, skip this byte
if (ansiSkip > 0) {
ansiSkip--;
return true; // consumed ��� caller should ignore
}
return false;
case TEXT:
// If we're in the ANSI shadow, skip text chars
if (ansiSkip > 0) {
// Each TEXT token is one char
ansiSkip--;
return true;
}
return false;
default:
return false;
}
}
private boolean processControlWord(RTFToken tok) {
String name = tok.getName();
boolean hasParam = tok.hasParameter();
int param = tok.getParameter();
// Global charset selectors (header)
switch (name) {
case "ansi":
globalCharset = RTFCharsetMaps.WINDOWS_1252;
return true;
case "pca":
globalCharset = RTFCharsetMaps.getCharset("cp850");
return true;
case "pc":
globalCharset = RTFCharsetMaps.getCharset("cp437");
return true;
case "mac":
globalCharset = RTFCharsetMaps.getCharset("MacRoman");
return true;
case "ansicpg":
if (hasParam) {
Charset cs = RTFCharsetMaps.ANSICPG_MAP.get(param);
if (cs != null) {
globalCharset = cs;
} else {
globalCharset = RTFCharsetMaps.resolveCodePage(param);
}
}
return true;
case "deff":
if (hasParam) {
globalDefaultFont = param;
}
return true;
}
// Font table management
if ("fonttbl".equals(name)) {
fontTableState = 1;
fontTableDepth = current.depth;
current.ignore = true;
return true;
}
if (fontTableState == 1) {
// Inside font table
if (current.depth < fontTableDepth) {
fontTableState = 2;
} else {
if ("f".equals(name) && hasParam) {
currentFontId = param;
return true;
} else if ("fcharset".equals(name) && hasParam) {
Charset cs = RTFCharsetMaps.FCHARSET_MAP.get(param);
if (cs != null) {
fontToCharset.put(currentFontId, cs);
}
return true;
}
}
}
// Unicode skip count
if ("uc".equals(name) && hasParam) {
current.ucSkip = param;
return true;
}
// Font change in body
if ("f".equals(name) && hasParam) {
current.fontId = param;
Charset fontCs = fontToCharset.get(param);
current.fontCharset = fontCs; // may be null
// If we've seen the font table and this is a body font change,
// we're out of the header
if (fontTableState == 2 && !current.ignore) {
inHeader = false;
}
return false; // caller may also want to know about font changes
}
// Header-ending control words
if (inHeader && !current.ignore) {
switch (name) {
case "par":
case "pard":
case "sect":
case "sectd":
case "plain":
case "ltrch":
case "rtlch":
case "htmlrtf":
case "line":
inHeader = false;
break;
}
}
// Embedded object / picture control words
switch (name) {
case "object":
current.object = true;
return false; // caller may want to know
case "objdata":
current.objdata = true;
return false;
case "pict":
current.pictDepth = 1;
return false;
case "sp":
current.sp = true;
return false;
case "sn":
current.sn = true;
return false;
case "sv":
current.sv = true;
return false;
case "wbitmap":
return false; // caller handles
}
// Ignorable destinations
if (inHeader) {
switch (name) {
case "colortbl":
case "stylesheet":
current.ignore = true;
return true;
}
}
return false;
}
/** Open a new group: push current state and create a child. */
public void pushGroup() {
if (stack.size() >= MAX_GROUP_DEPTH) {
// Silently ignore ��� treat further { as flat content
return;
}
stack.push(current);
current = new RTFGroupState(current);
}
/** Close the current group: pop and restore the parent state. */
public void popGroup() {
if (!stack.isEmpty()) {
current = stack.pop();
}
}
/**
* Returns the charset that should be used to decode the current hex escape
* or text byte. Priority:
* <ol>
* <li>Font-specific charset (from {@code \fN ��� \fcharsetN})</li>
* <li>Global default font's charset (from {@code \deffN})</li>
* <li>Global charset (from {@code \ansicpgN} or family selector)</li>
* </ol>
*/
public Charset getCurrentCharset() {
if (current.fontCharset != null) {
return current.fontCharset;
}
if (globalDefaultFont != -1 && !inHeader) {
Charset cs = fontToCharset.get(globalDefaultFont);
if (cs != null) {
return cs;
}
}
return globalCharset;
}
/** Returns the global charset ({@code \ansicpgN}). */
public Charset getGlobalCharset() {
return globalCharset;
}
/** Returns the current group state. */
public RTFGroupState getCurrentGroup() {
return current;
}
/** Returns true if we're still in the RTF header (before body content). */
public boolean isInHeader() {
return inHeader;
}
/** Returns the current group nesting depth. */
public int getDepth() {
return current.depth;
}
/** Returns the font-to-charset mapping table. */
public Map<Integer, Charset> getFontToCharset() {
return fontToCharset;
}
/** Returns the number of ANSI chars remaining to skip. */
public int getAnsiSkip() {
return ansiSkip;
}
/**
* Returns the group state that was just closed on the most recent GROUP_CLOSE.
* This is the child group's state before it was popped.
* Useful for checking flags like objdata, pictDepth, sn, sv, sp, object
* to trigger completion handlers.
*/
public RTFGroupState getLastClosedGroup() {
return lastClosedGroup;
}
}