RTFEncapsulatedHTMLExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.msg;

import java.io.ByteArrayOutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.HashMap;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Extracts the original HTML from an RTF document that contains encapsulated HTML
 * (as indicated by the {@code \fromhtml1} control word).
 *
 * <p>The encapsulated HTML format stores HTML in two places:</p>
 * <ol>
 *   <li>{@code {\*\htmltag<N> ...}} groups ��� contain the HTML markup (tags, style blocks, etc.)</li>
 *   <li>Text between htmltag groups ��� contains the actual text content, provided it is NOT
 *       wrapped in {@code \htmlrtf ... \htmlrtf0} (which marks RTF-only rendering hints)</li>
 * </ol>
 *
 * <p>Per the MS-OXRTFEX specification, {@code \'xx} hex escapes in inter-tag text are decoded
 * using the code page of the currently selected font ({@code \fN}).  The font-to-charset mapping
 * is built from the RTF font table's {@code \fcharsetN} declarations.  Inside
 * {@code {\*\htmltag}} groups, the document's default code page ({@code \ansicpgN}) is used.</p>
 */
public class RTFEncapsulatedHTMLExtractor {

    private static final Logger LOGGER = LoggerFactory.getLogger(RTFEncapsulatedHTMLExtractor.class);

    private static final String HTMLTAG_PREFIX = "{\\*\\htmltag";
    private static final String FROM_HTML_MARKER = "\\fromhtml";
    private static final String ANSICPG_PREFIX = "\\ansicpg";

    // Maps RTF \fcharset values to Java Charset objects.
    // Based on the Windows CharacterSet enumeration and Tika's TextExtractor.FCHARSET_MAP.
    private static final Map<Integer, Charset> FCHARSET_MAP = new HashMap<>();

    static {
        FCHARSET_MAP.put(0, Charset.forName("windows-1252"));   // ANSI
        FCHARSET_MAP.put(77, Charset.forName("MacRoman"));      // Mac Roman
        FCHARSET_MAP.put(128, Charset.forName("MS932"));         // Shift_JIS (Japanese)
        FCHARSET_MAP.put(129, Charset.forName("ms949"));         // Hangul (Korean)
        FCHARSET_MAP.put(130, charsetOrNull("x-Johab"));         // Johab (Korean)
        FCHARSET_MAP.put(134, Charset.forName("GBK"));           // GB2312 (Simplified Chinese)
        FCHARSET_MAP.put(136, Charset.forName("Big5"));          // Big5 (Traditional Chinese)
        FCHARSET_MAP.put(161, Charset.forName("windows-1253"));  // Greek
        FCHARSET_MAP.put(162, Charset.forName("windows-1254"));  // Turkish
        FCHARSET_MAP.put(163, Charset.forName("windows-1258"));  // Vietnamese
        FCHARSET_MAP.put(177, Charset.forName("windows-1255"));  // Hebrew
        FCHARSET_MAP.put(178, Charset.forName("windows-1256"));  // Arabic
        FCHARSET_MAP.put(186, Charset.forName("windows-1257"));  // Baltic
        FCHARSET_MAP.put(204, Charset.forName("windows-1251"));  // Russian
        FCHARSET_MAP.put(222, Charset.forName("ms874"));         // Thai
        FCHARSET_MAP.put(238, Charset.forName("windows-1250"));  // Eastern Europe
    }

    /**
     * Extracts the HTML content from an encapsulated-HTML RTF document.
     *
     * @param rtfBytes the decompressed RTF bytes
     * @return the extracted HTML string, or {@code null} if the RTF does not contain
     *         encapsulated HTML
     */
    public static String extract(byte[] rtfBytes) {
        if (rtfBytes == null || rtfBytes.length == 0) {
            return null;
        }
        // Work with US-ASCII ��� RTF is 7-bit and non-ASCII bytes are escaped as \'xx
        String rtf = new String(rtfBytes, StandardCharsets.US_ASCII);

        if (!rtf.contains(FROM_HTML_MARKER)) {
            return null;
        }

        Charset defaultCodePage = detectCodePage(rtf);
        Map<Integer, Charset> fontCharsets = parseFontTable(rtf);
        // Track the current font's charset for inter-tag text decoding.
        // The stack mirrors RTF brace nesting so that font switches inside
        // groups (e.g. {\f3 ...}) are automatically unwound on '}'.
        Charset currentFontCharset = defaultCodePage;
        Deque<Charset> charsetStack = new ArrayDeque<>();

        // Find the start of the document body (after the RTF header).
        int bodyStart = rtf.indexOf(HTMLTAG_PREFIX);
        if (bodyStart < 0) {
            return null;
        }

        StringBuilder html = new StringBuilder(rtf.length() / 2);
        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
        int pos = bodyStart;
        int len = rtf.length();
        boolean inHtmlRtfSkip = false;

        while (pos < len) {
            // Check if we're at an htmltag group
            if (rtf.startsWith(HTMLTAG_PREFIX, pos)) {
                flushPendingBytes(pendingBytes, html, currentFontCharset);

                // Find matching close brace
                int groupEnd = findMatchingBrace(rtf, pos);
                if (groupEnd < 0) {
                    break;
                }

                // Skip {\*\htmltag prefix and digit(s)
                int contentStart = pos + HTMLTAG_PREFIX.length();
                while (contentStart < groupEnd && Character.isDigit(rtf.charAt(contentStart))) {
                    contentStart++;
                }
                // Skip optional space after tag number
                if (contentStart < groupEnd && rtf.charAt(contentStart) == ' ') {
                    contentStart++;
                }

                // Decode the htmltag content using default code page per MS-OXRTFEX spec
                String inner = rtf.substring(contentStart, groupEnd);
                decodeRtfEscapes(inner, html, defaultCodePage);

                pos = groupEnd + 1;
                continue;
            }

            // Check for \htmlrtf control word (start or end of RTF-only block)
            if (rtf.startsWith("\\htmlrtf", pos)) {
                flushPendingBytes(pendingBytes, html, currentFontCharset);
                int afterWord = pos + "\\htmlrtf".length();

                if (afterWord < len && rtf.charAt(afterWord) == '0') {
                    // \htmlrtf0 ��� end of skip block
                    inHtmlRtfSkip = false;
                    afterWord++;
                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
                        afterWord++;
                    }
                } else {
                    // \htmlrtf ��� start of skip block
                    inHtmlRtfSkip = true;
                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
                        afterWord++;
                    }
                }
                pos = afterWord;
                continue;
            }

            // Inside \htmlrtf skip blocks: don't emit text, but track brace
            // nesting so that font switches inside groups are properly scoped
            // (pushed on '{', popped on '}') ��� just like the full RTF parser.
            if (inHtmlRtfSkip) {
                char sc = rtf.charAt(pos);
                if (sc == '{') {
                    charsetStack.push(currentFontCharset);
                } else if (sc == '}') {
                    if (!charsetStack.isEmpty()) {
                        currentFontCharset = charsetStack.pop();
                    }
                } else if (sc == '\\' && pos + 1 < len && rtf.charAt(pos + 1) == 'f'
                        && pos + 2 < len && Character.isDigit(rtf.charAt(pos + 2))) {
                    // Track \fN font switches within the current group
                    int numStart = pos + 2;
                    int numEnd = numStart;
                    while (numEnd < len && Character.isDigit(rtf.charAt(numEnd))) {
                        numEnd++;
                    }
                    if (numEnd >= len || !Character.isLetter(rtf.charAt(numEnd))) {
                        int fontId = Integer.parseInt(rtf.substring(numStart, numEnd));
                        Charset fontCs = fontCharsets.get(fontId);
                        if (fontCs != null) {
                            currentFontCharset = fontCs;
                        }
                    }
                }
                pos++;
                continue;
            }

            // Check for other { groups (nested RTF groups that aren't htmltag)
            if (rtf.charAt(pos) == '{') {
                flushPendingBytes(pendingBytes, html, currentFontCharset);
                int end = findMatchingBrace(rtf, pos);
                if (end > 0) {
                    pos = end + 1;
                } else {
                    pos++;
                }
                continue;
            }

            // Skip closing braces
            if (rtf.charAt(pos) == '}') {
                flushPendingBytes(pendingBytes, html, currentFontCharset);
                pos++;
                continue;
            }

            // Handle RTF escapes in inter-tag text
            if (rtf.charAt(pos) == '\\' && pos + 1 < len) {
                char next = rtf.charAt(pos + 1);

                // \'xx hex escape ��� decode using current font's charset
                if (next == '\'' && pos + 3 < len) {
                    int hi = Character.digit(rtf.charAt(pos + 2), 16);
                    int lo = Character.digit(rtf.charAt(pos + 3), 16);
                    if (hi >= 0 && lo >= 0) {
                        pendingBytes.write((hi << 4) | lo);
                    }
                    pos += 4;
                    continue;
                }

                flushPendingBytes(pendingBytes, html, currentFontCharset);

                // Escaped literals
                if (next == '\\' || next == '{' || next == '}') {
                    html.append(next);
                    pos += 2;
                    continue;
                }

                // Control word
                if (Character.isLetter(next)) {
                    int wordStart = pos + 1;
                    int wordEnd = wordStart;
                    while (wordEnd < len && Character.isLetter(rtf.charAt(wordEnd))) {
                        wordEnd++;
                    }
                    String word = rtf.substring(wordStart, wordEnd);

                    // Parse optional numeric parameter
                    int paramStart = wordEnd;
                    int paramEnd = wordEnd;
                    if (paramEnd < len && (rtf.charAt(paramEnd) == '-'
                            || Character.isDigit(rtf.charAt(paramEnd)))) {
                        paramEnd++;
                        while (paramEnd < len && Character.isDigit(rtf.charAt(paramEnd))) {
                            paramEnd++;
                        }
                    }
                    // Skip optional space delimiter
                    int afterWord = paramEnd;
                    if (afterWord < len && rtf.charAt(afterWord) == ' ') {
                        afterWord++;
                    }

                    switch (word) {
                        case "par":
                        case "pard":
                            html.append('\n');
                            break;
                        case "tab":
                            html.append('\t');
                            break;
                        case "line":
                            html.append("<br>");
                            break;
                        case "f":
                            // Font switch in inter-tag text ��� update current charset
                            if (paramEnd > paramStart) {
                                int fontId = Integer.parseInt(
                                        rtf.substring(paramStart, paramEnd));
                                Charset fontCs = fontCharsets.get(fontId);
                                if (fontCs != null) {
                                    currentFontCharset = fontCs;
                                }
                            }
                            break;
                        default:
                            // Skip unknown control words
                            break;
                    }
                    pos = afterWord;
                    continue;
                }

                // Unknown escape ��� skip backslash
                pos++;
                continue;
            }

            // Newlines/carriage returns in RTF are whitespace, not content
            if (rtf.charAt(pos) == '\r' || rtf.charAt(pos) == '\n') {
                pos++;
                continue;
            }

            // Regular text character between htmltag groups ��� this is HTML content
            flushPendingBytes(pendingBytes, html, currentFontCharset);
            html.append(rtf.charAt(pos));
            pos++;
        }

        flushPendingBytes(pendingBytes, html, currentFontCharset);

        if (html.length() == 0) {
            return null;
        }
        return html.toString();
    }

    /**
     * Parse the RTF font table to build a mapping from font ID to charset.
     */
    static Map<Integer, Charset> parseFontTable(String rtf) {
        Map<Integer, Charset> result = new HashMap<>();
        int fontTblStart = rtf.indexOf("{\\fonttbl");
        if (fontTblStart < 0) {
            return result;
        }
        int fontTblEnd = findMatchingBrace(rtf, fontTblStart);
        if (fontTblEnd < 0) {
            return result;
        }
        String fontTable = rtf.substring(fontTblStart, fontTblEnd + 1);

        int currentFontId = -1;
        int pos = 0;
        int ftLen = fontTable.length();

        while (pos < ftLen) {
            if (fontTable.charAt(pos) == '\\' && pos + 1 < ftLen
                    && Character.isLetter(fontTable.charAt(pos + 1))) {
                int wordStart = pos + 1;
                int wordEnd = wordStart;
                while (wordEnd < ftLen && Character.isLetter(fontTable.charAt(wordEnd))) {
                    wordEnd++;
                }
                String word = fontTable.substring(wordStart, wordEnd);

                // Parse numeric parameter
                int paramStart = wordEnd;
                int paramEnd = wordEnd;
                if (paramEnd < ftLen && (fontTable.charAt(paramEnd) == '-'
                        || Character.isDigit(fontTable.charAt(paramEnd)))) {
                    paramEnd++;
                    while (paramEnd < ftLen && Character.isDigit(fontTable.charAt(paramEnd))) {
                        paramEnd++;
                    }
                }

                if ("f".equals(word) && paramEnd > paramStart) {
                    currentFontId = Integer.parseInt(fontTable.substring(paramStart, paramEnd));
                } else if ("fcharset".equals(word) && paramEnd > paramStart
                        && currentFontId >= 0) {
                    int fcharset = Integer.parseInt(fontTable.substring(paramStart, paramEnd));
                    Charset cs = FCHARSET_MAP.get(fcharset);
                    if (cs != null) {
                        result.put(currentFontId, cs);
                    }
                }
                pos = paramEnd;
            } else {
                pos++;
            }
        }
        return result;
    }

    /**
     * Find the position of the closing brace that matches the opening brace at
     * {@code openPos}.  Handles nested groups and escaped braces.
     *
     * @return index of the closing '}', or -1 if not found
     */
    static int findMatchingBrace(String rtf, int openPos) {
        int depth = 0;
        int len = rtf.length();
        for (int i = openPos; i < len; i++) {
            char c = rtf.charAt(i);
            if (c == '\\' && i + 1 < len) {
                char next = rtf.charAt(i + 1);
                if (next == '{' || next == '}' || next == '\\') {
                    i++;
                    continue;
                }
            }
            if (c == '{') {
                depth++;
            } else if (c == '}') {
                depth--;
                if (depth == 0) {
                    return i;
                }
            }
        }
        return -1;
    }

    /**
     * Decode RTF escapes within an htmltag group's content.
     */
    static void decodeRtfEscapes(String content, StringBuilder out, Charset codePage) {
        int len = content.length();
        int i = 0;
        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();

        while (i < len) {
            char c = content.charAt(i);

            if (c == '\\') {
                if (i + 1 >= len) {
                    break;
                }
                char next = content.charAt(i + 1);

                // \'xx hex escape
                if (next == '\'' && i + 3 < len) {
                    int hi = Character.digit(content.charAt(i + 2), 16);
                    int lo = Character.digit(content.charAt(i + 3), 16);
                    if (hi >= 0 && lo >= 0) {
                        pendingBytes.write((hi << 4) | lo);
                    }
                    i += 4;
                    continue;
                }

                flushPendingBytes(pendingBytes, out, codePage);

                if (next == '\\' || next == '{' || next == '}') {
                    out.append(next);
                    i += 2;
                    continue;
                }

                // Control words
                if (Character.isLetter(next)) {
                    int wordStart = i + 1;
                    int wordEnd = wordStart;
                    while (wordEnd < len && Character.isLetter(content.charAt(wordEnd))) {
                        wordEnd++;
                    }
                    String word = content.substring(wordStart, wordEnd);

                    int paramEnd = wordEnd;
                    if (paramEnd < len && (content.charAt(paramEnd) == '-'
                            || Character.isDigit(content.charAt(paramEnd)))) {
                        paramEnd++;
                        while (paramEnd < len && Character.isDigit(content.charAt(paramEnd))) {
                            paramEnd++;
                        }
                    }
                    int afterWord = paramEnd;
                    if (afterWord < len && content.charAt(afterWord) == ' ') {
                        afterWord++;
                    }

                    switch (word) {
                        case "par":
                        case "pard":
                            out.append('\n');
                            break;
                        case "tab":
                            out.append('\t');
                            break;
                        case "line":
                            out.append("<br>");
                            break;
                        case "htmlrtf":
                            // Skip \htmlrtf...\htmlrtf0 inside htmltag groups
                            i = skipHtmlRtfBlock(content, i);
                            continue;
                        default:
                            break;
                    }
                    i = afterWord;
                    continue;
                }

                i++;
                continue;
            }

            if (c == '{' || c == '}') {
                flushPendingBytes(pendingBytes, out, codePage);
                i++;
                continue;
            }

            flushPendingBytes(pendingBytes, out, codePage);
            out.append(c);
            i++;
        }

        flushPendingBytes(pendingBytes, out, codePage);
    }

    /**
     * Skip a {@code \htmlrtf ... \htmlrtf0} block within an htmltag group.
     *
     * @param content the string being parsed
     * @param pos     position of the backslash starting {@code \htmlrtf}
     * @return position after the matching {@code \htmlrtf0}
     */
    static int skipHtmlRtfBlock(String content, int pos) {
        int afterWord = pos + "\\htmlrtf".length();
        if (afterWord < content.length() && content.charAt(afterWord) == '0') {
            // This is \htmlrtf0 (end marker) ��� just skip past it
            afterWord++;
            if (afterWord < content.length() && content.charAt(afterWord) == ' ') {
                afterWord++;
            }
            return afterWord;
        }

        // Skip everything until \htmlrtf0
        int endPos = content.indexOf("\\htmlrtf0", afterWord);
        if (endPos < 0) {
            return content.length();
        }
        int after = endPos + "\\htmlrtf0".length();
        if (after < content.length() && content.charAt(after) == ' ') {
            after++;
        }
        return after;
    }

    /**
     * Detect the ANSI code page from the RTF header ({@code \ansicpgNNNN}).
     * Falls back to windows-1252 if not found.
     */
    static Charset detectCodePage(String rtf) {
        int idx = rtf.indexOf(ANSICPG_PREFIX);
        if (idx < 0) {
            return Charset.forName("windows-1252");
        }
        int numStart = idx + ANSICPG_PREFIX.length();
        int numEnd = numStart;
        while (numEnd < rtf.length() && Character.isDigit(rtf.charAt(numEnd))) {
            numEnd++;
        }
        if (numEnd == numStart) {
            return Charset.forName("windows-1252");
        }
        String cpNum = rtf.substring(numStart, numEnd);
        try {
            return Charset.forName("windows-" + cpNum);
        } catch (Exception e) {
            try {
                return Charset.forName("cp" + cpNum);
            } catch (Exception e2) {
                LOGGER.debug("Unknown code page {}, falling back to windows-1252", cpNum);
                return Charset.forName("windows-1252");
            }
        }
    }

    private static Charset charsetOrNull(String name) {
        try {
            return Charset.forName(name);
        } catch (Exception e) {
            return null;
        }
    }

    private static void flushPendingBytes(ByteArrayOutputStream pending, StringBuilder out,
                                          Charset codePage) {
        if (pending.size() > 0) {
            out.append(new String(pending.toByteArray(), codePage));
            pending.reset();
        }
    }
}