OOXMLWordAndPowerPointTextHandler.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;


import java.util.Date;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.utils.DateUtils;

/**
 * This class is intended to handle anything that might contain IBodyElements:
 * main document, headers, footers, notes, slides, etc.
 * <p>
 * <p/>
 * <p>
 * This class does not generally check for namespaces, and it can be applied
 * to PPTX and DOCX for text extraction.
 * <p>
 * <p/>
 * This can be used to scrape content from charts.  It currently ignores
 * formula (&lt;c:f/&gt;) elements
 * <p>
 * <p/>
 * This does not work with .xlsx or .vsdx.
 * <p>
 * TODO: move this into POI?
 */

public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {


    public final static String W_NS =
            "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
    private final static String R = "r";
    private final static String FLD = "fld";
    private final static String RPR = "rPr";
    private final static String P = "p";
    private final static String P_STYLE = "pStyle";
    private final static String PPR = "pPr";
    private final static String T = "t";
    private final static String TAB = "tab";
    private final static String B = "b";
    private final static String ILVL = "ilvl";
    private final static String NUM_ID = "numId";
    private final static String TC = "tc";
    private final static String TR = "tr";
    private final static String I = "i";
    private final static String U = "u";
    private final static String STRIKE = "strike";
    private final static String NUM_PR = "numPr";
    private final static String BR = "br";
    private final static String HYPERLINK = "hyperlink";
    private final static String HLINK_CLICK = "hlinkClick"; //pptx hlink
    private final static String TBL = "tbl";
    private final static String PIC = "pic";
    private final static String PICT = "pict";
    private final static String IMAGEDATA = "imagedata";
    private final static String BLIP = "blip";
    private final static String CHOICE = "Choice";
    private final static String FALLBACK = "Fallback";
    private final static String OLE_OBJECT = "OLEObject";
    private final static String CR = "cr";
    private final static String V = "v";
    private final static String RUBY = "ruby"; //phonetic section
    private final static String RT = "rt"; //phonetic run
    private static final String VAL = "val";
    private static final String SLIDE = "sld";
    private static final String SHOW = "show";
    private final static String MC_NS =
            "http://schemas.openxmlformats.org/markup-compatibility/2006";
    private final static String O_NS = "urn:schemas-microsoft-com:office:office";
    private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
    private final static String DRAWING_MAIN_NS =
            "http://schemas.openxmlformats.org/drawingml/2006/main";
    private final static String V_NS = "urn:schemas-microsoft-com:vml";
    private final static String C_NS = "http://schemas.openxmlformats.org/drawingml/2006/chart";
    private final static String OFFICE_DOC_RELATIONSHIP_NS =
            "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
    private final static char[] TAB_CHAR = new char[]{'\t'};
    private final static char NEWLINE = '\n';
    private final static String BOOKMARK_START = "bookmarkStart";
    private final static String BOOKMARK_END = "bookmarkEnd";
    private final static String FOOTNOTE_REFERENCE = "footnoteReference";
    private final static String INS = "ins";
    private final static String DEL = "del";
    private final static String DEL_TEXT = "delText";
    private final static String MOVE_FROM = "moveFrom";
    private final static String MOVE_TO = "moveTo";
    private final static String ENDNOTE_REFERENCE = "endnoteReference";
    private static final String TEXTBOX = "textbox";
    private final static String FLD_CHAR = "fldChar";
    private final static String INSTR_TEXT = "instrText";
    private final static String FLD_CHAR_TYPE = "fldCharType";
    // DrawingML hyperlinks on shapes/pictures
    private final static String HLINK_HOVER = "hlinkHover";
    private final static String C_NV_PR = "cNvPr";
    // VML shape hyperlinks
    private final static String SHAPE = "shape";
    private final static String HREF = "href";

    // Patterns for extracting URLs from field codes
    private static final Pattern HYPERLINK_PATTERN =
            Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
    private static final Pattern INCLUDEPICTURE_PATTERN =
            Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
    private static final Pattern INCLUDETEXT_PATTERN =
            Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
    private static final Pattern IMPORT_PATTERN =
            Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
    private static final Pattern LINK_PATTERN =
            Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
    private final XWPFBodyContentsHandler bodyContentsHandler;
    private final Map<String, String> linkedRelationships;
    private final RunProperties currRunProperties = new RunProperties();
    private final ParagraphProperties currPProperties = new ParagraphProperties();
    private final boolean includeTextBox;
    private final boolean concatenatePhoneticRuns;
    private final Metadata metadata;
    private final StringBuilder runBuffer = new StringBuilder();
    private final StringBuilder rubyBuffer = new StringBuilder();
    private boolean inR = false;
    //in run or in field. TODO: convert this to an integer because you can have a run within a run
    private boolean inT = false;
    private boolean inRPr = false;
    private boolean inNumPr = false;
    private boolean inRt = false;
    private boolean inPic = false;
    private boolean inPict = false;
    private String picDescription = null;
    private String picRId = null;
    private String picFilename = null;
    //mechanism used to determine when to
    //signal the start of the p, and still
    //handle p with pPr and those without
    private boolean lastStartElementWasP = false;
    //have we signaled the start of a p?
    //pPr can happen multiple times within a p
    //<p><pPr/><r><t>text</t></r><pPr></p>
    private boolean pStarted = false;
    //alternate content can be embedded in itself.
    //need to track depth.
    //if in alternate, choose fallback, maybe make this configurable?
    private int inACChoiceDepth = 0;
    private int inACFallbackDepth = 0;
    private boolean inDelText = false;
    //buffers rt in ruby sections (see 17.3.3.25)
    private boolean inHlinkClick = false;
    private boolean inTextBox = false;
    private boolean inV = false; //in c:v in chart file
    // Field code tracking for instrText-based hyperlinks
    private boolean inField = false;
    private boolean inInstrText = false;
    private boolean inFieldHyperlink = false;
    private final StringBuilder instrTextBuffer = new StringBuilder();
    private OOXMLWordAndPowerPointTextHandler.EditType editType =
            OOXMLWordAndPowerPointTextHandler.EditType.NONE;
    private DateUtils dateUtils = new DateUtils();

    private boolean hiddenSlide = false;

    public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
                                             Map<String, String> hyperlinks) {
        this(bodyContentsHandler, hyperlinks, true, true, null);
    }

    public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
                                             Map<String, String> hyperlinks, boolean includeTextBox,
                                             boolean concatenatePhoneticRuns) {
        this(bodyContentsHandler, hyperlinks, includeTextBox, concatenatePhoneticRuns, null);
    }

    public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
                                             Map<String, String> hyperlinks, boolean includeTextBox,
                                             boolean concatenatePhoneticRuns, Metadata metadata) {
        this.bodyContentsHandler = bodyContentsHandler;
        this.linkedRelationships = hyperlinks;
        this.includeTextBox = includeTextBox;
        this.concatenatePhoneticRuns = concatenatePhoneticRuns;
        this.metadata = metadata;
    }

    @Override
    public void startDocument() throws SAXException {
    }

    @Override
    public void endDocument() throws SAXException {
    }

    @Override
    public void startPrefixMapping(String prefix, String uri) throws SAXException {
    }

    @Override
    public void endPrefixMapping(String prefix) throws SAXException {
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts)
            throws SAXException {
        //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd

        if (lastStartElementWasP && !PPR.equals(localName)) {
            bodyContentsHandler.startParagraph(currPProperties);
        }

        lastStartElementWasP = false;

        if (uri != null && uri.equals(MC_NS)) {
            if (CHOICE.equals(localName)) {
                inACChoiceDepth++;
            } else if (FALLBACK.equals(localName)) {
                inACFallbackDepth++;
            }
        }

        if (inACChoiceDepth > 0) {
            return;
        }

        if (!includeTextBox && localName.equals(TEXTBOX)) {
            inTextBox = true;
            return;
        }
        //these are sorted descending by frequency within docx files
        //in our regression corpus.
        //yes, I know, likely premature optimization...
        if (RPR.equals(localName)) {
            inRPr = true;
        } else if (R.equals(localName)) {
            inR = true;
        } else if (T.equals(localName)) {
            inT = true;
        } else if (TAB.equals(localName)) {
            runBuffer.append(TAB_CHAR);
        } else if (P.equals(localName)) {
            lastStartElementWasP = true;
        } else if (B.equals(localName)) { //TODO: add bCs
            if (inR && inRPr) {
                currRunProperties.setBold(true);
            }
        } else if (TC.equals(localName)) {
            bodyContentsHandler.startTableCell();
        } else if (P_STYLE.equals(localName)) {
            String styleId = atts.getValue(W_NS, "val");
            currPProperties.setStyleID(styleId);
        } else if (I.equals(localName)) { //TODO: add iCs
            //rprs don't have to be inR; ignore those that aren't
            if (inR && inRPr) {
                currRunProperties.setItalics(true);
            }
        } else if (STRIKE.equals(localName)) {
            if (inR && inRPr) {
                currRunProperties.setStrike(true);
            }
        } else if (U.equals(localName)) {
            if (inR && inRPr) {
                currRunProperties.setUnderline(getStringVal(atts));
            }
        } else if (TR.equals(localName)) {
            bodyContentsHandler.startTableRow();
        } else if (NUM_PR.equals(localName)) {
            inNumPr = true;
        } else if (ILVL.equals(localName)) {
            if (inNumPr) {
                currPProperties.setIlvl(getIntVal(atts));
            }
        } else if (NUM_ID.equals(localName)) {
            if (inNumPr) {
                currPProperties.setNumId(getIntVal(atts));
            }
        } else if (BR.equals(localName)) {
            runBuffer.append(NEWLINE);
        } else if (BOOKMARK_START.equals(localName)) {
            String name = atts.getValue(W_NS, "name");
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.startBookmark(id, name);
        } else if (BOOKMARK_END.equals(localName)) {
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.endBookmark(id);
        } else if (HYPERLINK.equals(localName)) { //docx hyperlink
            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
            String hyperlink = null;
            if (hyperlinkId != null) {
                hyperlink = linkedRelationships.get(hyperlinkId);
                bodyContentsHandler.hyperlinkStart(hyperlink);
            } else {
                String anchor = atts.getValue(W_NS, "anchor");
                if (anchor != null) {
                    anchor = "#" + anchor;
                }
                bodyContentsHandler.hyperlinkStart(anchor);
            }
        } else if (HLINK_CLICK.equals(localName)) { //pptx hyperlink
            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
            String hyperlink = null;
            if (hyperlinkId != null) {
                hyperlink = linkedRelationships.get(hyperlinkId);
                bodyContentsHandler.hyperlinkStart(hyperlink);
                inHlinkClick = true;
            }
        } else if (TBL.equals(localName)) {
            bodyContentsHandler.startTable();
        } else if (BLIP.equals(localName)) { //check for DRAWING_NS
            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
        } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
            picDescription = atts.getValue("", "descr");
        } else if (PIC.equals(localName)) {
            inPic = true; //check for PIC_NS?
        } //TODO: add sdt, sdtPr, sdtContent goes here statistically
        else if (FOOTNOTE_REFERENCE.equals(localName)) {
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.footnoteReference(id);
        } else if (IMAGEDATA.equals(localName)) {
            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
            picDescription = atts.getValue(O_NS, "title");
        } else if (INS.equals(localName)) {
            startEditedSection(editType.INSERT, atts);
        } else if (DEL_TEXT.equals(localName)) {
            inDelText = true;
        } else if (DEL.equals(localName)) {
            startEditedSection(editType.DELETE, atts);
        } else if (MOVE_TO.equals(localName)) {
            startEditedSection(EditType.MOVE_TO, atts);
        } else if (MOVE_FROM.equals(localName)) {
            startEditedSection(editType.MOVE_FROM, atts);
        } else if (OLE_OBJECT.equals(localName)) { //check for O_NS?
            String type = null;
            String refId = null;
            //TODO: clean this up and ...want to get ProgID?
            for (int i = 0; i < atts.getLength(); i++) {
                String attLocalName = atts.getLocalName(i);
                String attValue = atts.getValue(i);
                if (attLocalName.equals("Type")) {
                    type = attValue;
                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) &&
                        attLocalName.equals("id")) {
                    refId = attValue;
                }
            }
            if ("Embed".equals(type)) {
                bodyContentsHandler.embeddedOLERef(refId);
            } else if ("Link".equals(type)) {
                // Linked OLE object - references external file
                bodyContentsHandler.linkedOLERef(refId);
                if (metadata != null) {
                    metadata.set(Office.HAS_LINKED_OLE_OBJECTS, true);
                }
            }
        } else if (CR.equals(localName)) {
            runBuffer.append(NEWLINE);
        } else if (ENDNOTE_REFERENCE.equals(localName)) {
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.endnoteReference(id);
        } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
            inV = true;
        } else if (RT.equals(localName)) {
            inRt = true;
        } else if (SLIDE.equals(localName)) {
            String val = atts.getValue("show");
            if ("0".equals(val) || "false".equals(val)) {
                hiddenSlide = true;
            }
        } else if (FLD_CHAR.equals(localName)) {
            String fldCharType = atts.getValue(W_NS, FLD_CHAR_TYPE);
            if ("begin".equals(fldCharType)) {
                inField = true;
                instrTextBuffer.setLength(0);
            } else if ("separate".equals(fldCharType)) {
                // Parse instrText for HYPERLINK
                String url = parseHyperlinkFromInstrText(instrTextBuffer.toString());
                if (url != null) {
                    bodyContentsHandler.hyperlinkStart(url);
                    inFieldHyperlink = true;
                    if (metadata != null) {
                        metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
                    }
                } else {
                    // Check for external reference fields (INCLUDEPICTURE, INCLUDETEXT, etc.)
                    StringBuilder fieldType = new StringBuilder();
                    String extUrl = parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType);
                    if (extUrl != null) {
                        bodyContentsHandler.externalRef(fieldType.toString(), extUrl);
                        if (metadata != null) {
                            metadata.set(Office.HAS_FIELD_HYPERLINKS, true);
                        }
                    }
                }
            } else if ("end".equals(fldCharType)) {
                if (inFieldHyperlink) {
                    bodyContentsHandler.hyperlinkEnd();
                    inFieldHyperlink = false;
                }
                inField = false;
                instrTextBuffer.setLength(0);
            }
        } else if (INSTR_TEXT.equals(localName)) {
            inInstrText = true;
        } else if (HLINK_HOVER.equals(localName)) {
            // DrawingML hover hyperlink on shapes/pictures
            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
            if (hyperlinkId != null) {
                String hyperlink = linkedRelationships.get(hyperlinkId);
                if (hyperlink != null) {
                    bodyContentsHandler.externalRef("hlinkHover", hyperlink);
                    if (metadata != null) {
                        metadata.set(Office.HAS_HOVER_HYPERLINKS, true);
                    }
                }
            }
        } else if (SHAPE.equals(localName) && V_NS.equals(uri)) {
            // VML shape with href attribute
            String href = atts.getValue(HREF);
            if (href == null) {
                href = atts.getValue(O_NS, HREF);
            }
            if (href != null && !href.isEmpty()) {
                bodyContentsHandler.externalRef("vml-shape-href", href);
                if (metadata != null) {
                    metadata.set(Office.HAS_VML_HYPERLINKS, true);
                }
            }
        }

    }

    private void startEditedSection(EditType editType, Attributes atts) throws SAXException {
        String editAuthor = atts.getValue(W_NS, "author");
        String editDateString = atts.getValue(W_NS, "date");
        Date editDate = null;
        if (editDateString != null) {
            editDate = dateUtils.tryToParse(editDateString);
        }
        bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
        this.editType = editType;
    }

    private String getStringVal(Attributes atts) {
        String valString = atts.getValue(W_NS, VAL);
        if (valString != null) {
            return valString;
        }
        return "";
    }

    private int getIntVal(Attributes atts) {
        String valString = atts.getValue(W_NS, VAL);
        if (valString != null) {
            try {
                return Integer.parseInt(valString);
            } catch (NumberFormatException e) {
                //swallow
            }
        }
        return -1;
    }

    /**
     * Parses a HYPERLINK URL from instrText field code content.
     * Field codes like: HYPERLINK "https://example.com"
     *
     * @param instrText the accumulated instrText content
     * @return the URL if found, or null
     */
    private String parseHyperlinkFromInstrText(String instrText) {
        if (instrText == null || instrText.isEmpty()) {
            return null;
        }
        Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
        if (m.find()) {
            return m.group(1);
        }
        return null;
    }

    /**
     * Parses URLs from instrText field codes that reference external resources.
     * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields.
     *
     * @param instrText the accumulated instrText content
     * @param fieldType output parameter - will contain the field type if found
     * @return the URL if found, or null
     */
    private String parseExternalRefFromInstrText(String instrText, StringBuilder fieldType) {
        if (instrText == null || instrText.isEmpty()) {
            return null;
        }
        String trimmed = instrText.trim();

        Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("INCLUDEPICTURE");
            return m.group(1);
        }

        m = INCLUDETEXT_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("INCLUDETEXT");
            return m.group(1);
        }

        m = IMPORT_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("IMPORT");
            return m.group(1);
        }

        m = LINK_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("LINK");
            return m.group(1);
        }

        return null;
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {

        if (CHOICE.equals(localName)) {
            inACChoiceDepth--;
        } else if (FALLBACK.equals(localName)) {
            inACFallbackDepth--;
        }
        if (inACChoiceDepth > 0) {
            return;
        }

        if (!includeTextBox && localName.equals(TEXTBOX)) {
            inTextBox = false;
            return;
        }
        if (PIC.equals(localName)) { //PIC_NS
            handlePict();
            inPic = false;
            return;
        } else if (RPR.equals(localName)) {
            inRPr = false;
        } else if (R.equals(localName)) {
            handleEndOfRun();
        } else if (T.equals(localName)) {
            inT = false;
        } else if (PPR.equals(localName)) {
            if (!pStarted) {
                bodyContentsHandler.startParagraph(currPProperties);
                pStarted = true;
            }
            currPProperties.reset();
        } else if (P.equals(localName)) {
            if (runBuffer.length() > 0) {
                //<p><tab></p>...this will treat that as if it were
                //a run...TODO: should we swallow whitespace that doesn't occur in a run?
                bodyContentsHandler.run(currRunProperties, runBuffer.toString());
                runBuffer.setLength(0);
            }
            pStarted = false;
            bodyContentsHandler.endParagraph();
        } else if (TC.equals(localName)) {
            bodyContentsHandler.endTableCell();
        } else if (TR.equals(localName)) {
            bodyContentsHandler.endTableRow();
        } else if (TBL.equals(localName)) {
            bodyContentsHandler.endTable();
        } else if (FLD.equals(localName)) {
            handleEndOfRun();
        } else if (DEL_TEXT.equals(localName)) {
            inDelText = false;
        } else if (INS.equals(localName) || DEL.equals(localName) || MOVE_TO.equals(localName) ||
                MOVE_FROM.equals(localName)) {
            editType = EditType.NONE;
        } else if (HYPERLINK.equals(localName)) {
            bodyContentsHandler.hyperlinkEnd();
        } else if (PICT.equals(localName)) {
            handlePict();
        } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
            inV = false;
            handleEndOfRun();
        } else if (RT.equals(localName)) {
            inRt = false;
        } else if (RUBY.equals(localName)) {
            handleEndOfRuby();
        } else if (INSTR_TEXT.equals(localName)) {
            inInstrText = false;
        }
    }

    private void handleEndOfRuby() throws SAXException {
        if (rubyBuffer.length() > 0) {
            if (concatenatePhoneticRuns) {
                bodyContentsHandler.run(currRunProperties, " (" + rubyBuffer.toString() + ")");
            }
            rubyBuffer.setLength(0);
        }
    }

    private void handleEndOfRun() throws SAXException {
        bodyContentsHandler.run(currRunProperties, runBuffer.toString());
        if (inHlinkClick) {
            bodyContentsHandler.hyperlinkEnd();
            inHlinkClick = false;
        }
        inR = false;
        runBuffer.setLength(0);
        currRunProperties.setBold(false);
        currRunProperties.setItalics(false);
        currRunProperties.setStrike(false);
        currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
    }

    private void handlePict() throws SAXException {
        String picFileName = null;
        if (picRId != null) {
            picFileName = linkedRelationships.get(picRId);
        }
        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
        picDescription = null;
        picRId = null;
        inPic = false;
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {

        if (inACChoiceDepth > 0) {
            return;
        } else if (!includeTextBox && inTextBox) {
            return;
        }

        if (editType.equals(EditType.MOVE_FROM) && inT) {
            if (bodyContentsHandler.isIncludeMoveFromText()) {
                appendToBuffer(ch, start, length);
            }
        } else if (inT) {
            appendToBuffer(ch, start, length);
        } else if (bodyContentsHandler.isIncludeDeletedText() && editType.equals(EditType.DELETE)) {
            appendToBuffer(ch, start, length);
        } else if (inV) {
            appendToBuffer(ch, start, length);
            appendToBuffer(TAB_CHAR, 0, 1);
        } else if (inInstrText && inField) {
            // Accumulate instrText content for field code parsing (e.g., HYPERLINK)
            instrTextBuffer.append(ch, start, length);
        }
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
        if (inACChoiceDepth > 0) {
            return;
        } else if (!includeTextBox && inTextBox) {
            return;
        }

        if (inT) {
            appendToBuffer(ch, start, length);
        } else if (bodyContentsHandler.isIncludeDeletedText() && inDelText) {
            appendToBuffer(ch, start, length);
        }
    }

    private void appendToBuffer(char[] ch, int start, int length) throws SAXException {
        if (inRt) {
            rubyBuffer.append(ch, start, length);
        } else {
            runBuffer.append(ch, start, length);
        }
    }

    public enum EditType {
        NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM
    }

    public interface XWPFBodyContentsHandler {

        void run(RunProperties runProperties, String contents) throws SAXException;

        /**
         * @param link the link; can be null
         */
        void hyperlinkStart(String link) throws SAXException;

        void hyperlinkEnd() throws SAXException;

        void startParagraph(ParagraphProperties paragraphProperties) throws SAXException;

        void endParagraph() throws SAXException;

        void startTable() throws SAXException;

        void endTable() throws SAXException;

        void startTableRow() throws SAXException;

        void endTableRow() throws SAXException;

        void startTableCell() throws SAXException;

        void endTableCell() throws SAXException;

        void startSDT() throws SAXException;

        void endSDT() throws SAXException;

        void startEditedSection(String editor, Date date, EditType editType) throws SAXException;

        void endEditedSection() throws SAXException;

        boolean isIncludeDeletedText() throws SAXException;

        void footnoteReference(String id) throws SAXException;

        void endnoteReference(String id) throws SAXException;

        boolean isIncludeMoveFromText() throws SAXException;

        void embeddedOLERef(String refId) throws SAXException;

        /**
         * Called when a linked (vs embedded) OLE object is found.
         * These reference external files and are a security concern.
         */
        void linkedOLERef(String refId) throws SAXException;

        void embeddedPicRef(String picFileName, String picDescription) throws SAXException;

        void startBookmark(String id, String name) throws SAXException;

        void endBookmark(String id) throws SAXException;

        /**
         * Called when an external reference URL is found in a field code.
         * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK fields,
         * and DrawingML/VML hyperlinks on shapes.
         *
         * @param fieldType the type of field (e.g., "INCLUDEPICTURE", "hlinkHover", "vml-href")
         * @param url the external URL
         */
        default void externalRef(String fieldType, String url) throws SAXException {
            // Default no-op implementation for backward compatibility
        }
    }

    public boolean isHiddenSlide() {
        return hiddenSlide;
    }
}