FieldCodeParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.ooxml;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Parses OOXML field codes (instrText) to extract URLs from HYPERLINK,
 * INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields.
 * <p>
 * This class has no Tika dependencies and could be contributed to POI.
 */
public class FieldCodeParser {

    private static final Pattern HYPERLINK_PATTERN =
            Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"",
                    Pattern.CASE_INSENSITIVE);
    private static final Pattern INCLUDEPICTURE_PATTERN =
            Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"",
                    Pattern.CASE_INSENSITIVE);
    private static final Pattern INCLUDETEXT_PATTERN =
            Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"",
                    Pattern.CASE_INSENSITIVE);
    private static final Pattern IMPORT_PATTERN =
            Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"",
                    Pattern.CASE_INSENSITIVE);
    private static final Pattern LINK_PATTERN =
            Pattern.compile(
                    "LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"",
                    Pattern.CASE_INSENSITIVE);

    private FieldCodeParser() {
    }

    /**
     * Parses a HYPERLINK URL from instrText field code content.
     * Field codes like: {@code HYPERLINK "https://example.com"}
     *
     * @param instrText the accumulated instrText content
     * @return the URL if found, or null
     */
    public static String parseHyperlinkFromInstrText(String instrText) {
        if (instrText == null || instrText.isEmpty()) {
            return null;
        }
        Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
        if (m.find()) {
            return m.group(1);
        }
        return null;
    }

    /**
     * Parses URLs from instrText field codes that reference external resources.
     * This includes INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK fields.
     *
     * @param instrText the accumulated instrText content
     * @param fieldType output parameter - will contain the field type if found
     * @return the URL if found, or null
     */
    public static String parseExternalRefFromInstrText(String instrText,
            StringBuilder fieldType) {
        if (instrText == null || instrText.isEmpty()) {
            return null;
        }
        String trimmed = instrText.trim();

        Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("INCLUDEPICTURE");
            return m.group(1);
        }

        m = INCLUDETEXT_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("INCLUDETEXT");
            return m.group(1);
        }

        m = IMPORT_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("IMPORT");
            return m.group(1);
        }

        m = LINK_PATTERN.matcher(trimmed);
        if (m.find()) {
            fieldType.append("LINK");
            return m.group(1);
        }

        return null;
    }
}