FieldHyperlinkTracker.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Tracks field hyperlink state across multiple runs within a paragraph.
* Field codes span multiple runs: begin -> instrText -> separate -> text runs -> end
* <p>
* This class handles HYPERLINK field codes as well as other external references
* like INCLUDEPICTURE, INCLUDETEXT, IMPORT, and LINK.
*/
class FieldHyperlinkTracker {
// Patterns for extracting URLs from field codes
private static final Pattern HYPERLINK_PATTERN =
Pattern.compile("HYPERLINK\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
private static final Pattern INCLUDEPICTURE_PATTERN =
Pattern.compile("INCLUDEPICTURE\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
private static final Pattern INCLUDETEXT_PATTERN =
Pattern.compile("INCLUDETEXT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
private static final Pattern IMPORT_PATTERN =
Pattern.compile("IMPORT\\s{1,100}\"([^\"]{1,10000})\"", Pattern.CASE_INSENSITIVE);
private static final Pattern LINK_PATTERN =
Pattern.compile("LINK\\s{1,100}[\\w.]{1,50}\\s{1,100}\"([^\"]{1,10000})\"",
Pattern.CASE_INSENSITIVE);
private boolean inField = false;
private boolean inFieldHyperlink = false;
private final StringBuilder instrTextBuffer = new StringBuilder();
private String lastExternalRefType = null;
private String lastExternalRefUrl = null;
void startField() {
inField = true;
instrTextBuffer.setLength(0);
lastExternalRefType = null;
lastExternalRefUrl = null;
}
void addInstrText(String text) {
if (inField && text != null) {
instrTextBuffer.append(text);
}
}
/**
* Called when fldChar separate is encountered.
*
* @return the hyperlink URL if this is a HYPERLINK field, null otherwise
*/
String separate() {
if (inField) {
String url = parseHyperlinkFromInstrText(instrTextBuffer.toString());
if (url != null) {
inFieldHyperlink = true;
return url;
}
// Check for other external refs (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK)
StringBuilder fieldType = new StringBuilder();
String extUrl = parseExternalRefFromInstrText(instrTextBuffer.toString(), fieldType);
if (extUrl != null) {
lastExternalRefType = fieldType.toString();
lastExternalRefUrl = extUrl;
}
}
return null;
}
void endField() {
inField = false;
inFieldHyperlink = false;
instrTextBuffer.setLength(0);
lastExternalRefType = null;
lastExternalRefUrl = null;
}
boolean isInFieldHyperlink() {
return inFieldHyperlink;
}
String getLastExternalRefType() {
return lastExternalRefType;
}
String getLastExternalRefUrl() {
return lastExternalRefUrl;
}
void clearExternalRef() {
lastExternalRefType = null;
lastExternalRefUrl = null;
}
/**
* Parses a HYPERLINK URL from instrText field code content.
*
* @param instrText the accumulated instrText content
* @return the URL if found, or null
*/
private static String parseHyperlinkFromInstrText(String instrText) {
if (instrText == null || instrText.isEmpty()) {
return null;
}
Matcher m = HYPERLINK_PATTERN.matcher(instrText.trim());
if (m.find()) {
return m.group(1);
}
return null;
}
/**
* Parses external reference URLs from instrText field codes
* (INCLUDEPICTURE, INCLUDETEXT, IMPORT, LINK).
*
* @param instrText the accumulated instrText content
* @param fieldType output parameter - will contain the field type if found
* @return the URL if found, or null
*/
private static String parseExternalRefFromInstrText(String instrText, StringBuilder fieldType) {
if (instrText == null || instrText.isEmpty()) {
return null;
}
String trimmed = instrText.trim();
Matcher m = INCLUDEPICTURE_PATTERN.matcher(trimmed);
if (m.find()) {
fieldType.append("INCLUDEPICTURE");
return m.group(1);
}
m = INCLUDETEXT_PATTERN.matcher(trimmed);
if (m.find()) {
fieldType.append("INCLUDETEXT");
return m.group(1);
}
m = IMPORT_PATTERN.matcher(trimmed);
if (m.find()) {
fieldType.append("IMPORT");
return m.group(1);
}
m = LINK_PATTERN.matcher(trimmed);
if (m.find()) {
fieldType.append("LINK");
return m.group(1);
}
return null;
}
}