FilenameUtils.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.io;
import java.util.HashSet;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.utils.StringUtils;
public class FilenameUtils {
private static final MimeTypes MIME_TYPES = MimeTypes.getDefaultMimeTypes();
private static final Pattern PROTOCOL_PATTERN = Pattern.compile("[A-Za-z0-9]{1,10}://+");
/**
* Reserved characters
*/
public final static char[] RESERVED_FILENAME_CHARACTERS =
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
0x1B, 0x1C, 0x1D, 0x1E, 0x1F, '?', ':', '*', '<', '>', '|', '"', '\''};
private final static HashSet<Character> RESERVED = new HashSet<>(38);
static {
for (char reservedFilenameCharacter : RESERVED_FILENAME_CHARACTERS) {
RESERVED.add(reservedFilenameCharacter);
}
}
private final static Pattern ASCII_NUMERIC = Pattern.compile("\\A\\.(?i)[a-z0-9]{1,5}\\Z");
/**
* Scans the given file name for reserved characters on different OSs and
* file systems and returns a sanitized version of the name with the
* reserved chars replaced by their hexadecimal value.
* <p>
* For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
*
* @param name the file name to be normalized - NOT NULL
* @return the normalized file name
* @throws IllegalArgumentException if name is null
*/
public static String normalize(final String name) {
if (name == null) {
throw new IllegalArgumentException("name cannot be null");
}
StringBuilder sb = new StringBuilder();
for (char c : name.toCharArray()) {
if (RESERVED.contains(c)) {
sb.append('%').append((c < 16) ? "0" : "")
.append(Integer.toHexString(c).toUpperCase(Locale.ROOT));
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
* This is a duplication of the algorithm and functionality
* available in commons io FilenameUtils. If Java's File were
* able handle Windows file paths correctly in linux,
* we wouldn't need this.
* <p>
* The goal of this is to get a filename from a path.
* The package parsers and some other embedded doc
* extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY.
* <p>
* If a careless client used that filename as if it were a
* filename and not a path when writing embedded files,
* bad things could happen. Consider: "../../../my_ppt.ppt".
* <p>
* Consider using this in combination with {@link #normalize(String)}.
*
* @param path path to strip
* @return empty string or a filename, never null
*/
public static String getName(final String path) {
if (path == null || path.isEmpty()) {
return StringUtils.EMPTY;
}
int unix = path.lastIndexOf("/");
int windows = path.lastIndexOf("\\");
//some macintosh file names are stored with : as the delimiter
//also necessary to properly handle C:somefilename
int colon = path.lastIndexOf(":");
String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1);
if (cand.equals("..") || cand.equals(".")) {
return StringUtils.EMPTY;
}
return cand;
}
/**
* This includes the period, e.g. ".pdf".
* This requires that an extension contain only ascii alphanumerics
* and it requires that an extension length be 5 or less.
* @param path
* @return the suffix or an empty string if one could not be found
*/
public static String getSuffixFromPath(String path) {
String n = getName(path);
int i = n.lastIndexOf(".");
//arbitrarily sets max extension length
if (i > -1 && n.length() - i < 6) {
String suffix = n.substring(i);
if (ASCII_NUMERIC.matcher(suffix).matches()) {
return suffix;
}
}
return StringUtils.EMPTY;
}
public static String getSanitizedEmbeddedFileName(Metadata metadata,
String defaultExtension, int maxLength) {
String path = getEmbeddedName(metadata);
//fName could be a full path or null
if (StringUtils.isBlank(path)) {
return null;
}
path = path.replaceAll("\u0000", " ");
if (path.startsWith("\"") && path.endsWith("\"")) {
path = path.substring(1, path.length() - 1);
}
int prefixLength = getPrefixLength(path);
if (prefixLength > 0) {
path = path.substring(prefixLength);
}
path = path.replaceAll("[:\\\\]+", "/");
String fName = getName(path);
fName = normalize(fName);
String extension = FilenameUtils.getSuffixFromPath(fName);
if (extension.equals(fName)) {
return null;
}
String namePart = null;
if (StringUtils.isBlank(extension)) {
namePart = fName;
extension = calculateExtension(metadata, defaultExtension);
} else {
namePart = fName.substring(0, fName.length() - extension.length());
}
if (StringUtils.isBlank(namePart)) {
return null;
}
//remove all initial .
namePart = namePart.replaceAll("\\A\\.+", "_");
//defense in depth. We shouldn't need this
namePart = namePart.replaceAll("(\\.\\.)+", "_");
namePart = namePart.replaceAll("[/\\\\]+", "_");
namePart = namePart.replaceAll(":+", "_");
namePart = namePart.trim();
if (StringUtils.isBlank(namePart)) {
return null;
}
//if path is > max length, return only the name part
if (namePart.length() > maxLength) {
return namePart.substring(0, maxLength - extension.length() - 3) + "..." + extension;
}
return namePart + extension;
}
/**
* This tries to sanitize dangerous user generated embedded file paths.
* If trusting these paths for writing files, users should run checks to make
* sure that the generated file path does not zipslip out of the target directory.
*
* @param metadata
* @param defaultExtension
* @param maxLength
* @return
*/
public static String getSanitizedEmbeddedFilePath(Metadata metadata,
String defaultExtension, int maxLength) {
String path = getEmbeddedPath(metadata);
//fName could be a full path or null
if (StringUtils.isBlank(path)) {
return null;
}
path = path.replaceAll("\u0000", " ");
int prefixLength = getPrefixLength(path);
if (prefixLength > 0) {
path = path.substring(prefixLength);
}
path = path.replaceAll("\\\\", "/");
path = removeProtocol(path);
path = path.replaceAll(":+", "/");
path = path.replaceAll("/+", "/");
path = normalize(path);
path = path.replaceAll("\\.{2,}", ".");
path = path.replaceAll("\\./", "/");
if (path.isBlank()) {
return null;
}
path = path.replaceAll("\\A/+", "");
path = path.replaceAll("/+\\Z", "");
String fName = getName(path);
if (StringUtils.isBlank(fName)) {
return null;
}
String relPath = "";
if (path.length() > fName.length()) {
relPath = path.substring(0, path.length() - fName.length() - 1);
}
String extension = FilenameUtils.getSuffixFromPath(fName);
if (extension.equals(path)) {
return extension;
}
String namePart = null;
if (StringUtils.isBlank(extension)) {
namePart = path;
extension = calculateExtension(metadata, defaultExtension);
} else {
namePart = fName.substring(0, fName.length() - extension.length());
}
if (StringUtils.isBlank(namePart)) {
return null;
}
//remove all initial .
namePart = namePart.replaceAll("\\A\\.+", "_");
//defense in depth. We shouldn't need this
namePart = namePart.replaceAll("\\.{2,}", ".");
namePart = namePart.replaceAll("[/\\\\]+", "_");
if (StringUtils.isBlank(namePart)) {
return null;
}
String retPath = StringUtils.isBlank(relPath) ? namePart + extension : relPath + "/" + namePart + extension;
//if path is > max length, return only the name part
if (retPath.length() > maxLength) {
if (namePart.length() > maxLength) {
return namePart.substring(0, maxLength - extension.length() - 3) + "..." + extension;
}
return namePart + extension;
}
return retPath;
}
private static int getPrefixLength(String path) {
int prefixLength = org.apache.commons.io.FilenameUtils.getPrefixLength(path);
if (prefixLength > 0) {
return prefixLength;
}
if (path.length() == 2 && path.charAt(0) >= 'A' && path.charAt(0) <= 'Z' && path.charAt(1) == ':') {
return 2;
}
return 0;
}
private static String removeProtocol(String path) {
Matcher m = PROTOCOL_PATTERN.matcher(path);
int last = -1;
while (m.find()) {
last = m.end();
}
if (last > -1) {
return path.substring(last);
}
return path;
}
//may return null
private static String getEmbeddedPath(Metadata metadata) {
String path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
if (! StringUtils.isBlank(path)) {
return path;
}
path = metadata.get(TikaCoreProperties.INTERNAL_PATH);
if (! StringUtils.isBlank(path)) {
return path;
}
path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (! StringUtils.isBlank(path)) {
return path;
}
path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
if (! StringUtils.isBlank(path)) {
return path;
}
return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
}
//this tries for resource name first, and then backs off to path
private static String getEmbeddedName(Metadata metadata) {
String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (! StringUtils.isBlank(path)) {
return path;
}
path = metadata.get(TikaCoreProperties.INTERNAL_PATH);
if (! StringUtils.isBlank(path)) {
return path;
}
path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
if (! StringUtils.isBlank(path)) {
return path;
}
path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
if (! StringUtils.isBlank(path)) {
return path;
}
return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
}
/**
* Calculate the extension based on the {@link Metadata#CONTENT_TYPE} value.
* On parse exception or null value, return the default value.
*
* @param metadata
* @param defaultValue
* @return the extension based on the mime type, including the initial "."
*/
public static String calculateExtension(Metadata metadata, String defaultValue) {
String mime = metadata.get(Metadata.CONTENT_TYPE);
if (mime == null) {
return defaultValue;
}
// Normalize OCR routing types (e.g., image/ocr-png -> image/png)
mime = EmbeddedDocumentUtil.normalizeMediaType(mime);
String ext = lookupExtension(mime);
if (ext != null) {
return ext;
}
return ".bin";
}
private static String lookupExtension(String mime) {
try {
String ext = MIME_TYPES
.forName(mime)
.getExtension();
if (!StringUtils.isBlank(ext)) {
return ext;
}
} catch (MimeTypeException e) {
//swallow
}
return null;
}
}