FileSystemFetcher.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.fetcher.fs;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.util.Date;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.FileSystem;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.fetcher.Fetcher;
import org.apache.tika.plugins.AbstractTikaExtension;
import org.apache.tika.plugins.ExtensionConfig;
import org.apache.tika.utils.StringUtils;
/**
* Fetches files from a local/mounted file system.
* Config:
* <pre>{@code
* "file-system-fetcher": {
* "basePath": "BASE_PATH",
* "extractFileSystemMetadata": false
* }
* }
* </pre>
*/
public class FileSystemFetcher extends AbstractTikaExtension implements Fetcher {
private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcher.class);
private FileSystemFetcherConfig defaultFileSystemFetcherConfig;
public FileSystemFetcher(ExtensionConfig pluginConfig) throws TikaConfigException {
super(pluginConfig);
defaultFileSystemFetcherConfig = FileSystemFetcherConfig.load(pluginConfig.json());
checkConfig(defaultFileSystemFetcherConfig);
}
@Override
public TikaInputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext)
throws IOException, TikaException {
if (fetchKey.contains("\u0000")) {
throw new IllegalArgumentException("Path must not contain 'u0000'. "
+ "Please review the life decisions that led you to requesting "
+ "a file name with this character in it.");
}
FileSystemFetcherConfig config = defaultFileSystemFetcherConfig;
Path p;
if (StringUtils.isBlank(config.getBasePath())) {
// No basePath - treat fetchKey as absolute path
p = Paths.get(fetchKey);
} else {
Path basePath = Paths.get(config.getBasePath());
if (!Files.isDirectory(basePath)) {
throw new IOException("BasePath is not a directory: " + basePath);
}
p = basePath.resolve(fetchKey);
// First check using normalize() - catches obvious path traversal attempts
// This doesn't require the file to exist, so it works on all platforms
if (!p.normalize().startsWith(basePath.normalize())) {
throw new SecurityException(
"fetchKey must resolve to be a descendant of the 'basePath'");
}
// Additional check using toRealPath() for symlink attacks (only if file exists)
if (Files.exists(p) && !p.toRealPath().startsWith(basePath.toRealPath())) {
throw new SecurityException(
"fetchKey must resolve to be a descendant of the 'basePath'");
}
}
metadata.set(TikaCoreProperties.SOURCE_PATH, fetchKey);
LOG.trace("about to read from {} with base={}", p.toAbsolutePath(), config.getBasePath());
if (!Files.isRegularFile(p)) {
throw new FileNotFoundException(p.toAbsolutePath().toString());
}
updateFileSystemMetadata(p, metadata, config);
return TikaInputStream.get(p, metadata);
}
private void updateFileSystemMetadata(Path p, Metadata metadata, FileSystemFetcherConfig config) throws IOException {
if (! config.isExtractFileSystemMetadata()) {
return;
}
BasicFileAttributes attrs = Files.readAttributes(p, BasicFileAttributes.class);
updateFileTime(FileSystem.CREATED, attrs.creationTime(), metadata);
updateFileTime(FileSystem.MODIFIED, attrs.lastModifiedTime(), metadata);
updateFileTime(FileSystem.ACCESSED, attrs.lastAccessTime(), metadata);
//TODO extract owner or group?
}
private void updateFileTime(Property property, FileTime fileTime, Metadata metadata) {
if (fileTime == null) {
return;
}
metadata.set(property, new Date(fileTime.toMillis()));
}
private void checkConfig(FileSystemFetcherConfig fetcherConfig)
throws TikaConfigException {
String basePath = fetcherConfig.getBasePath();
if (basePath == null || basePath.isBlank()) {
if (!fetcherConfig.isAllowAbsolutePaths()) {
throw new TikaConfigException(
"'basePath' must be set, or 'allowAbsolutePaths' must be true. "
+ "Without basePath, clients can read any file this process "
+ "has access to. Set 'allowAbsolutePaths: true' to explicitly "
+ "allow this behavior and accept the security risks.");
}
return;
}
if (basePath.startsWith("http://")) {
throw new TikaConfigException(
"FileSystemFetcher only works with local file systems. "
+ "Please use the tika-fetcher-http module for http calls");
} else if (basePath.startsWith("ftp://")) {
throw new TikaConfigException(
"FileSystemFetcher only works with local file systems. "
+ "Please consider contributing an ftp fetcher module");
} else if (basePath.startsWith("s3://")) {
throw new TikaConfigException(
"FileSystemFetcher only works with local file systems. "
+ "Please use the tika-fetcher-s3 module");
}
if (basePath.contains("\u0000")) {
throw new TikaConfigException(
"base path must not contain \u0000. Seriously, what were you thinking?");
}
}
@Override
public String toString() {
return "FileSystemFetcher{" + "defaultFileSystemFetcherConfig=" + defaultFileSystemFetcherConfig + ", pluginConfig=" + pluginConfig + '}';
}
}