RDFLoader.java

/*******************************************************************************
 * Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/
package org.eclipse.rdf4j.repository.util;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLConnection;
import java.security.PrivilegedAction;
import java.util.List;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.commons.lang3.StringUtils;
import org.eclipse.rdf4j.common.io.GZipUtil;
import org.eclipse.rdf4j.common.io.UncloseableInputStream;
import org.eclipse.rdf4j.common.io.ZipUtil;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.rio.ParserConfig;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandler;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.RDFParserRegistry;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.UnsupportedRDFormatException;
import org.eclipse.rdf4j.rio.helpers.ParseErrorLogger;

/**
 * Handles common I/O to retrieve and parse RDF.
 *
 * @author James Leigh
 */
public class RDFLoader {

	private final ParserConfig config;

	private final ValueFactory vf;

	/**
	 * @param config
	 * @param vf
	 */
	public RDFLoader(ParserConfig config, ValueFactory vf) {
		this.config = config;
		this.vf = vf;
	}

	/**
	 * Parses RDF data from the specified file to the given RDFHandler.
	 *
	 * @param file       A file containing RDF data.
	 * @param baseURI    The base URI to resolve any relative URIs that are in the data against. This defaults to the
	 *                   value of {@link java.io.File#toURI() file.toURI()} if the value is set to <var>null</var>.
	 * @param dataFormat The serialization format of the data.
	 * @param rdfHandler Receives RDF parser events.
	 * @throws IOException                  If an I/O error occurred while reading from the file.
	 * @throws UnsupportedRDFormatException If no parser is available for the specified RDF format.
	 * @throws RDFParseException            If an error was found while parsing the RDF data.
	 * @throws RDFHandlerException          If thrown by the RDFHandler
	 */
	public void load(File file, String baseURI, RDFFormat dataFormat, RDFHandler rdfHandler)
			throws IOException, RDFParseException, RDFHandlerException {
		if (baseURI == null) {
			// default baseURI to file
			baseURI = file.toURI().toString();
		}
		if (dataFormat == null) {
			dataFormat = Rio.getParserFormatForFileName(file.getName())
					.orElseThrow(() -> new UnsupportedRDFormatException(
							"Could not find RDF format for file: " + file.getName()));
		}

		try (InputStream in = new FileInputStream(file)) {
			load(in, baseURI, dataFormat, rdfHandler);
		}
	}

	/**
	 * Parses the RDF data that can be found at the specified URL to the RDFHandler. This method uses the class
	 * {@link URL} to resolve the provided <var>url</var>. This method honors
	 * {@link HttpURLConnection#getFollowRedirects()} to determine if redirects are followed and if set to
	 * <var>true</var> will also follow redirects from HTTP to HTTPS. The maximum number of redirects can be controlled
	 * using system property <var>http.maxRedirects</var>.
	 *
	 * @param url        The URL of the RDF data.
	 * @param baseURI    The base URI to resolve any relative URIs that are in the data against. This defaults to the
	 *                   value of {@link java.net.URL#toExternalForm() url.toExternalForm()} if the value is set to
	 *                   <var>null</var>.
	 * @param dataFormat The serialization format of the data. If set to <var>null</var>, the format will be
	 *                   automatically determined by examining the content type in the HTTP response header, and failing
	 *                   that, the file name extension of the supplied URL.
	 * @param rdfHandler Receives RDF parser events.
	 * @throws IOException                  If an I/O error occurred while reading from the URL.
	 * @throws UnsupportedRDFormatException If no parser is available for the specified RDF format, or the RDF format
	 *                                      could not be automatically determined.
	 * @throws RDFParseException            If an error was found while parsing the RDF data.
	 * @throws RDFHandlerException          If thrown by the RDFHandler
	 */
	public void load(URL url, String baseURI, RDFFormat dataFormat, RDFHandler rdfHandler)
			throws IOException, RDFParseException, RDFHandlerException {
		if (baseURI == null) {
			baseURI = url.toExternalForm();
		}

		boolean followRedirects = HttpURLConnection.getFollowRedirects();
		int maxRedirects = java.security.AccessController.doPrivileged(
				(PrivilegedAction<Integer>) () -> Integer.valueOf(System.getProperty("http.maxRedirects", "20")));

		int redirects = 0;
		boolean redirected;

		URL requestURL = url;
		do {
			redirected = false;

			URLConnection con = requestURL.openConnection();

			// Set appropriate Accept headers
			if (dataFormat != null) {
				for (String mimeType : dataFormat.getMIMETypes()) {
					con.addRequestProperty("Accept", mimeType);
				}
			} else {
				Set<RDFFormat> rdfFormats = RDFParserRegistry.getInstance().getKeys();
				List<String> acceptParams = RDFFormat.getAcceptParams(rdfFormats, true, null);
				for (String acceptParam : acceptParams) {
					con.addRequestProperty("Accept", acceptParam);
				}
			}

			/* Nullable */
			HttpURLConnection httpCon = null;
			if (con instanceof HttpURLConnection) {
				if (followRedirects) {
					httpCon = (HttpURLConnection) con;
					// Because of #2828, follow redirects manually
					httpCon.setInstanceFollowRedirects(false);
				}
			}

			try (InputStream in = con.getInputStream()) {
				// httpCon is non-null only if this is an HTTP connection and followRedirects is true
				if (httpCon != null && isRedirection(httpCon.getResponseCode())) {
					/* Nullable */
					String redirectionLocation = httpCon.getHeaderField("Location");
					if (StringUtils.isAllBlank(redirectionLocation)) {
						throw new IOException("Could not find redirection location for URL: " + url);
					}

					requestURL = new URL(requestURL, redirectionLocation);

					redirected = true;
					if (++redirects >= maxRedirects) {
						throw new ProtocolException("Server redirected too many times (" + redirects + ")");
					}
					continue; // request the URL associated with the redirection
				}

				if (dataFormat == null) {
					// Try to determine the data's MIME type
					String mimeType = con.getContentType();
					int semiColonIdx = mimeType.indexOf(';');
					if (semiColonIdx >= 0) {
						mimeType = mimeType.substring(0, semiColonIdx);
					}
					dataFormat = Rio.getParserFormatForMIMEType(mimeType)
							.orElseGet(() -> Rio.getParserFormatForFileName(url.getPath())
									.orElseThrow(() -> new UnsupportedRDFormatException(
											"Could not find RDF format for URL: " + url.getPath())));

				}

				load(in, baseURI, dataFormat, rdfHandler);
			}
		} while (redirected);
	}

	/**
	 * Returns whether a given HTTP status code represents a redirection (i.e. 3xx)
	 *
	 * @param statusCode
	 * @return
	 */
	private boolean isRedirection(int statusCode) {
		return statusCode / 100 == 3;
	}

	/**
	 * Parses RDF data from an InputStream to the RDFHandler.
	 *
	 * @param in         An InputStream from which RDF data can be read.
	 * @param baseURI    The base URI to resolve any relative URIs that are in the data against.
	 * @param dataFormat The serialization format of the data.
	 * @param rdfHandler Receives RDF parser events.
	 * @throws IOException                  If an I/O error occurred while reading from the input stream.
	 * @throws UnsupportedRDFormatException If no parser is available for the specified RDF format.
	 * @throws RDFParseException            If an error was found while parsing the RDF data.
	 * @throws RDFHandlerException          If thrown by the RDFHandler
	 */
	public void load(InputStream in, String baseURI, RDFFormat dataFormat, RDFHandler rdfHandler)
			throws IOException, RDFParseException, RDFHandlerException {
		if (!in.markSupported()) {
			in = new BufferedInputStream(in, 1024);
		}

		if (ZipUtil.isZipStream(in)) {
			loadZip(in, baseURI, dataFormat, rdfHandler);
		} else if (GZipUtil.isGZipStream(in)) {
			load(new GZIPInputStream(in), baseURI, dataFormat, rdfHandler);
		} else {
			loadInputStreamOrReader(in, baseURI, dataFormat, rdfHandler);
		}
	}

	/**
	 * Parses RDF data from a Reader to the RDFHandler. <b>Note: using a Reader to upload byte-based data means that you
	 * have to be careful not to destroy the data's character encoding by enforcing a default character encoding upon
	 * the bytes. If possible, adding such data using an InputStream is to be preferred.</b>
	 *
	 * @param reader     A Reader from which RDF data can be read.
	 * @param baseURI    The base URI to resolve any relative URIs that are in the data against.
	 * @param dataFormat The serialization format of the data.
	 * @param rdfHandler Receives RDF parser events.
	 * @throws IOException                  If an I/O error occurred while reading from the reader.
	 * @throws UnsupportedRDFormatException If no parser is available for the specified RDF format.
	 * @throws RDFParseException            If an error was found while parsing the RDF data.
	 * @throws RDFHandlerException          If thrown by the RDFHandler
	 */
	public void load(Reader reader, String baseURI, RDFFormat dataFormat, RDFHandler rdfHandler)
			throws IOException, RDFParseException, RDFHandlerException {
		loadInputStreamOrReader(reader, baseURI, dataFormat, rdfHandler);
	}

	private void loadZip(InputStream in, String baseURI, RDFFormat dataFormat, RDFHandler rdfHandler)
			throws IOException, RDFParseException, RDFHandlerException {

		try (ZipInputStream zipIn = new ZipInputStream(in)) {
			for (ZipEntry entry = zipIn.getNextEntry(); entry != null; entry = zipIn.getNextEntry()) {
				if (entry.isDirectory()) {
					continue;
				}

				try {
					RDFFormat format = Rio.getParserFormatForFileName(entry.getName()).orElse(dataFormat);

					// Prevent parser (Xerces) from closing the input stream
					UncloseableInputStream wrapper = new UncloseableInputStream(zipIn);
					load(wrapper, baseURI, format, rdfHandler);

				} catch (RDFParseException e) {
					String msg = e.getMessage() + " in " + entry.getName();
					RDFParseException pe = new RDFParseException(msg, e.getLineNumber(), e.getColumnNumber());
					pe.initCause(e);
					throw pe;
				} finally {
					zipIn.closeEntry();
				}
			} // end for
		}
	}

	/**
	 * Adds the data that can be read from the supplied InputStream or Reader to this repository.
	 *
	 * @param inputStreamOrReader An {@link InputStream} or {@link Reader} containing RDF data that must be added to the
	 *                            repository.
	 * @param baseURI             The base URI for the data.
	 * @param dataFormat          The file format of the data.
	 * @param rdfHandler          handles all data from all documents
	 * @throws IOException
	 * @throws UnsupportedRDFormatException
	 * @throws RDFParseException
	 * @throws RDFHandlerException
	 */
	private void loadInputStreamOrReader(Object inputStreamOrReader, String baseURI, RDFFormat dataFormat,
			RDFHandler rdfHandler) throws IOException, RDFParseException, RDFHandlerException {
		RDFParser rdfParser = Rio.createParser(dataFormat, vf);
		rdfParser.setParserConfig(config);
		rdfParser.setParseErrorListener(new ParseErrorLogger());

		rdfParser.setRDFHandler(rdfHandler);

		if (inputStreamOrReader instanceof InputStream) {
			rdfParser.parse((InputStream) inputStreamOrReader, baseURI);
		} else if (inputStreamOrReader instanceof Reader) {
			rdfParser.parse((Reader) inputStreamOrReader, baseURI);
		} else {
			throw new IllegalArgumentException(
					"Must be an InputStream or a Reader, is a: " + inputStreamOrReader.getClass());
		}
	}
}