ParsedURI.java

/*******************************************************************************
 * Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/

package org.eclipse.rdf4j.common.net;

import java.util.LinkedList;
import java.util.StringTokenizer;

/**
 * A replacement for Java's own URI: java.net.URI. Java's implementation is quite buggy in that it doesn't resolve
 * relative URIs correctly.
 * <p>
 * Note: this implementation is not guaranteed to handle ipv6 addresses correctly (yet).
 *
 * @deprecated use {@link ParsedIRI} instead
 */
@Deprecated(since = "2.3")
public class ParsedURI implements java.lang.Cloneable {

	/*
	 * // Tesing method public static void main(String[] args) throws Exception { URI baseURI = new URI(args[0]);
	 * baseURI.normalize(); URI uri = null; for (int i = 0; i < 100; i++) { uri = baseURI.resolve(args[1]); } try {
	 * Thread.sleep(1000); } catch (Exception e) {} long startTime = System.currentTimeMillis(); for (int i = 0; i <
	 * 100; i++) { uri = baseURI.resolve(args[1]); } long endTime = System.currentTimeMillis();
	 * System.out.println(args[0] + " was parsed as:"); System.out.println("scheme = " + uri.getScheme());
	 * System.out.println("schemeSpecificPart = " + uri.getSchemeSpecificPart()); System.out.println("authority = " +
	 * uri.getAuthority()); System.out.println("path = " + uri.getPath()); System.out.println("query = " +
	 * uri.getQuery()); System.out.println("fragment = " + uri.getFragment()); System.out.println("full URI = " +
	 * uri.toString()); System.out.println(" parsed 100 times in " + (endTime-startTime) + "ms"); }
	 */

	/*-----------*
	 * Variables *
	 *-----------*/

	// For all URIs:
	private String _scheme;

	private String _schemeSpecificPart;

	private String _fragment;

	// For hierarchical URIs:
	private String _authority;

	private String _path;

	private String _query;

	/*--------------*
	 * Constructors *
	 *--------------*/

	public ParsedURI(String uriSpec) {
		_parse(uriSpec);
	}

	public ParsedURI(String scheme, String schemeSpecificPart, String fragment) {
		_scheme = scheme;
		_schemeSpecificPart = schemeSpecificPart;
		_fragment = fragment;
	}

	public ParsedURI(String scheme, String authority, String path, String query, String fragment) {
		_scheme = scheme;
		_authority = authority;
		_path = path;
		_query = query;
		_fragment = fragment;
	}

	/*-----------------------*
	 * Public access methods *
	 *-----------------------*/

	public boolean isHierarchical() {
		return _path != null;
	}

	public boolean isOpaque() {
		return _path == null;
	}

	public boolean isAbsolute() {
		return _scheme != null;
	}

	public boolean isRelative() {
		return _scheme == null;
	}

	/**
	 * Checks whether this URI is a relative URI that references itself (i.e. it only contains an anchor).
	 */
	public boolean isSelfReference() {
		return _scheme == null && _authority == null && _query == null && _path.length() == 0;
	}

	public String getScheme() {
		return _scheme;
	}

	public String getSchemeSpecificPart() {
		return _schemeSpecificPart;
	}

	public String getAuthority() {
		return _authority;
	}

	public String getPath() {
		return _path;
	}

	public String getQuery() {
		return _query;
	}

	public String getFragment() {
		return _fragment;
	}

	/*------------------------------*
	 * Methods for normalizing URIs *
	 *------------------------------*/

	/**
	 * Normalizes the path of this URI if it has one. Normalizing a path means that any unnecessary '.' and '..'
	 * segments are removed. For example, the URI <var>http://server.com/a/b/../c/./d</var> would be normalized to
	 * <var>http://server.com/a/c/d</var>. A URI doens't have a path if it is opaque.
	 */
	public void normalize() {
		if (_path == null) {
			return;
		}

		// Remove any '.' segments:

		_path = _path.replace("/./", "/");

		if (_path.startsWith("./")) {
			// Remove both characters
			_path = _path.substring(2);
		}

		if (_path.endsWith("/.")) {
			// Remove only the last dot, not the slash!
			_path = _path.substring(0, _path.length() - 1);
		}

		if (_path.indexOf("/../") == -1 && !_path.endsWith("/..")) {
			// There are no '..' segments that can be removed. We're done and
			// don't have to execute the time-consuming code following this
			// if-statement
			return;
		}

		// Split the path into its segments

		LinkedList<String> segments = new LinkedList<>();

		StringTokenizer st = new StringTokenizer(_path, "/");

		while (st.hasMoreTokens()) {
			segments.add(st.nextToken());
		}

		boolean lastSegmentRemoved = false;

		// Remove all unnecessary '..' segments

		int i = 1;
		while (i < segments.size()) {
			String segment = segments.get(i);

			if (segment.equals("..")) {
				String prevSegment = segments.get(i - 1);

				if (prevSegment.equals("..")) {
					// two consecutive '..' segments at position i-1 and i,
					// continue at i + 2
					i += 2;
				} else {
					// Bingo! Remove these two segments...
					if (i == segments.size() - 1) {
						lastSegmentRemoved = true;
					}

					segments.remove(i);
					segments.remove(i - 1);

					// ...and continue at position (i + 1 - 2) == (i - 1)...

					// ...but only if i > 1, position 0 does not need to be
					// checked.

					if (i > 1) {
						i--;
					}
				}
			} else {
				// Not a '..' segment, check next
				i++;
			}
		}

		// Construct the normalized path

		StringBuilder newPath = new StringBuilder(_path.length());

		if (_path.startsWith("/")) {
			newPath.append('/');
		}

		int segmentCount = segments.size();
		for (i = 0; i < segmentCount - 1; i++) {
			newPath.append(segments.get(i));
			newPath.append('/');
		}

		if (segmentCount > 0) {
			String lastSegment = segments.get(segmentCount - 1);
			newPath.append(lastSegment);

			if (_path.endsWith("/") || lastSegmentRemoved) {
				newPath.append('/');
			}
		}

		_path = newPath.toString();
	}

	/**
	 * Resolves a relative URI using this URI as the base URI.
	 */
	public ParsedURI resolve(String relURISpec) {
		// This algorithm is based on the algorithm specified in chapter 5 of
		// RFC 2396: URI Generic Syntax. See http://www.ietf.org/rfc/rfc2396.txt

		// RFC, step 1:
		ParsedURI relURI = new ParsedURI(relURISpec);

		return this.resolve(relURI);
	}

	/**
	 * Resolves a relative URI using this URI as the base URI.
	 */
	public ParsedURI resolve(ParsedURI relURI) {
		// This algorithm is based on the algorithm specified in chapter 5 of
		// RFC 2396: URI Generic Syntax. See http://www.ietf.org/rfc/rfc2396.txt

		// RFC, step 3:
		if (relURI.isAbsolute()) {
			return relURI;
		}

		// relURI._scheme == null

		// RFC, step 2:
		if (relURI._authority == null && relURI._query == null && relURI._path.length() == 0) {
			// Reference to this URI
			ParsedURI result = (ParsedURI) this.clone();

			// Inherit any fragment identifier from relURI
			result._fragment = relURI._fragment;

			return result;
		}

		// We can start combining the URIs
		String scheme, authority, path, query, fragment;
		boolean normalizeURI = false;

		scheme = this._scheme;
		query = relURI._query;
		fragment = relURI._fragment;

		// RFC, step 4:
		if (relURI._authority != null) {
			authority = relURI._authority;
			path = relURI._path;
		} else {
			authority = this._authority;

			// RFC, step 5:
			if (relURI._path.startsWith("/")) {
				path = relURI._path;
			} else if (relURI._path.length() == 0) {
				path = this._path;
			} else {
				// RFC, step 6:
				path = this._path;

				if (path == null) {
					path = "/";
				} else {
					if (!path.endsWith("/")) {
						// Remove the last segment of the path. Note: if
						// lastSlashIdx is -1, the path will become empty,
						// which is fixed later.
						int lastSlashIdx = path.lastIndexOf('/');
						path = path.substring(0, lastSlashIdx + 1);
					}

					if (path.length() == 0) {
						// No path means: start at root.
						path = "/";
					}
				}

				// Append the path of the relative URI
				path += relURI._path;

				// Path needs to be normalized.
				normalizeURI = true;
			}
		}

		ParsedURI result = new ParsedURI(scheme, authority, path, query, fragment);

		if (normalizeURI) {
			result.normalize();
		}

		return result;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder(64);

		if (_scheme != null) {
			sb.append(_scheme);
			if (!isJarScheme(_scheme)) {
				sb.append(':');
			}
		}

		if (isOpaque()) {
			// Opaque URI
			if (_schemeSpecificPart != null) {
				sb.append(_schemeSpecificPart);
			}
		} else {
			// Hierachical URI
			if (_authority != null) {
				sb.append("//");
				sb.append(_authority);
			}

			sb.append(_path);

			if (_query != null) {
				sb.append('?');
				sb.append(_query);
			}
		}

		if (_fragment != null) {
			sb.append('#');
			sb.append(_fragment);
		}

		return sb.toString();
	}

	// Overrides Object.clone()
	@Override
	public Object clone() {
		try {
			return super.clone();
		} catch (CloneNotSupportedException e) {
			throw new RuntimeException(e);
		}
	}

	/*--------------------------*
	 * Methods for parsing URIs *
	 *--------------------------*/

	private static boolean isJarScheme(String s) {
		return (s.length() > 4 && s.substring(0, 4).equalsIgnoreCase("jar:"));
	}

	private void _parse(String uri) {
		if (isJarScheme(uri)) {
			// uriString is e.g.
			// jar:http://www.foo.com/bar/baz.jar!/COM/foo/Quux.class
			// Treat the part up to and including the exclamation mark as the
			// scheme and
			// the rest as the path to enable 'correct' resolving of relative URIs
			int idx = uri.indexOf('!');
			if (idx != -1) {
				String scheme = uri.substring(0, idx + 1);
				String path = uri.substring(idx + 1);

				_scheme = scheme;
				_authority = null;
				_path = path;
				_query = null;
				_fragment = null;

				return;
			}
		}

		if (_parseScheme(uri)) {
			// A scheme was found; _scheme and _schemeSpecificPart are now set
			if (_schemeSpecificPart.startsWith("/")) {
				// Hierachical URI
				String rest = _schemeSpecificPart;
				rest = _parseAuthority(rest);
				rest = _parsePath(rest);
				rest = _parseQuery(rest);
				_parseFragment(rest);
			} else {
				// Opaque URI
				String rest = _schemeSpecificPart;
				rest = _parseOpaquePart(rest);
				_parseFragment(rest);
			}
		} else {
			// No scheme was found
			String rest = uri;
			rest = _parseAuthority(rest);
			rest = _parsePath(rest);
			rest = _parseQuery(rest);
			_parseFragment(rest);
		}
	}

	private boolean _parseScheme(String uri) {
		// Query cannot contain a ':', '/', '?' or '#' character

		// Try to find the scheme in the URI
		char c = 0;
		int i = 0;

		for (; i < uri.length(); i++) {
			c = uri.charAt(i);
			if (c == ':' || c == '/' || c == '?' || c == '#') {
				// c is equal to one of the illegal chars
				break;
			}
		}

		if (c == ':' && i > 0) {
			// We've found a scheme
			_scheme = uri.substring(0, i);
			_schemeSpecificPart = uri.substring(i + 1);
			return true;
		}

		// No scheme found, uri is relative
		return false;
	}

	private String _parseAuthority(String s) {
		// Query cannot contain a '/', '?' or '#' character

		if (s.startsWith("//")) {
			// Authority present, could be empty though.
			int i = 2;
			for (; i < s.length(); i++) {
				char c = s.charAt(i);
				if (c == '/' || c == '?' || c == '#') {
					// c is equal to one of the illegal chars
					break;
				}
			}

			_authority = s.substring(2, i);
			return s.substring(i);
		}

		return s;
	}

	private String _parsePath(String s) {
		// Query cannot contain a '?' or '#' character

		int i = 0;
		for (; i < s.length(); i++) {
			char c = s.charAt(i);
			if (c == '?' || c == '#') {
				// c is equal to one of the illegal chars
				break;
			}
		}

		_path = s.substring(0, i);

		return s.substring(i);
	}

	private String _parseQuery(String s) {
		// Query must start with a '?' and cannot contain a '#' character

		if (s.startsWith("?")) {
			int i = 1;
			for (; i < s.length(); i++) {
				char c = s.charAt(i);
				if (c == '#') {
					// c is equal to one of the illegal chars
					break;
				}
			}

			_query = s.substring(1, i);
			return s.substring(i);
		} else {
			return s;
		}
	}

	private String _parseOpaquePart(String s) {
		// Opaque part cannot contain a '#' character

		int i = 0;
		for (; i < s.length(); i++) {
			char c = s.charAt(i);
			if (c == '#') {
				// c is equal to one of the illegal chars
				break;
			}
		}

		_schemeSpecificPart = s.substring(0, i);

		return s.substring(i);
	}

	private void _parseFragment(String s) {
		// Fragment must start with a '#'
		if (s.startsWith("#")) {
			_fragment = s.substring(1);
		}
	}
}