ParsedIRI.java

/*******************************************************************************
 * Copyright (c) 2017 Eclipse RDF4J contributors, Aduna, and others.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/

package org.eclipse.rdf4j.common.net;

import java.io.Serializable;
import java.net.IDN;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;

/**
 * Represents an Internationalized Resource Identifier (IRI) reference.
 * <p>
 * Aside from some minor deviations noted below, an instance of this class represents a IRI reference as defined by
 * <a href="http://www.ietf.org/rfc/rfc3987.txt"><i>RFC&nbsp;3987: Internationalized Resource Identifiers (IRI): IRI
 * Syntax</i></a>. This class provides constructors for creating IRI instances from their components or by parsing their
 * string forms, methods for accessing the various components of an instance, and methods for normalizing, resolving,
 * and relativizing IRI instances. Instances of this class are immutable.
 * <p>
 * An IRI instance has the following seven components in string form has the syntax <blockquote>
 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b>[<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>]][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>]
 * </blockquote>
 * <p>
 * In a given instance any particular component is either <i>undefined</i> or <i>defined</i> with a distinct value.
 * Undefined string components are represented by {@code null}, while undefined integer components are represented by
 * {@code -1}. A string component may be defined to have the empty string as its value; this is not equivalent to that
 * component being undefined.
 * <p>
 * Whether a particular component is or is not defined in an instance depends upon the type of the IRI being
 * represented. An absolute IRI has a scheme component. An opaque IRI has a scheme, a scheme-specific part, and possibly
 * a fragment, but has no other components. A hierarchical IRI always has a path (though it may be empty) and a
 * scheme-specific-part (which at least contains the path), and may have any of the other components.
 * <h4>IRIs, URIs, URLs, and URNs</h4>
 * <p>
 * IRIs are meant to replace URIs in identifying resources for protocols, formats, and software components that use a
 * UCS-based character repertoire.
 * <p>
 * Internationalized Resource Identifier (IRI) is a complement to the Uniform Resource Identifier (URI). An IRI is a
 * sequence of characters from the Universal Character Set (Unicode/ISO 10646). A mapping from IRIs to URIs is defined
 * using {@link #toASCIIString()}, which means that IRIs can be used instead of URIs, where appropriate, to identify
 * resources. While all URIs are also IRIs, the {@link #normalize()} method can be used to convert a URI back into a
 * normalized IRI.
 * <p>
 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform resource <i>locator</i>. Hence every URL is a
 * URI, abstractly speaking, but not every URI is a URL. This is because there is another subcategory of URIs, uniform
 * resource <i>names</i> (URNs), which name resources but do not specify how to locate them. The {@code mailto},
 * {@code news}, and {@code isbn} URIs shown above are examples of URNs.
 * <h4>Deviations</h4>
 * <p>
 * <b>jar:</b> This implementation treats the first colon as part of the scheme if the scheme starts with "jar:". For
 * example the IRI <i>jar:http://www.foo.com/bar/jar.jar!/baz/entry.txt</i> is parsed with the scheme <i>jar:http</i>
 * and the path <i>/bar/jar.jar!/baz/entry.txt</i>.
 *
 * @author James Leigh
 * @see <a href="http://www.ietf.org/rfc/rfc3987.txt"><i>RFC&nbsp;3987: Internationalized Resource Identifiers
 *      (IRIs)</i></a>
 * @see <a href="http://www.ietf.org/rfc/rfc3986.txt"><i>RFC&nbsp;3986: Uniform Resource Identifiers (URI): Generic
 *      Syntax</i></a>
 * @since 2.3
 */
public class ParsedIRI implements Cloneable, Serializable {

	private static final long serialVersionUID = -5681843777254402303L;

	private static final Comparator<int[]> CMP = (int[] o1, int[] o2) -> o1[0] - o2[0];

	private static final int EOF = 0;

	private static final int[][] iprivate = { new int[] { 0xE000, 0xF8FF }, new int[] { 0xF0000, 0xFFFFD },
			new int[] { 0x100000, 0x10FFFD } };

	private static final int[][] ucschar = { new int[] { 0x00A0, 0xD7FF }, new int[] { 0xF900, 0xFDCF },
			new int[] { 0xFDF0, 0xFFEF }, new int[] { 0x10000, 0x1FFFD }, new int[] { 0x20000, 0x2FFFD },
			new int[] { 0x30000, 0x3FFFD }, new int[] { 0x40000, 0x4FFFD }, new int[] { 0x50000, 0x5FFFD },
			new int[] { 0x60000, 0x6FFFD }, new int[] { 0x70000, 0x7FFFD }, new int[] { 0x80000, 0x8FFFD },
			new int[] { 0x90000, 0x9FFFD }, new int[] { 0xA0000, 0xAFFFD }, new int[] { 0xB0000, 0xBFFFD },
			new int[] { 0xC0000, 0xCFFFD }, new int[] { 0xD0000, 0xDFFFD }, new int[] { 0xE1000, 0xEFFFD } };

	private static final int[][] ALPHA = { new int[] { 'A', 'Z' }, new int[] { 'a', 'z' } };

	private static final int[][] DIGIT = { new int[] { '0', '9' } };

	private static final int[][] sub_delims = union('!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=');

	private static final int[][] gen_delims = union(':', '/', '?', '#', '[', ']', '@');

	private static final int[][] reserved = union(gen_delims, sub_delims);

	private static final int[][] unreserved_rfc3986 = union(ALPHA, DIGIT, '-', '.', '_', '~');

	private static final int[][] unreserved = union(unreserved_rfc3986, ucschar);

	private static final int[][] schar = union(ALPHA, DIGIT, '+', '-', '.');

	private static final int[][] uchar = union(unreserved, sub_delims, ':');

	private static final int[][] hchar = union(unreserved, sub_delims);

	private static final int[][] pchar = union(unreserved, sub_delims, ':', '@');

	private static final int[][] qchar = union(pchar, iprivate, '/', '?');

	private static final int[][] fchar = union(pchar, '/', '?');

	private static final int[] HEXDIG = flatten(
			union(DIGIT, new int[][] { new int[] { 'A', 'F' }, new int[] { 'a', 'f' } }));

	private static final int[] ascii = flatten(union(unreserved_rfc3986, reserved, '%'));

	private static final int[] common = flatten(
			union(unreserved_rfc3986, reserved, '%', '<', '>', '"', ' ', '{', '}', '|', '\\', '^', '`'));

	private static final String[] common_pct = pctEncode(common);

	private static int[][] union(Object... sets) {
		List<int[]> list = new ArrayList<>();
		for (Object set : sets) {
			if (set instanceof int[][]) {
				int[][] ar = (int[][]) set;
				list.addAll(Arrays.asList(ar));
			} else if (set instanceof Character) {
				char chr = (Character) set;
				list.add(new int[] { chr, chr });
			} else {
				throw new IllegalStateException();
			}
		}
		int[][] dest = list.toArray(new int[][] {});
		Arrays.sort(dest, CMP);
		return dest;
	}

	private static int[] flatten(int[]... arrays) {
		List<Integer> list = new ArrayList<>();
		for (int[] str : arrays) {
			if (str.length == 1) {
				list.add(str[0]); // character
			} else if (str.length == 2) {
				for (int chr = str[0], end = str[1]; chr <= end; chr++) {
					list.add(chr); // range
				}
			} else {
				throw new IllegalStateException();
			}
		}
		int[] chars = new int[list.size()];
		for (int i = 0; i < chars.length; i++) {
			chars[i] = list.get(i);
		}
		Arrays.sort(chars);
		return chars;
	}

	private static String[] pctEncode(int[] unencoded) {
		CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder();
		String[] result = new String[unencoded.length];
		for (int i = 0; i < unencoded.length; i++) {
			String ns = new String(Character.toChars(unencoded[i]));
			ByteBuffer bb;
			try {
				bb = encoder.encode(CharBuffer.wrap(ns));
			} catch (CharacterCodingException x) {
				throw new IllegalStateException();
			}
			StringBuilder sb = new StringBuilder();
			while (bb.hasRemaining()) {
				byte b = (byte) (bb.get() & 0xff);
				sb.append('%');
				sb.appendCodePoint(HEXDIG[(b >> 4) & 0x0f]);
				sb.appendCodePoint(HEXDIG[(b) & 0x0f]);
			}
			result[i] = sb.toString();
		}
		return result;
	}

	/**
	 * Creates a ParsedIRI by parsing the given string.
	 * <p>
	 * This convenience factory method works as if by invoking the {@link #ParsedIRI(String)} constructor; any
	 * {@link URISyntaxException} thrown by the constructor is caught and the error code point is percent encoded. This
	 * process is repeated until a syntactically valid IRI is formed or a {@link IllegalArgumentException} is thrown.
	 * <p>
	 * This method is provided for use in situations where it is known that the given string is an IRI, even if it is
	 * not completely syntactically valid, for example a IRI constants declared within in a program. The constructors,
	 * which throw {@link URISyntaxException} directly, should be used situations where a IRI is being constructed from
	 * user input or from some other source that may be prone to errors.
	 * </p>
	 *
	 * @param str The string to be parsed into an IRI
	 * @return The new ParsedIRI
	 * @throws NullPointerException     If {@code str} is {@code null}
	 * @throws IllegalArgumentException If the given string could not be converted into an IRI
	 */
	public static ParsedIRI create(String str) {
		try {
			return new ParsedIRI(str);
		} catch (URISyntaxException x) {
			int problem = x.getIndex();
			StringBuilder sb = new StringBuilder(str);
			while (true) {
				int end = sb.offsetByCodePoints(problem, 1);
				String decoded = sb.substring(problem, end);
				String encoded = " ".equals(decoded) ? "%20" : URLEncoder.encode(decoded, StandardCharsets.UTF_8);
				sb.replace(problem, end, encoded);
				try {
					return new ParsedIRI(sb.toString());
				} catch (URISyntaxException ex) {
					if (ex.getIndex() <= problem) {
						throw new IllegalArgumentException(x.getMessage(), x);
					} else {
						problem = ex.getIndex();
					}
				}
			}

		}
	}

	private final String iri;

	private int pos;

	private String scheme;

	private String userInfo;

	private String host;

	private int port = -1;

	private String path;

	private String query;

	private String fragment;

	/**
	 * Constructs a ParsedIRI by parsing the given string.
	 *
	 * @param iri The string to be parsed into a IRI
	 * @throws NullPointerException If {@code iri} is {@code null}
	 * @throws URISyntaxException   If the given string violates RFC&nbsp;3987, as augmented by the above deviations
	 */
	public ParsedIRI(String iri) throws URISyntaxException {
		assert iri != null;
		this.iri = iri;
		parse();
	}

	/**
	 * Constructs a hierarchical IRI from the given components.
	 * <p>
	 * This constructor first builds a IRI string from the given components according to the rules specified in
	 * <a href="http://www.ietf.org/rfc/rfc3987.txt">RFC&nbsp;3987</a>
	 * </p>
	 *
	 * @param scheme   Scheme name
	 * @param userInfo User name and authorization information
	 * @param host     Host name
	 * @param port     Port number
	 * @param path     Path
	 * @param query    Query
	 * @param fragment Fragment
	 */
	public ParsedIRI(String scheme, String userInfo, String host, int port, String path, String query,
			String fragment) {
		this.iri = buildIRI(scheme, userInfo, host, port, path, query, fragment);
		this.scheme = scheme;
		this.userInfo = userInfo;
		this.host = host;
		this.port = port;
		this.path = path == null ? "" : path;
		this.query = query;
		this.fragment = fragment;
	}

	@Override
	public int hashCode() {
		return iri.hashCode();
	}

	/**
	 * Tests this IRI for simple string comparison with another object.
	 * <p>
	 * If two IRI strings are identical, then it is safe to conclude that they are equivalent. However, even if the IRI
	 * strings are not identical the IRIs might still be equivalent. Further comparison can be made using the
	 * {@link #normalize()} forms.
	 *
	 * @param obj The object to which this object is to be compared
	 * @return {@code true} if the given object is a ParsedIRI that represents the same IRI
	 */
	@Override
	public boolean equals(Object obj) {
		return obj instanceof ParsedIRI && this.iri.equals(((ParsedIRI) obj).iri);
	}

	/**
	 * Returns the content of this IRI as a string.
	 * <p>
	 * If this URI was created by invoking one of the constructors in this class then a string equivalent to the
	 * original input string, or to the string computed from the originally-given components, as appropriate, is
	 * returned. Otherwise this IRI was created by normalization, resolution, or relativization, and so a string is
	 * constructed from this IRI's components according to the rules specified in
	 * <a href="http://www.ietf.org/rfc/rfc3987.txt">RFC&nbsp;3987</a>
	 * </p>
	 *
	 * @return The string form of this IRI
	 */
	@Override
	public String toString() {
		return iri;
	}

	/**
	 * Returns the content of this IRI as a US-ASCII string.
	 * <p>
	 * If this IRI only contains 8bit characters then an invocation of this method will return the same value as an
	 * invocation of the {@link #toString() toString} method. Otherwise this method works as if by encoding the host via
	 * <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> and all other components by percent encoding their
	 * UTF-8 values.
	 * </p>
	 *
	 * @return The string form of this IRI, encoded as needed so that it only contains characters in the US-ASCII
	 *         charset
	 */
	public String toASCIIString() {
		StringBuilder sb = new StringBuilder(iri.length());
		if (scheme != null) {
			sb.append(scheme).append(':');
		}
		if (host != null) {
			sb.append("//");
			if (userInfo != null) {
				appendAscii(sb, userInfo);
				sb.append('@');
			}
			if (!host.isEmpty()) {
				sb.append(IDN.toASCII(host, IDN.ALLOW_UNASSIGNED));
			}
			if (port >= 0) {
				sb.append(':').append(port);
			}
		}
		if (path != null) {
			appendAscii(sb, path);
		}
		if (query != null) {
			sb.append('?');
			appendAscii(sb, query);
		}
		if (fragment != null) {
			sb.append('#');
			appendAscii(sb, fragment);
		}
		return sb.toString();
	}

	/**
	 * Tells whether or not this IRI is absolute.
	 *
	 * @return {@code true} if, and only if, this IRI has a scheme component
	 */
	public boolean isAbsolute() {
		return scheme != null;
	}

	/**
	 * Tells whether or not this IRI is opaque.
	 * <p>
	 * A IRI is opaque if, and only if, it is absolute and its path part does not begin with a slash character ('/'). An
	 * opaque IRI has a scheme, a path, and possibly a query or fragment; all other components (userInfo, host, and
	 * port) are undefined.
	 * </p>
	 *
	 * @return {@code true} if, and only if, this IRI is absolute and its path does not start with a slash
	 */
	public boolean isOpaque() {
		return scheme != null && !path.isEmpty() && !path.startsWith("/");
	}

	/**
	 * Returns the scheme component of this IRI.
	 * <p>
	 * The scheme component of a IRI, if defined, only contains characters in the <i>alphanum</i> category and in the
	 * string {@code "-.+"}, unless the scheme starts with {@code "jar:"}, in which case it may also contain one colon.
	 * A scheme always starts with an <i>alpha</i> character.
	 * <p>
	 * The scheme component of a IRI cannot contain escaped octets.
	 *
	 * @return The scheme component of this IRI, or {@code null} if the scheme is undefined
	 */
	public String getScheme() {
		return scheme;
	}

	/**
	 * Returns the raw user-information component of this IRI.
	 *
	 * @return The raw user-information component of this IRI, or {@code null} if the user information is undefined
	 */
	public String getUserInfo() {
		return userInfo;
	}

	/**
	 * Returns the host component of this IRI.
	 *
	 * @return The host component of this IRI, or {@code null} if the host is undefined
	 */
	public String getHost() {
		return host;
	}

	/**
	 * Returns the port number of this IRI.
	 * <p>
	 * The port component of a IRI, if defined, is a non-negative integer.
	 * </p>
	 *
	 * @return The port component of this IRI, or {@code -1} if the port is undefined
	 */
	public int getPort() {
		return port;
	}

	/**
	 * Returns the raw path component of this IRI.
	 *
	 * @return The path component of this IRI (never {@code null})
	 */
	public String getPath() {
		return path;
	}

	/**
	 * Returns the raw query component of this IRI after the first question mark.
	 * <p>
	 * The query component of a IRI, if defined, only contains legal IRI characters.
	 * </p>
	 *
	 * @return The raw query component of this IRI, or {@code null} if the IRI does not contain a question mark
	 */
	public String getQuery() {
		return query;
	}

	/**
	 * Returns the raw fragment component of this IRI after the hash.
	 * <p>
	 * The fragment component of a IRI, if defined, only contains legal IRI characters and does not contain a hash.
	 * </p>
	 *
	 * @return The raw fragment component of this IRI, or {@code null} if the IRI does not contain a hash
	 */
	public String getFragment() {
		return fragment;
	}

	/**
	 * Normalizes this IRI's components.
	 * <p>
	 * Because IRIs exist to identify resources, presumably they should be considered equivalent when they identify the
	 * same resource. However, this definition of equivalence is not of much practical use, as there is no way for an
	 * implementation to compare two resources unless it has full knowledge or control of them. Therefore, IRI
	 * normalization is designed to minimize false negatives while strictly avoiding false positives.
	 * <p>
	 * <b>Case Normalization</b> the hexadecimal digits within a percent-encoding triplet (e.g., "%3a" versus "%3A") are
	 * case-insensitive and are normalized to use uppercase letters for the digits A - F. The scheme and host are case
	 * insensitive and are normalized to lowercase.
	 * <p>
	 * <b>Character Normalization</b> The Unicode Standard defines various equivalences between sequences of characters
	 * for various purposes. Unicode Standard Annex defines various Normalization Forms for these equivalences and is
	 * applied to the IRI components.
	 * <p>
	 * <b>Percent-Encoding Normalization</b> decodes any percent-encoded octet sequence that corresponds to an
	 * unreserved character anywhere in the IRI.
	 * <p>
	 * <b>Path Segment Normalization</b> is the process of removing unnecessary {@code "."} and {@code ".."} segments
	 * from the path component of a hierarchical IRI. Each {@code "."} segment is simply removed. A {@code ".."} segment
	 * is removed only if it is preceded by a non-{@code ".."} segment or the start of the path.
	 * <p>
	 * <b>HTTP(S) Scheme Normalization</b> if the port uses the default port number or not given it is set to undefined.
	 * An empty path is replaced with "/".
	 * <p>
	 * <b>File Scheme Normalization</b> if the host is "localhost" or empty it is set to undefined.
	 * <p>
	 * <b>Internationalized Domain Name Normalization</b> of the host component to Unicode.
	 *
	 * @return normalized IRI
	 */
	public ParsedIRI normalize() {
		String _scheme = toLowerCase(scheme);
		boolean optionalPort = isScheme("http") && 80 == port || isScheme("https") && 443 == port;
		int _port = optionalPort ? -1 : port;
		boolean localhost = isScheme("file") && userInfo == null && -1 == port
				&& ("".equals(host) || "localhost".equals(host));
		String _host = localhost ? null
				: host == null || host.isEmpty() ? host
						: IDN.toUnicode(pctEncodingNormalization(toLowerCase(host)),
								IDN.USE_STD3_ASCII_RULES | IDN.ALLOW_UNASSIGNED);
		String _path = _scheme != null && path == null ? "" : normalizePath(path);
		String _userInfo = pctEncodingNormalization(userInfo);
		String _query = pctEncodingNormalization(query);
		String _fragment = pctEncodingNormalization(fragment);
		ParsedIRI normalized = new ParsedIRI(_scheme, _userInfo, _host, _port, _path, _query, _fragment);
		if (this.iri.equals(normalized.iri)) {
			return this;
		} else {
			return normalized;
		}
	}

	/**
	 * Resolves the given IRI against this ParsedIRI.
	 *
	 * @param iri The IRI to be resolved against this ParsedIRI
	 * @return The resulting IRI
	 * @throws NullPointerException If {@code relative} is {@code null}
	 * @see #resolve(ParsedIRI)
	 */
	public String resolve(String iri) {
		return resolve(ParsedIRI.create(iri)).toString();
	}

	/**
	 * Resolves the given IRI against this ParsedIRI.
	 * <p>
	 * <i>Resolution</i> is the process of resolving one IRI against another, <i>base</i> IRI. The resulting IRI is
	 * constructed from components of both IRIs in the manner specified by RFC&nbsp;3986, taking components from the
	 * base IRI for those not specified in the original. For hierarchical IRIs, the path of the original is resolved
	 * against the path of the base and then normalized.
	 * <p>
	 * If the given IRI is already absolute, or if this IRI is opaque, then the given IRI is returned.
	 * <p>
	 * <a name="resolve-frag"></a> If the given URI's fragment component is defined, its path component is empty, and
	 * its scheme, authority, and query components are undefined, then a URI with the given fragment but with all other
	 * components equal to those of this URI is returned. This allows an IRI representing a standalone fragment
	 * reference, such as {@code "#foo"}, to be usefully resolved against a base IRI.
	 * <p>
	 * Otherwise this method constructs a new hierarchical IRI in a manner consistent with
	 * <a href="http://www.ietf.org/rfc/rfc3987.txt">RFC&nbsp;3987</a>
	 * </p>
	 * <p>
	 * The result of this method is absolute if, and only if, either this IRI is absolute or the given IRI is absolute.
	 * </p>
	 *
	 * @param relative The IRI to be resolved against this ParsedIRI
	 * @return The resulting IRI
	 * @throws NullPointerException If {@code relative} is {@code null}
	 */
	public ParsedIRI resolve(ParsedIRI relative) {
		// This algorithm is based on the algorithm specified in chapter 5 of
		// RFC 2396: URI Generic Syntax. See http://www.ietf.org/rfc/rfc2396.txt

		// RFC, step 3:
		if (relative.isAbsolute()) {
			return relative;
		}

		// relURI._scheme == null

		// RFC, step 2:
		if (relative.getHost() == null && relative.getQuery() == null && relative.getPath().isEmpty()) {

			// Inherit any fragment identifier from relURI
			String fragment = relative.getFragment();

			return new ParsedIRI(this.getScheme(), this.getUserInfo(), this.getHost(), this.getPort(), this.getPath(),
					this.getQuery(), fragment);
		} else if (relative.getHost() == null && relative.getPath().isEmpty()) {

			// Inherit any query or fragment from relURI
			String query = relative.getQuery();
			String fragment = relative.getFragment();

			return new ParsedIRI(this.getScheme(), this.getUserInfo(), this.getHost(), this.getPort(), this.getPath(),
					query, fragment);
		}

		// We can start combining the URIs
		String scheme, userInfo, host, path, query, fragment;
		int port;
		boolean normalize = false;

		scheme = this.getScheme();
		query = relative.getQuery();
		fragment = relative.getFragment();

		// RFC, step 4:
		if (relative.getHost() != null) {
			userInfo = relative.getUserInfo();
			host = relative.getHost();
			port = relative.getPort();
			path = relative.getPath();
		} else {
			userInfo = this.getUserInfo();
			host = this.getHost();
			port = this.getPort();

			// RFC, step 5:
			if (relative.getPath().startsWith("/")) {
				path = relative.getPath();
			} else {
				// RFC, step 6:
				path = this.getPath();

				if (path == null) {
					path = "/";
				} else {
					if (!path.endsWith("/")) {
						// Remove the last segment of the path. Note: if
						// lastSlashIdx is -1, the path will become empty,
						// which is fixed later.
						int lastSlashIdx = path.lastIndexOf('/');
						path = path.substring(0, lastSlashIdx + 1);
					}

					if (path.isEmpty()) {
						// No path means: start at root.
						path = "/";
					}
				}

				// Append the path of the relative URI
				path += relative.getPath();

				// Path needs to be normalized.
				normalize = true;
			}
		}

		if (normalize || path.contains("/./") || path.contains("/../")) {
			path = pathSegmentNormalization(path);
		}

		return new ParsedIRI(scheme, userInfo, host, port, path, query, fragment);
	}

	/**
	 * Relativizes the given IRI against this ParsedIRI.
	 *
	 * @param iri The IRI to be relativized against this ParsedIRI
	 * @return The resulting IRI
	 * @throws NullPointerException If {@code absolute} is {@code null}
	 * @see #relativize(ParsedIRI)
	 */
	public String relativize(String iri) {
		return relativize(ParsedIRI.create(iri)).toString();
	}

	/**
	 * Relativizes the given IRI against this ParsedIRI.
	 * <p>
	 * <i>Relativization</i> is the inverse of resolution. This operation is often useful when constructing a document
	 * containing IRIs that must be made relative to the base IRI of the document wherever possible.
	 * <p>
	 * The relativization of the given URI against this URI is computed as follows:
	 * </p>
	 * <ol>
	 * <li>
	 * <p>
	 * If either this IRI or the given IRI are opaque, or if the scheme and authority components of the two IRIs are not
	 * identical, or if the path of this IRI is not a prefix of the path of the given URI, then the given IRI is
	 * returned.
	 * </p>
	 * </li>
	 * <li>
	 * <p>
	 * Otherwise a new relative hierarchical IRI is constructed with query and fragment components taken from the given
	 * IRI and with a path component computed by removing this IRI's path from the beginning of the given IRI's path.
	 * </p>
	 * </li>
	 * </ol>
	 *
	 * @param absolute The IRI to be relativized against this ParsedIRI
	 * @return The resulting IRI
	 * @throws NullPointerException If {@code absolute} is {@code null}
	 */
	public ParsedIRI relativize(ParsedIRI absolute) {
		// identity URI reference
		String _frag = absolute.getFragment();
		if (iri.equals(absolute.iri) && _frag == null) {
			return new ParsedIRI(null, null, null, -1, "", null, null);
		}
		// different scheme or authority
		if (absolute.getScheme() != null && !absolute.getScheme().equalsIgnoreCase(this.getScheme())) {
			return absolute;
		}
		if (absolute.getUserInfo() != null && !absolute.getUserInfo().equals(this.getUserInfo())) {
			return absolute;
		}
		if (absolute.getHost() != null && !absolute.getHost().equalsIgnoreCase(this.getHost())) {
			return absolute;
		}
		if (absolute.getPort() != this.getPort()) {
			return absolute;
		}
		// fragment URI reference
		if (_frag != null) {
			if (this.getFragment() == null) {
				if (absolute.iri.startsWith(this.iri) && absolute.iri.charAt(iri.length()) == '#') {
					return new ParsedIRI(null, null, null, -1, "", null, _frag);
				}
			} else {
				int this_idx = iri.length() - this.getFragment().length();
				int abs_idx = absolute.iri.length() - _frag.length();
				if (iri.substring(0, this_idx).equals(absolute.iri.substring(0, abs_idx))) {
					return new ParsedIRI(null, null, null, -1, "", null, _frag);
				}
			}
		}
		// opaque IRI
		if (this.isOpaque() || absolute.isOpaque()) {
			return absolute;
		}
		// query string URI reference
		String _query = absolute.getQuery();
		if (_query != null) {
			if (this.getQuery() == null && this.getFragment() == null) {
				if (absolute.iri.startsWith(this.iri) && absolute.iri.charAt(iri.length()) == '?') {
					return new ParsedIRI(null, null, null, -1, "", _query, _frag);
				}
			} else {
				int this_idx = this.getQuery() == null ? iri.indexOf('#') : iri.indexOf('?');
				int abs_idx = absolute.iri.indexOf('?');
				if (iri.substring(0, this_idx).equals(absolute.iri.substring(0, abs_idx))) {
					return new ParsedIRI(null, null, null, -1, "", _query, _frag);
				}
			}
		}
		// last path segment
		String _path = absolute.getPath();
		int this_idx = this.getPath().lastIndexOf('/');
		int abs_idx = _path.lastIndexOf('/');
		if (this_idx < 0 || abs_idx < 0) {
			return absolute;
		}
		if (_path.equals(this.getPath().substring(0, this_idx + 1))) {
			return new ParsedIRI(null, null, null, -1, ".", _query, _frag);
		}
		// within last path segment
		if (_path.startsWith(this.getPath().substring(0, this_idx + 1))) {
			return new ParsedIRI(null, null, null, -1, _path.substring(this_idx + 1), _query, _frag);
		}
		return new ParsedIRI(null, null, null, -1, relativizePath(_path), _query, _frag);
	}

	private void parse() throws URISyntaxException {
		pos = 0;
		scheme = parseScheme();
		if ("jar".equalsIgnoreCase(scheme)) {
			scheme = scheme + ':' + parseScheme();
		}
		int peek = peek();
		if ('/' == peek && '/' == peek(1)) {
			advance(2);
			if (iri.indexOf('@') >= 0) {
				userInfo = parseUserInfo();
			}
			host = parseHost();
			if (':' == peek()) {
				advance(1);
				String p = parseMember(DIGIT, '/');
				if (!p.isEmpty()) {
					port = Integer.parseInt(p);
				} else {
					port = -1;
				}
			}
			int next = peek();
			if ('/' == next || '?' == next || '#' == next || EOF == next) {
				path = parsePath();
			} else {
				throw error("absolute or empty path expected");
			}
		} else if ('/' == peek || '?' == peek || '#' == peek || EOF == peek) {
			path = parsePath();
		} else if ('%' == peek || ':' != peek && isMember(pchar, peek)) {
			path = parsePath();
		} else if (scheme != null && ':' == peek) {
			path = parsePath();
		}
		if ('?' == peek()) {
			advance(1);
			query = parsePctEncoded(qchar, '#', EOF);
		}
		if ('#' == peek()) {
			advance(1);
			fragment = parsePctEncoded(fchar, '#', EOF);
		}
		if (pos != iri.length()) {
			throw error("Unexpected character");
		}
	}

	private String buildIRI(String scheme, String userInfo, String host, int port, String path, String query,
			String fragment) {
		StringBuilder sb = new StringBuilder();
		if (scheme != null) {
			sb.append(scheme).append(':');
		}
		if (host != null) {
			sb.append("//");
			if (userInfo != null) {
				sb.append(userInfo).append('@');
			}
			sb.append(host);
			if (port >= 0) {
				sb.append(':').append(port);
			}
		}
		if (path != null) {
			sb.append(path);
		}
		if (query != null) {
			sb.append('?').append(query);
		}
		if (fragment != null) {
			sb.append('#').append(fragment);
		}
		return sb.toString();
	}

	private String parseScheme() {
		if (isMember(ALPHA, peek())) {
			int start = pos;
			String scheme = parseMember(schar, ':');
			if (':' == peek()) {
				advance(1);
				return scheme;
			} else {
				pos = start; // reset
			}
		}
		return null;
	}

	private String parseUserInfo() throws URISyntaxException {
		int start = pos;
		String userinfo = parsePctEncoded(uchar, '@', '/');
		if ('@' == peek()) {
			advance(1);
			return userinfo;
		} else {
			pos = start; // reset
			return null;
		}
	}

	private String parseHost() throws URISyntaxException {
		int start = pos;
		if ('[' == peek()) {
			advance(1); // IP-Literal
			parseMember(uchar, ']');
			if (']' == peek()) {
				advance(1);
				return iri.substring(start, pos);
			} else {
				throw error("Invalid host IP address");
			}
		} else if (isMember(DIGIT, peek())) {
			URISyntaxException parsingException = null;
			int startPos = pos;
			for (int i = 0; i < 4; i++) {
				int octet;
				try {
					octet = Integer.parseInt(parseMember(DIGIT, '.'));
				} catch (NumberFormatException e) {
					parsingException = error("Invalid IPv4 address");
					break;
				}
				if (octet < 0 || octet > 255) {
					parsingException = error("Invalid IPv4 address");
					break;
				}
				if ('.' == peek()) {
					advance(1);
				} else {
					if (i == 3 && (EOF == peek() || ':' == peek() || '/' == peek())) {
						// next is end of IRI, a port, or a path
					} else {
						parsingException = error("Invalid IPv4 address");
						break;
					}
				}
			}
			if (parsingException == null) {
				// IPv4 parsed successfully
				return iri.substring(start, pos);
			} else {
				// Reset position and parse host
				pos = startPos;
				String host = parsePctEncoded(hchar, ':', '/');

				// http(s) scheme requires a valid top-level domain
				if (isScheme("http") || isScheme("https")) {
					if (!isTLDValid(start)) {
						throw parsingException;
					}
				}
				return host;
			}
		} else {
			return parsePctEncoded(hchar, ':', '/');
		}
	}

	private boolean isTLDValid(int hostStartPos) {
		int step = 0;
		boolean illegalCharFound = false;

		while (pos + step > hostStartPos) {
			int currChar = peek(--step);
			if ('.' == currChar) {
				return !illegalCharFound;
			}

			// TLDs should start with a letter
			if (!isMember(ALPHA, currChar)) {
				illegalCharFound = true;
			}
		}

		return true;
	}

	private String parsePath() throws URISyntaxException {
		return parsePctEncoded(fchar, '?', '#');
	}

	private String parsePctEncoded(int[][] set, int end1, int end2) throws URISyntaxException {
		int start = pos;
		while (true) {
			int chr = peek();
			if (chr == EOF || chr == end1 || chr == end2) {
				break; // optimize end character
			} else if (('a' <= chr && chr <= 'z') || ('A' <= chr && chr <= 'Z') || ('0' <= chr && chr <= '9')) {
				advance(1);
			} else if ('%' == chr) {
				if (Arrays.binarySearch(HEXDIG, peek(1)) >= 0 && Arrays.binarySearch(HEXDIG, peek(2)) >= 0) {
					advance(3);
				} else {
					throw error("Illegal percent encoding");
				}
			} else if (isMember(set, chr)) {
				advance(1);
			} else {
				break;
			}
		}
		return iri.substring(start, pos);
	}

	private String parseMember(int[][] set, int end) {
		int start = pos;
		while (true) {
			int chr = peek();
			if (chr == EOF || chr == end) {
				break;
			} else if (isMember(set, chr)) {
				advance(1);
			} else {
				break;
			}
		}
		return iri.substring(start, pos);
	}

	private boolean isMember(int[][] set, int chr) {
		int idx = Arrays.binarySearch(set, new int[] { chr }, CMP);
		if (idx >= 0) {
			return true; // lower range matched exactly
		} else if (idx == -1) {
			return false; // insertion point is 0, below lowest range
		} else {
			int i = -idx - 2; // range just before insertion point
			assert set[i][0] <= chr && set[i].length == 2;
			return chr <= set[i][1];
		}
	}

	private int peek() {
		if (pos < iri.length()) {
			return iri.codePointAt(pos);
		} else {
			return EOF;
		}
	}

	private int peek(int ahead) {
		if (pos + ahead < iri.length()) {
			return iri.codePointAt(iri.offsetByCodePoints(pos, ahead));
		} else {
			return EOF;
		}
	}

	private void advance(int ahead) {
		if (pos + ahead < iri.length()) {
			pos = iri.offsetByCodePoints(pos, ahead);
		} else {
			pos += ahead;
		}
	}

	private URISyntaxException error(String reason) {
		if (pos > -1 && pos < iri.length()) {
			int cp = iri.codePointAt(pos);
			reason = reason + " U+" + Integer.toHexString(cp).toUpperCase();
		}
		return new URISyntaxException(iri, reason, pos);
	}

	private void appendAscii(StringBuilder sb, String input) {
		for (int c = 0, n = input.codePointCount(0, input.length()); c < n; c++) {
			int chr = input.codePointAt(input.offsetByCodePoints(0, c));
			if (Arrays.binarySearch(ascii, chr) >= 0) {
				sb.appendCodePoint(chr);
			} else {
				sb.append(pctEncode(chr));
			}
		}
	}

	private String toLowerCase(String string) {
		if (string == null) {
			return null;
		}
		boolean changed = false;
		StringBuilder sb = new StringBuilder(string);
		for (int i = 0; i < sb.length(); i++) {
			char c = sb.charAt(i);
			if ((c >= 'A') && (c <= 'Z')) {
				changed = true;
				sb.setCharAt(i, (char) (c + ('a' - 'A')));
			}
		}
		return changed ? sb.toString() : string;
	}

	private String toUpperCase(String string) {
		if (string == null) {
			return null;
		}
		boolean changed = false;
		StringBuilder sb = new StringBuilder(string);
		for (int i = 0; i < sb.length(); i++) {
			char c = sb.charAt(i);
			if ((c >= 'a') && (c <= 'z')) {
				changed = true;
				sb.setCharAt(i, (char) (c - ('a' - 'A')));
			}
		}
		return changed ? sb.toString() : string;
	}

	private boolean isScheme(String scheme) {
		return scheme.equalsIgnoreCase(this.scheme) || this.scheme != null && this.scheme.indexOf(':') == 3
				&& this.scheme.equalsIgnoreCase("jar:" + scheme);
	}

	private String normalizePath(String path) {
		if ("".equals(path)) {
			return isScheme("http") || isScheme("https") ? "/" : "";
		} else if (isScheme("file")) {
			if (path.contains("%5C")) {
				// replace "/c:\path\to\file" with "/c:/path/to/file"
				return normalizePath(path.replace("%5C", "/"));
			} else if (!path.startsWith("/") && isMember(ALPHA, path.codePointAt(0))
					&& (':' == path.charAt(1) || path.length() >= 4 && "%7C".equals(path.substring(1, 4)))) {
				// replace "c:/path/to/file" with "/c:/path/to/file"
				return normalizePath("/" + path);
			} else if (path.length() >= 5 && "%7C".equals(path.substring(2, 5))
					&& isMember(ALPHA, path.codePointAt(1))) {
				// replace "/c|/path/to/file" with "/c:/path/to/file"
				return normalizePath(path.substring(0, 2) + ':' + path.substring(5));
			}
		}
		return pctEncodingNormalization(pathSegmentNormalization(path));
	}

	private String pctEncodingNormalization(String path) {
		if (path == null || path.isEmpty() || path.indexOf('%') < 0) {
			return path; // no pct encodings
		}
		String[] encodings = listPctEncodings(path);
		StringBuilder sb = new StringBuilder(path);
		int pos = 0;
		for (String encoding : encodings) {
			int idx = sb.indexOf(encoding, pos);
			String decoded = normalizePctEncoding(encoding);
			sb.replace(idx, idx + encoding.length(), decoded);
			pos += decoded.length();
		}
		return Normalizer.normalize(sb, Normalizer.Form.NFC);

	}

	private String[] listPctEncodings(String path) {
		if (path == null || path.indexOf('%') < 0) {
			return new String[0];
		}
		List<String> list = new ArrayList<>();
		int pct = -1;
		while ((pct = path.indexOf('%', pct + 1)) > 0) {
			int start = pct;
			// optimize common encoded members by grouping separately
			if (Arrays.binarySearch(common_pct, path.substring(pct, pct + 3)) < 0) {
				while (pct + 3 < path.length() && path.charAt(pct + 3) == '%'
						&& Arrays.binarySearch(common_pct, path.substring(pct + 3, pct + 6)) < 0) {
					pct += 3;
				}
			}
			list.add(path.substring(start, pct + 3));
		}
		return list.toArray(new String[0]);
	}

	private String normalizePctEncoding(String encoded) {
		int cidx = Arrays.binarySearch(common_pct, encoded);
		if (cidx >= 0 && isMember(unreserved, common[cidx])) {
			return new String(Character.toChars(common[cidx])); // quickly decode unreserved encodings
		} else if (cidx >= 0) {
			return encoded; // pass through reserved encodings
		}
		String decoded = pctDecode(encoded);
		String ns = Normalizer.normalize(decoded, Normalizer.Form.NFC);
		StringBuilder sb = new StringBuilder(ns.length());
		for (int c = 0, n = ns.codePointCount(0, ns.length()); c < n; c++) {
			int chr = ns.codePointAt(ns.offsetByCodePoints(0, c));
			if (isMember(unreserved, chr)) {
				sb.appendCodePoint(chr);
			} else if (n == 1) {
				return toUpperCase(encoded);
			} else {
				sb.append(pctEncode(chr));
			}
		}
		return sb.toString();
	}

	private String pctDecode(String encoded) {
		return URLDecoder.decode(encoded, StandardCharsets.UTF_8);

	}

	private String pctEncode(int chr) {
		return URLEncoder.encode(new String(Character.toChars(chr)), StandardCharsets.UTF_8);

	}

	/**
	 * Normalizes the path of this URI if it has one. Normalizing a path means that any unnecessary '.' and '..'
	 * segments are removed. For example, the URI <var>http://server.com/a/b/../c/./d</var> would be normalized to
	 * <var>http://server.com/a/c/d</var>. A URI doens't have a path if it is opaque.
	 */
	private String pathSegmentNormalization(String _path) {
		if (_path == null) {
			return null;
		}

		// Remove any '.' segments:
		_path = _path.replace("/./", "/");
		if (_path == null) {
			return null;
		}
		if (_path.startsWith("./")) {
			// Remove both characters
			_path = _path.substring(2);
		}

		if (_path.endsWith("/.")) {
			// Remove only the last dot, not the slash!
			_path = _path.substring(0, _path.length() - 1);
		}

		if (!_path.contains("/../") && !_path.endsWith("/..")) {
			// There are no '..' segments that can be removed. We're done and
			// don't have to execute the time-consuming code following this
			// if-statement
			return _path;
		}

		// Split the path into its segments

		LinkedList<String> segments = new LinkedList<>(Arrays.asList(_path.split("/")));
		if (_path.startsWith("/")) {
			segments.remove(0);
		}

		boolean lastSegmentRemoved = false;

		// Remove all unnecessary '..' segments

		int i = 1;
		while (i < segments.size()) {
			String segment = segments.get(i);

			if (segment.equals("..")) {
				String prevSegment = segments.get(i - 1);

				if (prevSegment.equals("..")) {
					// two consecutive '..' segments at position i-1 and i,
					// continue at i + 2
					i += 2;
				} else {
					// Bingo! Remove these two segments...
					if (i == segments.size() - 1) {
						lastSegmentRemoved = true;
					}

					segments.remove(i);
					segments.remove(i - 1);

					// ...and continue at position (i + 1 - 2) == (i - 1)...

					// ...but only if i > 1, position 0 does not need to be
					// checked.

					if (i > 1) {
						i--;
					}
				}
			} else {
				// Not a '..' segment, check next
				i++;
			}
		}

		while (!segments.isEmpty() && (segments.get(0).equals("..") || segments.get(0).equals("."))) {
			segments.remove(0);
		}

		// Construct the normalized path

		StringBuilder newPath = new StringBuilder(_path.length());

		if (_path.startsWith("/")) {
			newPath.append('/');
		}

		int segmentCount = segments.size();
		for (i = 0; i < segmentCount - 1; i++) {
			newPath.append(segments.get(i));
			newPath.append('/');
		}

		if (segmentCount > 0) {
			String lastSegment = segments.get(segmentCount - 1);
			newPath.append(lastSegment);

			if (_path.endsWith("/") || lastSegmentRemoved) {
				newPath.append('/');
			}
		}

		return newPath.toString();
	}

	private String relativizePath(String absolute) {
		assert absolute.charAt(0) == '/';
		String[] paths = path.split("/", Integer.MAX_VALUE);
		String[] seg = absolute.split("/", Integer.MAX_VALUE);
		// first segment is empty string
		int same = 1;
		while (same < paths.length && same < seg.length - 1 && paths[same].equals(seg[same])) {
			same++;
		}
		if (same < 2) {
			return absolute;
		}
		StringBuilder sb = new StringBuilder();
		// last segment is empty or file name
		for (int i = same; i < paths.length - 1; i++) {
			sb.append("../");
		}
		for (int i = same; i < seg.length - 1; i++) {
			sb.append(seg[i]).append('/');
		}
		return sb.append(seg[seg.length - 1]).toString();
	}
}