URIResultParser.java

/*
 * Copyright 2007 ZXing authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.zxing.client.result;

import com.google.zxing.Result;

import java.util.regex.Pattern;

/**
 * Tries to parse results that are a URI of some kind.
 * 
 * @author Sean Owen
 */
public final class URIResultParser extends ResultParser {

  private static final Pattern ALLOWED_URI_CHARS_PATTERN =
      Pattern.compile("[-._~:/?#\\[\\]@!$&'()*+,;=%A-Za-z0-9]+");
  // See http://www.ietf.org/rfc/rfc2396.txt
  private static final Pattern URL_WITH_PROTOCOL_PATTERN = Pattern.compile("[a-zA-Z][a-zA-Z0-9+-.]+:");
  private static final Pattern URL_WITHOUT_PROTOCOL_PATTERN = Pattern.compile(
      "([a-zA-Z0-9\\-]+\\.){1,6}[a-zA-Z]{2,}" + // host name elements; allow up to say 6 domain elements
      "(:\\d{1,5})?" + // maybe port
      "(/|\\?|$)"); // query, path or nothing

  @Override
  public URIParsedResult parse(Result result) {
    String rawText = getMassagedText(result);
    // We specifically handle the odd "URL" scheme here for simplicity and add "URI" for fun
    // Assume anything starting this way really means to be a URI
    if (rawText.startsWith("URL:") || rawText.startsWith("URI:")) {
      return new URIParsedResult(rawText.substring(4).trim(), null);
    }
    rawText = rawText.trim();
    if (!isBasicallyValidURI(rawText) || isPossiblyMaliciousURI(rawText)) {
      return null;
    }
    return new URIParsedResult(rawText, null);
  }

  /**
   * @return true if the URI contains suspicious patterns that may suggest it intends to
   *  mislead the user about its true nature. At the moment this looks for the presence
   *  of user/password syntax in the host/authority portion of a URI which may be used
   *  in attempts to make the URI's host appear to be other than it is. Example:
   *  http://yourbank.com@phisher.com  This URI connects to phisher.com but may appear
   *  to connect to yourbank.com at first glance.
   */
  static boolean isPossiblyMaliciousURI(String uri) {
    return !ALLOWED_URI_CHARS_PATTERN.matcher(uri).matches() || containsUserInHost(uri);
  }

  /**
   * Linear equivalent of finding {@code :/*([^/@]+)@[^/]+} anywhere in the URI, i.e. user/password
   * syntax in the authority. A regex with {@link java.util.regex.Matcher#find()} backtracks
   * quadratically here because ':' is itself a member of the userinfo class {@code [^/@]}, so a
   * scheme followed by a long run of ':' restarts the greedy scan at every colon. This scans from
   * each '@' instead, examining each character a constant number of times.
   */
  private static boolean containsUserInHost(String uri) {
    int length = uri.length();
    for (int at = uri.indexOf('@'); at >= 0; at = uri.indexOf('@', at + 1)) {
      // Host part "[^/]+": at least one non-'/' character must follow '@'.
      if (at + 1 >= length || uri.charAt(at + 1) == '/') {
        continue;
      }
      // Userinfo "[^/@]+" ends just before '@'. A ':' inside the run (with a character after it)
      // can serve as the scheme colon.
      boolean schemeColon = false;
      int i = at - 1;
      while (i >= 0 && uri.charAt(i) != '/' && uri.charAt(i) != '@') {
        if (uri.charAt(i) == ':' && i <= at - 2) {
          schemeColon = true;
        }
        i--;
      }
      if (i + 1 == at) {
        // No userinfo character immediately before '@'.
        continue;
      }
      if (schemeColon) {
        return true;
      }
      // Otherwise the scheme colon may sit before the run, separated by "/*".
      while (i >= 0 && uri.charAt(i) == '/') {
        i--;
      }
      if (i >= 0 && uri.charAt(i) == ':') {
        return true;
      }
    }
    return false;
  }

  static boolean isBasicallyValidURI(String uri) {
    if (uri.contains(" ")) {
      // Quick hack check for a common case
      return false;
    }
    // Anchor at the start. find() rescans from every position, which is quadratic on long
    // input that has no match at the start; lookingAt() matches only a prefix.
    if (URL_WITH_PROTOCOL_PATTERN.matcher(uri).lookingAt()) {
      return true;
    }
    return URL_WITHOUT_PROTOCOL_PATTERN.matcher(uri).lookingAt();
  }

}