PreScanner.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.html.charsetdetector;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.AbstractMap;
import java.util.BitSet;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A scanner meant to detect charset meta tags in a byte stream
 * See: https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
 */
class PreScanner {

    private static final Pattern CHARSET_PATTERN =
            Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
    private static final byte[] COMMENT_START = {(byte) '<', (byte) '!', (byte) '-', (byte) '-'};
    private static final byte[] COMMENT_END = {(byte) '-', (byte) '-', (byte) '>'};
    private static final byte[] META_TAG_START =
            {(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'};
    private static final byte SLASH = (byte) '/';
    private static final byte EQUAL = (byte) '=';
    private static final byte TAG_START = (byte) '<';
    private static final byte TAG_END = (byte) '>';
    private static final BitSet QUOTE = bitSet('"', '\'');

    private static final BitSet WHITESPACE = bitSet(0x09, 0x0A, 0x0C, 0x0D, 0x0D, 0x20);
    private static final BitSet SPACE_OR_TAG_END = bitSet(WHITESPACE, TAG_END);
    private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH);
    private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?');

    private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
    private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF};
    private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE};
    private static final byte LOWER_A = (byte) 'a';
    private static final byte LOWER_Z = (byte) 'z';
    private static final byte UPPER_A = (byte) 'A';
    private static final byte UPPER_Z = (byte) 'Z';
    private BufferedInputStream stream;
    private CharsetDetectionResult detectedCharset = CharsetDetectionResult.notFound();

    PreScanner(InputStream inputStream) {
        this.stream = new BufferedInputStream(inputStream);
    }

    private static BitSet bitSet(int... bs) {
        BitSet bitSet = new BitSet(0xFF);
        for (int b : bs) bitSet.set(b);
        return bitSet;
    }

    private static BitSet bitSet(BitSet base, int... bs) {
        BitSet bitSet = (BitSet) base.clone();
        for (int b : bs) bitSet.set(b);
        return bitSet;
    }

    static String getEncodingFromMeta(String attributeValue) {
        Matcher matcher = CHARSET_PATTERN.matcher(attributeValue);
        if (!matcher.find()) {
            return null;
        }
        return matcher.group(2);
    }

    private static boolean contains(BitSet bitSet, byte b) {
        return bitSet.get(b & 0xFF);
    }

    Charset scan() {
        while (processAtLeastOneByte()) {
            if (detectedCharset.isFound()) {
                return detectedCharset.getCharset();
            }
        }
        return null;
    }

    Charset detectBOM() {
        try {
            if (expect(UTF8_BOM)) {
                return StandardCharsets.UTF_8;
            } else if (expect(UTF16_BE_BOM)) {
                return StandardCharsets.UTF_16BE;
            } else if (expect(UTF16_LE_BOM)) {
                return StandardCharsets.UTF_16LE;
            }
        } catch (IOException e) { /* stream could not be read, also return null */ }
        return null;
    }

    private boolean processAtLeastOneByte() {
        try {
            return processComment() || processMeta() || processTag() || processSpecialTag() ||
                    processAny();
        } catch (IOException e) {
            return false;
        }
    }

    private boolean processAny() throws IOException {
        int read = stream.read();
        return read != -1;
    }

    private boolean processTag() throws IOException {
        stream.mark(3);
        if (read() == TAG_START) {
            int read = stream.read();
            if (read == SLASH) {
                read = stream.read();
            }
            if ((LOWER_A <= read && read <= LOWER_Z) || (UPPER_A <= read && read <= UPPER_Z)) {
                do {
                    stream.mark(1);
                } while (!contains(SPACE_OR_TAG_END, read()));
                stream.reset();
                while (getAttribute() != null) {/* ignore the attribute*/}
                return true;
            }
        }
        stream.reset();
        return false;
    }

    private boolean processSpecialTag() throws IOException {
        stream.mark(2);
        if (read() == TAG_START && contains(SPECIAL_TAGS, read())) {
            skipUntil(TAG_END);
            return true;
        }
        stream.reset();
        return false;
    }

    private boolean processMeta() throws IOException {
        stream.mark(6); // len("<meta ") == 6
        if (readCaseInsensitive(META_TAG_START) && contains(SPACE_OR_SLASH, read())) {
            MetaProcessor metaProcessor = new MetaProcessor();
            for (Map.Entry<String, String> attribute = getAttribute(); attribute != null;
                    attribute = getAttribute()) {
                metaProcessor.processAttribute(attribute);
            }
            metaProcessor.updateDetectedCharset(detectedCharset);
            return true;
        }
        stream.reset();
        return false;
    }

    /**
     * Read an attribute from the stream
     *
     * @return the attribute as a Map.Entry, where the key is the attribute's name and
     * the value is the attribute's value. If there is no attribute, return null
     */
    private Map.Entry<String, String> getAttribute() throws IOException {
        String name = getAttributeName();
        if (name == null) {
            return null;
        }

        if (!expect(EQUAL)) {
            return new AbstractMap.SimpleEntry<>(name, "");
        }
        skipAll(WHITESPACE);

        String value = getAttributeValue();
        return new AbstractMap.SimpleEntry<>(name, value);
    }

    private String getAttributeName() throws IOException {
        skipAll(SPACE_OR_SLASH);
        if (expect(TAG_END)) {
            return null;
        }
        StringBuilder name = new StringBuilder();
        while (!(peek() == EQUAL && name.length() > 0) && !(peek() == TAG_END || peek() == SLASH) &&
                !skipAll(WHITESPACE)) {
            name.append((char) getLowerCaseChar());
        }
        return name.toString();
    }

    private String getAttributeValue() throws IOException {
        StringBuilder value = new StringBuilder();
        stream.mark(1);
        byte quote = read();
        if (contains(QUOTE, quote)) {
            for (byte b = getLowerCaseChar(); b != quote; b = getLowerCaseChar()) {
                value.append((char) b);
            }
        } else {
            stream.reset();
            for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b);
                    b = getLowerCaseChar()) {
                value.append((char) b);
                stream.mark(1);
            }
            stream.reset(); // unread the space or tag end
        }
        return value.toString();
    }

    private boolean skipAll(BitSet bitSet) throws IOException {
        boolean skipped = false;
        stream.mark(1);
        for (byte read = read(); contains(bitSet, read); read = read()) {
            skipped = true;
            stream.mark(1);
        }
        stream.reset();
        return skipped;
    }

    private byte getLowerCaseChar() throws IOException {
        byte nextPoint = read();
        if (nextPoint >= 'A' && nextPoint <= 'Z') {
            nextPoint += 0x20; // lowercase
        }
        return nextPoint;
    }

    private boolean processComment() throws IOException {
        if (!expect(COMMENT_START)) {
            return false;
        }
        if (!expect(TAG_END)) {
            skipUntil(COMMENT_END);
        }
        return true;
    }

    private boolean expect(byte... expected) throws IOException {
        stream.mark(expected.length);
        for (byte b : expected) {
            byte read = read();
            if (read != b) {
                stream.reset();
                return false;
            }
        }
        return true;
    }

    private void skipUntil(byte... expected) throws IOException {
        while (!expect(expected)) {
            if (stream.read() == -1) {
                return;
            }
        }
    }

    private boolean readCaseInsensitive(byte... bs) throws IOException {
        for (byte b : bs)
            if (getLowerCaseChar() != b) {
                return false;
            }
        return true;
    }

    private byte read() throws IOException {
        int r = stream.read();
        if (r == -1) {
            throw new IOException();
        }
        return (byte) r;
    }

    private byte peek() throws IOException {
        stream.mark(1);
        byte b = read();
        stream.reset();
        return b;
    }
}