ByteSourceJsonBootstrapper.java

package tools.jackson.core.json;

import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import tools.jackson.core.*;
import tools.jackson.core.exc.JacksonIOException;
import tools.jackson.core.io.*;
import tools.jackson.core.sym.ByteQuadsCanonicalizer;
import tools.jackson.core.sym.CharsToNameCanonicalizer;
import tools.jackson.core.util.VersionUtil;

/**
 * This class is used to determine the encoding of byte stream
 * that is to contain JSON content. Rules are fairly simple, and
 * defined in JSON specification (RFC-4627 or newer), except
 * for BOM handling, which is a property of underlying
 * streams.
 */
public final class ByteSourceJsonBootstrapper
{
    public final static byte UTF8_BOM_1 = (byte) 0xEF;
    public final static byte UTF8_BOM_2 = (byte) 0xBB;
    public final static byte UTF8_BOM_3 = (byte) 0xBF;

    // [jackson-core#1081] Limit in bytes for input byte array length to use StringReader instead of InputStreamReader
    private static final int STRING_READER_BYTE_ARRAY_LENGTH_LIMIT = 8192;

    /*
    /**********************************************************************
    /* Configuration
    /**********************************************************************
     */

    private final IOContext _context;

    private final InputStream _in;

    /*
    /**********************************************************************
    /* Input buffering
    /**********************************************************************
     */

    private final byte[] _inputBuffer;

    private int _inputPtr;

    private int _inputEnd;

    /**
     * Flag that indicates whether buffer above is to be recycled
     * after being used or not.
     */
    private final boolean _bufferRecyclable;

    /*
    /**********************************************************************
    /* Input location
    /**********************************************************************
     */

    /**
     * Current number of input units (bytes or chars) that were processed in
     * previous blocks,
     * before contents of current input buffer.
     *<p>
     * Note: includes possible BOMs, if those were part of the input.
     */
//    private int _inputProcessed;

    /*
    /**********************************************************************
    /* Data gathered
    /**********************************************************************
     */

    /**
     * Whether input has been detected to be in Big-Endian encoding or not.
     */
    private boolean _bigEndian = true;

    private int _bytesPerChar; // 0 means "dunno yet"

    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */

    public ByteSourceJsonBootstrapper(IOContext ctxt, InputStream in) {
        _context = ctxt;
        _in = in;
        _inputBuffer = ctxt.allocReadIOBuffer();
        _inputEnd = _inputPtr = 0;
//        _inputProcessed = 0;
        _bufferRecyclable = true;
    }

    public ByteSourceJsonBootstrapper(IOContext ctxt,
            byte[] inputBuffer, int inputStart, int inputLen) {
        _context = ctxt;
        _in = null;
        _inputBuffer = inputBuffer;
        _inputPtr = inputStart;
        _inputEnd = (inputStart + inputLen);
        // Need to offset this for correct location info
//        _inputProcessed = -inputStart;
        _bufferRecyclable = false;
    }

    /*
    /**********************************************************************
    /*  Encoding detection during bootstrapping
    /**********************************************************************
     */

    /**
     * Method that should be called after constructing an instace.
     * It will figure out encoding that content uses, to allow
     * for instantiating a proper scanner object.
     *
     * @return {@link JsonEncoding} detected, if any; {@code JsonEncoding.UTF8} otherwise
     *
     * @throws JacksonException If read from underlying input source fails
     */
    public JsonEncoding detectEncoding() throws JacksonException
    {
        boolean foundEncoding = false;

        // First things first: BOM handling
        /* Note: we can require 4 bytes to be read, since no
         * combination of BOM + valid JSON content can have
         * shorter length (shortest valid JSON content is single
         * digit char, but BOMs are chosen such that combination
         * is always at least 4 chars long)
         */
        if (ensureLoaded(4)) {
            int quad =  (_inputBuffer[_inputPtr] << 24)
                | ((_inputBuffer[_inputPtr+1] & 0xFF) << 16)
                | ((_inputBuffer[_inputPtr+2] & 0xFF) << 8)
                | (_inputBuffer[_inputPtr+3] & 0xFF);

            if (handleBOM(quad)) {
                foundEncoding = true;
            } else {
                /* If no BOM, need to auto-detect based on first char;
                 * this works since it must be 7-bit ascii (wrt. unicode
                 * compatible encodings, only ones JSON can be transferred
                 * over)
                 */
                // UTF-32?
                if (checkUTF32(quad)) {
                    foundEncoding = true;
                } else if (checkUTF16(quad >>> 16)) {
                    foundEncoding = true;
                }
            }
        } else if (ensureLoaded(2)) {
            int i16 = ((_inputBuffer[_inputPtr] & 0xFF) << 8)
                | (_inputBuffer[_inputPtr+1] & 0xFF);
            if (checkUTF16(i16)) {
                foundEncoding = true;
            }
        }

        JsonEncoding enc;

        // Not found yet? As per specs, this means it must be UTF-8
        if (!foundEncoding) {
            enc = JsonEncoding.UTF8;
        } else {
            switch (_bytesPerChar) {
            case 1: enc = JsonEncoding.UTF8;
                break;
            case 2: enc = _bigEndian ? JsonEncoding.UTF16_BE : JsonEncoding.UTF16_LE;
                break;
            case 4: enc = _bigEndian ? JsonEncoding.UTF32_BE : JsonEncoding.UTF32_LE;
                break;
            default:
                return VersionUtil.throwInternalReturnAny();
            }
        }
        _context.setEncoding(enc);
        return enc;
    }

    /**
     * Helper method that may be called to see if given {@link DataInput}
     * has BOM marker, and if so, to skip it.
     *
     * @param input DataInput to read content from
     *
     * @return Byte (as unsigned {@code int}) read after possible UTF-8 BOM
     *
     * @throws JacksonException If read from underlying input source fails
     */
    public static int skipUTF8BOM(DataInput input) throws JacksonException
    {
        try {
            int b = input.readUnsignedByte();
            if (b != 0xEF) {
                return b;
            }
            // since this is not legal byte in JSON otherwise, except
            // that we do get BOM; if not, report error
            b = input.readUnsignedByte();
            if (b != 0xBB) {
                throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)
                    +" following 0xEF; should get 0xBB as part of UTF-8 BOM");
            }
            b = input.readUnsignedByte();
            if (b != 0xBF) {
                throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)
                    +" following 0xEF 0xBB; should get 0xBF as part of UTF-8 BOM");
            }
            return input.readUnsignedByte();
        } catch (IOException e) {
            throw _wrapIOFailure(e);
        }
    }

    /*
    /**********************************************************************
    /* Constructing a Reader
    /**********************************************************************
     */

    @SuppressWarnings("resource")
    public Reader constructReader() throws JacksonException
    {
        JsonEncoding enc = _context.getEncoding();
        switch (enc.bits()) {
        case 8: // only in non-common case where we don't want to do direct mapping
        case 16:
            {
                // First: do we have a Stream? If not, need to create one:
                InputStream in = _in;

                if (in == null) {
                    int length = _inputEnd - _inputPtr;
                    if (length <= STRING_READER_BYTE_ARRAY_LENGTH_LIMIT) {
                        // [jackson-core#1081] Avoid overhead of heap ByteBuffer allocated by InputStreamReader
                        // when processing small inputs up to 8KiB.
                        try {
                            return new StringReader(new String(_inputBuffer, _inputPtr, length, enc.getJavaName()));
                        } catch (IOException e) {
                            throw _wrapIOFailure(e);
                        }
                    }
                    in = new ByteArrayInputStream(_inputBuffer, _inputPtr, _inputEnd);
                } else {
                    // Also, if we have any read but unused input (usually true),
                    // need to merge that input in:
                    if (_inputPtr < _inputEnd) {
                        in = new MergedStream(_context, in, _inputBuffer, _inputPtr, _inputEnd);
                    }
                }
                try {
                    return new InputStreamReader(in, enc.getJavaName());
                } catch (IOException e) {
                    throw _wrapIOFailure(e);
                }
            }
        case 32:
            {
                // 01-Jun-2019. tatu: Should determine like so in future:
// final boolean autoClose = _context.isResourceManaged() || isEnabled(StreamReadFeature.AUTO_CLOSE_SOURCE);
                // ... but for now, do what 2.x did:
                final boolean autoClose = true;
                return new UTF32Reader(_context, _in, autoClose,
                        _inputBuffer, _inputPtr, _inputEnd,
                        _context.getEncoding().isBigEndian());
            }
        }
        return VersionUtil.throwInternalReturnAny();
    }

    public JsonParser constructParser(ObjectReadContext readCtxt,
            int streamReadFeatures, int formatReadFeatures,
            ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols,
            int factoryFeatures)
        throws JacksonException
    {
        int prevInputPtr = _inputPtr;
        JsonEncoding enc = JsonFactory.Feature.CHARSET_DETECTION.enabledIn(factoryFeatures) ? detectEncoding() : JsonEncoding.UTF8;
        int bytesProcessed = _inputPtr - prevInputPtr;

        if (enc == JsonEncoding.UTF8) {
            // and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
            // (which is ok for larger input; not so hot for smaller; but this is not a common case)
            if (JsonFactory.Feature.CANONICALIZE_PROPERTY_NAMES.enabledIn(factoryFeatures)) {
                ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
                return new UTF8StreamJsonParser(readCtxt, _context,
                        streamReadFeatures, formatReadFeatures, _in, can,
                        _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
            }
        }
        return new ReaderBasedJsonParser(readCtxt, _context, streamReadFeatures, formatReadFeatures,
                constructReader(),
                rootCharSymbols.makeChild());
    }

    /*
    /**********************************************************************
    /* Internal methods, parsing
    /**********************************************************************
     */

    /**
     * @return True if a BOM was succesfully found, and encoding
     *   thereby recognized.
     */
    private boolean handleBOM(int quad) throws JacksonException
    {
        /* Handling of (usually) optional BOM (required for
         * multi-byte formats); first 32-bit charsets:
         */
        switch (quad) {
        case 0x0000FEFF:
            _bigEndian = true;
            _inputPtr += 4;
            _bytesPerChar = 4;
            return true;
        case 0xFFFE0000: // UCS-4, LE?
            _inputPtr += 4;
            _bytesPerChar = 4;
            _bigEndian = false;
            return true;
        case 0x0000FFFE: // UCS-4, in-order...
            _reportWeirdUCS4("2143"); // throws exception
            break; // never gets here
        case 0xFEFF0000: // UCS-4, in-order...
            _reportWeirdUCS4("3412"); // throws exception
            break; // never gets here
        default:
        }
        // Ok, if not, how about 16-bit encoding BOMs?
        int msw = quad >>> 16;
        if (msw == 0xFEFF) { // UTF-16, BE
            _inputPtr += 2;
            _bytesPerChar = 2;
            _bigEndian = true;
            return true;
        }
        if (msw == 0xFFFE) { // UTF-16, LE
            _inputPtr += 2;
            _bytesPerChar = 2;
            _bigEndian = false;
            return true;
        }
        // And if not, then UTF-8 BOM?
        if ((quad >>> 8) == 0xEFBBBF) { // UTF-8
            _inputPtr += 3;
            _bytesPerChar = 1;
            _bigEndian = true; // doesn't really matter
            return true;
        }
        return false;
    }

    private boolean checkUTF32(int quad) throws JacksonException
    {
        /* Handling of (usually) optional BOM (required for
         * multi-byte formats); first 32-bit charsets:
         */
        if ((quad >> 8) == 0) { // 0x000000?? -> UTF32-BE
            _bigEndian = true;
        } else if ((quad & 0x00FFFFFF) == 0) { // 0x??000000 -> UTF32-LE
            _bigEndian = false;
        } else if ((quad & ~0x00FF0000) == 0) { // 0x00??0000 -> UTF32-in-order
            _reportWeirdUCS4("3412");
        } else if ((quad & ~0x0000FF00) == 0) { // 0x0000??00 -> UTF32-in-order
            _reportWeirdUCS4("2143");
        } else {
            // Cannot be valid UTF-32 encoded JSON...
            return false;
        }
        // Not BOM (just regular content), nothing to skip past:
        //_inputPtr += 4;
        _bytesPerChar = 4;
        return true;
    }

    private boolean checkUTF16(int i16)
    {
        if ((i16 & 0xFF00) == 0) { // UTF-16BE
            _bigEndian = true;
        } else if ((i16 & 0x00FF) == 0) { // UTF-16LE
            _bigEndian = false;
        } else { // nope, not  UTF-16
            return false;
        }
        // Not BOM (just regular content), nothing to skip past:
        //_inputPtr += 2;
        _bytesPerChar = 2;
        return true;
    }

    /*
    /**********************************************************************
    /* Internal methods, problem reporting
    /**********************************************************************
     */

    private void _reportWeirdUCS4(String type) throws JacksonException {
        throw _createIOFailure("Unsupported UCS-4 endianness ("+type+") detected");
    }

    /*
    /**********************************************************************
    /* Internal methods, raw input access
    /**********************************************************************
     */

    protected boolean ensureLoaded(int minimum) throws JacksonException {
        // Let's assume here buffer has enough room -- this will always
        // be true for the limited used this method gets
        int gotten = (_inputEnd - _inputPtr);
        while (gotten < minimum) {
            int count;

            if (_in == null) { // block source
                count = -1;
            } else {
                try {
                    count = _in.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd);
                } catch (IOException e) {
                    throw _wrapIOFailure(e);
                }
            }
            if (count < 1) {
                return false;
            }
            _inputEnd += count;
            gotten += count;
        }
        return true;
    }

    /*
    /**********************************************************************
    /* Internal methods, exception handling
    /**********************************************************************
     */

    private JacksonException _createIOFailure(String msg) throws JacksonException {
        // 12-Jan-2021, tatu: Couple of alternatives, but since this is before
        //    actual parser created, seems best to simply fake this was "true"
        //    IOException
        return _wrapIOFailure(new IOException(msg));
    }

    private static JacksonException _wrapIOFailure(IOException e) throws JacksonException {
        return JacksonIOException.construct(e, null);
    }
}