JsonStringEncoder.java

package com.fasterxml.jackson.core.io;

import java.io.IOException;
import java.util.Arrays;

import com.fasterxml.jackson.core.util.ByteArrayBuilder;
import com.fasterxml.jackson.core.util.TextBuffer;

/**
 * Helper class used for efficient encoding of JSON String values (including
 * JSON field names) into Strings or UTF-8 byte arrays.
 *<p>
 * Note that methods in here are somewhat optimized, but not ridiculously so.
 * Reason is that conversion method results are expected to be cached so that
 * these methods will not be hot spots during normal operation.
 */
public final class JsonStringEncoder
{
    /*
    /**********************************************************************
    /* Constants
    /**********************************************************************
     */

    private final static char[] HC = CharTypes.copyHexChars(true);

    private final static byte[] HB = CharTypes.copyHexBytes(true);

    private final static int SURR1_FIRST = 0xD800;
    private final static int SURR1_LAST = 0xDBFF;
    private final static int SURR2_FIRST = 0xDC00;
    private final static int SURR2_LAST = 0xDFFF;

    // 18-Aug-2021, tatu: [core#712] Change to more dynamic allocation; try
    //    to estimate ok initial encoding buffer, switch to segmented for
    //    possible (but rare) big content

    final static int MIN_CHAR_BUFFER_SIZE = 16;
    final static int MAX_CHAR_BUFFER_SIZE = 32000; // use segments beyond
    final static int MIN_BYTE_BUFFER_SIZE = 24;
    final static int MAX_BYTE_BUFFER_SIZE = 32000; // use segments beyond

    /*
    /**********************************************************************
    /* Construction, instance access
    /**********************************************************************
     */

    // Since 2.10 we have stateless singleton and NO fancy ThreadLocal/SofRef caching!!!
    private final static JsonStringEncoder instance = new JsonStringEncoder();

    public JsonStringEncoder() { }

    /**
     * Factory method for getting an instance; this is either recycled per-thread instance,
     * or a newly constructed one.
     *
     * @return Static stateless encoder instance
     */
    public static JsonStringEncoder getInstance() {
        return instance;
    }

    /*
    /**********************************************************************
    /* Public API
    /**********************************************************************
     */

    /**
     * Method that will escape text contents using JSON standard escaping,
     * and return results as a character array.
     *
     * @param input Value String to process
     *
     * @return JSON-escaped String matching {@code input}
     */
    public char[] quoteAsString(String input)
    {
        final int inputLen = input.length();
        char[] outputBuffer = new char[_initialCharBufSize(inputLen)];
        final int[] escCodes = CharTypes.get7BitOutputEscapes();
        final int escCodeCount = escCodes.length;
        int inPtr = 0;
        TextBuffer textBuffer = null;
        int outPtr = 0;
        char[] qbuf = null;

        outer:
        while (inPtr < inputLen) {
            tight_loop:
            while (true) {
                char c = input.charAt(inPtr);
                if (c < escCodeCount && escCodes[c] != 0) {
                    break tight_loop;
                }
                if (outPtr >= outputBuffer.length) {
                    if (textBuffer == null) {
                        textBuffer = TextBuffer.fromInitial(outputBuffer);
                    }
                    try {
                        outputBuffer = textBuffer.finishCurrentSegment();
                    } catch (IOException e) {
                        // IOException won't happen here, can only occur when ReadConstrainedTextBuffer is used
                        throw new IllegalStateException(e);
                    }
                    outPtr = 0;
                }
                outputBuffer[outPtr++] = c;
                if (++inPtr >= inputLen) {
                    break outer;
                }
            }
            // something to escape; 2 or 6-char variant?
            if (qbuf == null) {
                qbuf = _qbuf();
            }
            char d = input.charAt(inPtr++);
            int escCode = escCodes[d];
            int length = (escCode < 0)
                    ? _appendNumeric(d, qbuf)
                    : _appendNamed(escCode, qbuf);
                    
            if ((outPtr + length) > outputBuffer.length) {
                int first = outputBuffer.length - outPtr;
                if (first > 0) {
                    System.arraycopy(qbuf, 0, outputBuffer, outPtr, first);
                }
                if (textBuffer == null) {
                    textBuffer = TextBuffer.fromInitial(outputBuffer);
                }
                try {
                    outputBuffer = textBuffer.finishCurrentSegment();
                } catch (IOException e) {
                    // IOException won't happen here, can only occur when ReadConstrainedTextBuffer is used
                    throw new IllegalStateException(e);
                }
                int second = length - first;
                System.arraycopy(qbuf, first, outputBuffer, 0, second);
                outPtr = second;
            } else {
                System.arraycopy(qbuf, 0, outputBuffer, outPtr, length);
                outPtr += length;
            }
        }

        if (textBuffer == null) {
            return Arrays.copyOfRange(outputBuffer, 0, outPtr);
        }
        textBuffer.setCurrentLength(outPtr);
        try {
            return textBuffer.contentsAsArray();
        } catch (IOException e) {
            // IOException won't happen here, can only occur when ReadConstrainedTextBuffer is used
            throw new IllegalStateException(e);
        }
    }

    /**
     * Overloaded variant of {@link #quoteAsString(String)}.
     *
     * @param input Value {@link CharSequence} to process
     *
     * @return JSON-escaped String matching {@code input}
     *
     * @since 2.10
     */
    public char[] quoteAsString(CharSequence input)
    {
        // 15-Aug-2019, tatu: Optimize common case as JIT can't get rid of overhead otherwise
        if (input instanceof String) {
            return quoteAsString((String) input);
        }

        TextBuffer textBuffer = null;

        final int inputLen = input.length();
        char[] outputBuffer = new char[_initialCharBufSize(inputLen)];
        final int[] escCodes = CharTypes.get7BitOutputEscapes();
        final int escCodeCount = escCodes.length;
        int inPtr = 0;
        int outPtr = 0;
        char[] qbuf = null;

        outer:
        while (inPtr < inputLen) {
            tight_loop:
            while (true) {
                char c = input.charAt(inPtr);
                if (c < escCodeCount && escCodes[c] != 0) {
                    break tight_loop;
                }
                if (outPtr >= outputBuffer.length) {
                    if (textBuffer == null) {
                        textBuffer = TextBuffer.fromInitial(outputBuffer);
                    }
                    try {
                        outputBuffer = textBuffer.finishCurrentSegment();
                    } catch (IOException e) {
                        // IOException won't happen here, can only occur when ReadConstrainedTextBuffer is used
                        throw new IllegalStateException(e);
                    }
                    outPtr = 0;
                }
                outputBuffer[outPtr++] = c;
                if (++inPtr >= inputLen) {
                    break outer;
                }
            }
            // something to escape; 2 or 6-char variant?
            if (qbuf == null) {
                qbuf = _qbuf();
            }
            char d = input.charAt(inPtr++);
            int escCode = escCodes[d];
            int length = (escCode < 0)
                    ? _appendNumeric(d, qbuf)
                    : _appendNamed(escCode, qbuf);
                    
            if ((outPtr + length) > outputBuffer.length) {
                int first = outputBuffer.length - outPtr;
                if (first > 0) {
                    System.arraycopy(qbuf, 0, outputBuffer, outPtr, first);
                }
                if (textBuffer == null) {
                    textBuffer = TextBuffer.fromInitial(outputBuffer);
                }
                try {
                    outputBuffer = textBuffer.finishCurrentSegment();
                } catch (IOException e) {
                    // IOException won't happen here, can only occur when ReadConstrainedTextBuffer is used
                    throw new IllegalStateException(e);
                }
                int second = length - first;
                System.arraycopy(qbuf, first, outputBuffer, 0, second);
                outPtr = second;
            } else {
                System.arraycopy(qbuf, 0, outputBuffer, outPtr, length);
                outPtr += length;
            }
        }

        if (textBuffer == null) {
            return Arrays.copyOfRange(outputBuffer, 0, outPtr);
        }
        textBuffer.setCurrentLength(outPtr);
        try {
            return textBuffer.contentsAsArray();
        } catch (IOException e) {
            // IOException won't happen here, can only occur when ReadConstrainedTextBuffer is used
            throw new IllegalStateException(e);
        }
    }

    /**
     * Method that will quote text contents using JSON standard quoting,
     * and append results to a supplied {@link StringBuilder}.
     * Use this variant if you have e.g. a {@link StringBuilder} and want to avoid superfluous copying of it.
     *
     * @param input Value {@link CharSequence} to process
     * @param output {@link StringBuilder} to append escaped contents to
     *
     * @since 2.8
     */
    public void quoteAsString(CharSequence input, StringBuilder output)
    {
        final int[] escCodes = CharTypes.get7BitOutputEscapes();
        final int escCodeCount = escCodes.length;
        int inPtr = 0;
        final int inputLen = input.length();
        char[] qbuf = null;

        outer:
        while (inPtr < inputLen) {
            tight_loop:
            while (true) {
                char c = input.charAt(inPtr);
                if (c < escCodeCount && escCodes[c] != 0) {
                    break tight_loop;
                }
                output.append(c);
                if (++inPtr >= inputLen) {
                    break outer;
                }
            }
            // something to escape; 2 or 6-char variant?
            if (qbuf == null) {
                qbuf = _qbuf();
            }
            char d = input.charAt(inPtr++);
            int escCode = escCodes[d];
            int length = (escCode < 0)
                    ? _appendNumeric(d, qbuf)
                    : _appendNamed(escCode, qbuf);
            output.append(qbuf, 0, length);
        }
    }

    /**
     * Method that will escape text contents using JSON standard escaping,
     * encode resulting String as UTF-8 bytes
     * and return results as a  byte array.
     *
     * @param text Value {@link String} to process
     *
     * @return UTF-8 encoded bytes of JSON-escaped {@code text}
     */
    @SuppressWarnings("resource")
    public byte[] quoteAsUTF8(String text)
    {
        int inputPtr = 0;
        int inputEnd = text.length();
        int outputPtr = 0;
        byte[] outputBuffer = new byte[_initialByteBufSize(inputEnd)];
        ByteArrayBuilder bb = null;

        main:
        while (inputPtr < inputEnd) {
            final int[] escCodes = CharTypes.get7BitOutputEscapes();

            inner_loop: // ASCII and escapes
            while (true) {
                int ch = text.charAt(inputPtr);
                if (ch > 0x7F || escCodes[ch] != 0) {
                    break inner_loop;
                }
                if (outputPtr >= outputBuffer.length) {
                    if (bb == null) {
                        bb = ByteArrayBuilder.fromInitial(outputBuffer, outputPtr);
                    }
                    outputBuffer = bb.finishCurrentSegment();
                    outputPtr = 0;
                }
                outputBuffer[outputPtr++] = (byte) ch;
                if (++inputPtr >= inputEnd) {
                    break main;
                }
            }
            if (bb == null) {
                bb = ByteArrayBuilder.fromInitial(outputBuffer, outputPtr);
            }
            if (outputPtr >= outputBuffer.length) {
                outputBuffer = bb.finishCurrentSegment();
                outputPtr = 0;
            }
            // Ok, so what did we hit?
            int ch = text.charAt(inputPtr++);
            if (ch <= 0x7F) { // needs quoting
                int escape = escCodes[ch];
                // ctrl-char, 6-byte escape...
                outputPtr = _appendByte(ch, escape, bb, outputPtr);
                outputBuffer = bb.getCurrentSegment();
                continue main;
            }
            if (ch <= 0x7FF) { // fine, just needs 2 byte output
                outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
                ch = (0x80 | (ch & 0x3f));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (ch < SURR1_FIRST || ch > SURR2_LAST) { // nope
                    outputBuffer[outputPtr++] = (byte) (0xe0 | (ch >> 12));
                    if (outputPtr >= outputBuffer.length) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 6) & 0x3f));
                    ch = (0x80 | (ch & 0x3f));
                } else { // yes, surrogate pair
                    if (ch > SURR1_LAST) { // must be from first range
                        _illegal(ch);
                    }
                    // and if so, followed by another from next range
                    if (inputPtr >= inputEnd) {
                        _illegal(ch);
                    }
                    ch = _convert(ch, text.charAt(inputPtr++));
                    if (ch > 0x10FFFF) { // illegal, as per RFC 4627
                        _illegal(ch);
                    }
                    outputBuffer[outputPtr++] = (byte) (0xf0 | (ch >> 18));
                    if (outputPtr >= outputBuffer.length) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 12) & 0x3f));
                    if (outputPtr >= outputBuffer.length) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 6) & 0x3f));
                    ch = (0x80 | (ch & 0x3f));
                }
            }
            if (outputPtr >= outputBuffer.length) {
                outputBuffer = bb.finishCurrentSegment();
                outputPtr = 0;
            }
            outputBuffer[outputPtr++] = (byte) ch;
        }
        if (bb == null) {
            return Arrays.copyOfRange(outputBuffer, 0, outputPtr);
        }
        return bb.completeAndCoalesce(outputPtr);
    }

    /**
     * Will encode given String as UTF-8 (without any escaping) and return
     * the resulting byte array.
     *
     * @param text Value {@link String} to process
     *
     * @return UTF-8 encoded bytes of {@code text} (without any escaping)
     */
    @SuppressWarnings("resource")
    public byte[] encodeAsUTF8(String text)
    {
        int inputPtr = 0;
        int inputEnd = text.length();
        int outputPtr = 0;
        byte[] outputBuffer = new byte[_initialByteBufSize(inputEnd)];
        int outputEnd = outputBuffer.length;
        ByteArrayBuilder bb = null;

        main_loop:
        while (inputPtr < inputEnd) {
            int c = text.charAt(inputPtr++);

            // first tight loop for ascii
            while (c <= 0x7F) {
                if (outputPtr >= outputEnd) {
                    if (bb == null) {
                        bb = ByteArrayBuilder.fromInitial(outputBuffer, outputPtr);
                    }
                    outputBuffer = bb.finishCurrentSegment();
                    outputEnd = outputBuffer.length;
                    outputPtr = 0;
                }
                outputBuffer[outputPtr++] = (byte) c;
                if (inputPtr >= inputEnd) {
                    break main_loop;
                }
                c = text.charAt(inputPtr++);
            }

            // then multi-byte...
            if (bb == null) {
                bb = ByteArrayBuilder.fromInitial(outputBuffer, outputPtr);
            }
            if (outputPtr >= outputEnd) {
                outputBuffer = bb.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            if (c < 0x800) { // 2-byte
                outputBuffer[outputPtr++] = (byte) (0xc0 | (c >> 6));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (c < SURR1_FIRST || c > SURR2_LAST) { // nope
                    outputBuffer[outputPtr++] = (byte) (0xe0 | (c >> 12));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                } else { // yes, surrogate pair
                    if (c > SURR1_LAST) { // must be from first range
                        _illegal(c);
                    }
                    // and if so, followed by another from next range
                    if (inputPtr >= inputEnd) {
                        _illegal(c);
                    }
                    c = _convert(c, text.charAt(inputPtr++));
                    if (c > 0x10FFFF) { // illegal, as per RFC 4627
                        _illegal(c);
                    }
                    outputBuffer[outputPtr++] = (byte) (0xf0 | (c >> 18));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                }
            }
            if (outputPtr >= outputEnd) {
                outputBuffer = bb.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            outputBuffer[outputPtr++] = (byte) (0x80 | (c & 0x3f));
        }
        if (bb == null) {
            return Arrays.copyOfRange(outputBuffer, 0, outputPtr);
        }
        return bb.completeAndCoalesce(outputPtr);
    }

    /**
     * Overloaded variant of {@link #encodeAsUTF8(String)}.
     *
     * @param text Value {@link CharSequence} to process
     *
     * @return UTF-8 encoded bytes of {@code text} (without any escaping)
     *
     * @since 2.11
     */
    @SuppressWarnings("resource")
    public byte[] encodeAsUTF8(CharSequence text)
    {
        int inputPtr = 0;
        int inputEnd = text.length();
        int outputPtr = 0;
        byte[] outputBuffer = new byte[_initialByteBufSize(inputEnd)];
        int outputEnd = outputBuffer.length;
        ByteArrayBuilder bb = null;

        main_loop:
        while (inputPtr < inputEnd) {
            int c = text.charAt(inputPtr++);

            // first tight loop for ascii
            while (c <= 0x7F) {
                if (outputPtr >= outputEnd) {
                    if (bb == null) {
                        bb = ByteArrayBuilder.fromInitial(outputBuffer, outputPtr);
                    }
                    outputBuffer = bb.finishCurrentSegment();
                    outputEnd = outputBuffer.length;
                    outputPtr = 0;
                }
                outputBuffer[outputPtr++] = (byte) c;
                if (inputPtr >= inputEnd) {
                    break main_loop;
                }
                c = text.charAt(inputPtr++);
            }

            // then multi-byte...
            if (bb == null) {
                bb = ByteArrayBuilder.fromInitial(outputBuffer, outputPtr);
            }
            if (outputPtr >= outputEnd) {
                outputBuffer = bb.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            if (c < 0x800) { // 2-byte
                outputBuffer[outputPtr++] = (byte) (0xc0 | (c >> 6));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (c < SURR1_FIRST || c > SURR2_LAST) { // nope
                    outputBuffer[outputPtr++] = (byte) (0xe0 | (c >> 12));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                } else { // yes, surrogate pair
                    if (c > SURR1_LAST) { // must be from first range
                        _illegal(c);
                    }
                    // and if so, followed by another from next range
                    if (inputPtr >= inputEnd) {
                        _illegal(c);
                    }
                    c = _convert(c, text.charAt(inputPtr++));
                    if (c > 0x10FFFF) { // illegal, as per RFC 4627
                        _illegal(c);
                    }
                    outputBuffer[outputPtr++] = (byte) (0xf0 | (c >> 18));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = bb.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                }
            }
            if (outputPtr >= outputEnd) {
                outputBuffer = bb.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            outputBuffer[outputPtr++] = (byte) (0x80 | (c & 0x3f));
        }
        if (bb == null) {
            return Arrays.copyOfRange(outputBuffer, 0, outputPtr);
        }
        return bb.completeAndCoalesce(outputPtr);
    }

    /*
    /**********************************************************************
    /* Internal methods
    /**********************************************************************
     */

    private char[] _qbuf() {
        char[] qbuf = new char[6];
        qbuf[0] = '\\';
        qbuf[2] = '0';
        qbuf[3] = '0';
        return qbuf;
    }

    private int _appendNumeric(int value, char[] qbuf) {
        qbuf[1] = 'u';
        // We know it's a control char, so only the last 2 chars are non-0
        qbuf[4] = HC[value >> 4];
        qbuf[5] = HC[value & 0xF];
        return 6;
    }

    private int _appendNamed(int esc, char[] qbuf) {
        qbuf[1] = (char) esc;
        return 2;
    }

    private int _appendByte(int ch, int esc, ByteArrayBuilder bb, int ptr)
    {
        bb.setCurrentSegmentLength(ptr);
        bb.append('\\');
        if (esc < 0) { // standard escape
            bb.append('u');
            if (ch > 0xFF) {
                int hi = (ch >> 8);
                bb.append(HB[hi >> 4]);
                bb.append(HB[hi & 0xF]);
                ch &= 0xFF;
            } else {
                bb.append('0');
                bb.append('0');
            }
            bb.append(HB[ch >> 4]);
            bb.append(HB[ch & 0xF]);
        } else { // 2-char simple escape
            bb.append((byte) esc);
        }
        return bb.getCurrentSegmentLength();
    }

    private static int _convert(int p1, int p2) {
        // Ok, then, is the second part valid?
        if (p2 < SURR2_FIRST || p2 > SURR2_LAST) {
            throw new IllegalArgumentException("Broken surrogate pair: first char 0x"+Integer.toHexString(p1)+", second 0x"+Integer.toHexString(p2)+"; illegal combination");
        }
        return (p1 << 10) + p2 + UTF8Writer.SURROGATE_BASE;
    }

    private static void _illegal(int c) {
        throw new IllegalArgumentException(UTF8Writer.illegalSurrogateDesc(c));
    }

    // non-private for unit test access
    static int _initialCharBufSize(int strLen) {
        // char->char won't expand but we need to give some room for escaping
        // like 1/8 (12.5% expansion) but cap addition to something modest
        final int estimated = Math.max(MIN_CHAR_BUFFER_SIZE,
                strLen + Math.min(6 + (strLen >> 3), 1000));
        return Math.min(estimated, MAX_CHAR_BUFFER_SIZE);
    }

    // non-private for unit test access
    static int _initialByteBufSize(int strLen) {
        // char->byte for UTF-8 can expand size by x3 itself, and escaping
        // more... but let's use lower factor of 1.5
        final int doubled = Math.max(MIN_BYTE_BUFFER_SIZE, strLen + 6 + (strLen>>1));
        // but use upper bound for humongous cases (segmented)
        return Math.min(doubled, MAX_BYTE_BUFFER_SIZE);
    }
}