StringUtils.java

package com.alibaba.fastjson2.util;

import java.nio.charset.StandardCharsets;

import static com.alibaba.fastjson2.JSONWriter.Feature.BrowserSecure;
import static com.alibaba.fastjson2.JSONWriter.Feature.EscapeNoneAscii;
import static com.alibaba.fastjson2.util.IOUtils.*;
import static com.alibaba.fastjson2.util.IOUtils.hex4U;
import static com.alibaba.fastjson2.util.JDKUtils.ARRAY_BYTE_BASE_OFFSET;
import static com.alibaba.fastjson2.util.JDKUtils.UNSAFE;

public class StringUtils {
    protected static final long MASK_ESCAPE_NONE_ASCII = EscapeNoneAscii.mask;
    protected static final long MASK_BROWSER_SECURE = BrowserSecure.mask;
    public static int writeLatin1(byte[] bytes, int off, byte[] value, byte quote) {
        int strlen = value.length;
        bytes[off] = quote;
        System.arraycopy(value, 0, bytes, off + 1, strlen);
        bytes[off + strlen + 1] = quote;
        return off + strlen + 2;
    }

    public static int writeLatin1Escaped(byte[] bytes, int off, byte[] values, byte quote, long features) {
        final boolean browserSecure = (features & MASK_BROWSER_SECURE) != 0;
        bytes[off++] = quote;
        for (int i = 0; i < values.length; i++) {
            byte ch = values[i];
            switch (ch) {
                case '\\':
                case '\n':
                case '\r':
                case '\f':
                case '\b':
                case '\t':
                    writeEscapedChar(bytes, off, ch);
                    off += 2;
                    break;
                case 0:
                case 1:
                case 2:
                case 3:
                case 4:
                case 5:
                case 6:
                case 7:
                case 11:
                case 14:
                case 15:
                case 16:
                case 17:
                case 18:
                case 19:
                case 20:
                case 21:
                case 22:
                case 23:
                case 24:
                case 25:
                case 26:
                case 27:
                case 28:
                case 29:
                case 30:
                case 31:
                    writeU4Hex2(bytes, off, ch);
                    off += 6;
                    break;
                case '<':
                case '>':
                case '(':
                case ')':
                    if (browserSecure) {
                        writeU4HexU(bytes, off, ch);
                        off += 6;
                    } else {
                        bytes[off++] = ch;
                    }
                    break;
                default:
                    if (ch == quote) {
                        bytes[off] = '\\';
                        bytes[off + 1] = quote;
                        off += 2;
                    } else if (ch < 0) {
                        // latin
                        int c = ch & 0xFF;
                        bytes[off] = (byte) (0xc0 | (c >> 6));
                        bytes[off + 1] = (byte) (0x80 | (c & 0x3f));
                        off += 2;
                    } else {
                        bytes[off++] = ch;
                    }
                    break;
            }
        }
        bytes[off] = quote;
        return off + 1;
    }

    public static int writeLatin1EscapedRest(char[] chars, int off, byte[] str, int coff, char quote, long features) {
        boolean escapeNoneAscii = (features & EscapeNoneAscii.mask) != 0;
        boolean browserSecure = (features & BrowserSecure.mask) != 0;

        for (int i = coff; i < str.length; i++) {
            byte b = str[i];
            char ch = (char) (b & 0xff);
            switch (ch) {
                case '"':
                case '\'':
                    if (ch == quote) {
                        chars[off++] = '\\';
                    }
                    chars[off++] = ch;
                    break;
                case '\\':
                case '\r':
                case '\n':
                case '\b':
                case '\f':
                case '\t':
                    writeEscapedChar(chars, off, ch);
                    off += 2;
                    break;
                case 0:
                case 1:
                case 2:
                case 3:
                case 4:
                case 5:
                case 6:
                case 7:
                case 11:
                case 14:
                case 15:
                case 16:
                case 17:
                case 18:
                case 19:
                case 20:
                case 21:
                case 22:
                case 23:
                case 24:
                case 25:
                case 26:
                case 27:
                case 28:
                case 29:
                case 30:
                case 31:
                    writeU4Hex2(chars, off, ch);
                    off += 6;
                    break;
                case '<':
                case '>':
                case '(':
                case ')':
                    if (browserSecure) {
                        writeU4HexU(chars, off, ch);
                        off += 6;
                    } else {
                        chars[off++] = ch;
                    }
                    break;
                default:
                    if (escapeNoneAscii && ch > 0x007F) {
                        writeU4HexU(chars, off, ch);
                        off += 6;
                    } else {
                        chars[off++] = ch;
                    }
                    break;
            }
        }
        chars[off] = quote;
        return off + 1;
    }

    public static int writeUTF16(byte[] bytes, int off, byte[] value, byte quote, long features) {
        boolean escapeNoneAscii = (features & MASK_ESCAPE_NONE_ASCII) != 0;
        boolean browserSecure = (features & MASK_BROWSER_SECURE) != 0;

        bytes[off++] = quote;

        int coff = 0, char_len = value.length >> 1;
        while (coff < char_len) {
            char c = IOUtils.getChar(value, coff++);
            if (c < 0x80) {
                switch (c) {
                    case '\\':
                    case '\n':
                    case '\r':
                    case '\f':
                    case '\b':
                    case '\t':
                        writeEscapedChar(bytes, off, c);
                        off += 2;
                        break;
                    case 0:
                    case 1:
                    case 2:
                    case 3:
                    case 4:
                    case 5:
                    case 6:
                    case 7:
                    case 11:
                    case 14:
                    case 15:
                    case 16:
                    case 17:
                    case 18:
                    case 19:
                    case 20:
                    case 21:
                    case 22:
                    case 23:
                    case 24:
                    case 25:
                    case 26:
                    case 27:
                    case 28:
                    case 29:
                    case 30:
                    case 31:
                        writeU4Hex2(bytes, off, c);
                        off += 6;
                        break;
                    case '<':
                    case '>':
                    case '(':
                    case ')':
                        if (browserSecure) {
                            writeU4HexU(bytes, off, c);
                            off += 6;
                        } else {
                            bytes[off++] = (byte) c;
                        }
                        break;
                    default:
                        if (c == quote) {
                            bytes[off] = '\\';
                            bytes[off + 1] = quote;
                            off += 2;
                        } else {
                            bytes[off++] = (byte) c;
                        }
                        break;
                }
            } else {
                if (c < 0x800) {
                    // 2 bytes, 11 bits
                    bytes[off] = (byte) (0xc0 | (c >> 6));
                    bytes[off + 1] = (byte) (0x80 | (c & 0x3f));
                    off += 2;
                } else if (escapeNoneAscii) {
                    writeU4HexU(bytes, off, c);
                    off += 6;
                } else if (c >= '\uD800' && c < ('\uDFFF' + 1)) { //Character.isSurrogate(c) but 1.7
                    final int uc;
                    if (c < '\uDBFF' + 1) { // Character.isHighSurrogate(c)
                        if (coff + 1 > char_len) {
                            uc = -1;
                        } else {
                            char d = getChar(value, coff);
                            // d >= '\uDC00' && d < ('\uDFFF' + 1)
                            if (d >= '\uDC00' && d < ('\uDFFF' + 1)) { // Character.isLowSurrogate(d)
                                coff++;
                                uc = ((c << 10) + d) + (0x010000 - ('\uD800' << 10) - '\uDC00'); // Character.toCodePoint(c, d)
                            } else {
                                bytes[off++] = (byte) '?';
                                continue;
                            }
                        }
                    } else {
                        //
                        // Character.isLowSurrogate(c)
                        bytes[off++] = (byte) '?';
                        continue;
                    }

                    if (uc < 0) {
                        bytes[off++] = (byte) '?';
                    } else {
                        bytes[off] = (byte) (0xf0 | ((uc >> 18)));
                        bytes[off + 1] = (byte) (0x80 | ((uc >> 12) & 0x3f));
                        bytes[off + 2] = (byte) (0x80 | ((uc >> 6) & 0x3f));
                        bytes[off + 3] = (byte) (0x80 | (uc & 0x3f));
                        off += 4;
                    }
                } else {
                    // 3 bytes, 16 bits
                    bytes[off] = (byte) (0xe0 | ((c >> 12)));
                    bytes[off + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
                    bytes[off + 2] = (byte) (0x80 | (c & 0x3f));
                    off += 3;
                }
            }
        }

        bytes[off] = quote;
        return off + 1;
    }

    public static void writeEscapedChar(byte[] bytes, int off, int c0) {
        putShortLE(bytes, off, LATIN1.ESCAPED_CHARS[c0 & 0x7f]);
    }

    public static void writeU4Hex2(byte[] bytes, int off, int c) {
        putIntUnaligned(bytes, off, LATIN1.U4);
        putShortLE(bytes, off + 4, hex2(c));
    }

    public static void writeU4HexU(byte[] bytes, int off, int c) {
        putShortUnaligned(bytes, off, LATIN1.U2);
        putIntLE(bytes, off + 2, hex4U(c));
    }

    public static void writeEscapedChar(char[] chars, int off, int c0) {
        IOUtils.putIntUnaligned(chars, off, UTF16.ESCAPED_CHARS[c0 & 0x7f]);
    }

    public static void writeU4Hex2(char[] chars, int off, int c) {
        IOUtils.putLongUnaligned(chars, off, UTF16.U4);
        IOUtils.putIntLE(chars, off + 4, utf16Hex2(c));
    }

    public static void writeU4HexU(char[] chars, int off, int c) {
        IOUtils.putIntUnaligned(chars, off, UTF16.U2);
        IOUtils.putLongLE(chars, off + 2, utf16Hex4U(c));
    }

    public static boolean escaped(byte[] value, byte quote, long vecQuote) {
        int i = 0;
        final int upperBound = (value.length - i) & ~7;
        for (; i < upperBound; i += 8) {
            if (!noneEscaped(getLongUnaligned(value, i), vecQuote)) {
                return true;
            }
        }
        for (; i < value.length; i++) {
            byte c = value[i];
            if (c == quote || c == '\\' || c < ' ') {
                return true;
            }
        }
        return false;
    }

    public static boolean noneEscaped(long v, long quote) {
        /*
          for (int i = 0; i < 8; ++i) {
            byte c = (byte) data;
            if (c == (byte) quote || c == '\\' || c < ' ') {
                return false;
            }
            data >>>= 8;
          }
          return true;
         */
        return ((v + 0x6060606060606060L) & 0x8080808080808080L) == 0x8080808080808080L // all >= 32
                && ((v ^ quote) + 0x0101010101010101L & 0x8080808080808080L) == 0x8080808080808080L // != quote
                && ((v ^ 0xA3A3A3A3A3A3A3A3L) + 0x0101010101010101L & 0x8080808080808080L) == 0x8080808080808080L; // != '\\'
    }

    public static final class LATIN1 {
        private static final short U2;
        private static final int U4;
        private static final short[] ESCAPED_CHARS;

        static {
            {
                byte[] bytes = "\\u00".getBytes(StandardCharsets.UTF_8);
                U2 = UNSAFE.getShort(bytes, ARRAY_BYTE_BASE_OFFSET);
                U4 = UNSAFE.getInt(bytes, ARRAY_BYTE_BASE_OFFSET);
            }
            {
                char slash = '\\';
                short[] shorts = new short[128];
                shorts['\\'] = (short) (slash | ('\\' << 8));
                shorts['\n'] = (short) (slash | ('n' << 8));
                shorts['\r'] = (short) (slash | ('r' << 8));
                shorts['\f'] = (short) (slash | ('f' << 8));
                shorts['\b'] = (short) (slash | ('b' << 8));
                shorts['\t'] = (short) (slash | ('t' << 8));
                ESCAPED_CHARS = shorts;
            }
        }
    }

    public static final class UTF16 {
        private static final int U2;
        private static final long U4;
        private static final int[] ESCAPED_CHARS;

        static {
            {
                char[] bytes = "\\u00".toCharArray();
                U2 = UNSAFE.getInt(bytes, ARRAY_BYTE_BASE_OFFSET);
                U4 = UNSAFE.getLong(bytes, ARRAY_BYTE_BASE_OFFSET);
            }
            {
                char[] mapping = new char[]{
                        '\\', '\\',
                        '\n', 'n',
                        '\r', 'r',
                        '\f', 'f',
                        '\b', 'b',
                        '\t', 't'
                };
                char[] buf = {'\\', '\0'};
                int[] shorts = new int[128];
                for (int i = 0; i < mapping.length; i += 2) {
                    buf[1] = mapping[i + 1];
                    shorts[mapping[i]] = IOUtils.getIntUnaligned(buf, 0);
                }
                ESCAPED_CHARS = shorts;
            }
        }
    }
}