SurrogateWrite223Test.java

package com.fasterxml.jackson.core.write;

import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;

import org.junit.jupiter.api.Test;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.core.json.JsonWriteFeature;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

class SurrogateWrite223Test extends JUnit5TestBase
{
    private final JsonFactory DEFAULT_JSON_F = newStreamFactory();

    private final JsonFactory SURROGATE_COMBINING_JSON_F = JsonFactory.builder()
            .enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
            .build();

    // for [core#223]
    @Test
    void surrogatesDefaultSetting() throws Exception {
        // default in 2.x should be disabled:
        assertFalse(DEFAULT_JSON_F.isEnabled(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.mappedFeature()));
    }

    // for [core#223]
    @Test
    void surrogatesByteBacked() throws Exception
    {
        ByteArrayOutputStream out;
        JsonGenerator g;
        final String toQuote = new String(Character.toChars(0x1F602));
        assertEquals(2, toQuote.length()); // just sanity check

        out = new ByteArrayOutputStream();

        JsonFactory f = SURROGATE_COMBINING_JSON_F;
        g = f.createGenerator(out);
        g.writeStartArray();
        g.writeString(toQuote);
        g.writeEndArray();
        g.close();
        assertEquals(2 + 2 + 4, out.size()); // brackets, quotes, 4-byte encoding

        // Also parse back to ensure correctness
        JsonParser p = f.createParser(out.toByteArray());
        assertToken(JsonToken.START_ARRAY, p.nextToken());
        assertToken(JsonToken.VALUE_STRING, p.nextToken());
        assertEquals(toQuote, p.getText());
        assertToken(JsonToken.END_ARRAY, p.nextToken());
        p.close();

        // but may revert back to original behavior
        out = new ByteArrayOutputStream();
        f = JsonFactory.builder()
                .disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
                .build();

        g = f.createGenerator(out);
        g.writeStartArray();
        g.writeString(toQuote);
        g.writeEndArray();
        g.close();
        assertEquals(2 + 2 + 12, out.size()); // brackets, quotes, 2 x 6 byte JSON escape
    }

    // for [core#223]: no change for character-backed (cannot do anything)
    @Test
    void surrogatesCharBacked() throws Exception
    {
        Writer out;
        JsonGenerator g;
        final String toQuote = new String(Character.toChars(0x1F602));
        assertEquals(2, toQuote.length()); // just sanity check

        out = new StringWriter();
        g = DEFAULT_JSON_F.createGenerator(out);
        g.writeStartArray();
        g.writeString(toQuote);
        g.writeEndArray();
        g.close();
        assertEquals(2 + 2 + 2, out.toString().length()); // brackets, quotes, 2 chars as is

        // Also parse back to ensure correctness
        JsonParser p = DEFAULT_JSON_F.createParser(out.toString());
        assertToken(JsonToken.START_ARRAY, p.nextToken());
        assertToken(JsonToken.VALUE_STRING, p.nextToken());
        assertEquals(toQuote, p.getText());
        assertToken(JsonToken.END_ARRAY, p.nextToken());
        p.close();
    }

    //https://github.com/FasterXML/jackson-core/issues/1359
    @Test
    void checkNonSurrogates() throws Exception {
        JsonFactory f = SURROGATE_COMBINING_JSON_F;
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try (JsonGenerator gen = f.createGenerator(out)) {
            gen.writeStartObject();

            // Inside the BMP, beyond surrogate block; 0xFF0C - full-width comma
            gen.writeStringField("test_full_width", "foo" + new String(Character.toChars(0xFF0C)) + "bar");

            // Inside the BMP, beyond surrogate block; 0xFE6A - small form percent
            gen.writeStringField("test_small_form", "foo" + new String(Character.toChars(0xFE6A)) + "bar");

            // Inside the BMP, before the surrogate block; 0x3042 - Hiragana A
            gen.writeStringField("test_hiragana", "foo" + new String(Character.toChars(0x3042)) + "bar");

            // Outside the BMP; 0x1F60A - emoji
            gen.writeStringField("test_emoji", new String(Character.toChars(0x1F60A)));

            gen.writeEndObject();
        }
        String json = out.toString("UTF-8");
        assertTrue(json.contains("foo\uFF0Cbar"));
        assertTrue(json.contains("foo\uFE6Abar"));
        assertTrue(json.contains("foo\u3042bar"));
        assertTrue(json.contains("\"test_emoji\":\"\uD83D\uDE0A\""));
    }

    @Test
    void checkSurrogateWithCharacterEscapes() throws Exception {
        JsonFactory f = SURROGATE_COMBINING_JSON_F;
        f.setCharacterEscapes(JsonpCharacterEscapes.instance());
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try (JsonGenerator gen = f.createGenerator(out)) {
            gen.writeStartObject();
            // Outside the BMP; 0x1F60A - emoji
            gen.writeStringField("test_emoji", new String(Character.toChars(0x1F60A)));
            gen.writeEndObject();
        }
        String json = out.toString("UTF-8");
        assertEquals("{\"test_emoji\":\"\uD83D\uDE0A\"}", json);
    }

    //https://github.com/FasterXML/jackson-core/issues/1473
    @Test
    void surrogateCharSplitInTwoSegments() throws Exception
    {
        // UTF8JsonGenerator must avoid splitting surrogate chars
        // into separate segments. We want to test the third segment
        // split to make sure indexes, offsets, etc are all correct.
        // By default, segments split in every 1000 chars.
        // Thus, we need a string with length 2001 where the surrogate is
        // at 2000 and 2001 positions.
        int count = 1999;
        char[] chars = new char[count];
        java.util.Arrays.fill(chars, 'x');
        String base = new String(chars);

        final String VALUE = base + "\uD83E\uDEE1";

        ByteArrayOutputStream bb = new ByteArrayOutputStream();
        try (JsonGenerator g = SURROGATE_COMBINING_JSON_F.createGenerator(bb)) {
            g.enable(JsonGenerator.Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8);
    
            g.writeStartArray();
            g.writeString(VALUE);
            g.writeEndArray();
        }

        String result = new String(bb.toByteArray(), StandardCharsets.UTF_8);

        // +2 and -2 to remove array and quotes: result should contain ["xxxx....����"]
        // "\uD83E\uDEE1" is the combined surrogate form of the emoji
        assertEquals("\uD83E\uDEE1", result.substring(count+2, result.length()-2));
    }
}