JsonExtract.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.operator.scalar;

import com.facebook.airlift.json.JsonObjectMapperProvider;
import com.facebook.presto.common.function.SqlFunctionProperties;
import com.facebook.presto.spi.PrestoException;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.io.SerializedString;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UncheckedIOException;

import static com.facebook.presto.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT;
import static com.facebook.presto.util.JsonUtil.createJsonGenerator;
import static com.facebook.presto.util.JsonUtil.createJsonParser;
import static com.fasterxml.jackson.core.JsonFactory.Feature.CANONICALIZE_FIELD_NAMES;
import static com.fasterxml.jackson.core.JsonToken.END_ARRAY;
import static com.fasterxml.jackson.core.JsonToken.END_OBJECT;
import static com.fasterxml.jackson.core.JsonToken.FIELD_NAME;
import static com.fasterxml.jackson.core.JsonToken.START_ARRAY;
import static com.fasterxml.jackson.core.JsonToken.START_OBJECT;
import static com.fasterxml.jackson.core.JsonToken.VALUE_NULL;
import static com.fasterxml.jackson.databind.SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS;
import static io.airlift.slice.Slices.utf8Slice;
import static java.util.Objects.requireNonNull;

/**
 * Extracts values from JSON
 * <p/>
 * Supports the following JSON path primitives:
 * <pre>
 *    $ : Root object
 *    . or [] : Child operator
 *   [] : Subscript operator for array
 * </pre>
 * <p/>
 * Supported JSON Path Examples:
 * <pre>
 *    { "store": {
 *        "book": [
 *          { "category": "reference",
 *            "author": "Nigel Rees",
 *            "title": "Sayings of the Century",
 *            "price": 8.95,
 *            "contributors": [["Adam", "Levine"], ["Bob", "Strong"]]
 *          },
 *          { "category": "fiction",
 *            "author": "Evelyn Waugh",
 *            "title": "Sword of Honour",
 *            "price": 12.99,
 *            "isbn": "0-553-21311-3",
 *            "last_owner": null
 *          }
 *        ],
 *        "bicycle": {
 *          "color": "red",
 *          "price": 19.95
 *        }
 *      }
 *    }
 * </pre>
 * <p/>
 * With only scalar values using dot-notation of path:
 * <pre>
 *    $.store.book[0].author => Nigel Rees
 *    $.store.bicycle.price => 19.95
 *    $.store.book[0].isbn => NULL (Doesn't exist becomes java null)
 *    $.store.book[1].last_owner => NULL (json null becomes java null)
 *    $.store.book[0].contributors[0][1] => Levine
 * </pre>
 * <p/>
 * With json values using dot-notation of path:
 * <pre>
 *    $.store.book[0].author => "Nigel Rees"
 *    $.store.bicycle.price => 19.95
 *    $.store.book[0].isbn => NULL (Doesn't exist becomes java null)
 *    $.store.book[1].last_owner => null (json null becomes the string "null")
 *    $.store.book[0].contributors[0] => ["Adam", "Levine"]
 *    $.store.bicycle => {"color": "red", "price": 19.95}
 * </pre>
 * With only scalar values using bracket-notation of path:
 * <pre>
 *    $["store"]["book"][0]["author"] => Nigel Rees
 *    $["store"]["bicycle"]["price"] => 19.95
 *    $["store"]["book"][0]["isbn"] => NULL (Doesn't exist becomes java null)
 *    $["store"]["book"][1]["last_owner"] => NULL (json null becomes java null)
 *    $["store"]["book"][0]["contributors"][0][1] => Levine
 * </pre>
 * <p/>
 * With json values using bracket-notation of path:
 * <pre>
 *    $["store"]["book"][0]["author"] => "Nigel Rees"
 *    $["store"]["bicycle"]["price"] => 19.95
 *    $["store"]["book"][0]["isbn"] => NULL (Doesn't exist becomes java null)
 *    $["store"]["book"][1]["last_owner"] => null (json null becomes the string "null")
 *    $["store"]["book"][0]["contributors"][0] => ["Adam", "Levine"]
 *    $["store"]["bicycle"] => {"color": "red", "price": 19.95}
 * </pre>
 */
public final class JsonExtract
{
    private static final int ESTIMATED_JSON_OUTPUT_SIZE = 512;

    private static final JsonFactory JSON_FACTORY = new JsonFactory()
            .disable(CANONICALIZE_FIELD_NAMES);

    private static final ObjectMapper SORTED_MAPPER = new JsonObjectMapperProvider().get().configure(ORDER_MAP_ENTRIES_BY_KEYS, true);

    private JsonExtract() {}

    public static <T> T extract(Slice jsonInput, JsonExtractor<T> jsonExtractor, SqlFunctionProperties properties)
    {
        requireNonNull(jsonInput, "jsonInput is null");
        try {
            return jsonExtractor.extract(jsonInput.getInput(), properties);
        }
        catch (JsonParseException e) {
            // Return null if we failed to parse something
            return null;
        }
        catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    public static <T> PrestoJsonExtractor<T> generateExtractor(String path, PrestoJsonExtractor<T> rootExtractor)
    {
        return generateExtractor(path, rootExtractor, false);
    }

    public static <T> PrestoJsonExtractor<T> generateExtractor(String path, PrestoJsonExtractor<T> rootExtractor, boolean exceptionOnOutOfBounds)
    {
        ImmutableList<String> tokens = ImmutableList.copyOf(new JsonPathTokenizer(path));

        PrestoJsonExtractor<T> jsonExtractor = rootExtractor;
        for (String token : tokens.reverse()) {
            jsonExtractor = new ObjectFieldJsonExtractor<>(token, jsonExtractor, exceptionOnOutOfBounds);
        }
        return jsonExtractor;
    }

    public interface JsonExtractor<T>
    {
        T extract(InputStream inputStream, SqlFunctionProperties properties)
                throws IOException;
    }

    public abstract static class PrestoJsonExtractor<T>
            implements JsonExtractor<T>
    {
        /**
         * Executes the extraction on the existing content of the JsonParser and outputs the match.
         * <p/>
         * Notes:
         * <ul>
         * <li>JsonParser must be on the FIRST token of the value to be processed when extract is called</li>
         * <li>INVARIANT: when extract() returns, the current token of the parser will be the LAST token of the value</li>
         * </ul>
         *
         * @return the value, or null if not applicable
         */
        abstract T extract(JsonParser jsonParser, SqlFunctionProperties properties)
                throws IOException;

        @Override
        public T extract(InputStream inputStream, SqlFunctionProperties properties)
                throws IOException
        {
            try (JsonParser jsonParser = createJsonParser(JSON_FACTORY, inputStream)) {
                // Initialize by advancing to first token and make sure it exists
                if (jsonParser.nextToken() == null) {
                    return null;
                }

                return extract(jsonParser, properties);
            }
        }
    }

    public static class ObjectFieldJsonExtractor<T>
            extends PrestoJsonExtractor<T>
    {
        private final SerializedString fieldName;
        private final PrestoJsonExtractor<? extends T> delegate;
        private final int index;
        private final boolean exceptionOnOutOfBounds;

        public ObjectFieldJsonExtractor(String fieldName, PrestoJsonExtractor<? extends T> delegate)
        {
            this(fieldName, delegate, false);
        }

        public ObjectFieldJsonExtractor(String fieldName, PrestoJsonExtractor<? extends T> delegate, boolean exceptionOnOutOfBounds)
        {
            this.fieldName = new SerializedString(requireNonNull(fieldName, "fieldName is null"));
            this.delegate = requireNonNull(delegate, "delegate is null");
            this.exceptionOnOutOfBounds = exceptionOnOutOfBounds;
            this.index = tryParseInt(fieldName, -1);
        }

        @Override
        public T extract(JsonParser jsonParser, SqlFunctionProperties properties)
                throws IOException
        {
            if (jsonParser.getCurrentToken() == START_OBJECT) {
                return processJsonObject(jsonParser, properties);
            }

            if (jsonParser.getCurrentToken() == START_ARRAY) {
                return processJsonArray(jsonParser, properties);
            }

            throw new JsonParseException(jsonParser, "Expected a JSON object or array");
        }

        public T processJsonObject(JsonParser jsonParser, SqlFunctionProperties properties)
                throws IOException
        {
            while (!jsonParser.nextFieldName(fieldName)) {
                if (!jsonParser.hasCurrentToken()) {
                    throw new JsonParseException(jsonParser, "Unexpected end of object");
                }
                if (jsonParser.getCurrentToken() == END_OBJECT) {
                    // Unable to find matching field
                    return null;
                }
                jsonParser.skipChildren(); // Skip nested structure if currently at the start of one
            }

            jsonParser.nextToken(); // Shift to first token of the value

            return delegate.extract(jsonParser, properties);
        }

        public T processJsonArray(JsonParser jsonParser, SqlFunctionProperties properties)
                throws IOException
        {
            int currentIndex = 0;
            while (true) {
                JsonToken token = jsonParser.nextToken();
                if (token == null) {
                    throw new JsonParseException(jsonParser, "Unexpected end of array");
                }
                if (token == END_ARRAY) {
                    // Index out of bounds
                    if (exceptionOnOutOfBounds) {
                        throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Index out of bounds");
                    }
                    return null;
                }
                if (currentIndex == index) {
                    break;
                }
                currentIndex++;
                jsonParser.skipChildren(); // Skip nested structure if currently at the start of one
            }

            return delegate.extract(jsonParser, properties);
        }
    }

    public static class ScalarValueJsonExtractor
            extends PrestoJsonExtractor<Slice>
    {
        @Override
        public Slice extract(JsonParser jsonParser, SqlFunctionProperties properties)
                throws IOException
        {
            JsonToken token = jsonParser.getCurrentToken();
            if (token == null) {
                throw new JsonParseException(jsonParser, "Unexpected end of value");
            }
            if (!token.isScalarValue() || token == VALUE_NULL) {
                return null;
            }
            return utf8Slice(jsonParser.getText());
        }
    }

    public static class JsonValueJsonExtractor
            extends PrestoJsonExtractor<Slice>
    {
        @Override
        public Slice extract(JsonParser jsonParser, SqlFunctionProperties properties)
                throws IOException
        {
            if (!jsonParser.hasCurrentToken()) {
                throw new JsonParseException(jsonParser, "Unexpected end of value");
            }
            if (!properties.isCanonicalizedJsonExtract()) {
                return legacyExtract(jsonParser);
            }
            DynamicSliceOutput dynamicSliceOutput = new DynamicSliceOutput(ESTIMATED_JSON_OUTPUT_SIZE);
            // Write the JSON to output stream with sorted keys
            SORTED_MAPPER.writeValue((OutputStream) dynamicSliceOutput, SORTED_MAPPER.readValue(jsonParser, Object.class));
            // nextToken will throw an exception if there are trailing characters.
            try {
                jsonParser.nextToken();
            }
            catch (JsonParseException e) {
                throw new PrestoException(INVALID_FUNCTION_ARGUMENT, e.getMessage());
            }
            return dynamicSliceOutput.slice();
        }

        public Slice legacyExtract(JsonParser jsonParser)
                throws IOException
        {
            DynamicSliceOutput dynamicSliceOutput = new DynamicSliceOutput(ESTIMATED_JSON_OUTPUT_SIZE);
            try (JsonGenerator jsonGenerator = createJsonGenerator(JSON_FACTORY, dynamicSliceOutput)) {
                jsonGenerator.copyCurrentStructure(jsonParser);
            }
            return dynamicSliceOutput.slice();
        }
    }

    public static class JsonSizeExtractor
            extends PrestoJsonExtractor<Long>
    {
        @Override
        public Long extract(JsonParser jsonParser, SqlFunctionProperties properties)
                throws IOException
        {
            if (!jsonParser.hasCurrentToken()) {
                throw new JsonParseException(jsonParser, "Unexpected end of value");
            }

            if (jsonParser.getCurrentToken() == START_ARRAY) {
                long length = 0;
                while (true) {
                    JsonToken token = jsonParser.nextToken();
                    if (token == null) {
                        return null;
                    }
                    if (token == END_ARRAY) {
                        return length;
                    }
                    jsonParser.skipChildren();

                    length++;
                }
            }

            if (jsonParser.getCurrentToken() == START_OBJECT) {
                long length = 0;
                while (true) {
                    JsonToken token = jsonParser.nextToken();
                    if (token == null) {
                        return null;
                    }
                    if (token == END_OBJECT) {
                        return length;
                    }

                    if (token == FIELD_NAME) {
                        length++;
                    }
                    else {
                        jsonParser.skipChildren();
                    }
                }
            }

            return 0L;
        }
    }

    private static int tryParseInt(String fieldName, int defaultValue)
    {
        int index = defaultValue;
        try {
            index = Integer.parseInt(fieldName);
        }
        catch (NumberFormatException ignored) {
        }
        return index;
    }
}