/src/tarantool/src/lib/json/json.c

Source
/*
 * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * 1. Redistributions of source code must retain the above
 *    copyright notice, this list of conditions and the
 *    following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials
 *    provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "json.h"

#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unicode/uchar.h>
#include <unicode/utf8.h>

#include "trivia/util.h"
#include <PMurHash.h>

/**
 * Read a single symbol from a string starting from an offset.
 * @param lexer JSON path lexer.
 * @param[out] UChar32 Read symbol.
 *
 * @retval   0 Success.
 * @retval > 0 1-based position of a syntax error.
 */
static inline int
json_read_symbol(struct json_lexer *lexer, UChar32 *out)
{
  if (json_lexer_is_eof(lexer)) {
    *out = U_SENTINEL;
    return lexer->symbol_count + 1;
  }
  U8_NEXT(lexer->src, lexer->offset, lexer->src_len, *out);
  if (*out == U_SENTINEL)
    return lexer->symbol_count + 1;
  ++lexer->symbol_count;
  return 0;
}

/**
 * Rollback one symbol offset.
 * @param lexer JSON path lexer.
 * @param offset Offset to the previous symbol.
 */
static inline void
json_revert_symbol(struct json_lexer *lexer, int offset)
{
  lexer->offset = offset;
  --lexer->symbol_count;
}

/** Fast forward when it is known that a symbol is 1-byte char. */
static inline void
json_skip_char(struct json_lexer *lexer)
{
  ++lexer->offset;
  ++lexer->symbol_count;
}

/** Get a current symbol as a 1-byte char. */
static inline char
json_current_char(const struct json_lexer *lexer)
{
  return *(lexer->src + lexer->offset);
}

/**
 * Parse string identifier in quotes. Lexer either stops right
 * after the closing quote, or returns an error position.
 * @param lexer JSON path lexer.
 * @param[out] token JSON token to store result.
 * @param quote_type Quote by that a string must be terminated.
 *
 * @retval   0 Success.
 * @retval > 0 1-based position of a syntax error.
 */
static inline int
json_parse_string(struct json_lexer *lexer, struct json_token *token,
      UChar32 quote_type)
{
  assert(lexer->offset < lexer->src_len);
  assert(quote_type == json_current_char(lexer));
  /* The first symbol is always char  - ' or ". */
  json_skip_char(lexer);
  int str_offset = lexer->offset;
  UChar32 c;
  int rc;
  while ((rc = json_read_symbol(lexer, &c)) == 0) {
    if (c == quote_type) {
      int len = lexer->offset - str_offset - 1;
      if (len == 0)
        return lexer->symbol_count;
      token->type = JSON_TOKEN_STR;
      token->str = lexer->src + str_offset;
      token->len = len;
      return 0;
    }
  }
  return rc;
}

/**
 * Parse digit sequence into integer until non-digit is met.
 * Lexer stops right after the last digit.
 * @param lexer JSON lexer.
 * @param[out] token JSON token to store result.
 *
 * @retval   0 Success.
 * @retval > 0 1-based position of a syntax error.
 */
static inline int
json_parse_integer(struct json_lexer *lexer, struct json_token *token)
{
  const char *end = lexer->src + lexer->src_len;
  const char *pos = lexer->src + lexer->offset;
  assert(pos < end);
  int len = 0;
  int value = 0;
  char c = *pos;
  if (! isdigit(c))
    return lexer->symbol_count + 1;
  do {
    value = value * 10 + c - (int)'0';
    ++len;
  } while (++pos < end && isdigit((c = *pos)));
  if (value < lexer->index_base)
    return lexer->symbol_count + 1;
  lexer->offset += len;
  lexer->symbol_count += len;
  token->type = JSON_TOKEN_NUM;
  token->num = value - lexer->index_base;
  return 0;
}

/**
 * Check that a symbol can be part of a JSON path not inside
 * ["..."].
 */
static inline bool
json_is_valid_identifier_symbol(UChar32 c)
{
  return u_isUAlphabetic(c) || c == (UChar32)'_' || u_isdigit(c);
}

/**
 * Parse identifier out of quotes. It can contain only alphas,
 * digits and underscores. And can not contain digit at the first
 * position. Lexer is stoped right after the last non-digit,
 * non-alpha and non-underscore symbol.
 * @param lexer JSON lexer.
 * @param[out] token JSON token to store result.
 *
 * @retval   0 Success.
 * @retval > 0 1-based position of a syntax error.
 */
static inline int
json_parse_identifier(struct json_lexer *lexer, struct json_token *token)
{
  assert(lexer->offset < lexer->src_len);
  int str_offset = lexer->offset;
  UChar32 c;
  int rc = json_read_symbol(lexer, &c);
  if (rc != 0)
    return rc;
  /* First symbol can not be digit. */
  if (!u_isalpha(c) && c != (UChar32)'_')
    return lexer->symbol_count;
  int last_offset = lexer->offset;
  while ((rc = json_read_symbol(lexer, &c)) == 0) {
    if (! json_is_valid_identifier_symbol(c)) {
      json_revert_symbol(lexer, last_offset);
      break;
    }
    last_offset = lexer->offset;
  }
  token->type = JSON_TOKEN_STR;
  token->str = lexer->src + str_offset;
  token->len = lexer->offset - str_offset;
  return 0;
}

int
json_lexer_next_token(struct json_lexer *lexer, struct json_token *token)
{
  if (json_lexer_is_eof(lexer)) {
    token->type = JSON_TOKEN_END;
    return 0;
  }
  UChar32 c;
  int last_offset = lexer->offset;
  int rc = json_read_symbol(lexer, &c);
  if (rc != 0)
    return rc;
  switch(c) {
  case (UChar32)'[':
    /* Error for '[\0'. */
    if (json_lexer_is_eof(lexer))
      return lexer->symbol_count;
    c = json_current_char(lexer);
    if (c == '"' || c == '\'') {
      rc = json_parse_string(lexer, token, c);
    } else if (c == '*') {
      json_skip_char(lexer);
      token->type = JSON_TOKEN_ANY;
    } else {
      rc = json_parse_integer(lexer, token);
    }
    if (rc != 0)
      return rc;
    /*
     * Expression, started from [ must be finished
     * with ] regardless of its type.
     */
    if (json_lexer_is_eof(lexer) ||
        json_current_char(lexer) != ']')
      return lexer->symbol_count + 1;
    /* Skip ] - one byte char. */
    json_skip_char(lexer);
    return 0;
  case (UChar32)'.':
    if (json_lexer_is_eof(lexer))
      return lexer->symbol_count + 1;
    return json_parse_identifier(lexer, token);
  default:
    if (last_offset != 0)
      return lexer->symbol_count;
    json_revert_symbol(lexer, last_offset);
    return json_parse_identifier(lexer, token);
  }
}

/**
 * Compare JSON tokens as nodes of a JSON tree. That is, including
 * parent references.
 */
static int
json_token_cmp_in_tree(const struct json_token *a, const struct json_token *b)
{
  if (a->parent != b->parent)
    return a->parent - b->parent;
  return json_token_cmp(a, b);
}

int
json_path_cmp(const char *a, int a_len, const char *b, int b_len,
        int index_base)
{
  struct json_lexer lexer_a, lexer_b;
  json_lexer_create(&lexer_a, a, a_len, index_base);
  json_lexer_create(&lexer_b, b, b_len, index_base);
  struct json_token token_a, token_b;
  /* For the sake of json_token_cmp_in_tree(). */
  token_a.parent = NULL;
  token_b.parent = NULL;
  int rc_a, rc_b;
  while ((rc_a = json_lexer_next_token(&lexer_a, &token_a)) == 0 &&
         (rc_b = json_lexer_next_token(&lexer_b, &token_b)) == 0 &&
    token_a.type != JSON_TOKEN_END &&
    token_b.type != JSON_TOKEN_END) {
    int rc = json_token_cmp_in_tree(&token_a, &token_b);
    if (rc != 0)
      return rc;
  }
  /* Paths a and b must be valid. */
  assert(rc_a == 0 && rc_b == 0);
  /*
   * The parser stopped because the end of one of the paths
   * was reached. As JSON_TOKEN_END > JSON_TOKEN_{NUM, STR},
   * the path having more tokens has lower key.type value.
   */
  return token_b.type - token_a.type;
}

int
json_path_validate(const char *path, int path_len, int index_base)
{
  struct json_lexer lexer;
  json_lexer_create(&lexer, path, path_len, index_base);
  struct json_token token;
  int rc;
  while ((rc = json_lexer_next_token(&lexer, &token)) == 0 &&
         token.type != JSON_TOKEN_END) {}
  return rc;
}

int
json_path_multikey_offset(const char *path, int path_len, int index_base)
{
  struct json_lexer lexer;
  json_lexer_create(&lexer, path, path_len, index_base);
  struct json_token token;
  int rc, last_lexer_offset = 0;
  while ((rc = json_lexer_next_token(&lexer, &token)) == 0) {
    if (token.type == JSON_TOKEN_ANY)
      return last_lexer_offset;
    else if (token.type == JSON_TOKEN_END)
      break;
    last_lexer_offset = lexer.offset;
  }
  assert(rc == 0);
  return path_len;
}

/**
 * An snprint-style helper to print an individual token key.
 */
static int
json_token_snprint(char *buf, int size, const struct json_token *token,
       int index_base)
{
  int len = 0;
  switch (token->type) {
  case JSON_TOKEN_NUM:
    len = snprintf(buf, size, "[%d]", token->num + index_base);
    break;
  case JSON_TOKEN_STR:
    len = snprintf(buf, size, "[\"%.*s\"]", token->len, token->str);
    break;
  case JSON_TOKEN_ANY:
    len = snprintf(buf, size, "[*]");
    break;
  default:
    unreachable();
  }
  return len;
}

int
json_tree_snprint_path(char *buf, int size, const struct json_token *token,
           int index_base)
{
  /* The token must be linked in a tree. */
  assert(token->parent != NULL);

  /*
   * Calculate the length of the final path string by passing
   * 0-length buffer to json_token_snprint() routine.
   */
  int len = 0;
  for (const struct json_token *iter = token;
       iter->type != JSON_TOKEN_END; iter = iter->parent)
    len += json_token_snprint(NULL, 0, iter, index_base);

  /*
   * Nothing else to do if the caller is only interested in
   * the string length.
   */
  if (size == 0)
    return len;

  /*
   * Write the path to the supplied buffer.
   */
  int pos = len;
  char last = '\0';
  for (const struct json_token *iter = token;
       iter->type != JSON_TOKEN_END; iter = iter->parent) {
    pos -= json_token_snprint(NULL, 0, iter, index_base);
    assert(pos >= 0);
    if (pos >= size) {
      /* The token doesn't fit in the buffer. */
      continue;
    }
    int rc = json_token_snprint(buf + pos, size - pos,
              iter, index_base);
    /*
     * Restore the character overwritten with
     * a terminating nul by json_token_snprint().
     */
    if (last != '\0') {
      assert(pos + rc < size);
      buf[pos + rc] = last;
    }
    last = buf[pos];
  }
  return len;
}

#define MH_SOURCE 1
#define mh_name _json
#define mh_key_t struct json_token *
#define mh_node_t struct json_token *
#define mh_arg_t void *
#define mh_hash(a, arg) ((*(a))->hash)
#define mh_hash_key(a, arg) ((a)->hash)
#define mh_cmp(a, b, arg) (json_token_cmp_in_tree(*(a), *(b)))
#define mh_cmp_key(a, b, arg) (json_token_cmp_in_tree((a), *(b)))
#include "salad/mhash.h"

static const uint32_t hash_seed = 13U;

/**
 * Compute the hash value of a JSON token.
 *
 * See the comment to json_tree::hash for implementation details.
 */
static uint32_t
json_token_hash(struct json_token *token)
{
  uint32_t h = token->parent->hash;
  uint32_t carry = 0;
  const void *data;
  int data_size;
  if (token->type == JSON_TOKEN_STR) {
    data = token->str;
    data_size = token->len;
  } else if (token->type == JSON_TOKEN_NUM) {
    data = &token->num;
    data_size = sizeof(token->num);
  } else if (token->type == JSON_TOKEN_ANY) {
    data = "*";
    data_size = 1;
  } else {
    unreachable();
  }
  PMurHash32_Process(&h, &carry, data, data_size);
  return PMurHash32_Result(h, carry, data_size);
}

void
json_tree_create(struct json_tree *tree)
{
  memset(tree, 0, sizeof(struct json_tree));
  tree->root.hash = hash_seed;
  tree->root.type = JSON_TOKEN_END;
  tree->root.max_child_idx = -1;
  tree->root.sibling_idx = -1;
  tree->hash = mh_json_new();
}

static void
json_token_destroy(struct json_token *token)
{
  assert(token->max_child_idx == -1);
  assert(token->sibling_idx == -1);
  free(token->children);
  token->children = NULL;
}

void
json_tree_destroy(struct json_tree *tree)
{
  json_token_destroy(&tree->root);
  mh_json_delete(tree->hash);
}

struct json_token *
json_tree_lookup_slowpath(struct json_tree *tree, struct json_token *parent,
        const struct json_token *token)
{
  assert(token->type == JSON_TOKEN_STR);
  struct json_token key;
  key.type = token->type;
  key.str = token->str;
  key.len = token->len;
  key.parent = parent;
  key.hash = json_token_hash(&key);
  mh_int_t id = mh_json_find(tree->hash, &key, NULL);
  if (id == mh_end(tree->hash))
    return NULL;
  struct json_token **entry = mh_json_node(tree->hash, id);
  assert(entry != NULL && *entry != NULL);
  return *entry;
}

int
json_tree_add(struct json_tree *tree, struct json_token *parent,
        struct json_token *token)
{
  assert(json_tree_lookup(tree, parent, token) == NULL);
  token->parent = parent;
  token->children = NULL;
  token->children_capacity = 0;
  token->max_child_idx = -1;
  token->hash = json_token_hash(token);
  int insert_idx = token->type == JSON_TOKEN_NUM ?
       (int)token->num : parent->max_child_idx + 1;
  /*
   * Dynamically grow the children array if necessary.
   */
  if (insert_idx >= parent->children_capacity) {
    int new_size = parent->children_capacity == 0 ?
             8 : 2 * parent->children_capacity;
    while (insert_idx >= new_size)
      new_size *= 2;
    struct json_token **children = realloc(parent->children,
            new_size * sizeof(void *));
    if (children == NULL)
      return -1; /* out of memory */
    memset(children + parent->children_capacity, 0,
           (new_size - parent->children_capacity) * sizeof(void *));
    parent->children = children;
    parent->children_capacity = new_size;
  }
  /*
   * Insert the token into the hash (only for tokens representing
   * JSON map entries, see the comment to json_tree::hash).
   */
  if (token->type == JSON_TOKEN_STR) {
    mh_json_put(tree->hash, (const struct json_token **)&token,
          NULL, NULL);
  }
  /*
   * Success, now we can insert the new token into its parent's
   * children array.
   */
  assert(parent->children[insert_idx] == NULL);
  parent->children[insert_idx] = token;
  parent->max_child_idx = MAX(parent->max_child_idx, insert_idx);
  token->sibling_idx = insert_idx;
  assert(json_tree_lookup(tree, parent, token) == token);
  return 0;
}

void
json_tree_del(struct json_tree *tree, struct json_token *token)
{
  struct json_token *parent = token->parent;
  assert(token->sibling_idx >= 0);
  assert(parent->children[token->sibling_idx] == token);
  assert(json_tree_lookup(tree, parent, token) == token);
  /*
   * Clear the entry corresponding to this token in parent's
   * children array and update max_child_idx if necessary.
   */
  parent->children[token->sibling_idx] = NULL;
  token->sibling_idx = -1;
  while (parent->max_child_idx >= 0 &&
         parent->children[parent->max_child_idx] == NULL)
    parent->max_child_idx--;
  /*
   * Remove the token from the hash (only for tokens representing
   * JSON map entries, see the comment to json_tree::hash).
   */
  if (token->type == JSON_TOKEN_STR) {
    mh_int_t id = mh_json_find(tree->hash, token, NULL);
    assert(id != mh_end(tree->hash));
    mh_json_del(tree->hash, id, NULL);
  }
  json_token_destroy(token);
  assert(json_tree_lookup(tree, parent, token) == NULL);
}

struct json_token *
json_tree_lookup_path(struct json_tree *tree, struct json_token *root,
          const char *path, int path_len, int index_base)
{
  int rc;
  struct json_lexer lexer;
  struct json_token token;
  struct json_token *ret = root;
  json_lexer_create(&lexer, path, path_len, index_base);
  while ((rc = json_lexer_next_token(&lexer, &token)) == 0 &&
         token.type != JSON_TOKEN_END && ret != NULL) {
    /*
     * We could skip intermediate lookups by computing
     * a rolling hash of all tokens produced by the
     * lexer. But then we would still have to traverse
     * the path back to ensure it is equal to the
     * found to the found one. For that we would have
     * to keep the stack of lexer tokens somewhere.
     * Given the complications of the alternative,
     * intermediate lookups don't seem to be so big of
     * a problem.
     */
    ret = json_tree_lookup(tree, ret, &token);
  }
  if (rc != 0 || token.type != JSON_TOKEN_END)
    return NULL;
  return ret;
}

/**
 * Return the child of @parent following @pos or NULL if @pos
 * points to the last child in the children array. If @pos is
 * NULL, this function returns the first child.
 */
static struct json_token *
json_tree_child_next(struct json_token *parent, struct json_token *pos)
{
  assert(pos == NULL || pos->parent == parent);
  struct json_token **arr = parent->children;
  if (arr == NULL)
    return NULL;
  int idx = pos != NULL ? pos->sibling_idx + 1 : 0;
  while (idx <= parent->max_child_idx && arr[idx] == NULL)
    idx++;
  return idx <= parent->max_child_idx ? arr[idx] : NULL;
}

/**
 * Return the leftmost descendant of the tree rooted at @pos
 * or NULL if the tree is empty.
 */
static struct json_token *
json_tree_leftmost(struct json_token *pos)
{
  struct json_token *last;
  do {
    last = pos;
    pos = json_tree_child_next(pos, NULL);
  } while (pos != NULL);
  return last;
}

struct json_token *
json_tree_preorder_next(struct json_token *root, struct json_token *pos)
{
  struct json_token *next = json_tree_child_next(pos, NULL);
  if (next != NULL)
    return next;
  while (pos != root) {
    next = json_tree_child_next(pos->parent, pos);
    if (next != NULL)
      return next;
    pos = pos->parent;
  }
  return NULL;
}

struct json_token *
json_tree_postorder_next(struct json_token *root, struct json_token *pos)
{
  struct json_token *next;
  if (pos == NULL)
    return json_tree_leftmost(root);
  if (pos == root)
    return NULL;
  next = json_tree_child_next(pos->parent, pos);
  if (next != NULL)
    return json_tree_leftmost(next);
  return pos->parent;
}

Coverage Report

Created: 2025-12-28 06:16

Line	Count	Source
1		/*
2		* Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
3		*
4		* Redistribution and use in source and binary forms, with or
5		* without modification, are permitted provided that the following
6		* conditions are met:
7		*
8		* 1. Redistributions of source code must retain the above
9		* copyright notice, this list of conditions and the
10		* following disclaimer.
11		*
12		* 2. Redistributions in binary form must reproduce the above
13		* copyright notice, this list of conditions and the following
14		* disclaimer in the documentation and/or other materials
15		* provided with the distribution.
16		*
17		* THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
18		* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19		* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20		* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
21		* <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22		* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23		* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24		* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
25		* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26		* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27		* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28		* THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29		* SUCH DAMAGE.
30		*/
31
32		#include "json.h"
33
34		#include <assert.h>
35		#include <ctype.h>
36		#include <stdbool.h>
37		#include <stddef.h>
38		#include <stdint.h>
39		#include <stdlib.h>
40		#include <string.h>
41		#include <unicode/uchar.h>
42		#include <unicode/utf8.h>
43
44		#include "trivia/util.h"
45		#include <PMurHash.h>
46
47		/**
48		* Read a single symbol from a string starting from an offset.
49		* @param lexer JSON path lexer.
50		* @param[out] UChar32 Read symbol.
51		*
52		* @retval 0 Success.
53		* @retval > 0 1-based position of a syntax error.
54		*/
55		static inline int
56		json_read_symbol(struct json_lexer lexer, UChar32 out)
57	0	{
58	0	if (json_lexer_is_eof(lexer)) {
59	0	*out = U_SENTINEL;
60	0	return lexer->symbol_count + 1;
61	0	}
62	0	U8_NEXT(lexer->src, lexer->offset, lexer->src_len, *out);
63	0	if (*out == U_SENTINEL)
64	0	return lexer->symbol_count + 1;
65	0	++lexer->symbol_count;
66	0	return 0;
67	0	}
68
69		/**
70		* Rollback one symbol offset.
71		* @param lexer JSON path lexer.
72		* @param offset Offset to the previous symbol.
73		*/
74		static inline void
75		json_revert_symbol(struct json_lexer *lexer, int offset)
76	0	{
77	0	lexer->offset = offset;
78	0	--lexer->symbol_count;
79	0	}
80
81		/** Fast forward when it is known that a symbol is 1-byte char. */
82		static inline void
83		json_skip_char(struct json_lexer *lexer)
84	0	{
85	0	++lexer->offset;
86	0	++lexer->symbol_count;
87	0	}
88
89		/** Get a current symbol as a 1-byte char. */
90		static inline char
91		json_current_char(const struct json_lexer *lexer)
92	0	{
93	0	return *(lexer->src + lexer->offset);
94	0	}
95
96		/**
97		* Parse string identifier in quotes. Lexer either stops right
98		* after the closing quote, or returns an error position.
99		* @param lexer JSON path lexer.
100		* @param[out] token JSON token to store result.
101		* @param quote_type Quote by that a string must be terminated.
102		*
103		* @retval 0 Success.
104		* @retval > 0 1-based position of a syntax error.
105		*/
106		static inline int
107		json_parse_string(struct json_lexer lexer, struct json_token token,
108		UChar32 quote_type)
109	0	{
110	0	assert(lexer->offset < lexer->src_len);
111	0	assert(quote_type == json_current_char(lexer));
112		/* The first symbol is always char - ' or ". */
113	0	json_skip_char(lexer);
114	0	int str_offset = lexer->offset;
115	0	UChar32 c;
116	0	int rc;
117	0	while ((rc = json_read_symbol(lexer, &c)) == 0) {
118	0	if (c == quote_type) {
119	0	int len = lexer->offset - str_offset - 1;
120	0	if (len == 0)
121	0	return lexer->symbol_count;
122	0	token->type = JSON_TOKEN_STR;
123	0	token->str = lexer->src + str_offset;
124	0	token->len = len;
125	0	return 0;
126	0	}
127	0	}
128	0	return rc;
129	0	}
130
131		/**
132		* Parse digit sequence into integer until non-digit is met.
133		* Lexer stops right after the last digit.
134		* @param lexer JSON lexer.
135		* @param[out] token JSON token to store result.
136		*
137		* @retval 0 Success.
138		* @retval > 0 1-based position of a syntax error.
139		*/
140		static inline int
141		json_parse_integer(struct json_lexer lexer, struct json_token token)
142	0	{
143	0	const char *end = lexer->src + lexer->src_len;
144	0	const char *pos = lexer->src + lexer->offset;
145	0	assert(pos < end);
146	0	int len = 0;
147	0	int value = 0;
148	0	char c = *pos;
149	0	if (! isdigit(c))
150	0	return lexer->symbol_count + 1;
151	0	do {
152	0	value = value * 10 + c - (int)'0';
153	0	++len;
154	0	} while (++pos < end && isdigit((c = *pos)));
155	0	if (value < lexer->index_base)
156	0	return lexer->symbol_count + 1;
157	0	lexer->offset += len;
158	0	lexer->symbol_count += len;
159	0	token->type = JSON_TOKEN_NUM;
160	0	token->num = value - lexer->index_base;
161	0	return 0;
162	0	}
163
164		/**
165		* Check that a symbol can be part of a JSON path not inside
166		* ["..."].
167		*/
168		static inline bool
169		json_is_valid_identifier_symbol(UChar32 c)
170	0	{
171	0	return u_isUAlphabetic(c) \|\| c == (UChar32)'_' \|\| u_isdigit(c);
172	0	}
173
174		/**
175		* Parse identifier out of quotes. It can contain only alphas,
176		* digits and underscores. And can not contain digit at the first
177		* position. Lexer is stoped right after the last non-digit,
178		* non-alpha and non-underscore symbol.
179		* @param lexer JSON lexer.
180		* @param[out] token JSON token to store result.
181		*
182		* @retval 0 Success.
183		* @retval > 0 1-based position of a syntax error.
184		*/
185		static inline int
186		json_parse_identifier(struct json_lexer lexer, struct json_token token)
187	0	{
188	0	assert(lexer->offset < lexer->src_len);
189	0	int str_offset = lexer->offset;
190	0	UChar32 c;
191	0	int rc = json_read_symbol(lexer, &c);
192	0	if (rc != 0)
193	0	return rc;
194		/* First symbol can not be digit. */
195	0	if (!u_isalpha(c) && c != (UChar32)'_')
196	0	return lexer->symbol_count;
197	0	int last_offset = lexer->offset;
198	0	while ((rc = json_read_symbol(lexer, &c)) == 0) {
199	0	if (! json_is_valid_identifier_symbol(c)) {
200	0	json_revert_symbol(lexer, last_offset);
201	0	break;
202	0	}
203	0	last_offset = lexer->offset;
204	0	}
205	0	token->type = JSON_TOKEN_STR;
206	0	token->str = lexer->src + str_offset;
207	0	token->len = lexer->offset - str_offset;
208	0	return 0;
209	0	}
210
211		int
212		json_lexer_next_token(struct json_lexer lexer, struct json_token token)
213	0	{
214	0	if (json_lexer_is_eof(lexer)) {
215	0	token->type = JSON_TOKEN_END;
216	0	return 0;
217	0	}
218	0	UChar32 c;
219	0	int last_offset = lexer->offset;
220	0	int rc = json_read_symbol(lexer, &c);
221	0	if (rc != 0)
222	0	return rc;
223	0	switch(c) {
224	0	case (UChar32)'[':
225		/* Error for '[\0'. */
226	0	if (json_lexer_is_eof(lexer))
227	0	return lexer->symbol_count;
228	0	c = json_current_char(lexer);
229	0	if (c == '"' \|\| c == '\'') {
230	0	rc = json_parse_string(lexer, token, c);
231	0	} else if (c == '*') {
232	0	json_skip_char(lexer);
233	0	token->type = JSON_TOKEN_ANY;
234	0	} else {
235	0	rc = json_parse_integer(lexer, token);
236	0	}
237	0	if (rc != 0)
238	0	return rc;
239		/*
240		* Expression, started from [ must be finished
241		* with ] regardless of its type.
242		*/
243	0	if (json_lexer_is_eof(lexer) \|\|
244	0	json_current_char(lexer) != ']')
245	0	return lexer->symbol_count + 1;
246		/* Skip ] - one byte char. */
247	0	json_skip_char(lexer);
248	0	return 0;
249	0	case (UChar32)'.':
250	0	if (json_lexer_is_eof(lexer))
251	0	return lexer->symbol_count + 1;
252	0	return json_parse_identifier(lexer, token);
253	0	default:
254	0	if (last_offset != 0)
255	0	return lexer->symbol_count;
256	0	json_revert_symbol(lexer, last_offset);
257	0	return json_parse_identifier(lexer, token);
258	0	}
259	0	}
260
261		/**
262		* Compare JSON tokens as nodes of a JSON tree. That is, including
263		* parent references.
264		*/
265		static int
266		json_token_cmp_in_tree(const struct json_token a, const struct json_token b)
267	0	{
268	0	if (a->parent != b->parent)
269	0	return a->parent - b->parent;
270	0	return json_token_cmp(a, b);
271	0	}
272
273		int
274		json_path_cmp(const char a, int a_len, const char b, int b_len,
275		int index_base)
276	0	{
277	0	struct json_lexer lexer_a, lexer_b;
278	0	json_lexer_create(&lexer_a, a, a_len, index_base);
279	0	json_lexer_create(&lexer_b, b, b_len, index_base);
280	0	struct json_token token_a, token_b;
281		/* For the sake of json_token_cmp_in_tree(). */
282	0	token_a.parent = NULL;
283	0	token_b.parent = NULL;
284	0	int rc_a, rc_b;
285	0	while ((rc_a = json_lexer_next_token(&lexer_a, &token_a)) == 0 &&
286	0	(rc_b = json_lexer_next_token(&lexer_b, &token_b)) == 0 &&
287	0	token_a.type != JSON_TOKEN_END &&
288	0	token_b.type != JSON_TOKEN_END) {
289	0	int rc = json_token_cmp_in_tree(&token_a, &token_b);
290	0	if (rc != 0)
291	0	return rc;
292	0	}
293		/* Paths a and b must be valid. */
294	0	assert(rc_a == 0 && rc_b == 0);
295		/*
296		* The parser stopped because the end of one of the paths
297		* was reached. As JSON_TOKEN_END > JSON_TOKEN_{NUM, STR},
298		* the path having more tokens has lower key.type value.
299		*/
300	0	return token_b.type - token_a.type;
301	0	}
302
303		int
304		json_path_validate(const char *path, int path_len, int index_base)
305	0	{
306	0	struct json_lexer lexer;
307	0	json_lexer_create(&lexer, path, path_len, index_base);
308	0	struct json_token token;
309	0	int rc;
310	0	while ((rc = json_lexer_next_token(&lexer, &token)) == 0 &&
311	0	token.type != JSON_TOKEN_END) {}
312	0	return rc;
313	0	}
314
315		int
316		json_path_multikey_offset(const char *path, int path_len, int index_base)
317	0	{
318	0	struct json_lexer lexer;
319	0	json_lexer_create(&lexer, path, path_len, index_base);
320	0	struct json_token token;
321	0	int rc, last_lexer_offset = 0;
322	0	while ((rc = json_lexer_next_token(&lexer, &token)) == 0) {
323	0	if (token.type == JSON_TOKEN_ANY)
324	0	return last_lexer_offset;
325	0	else if (token.type == JSON_TOKEN_END)
326	0	break;
327	0	last_lexer_offset = lexer.offset;
328	0	}
329	0	assert(rc == 0);
330	0	return path_len;
331	0	}
332
333		/**
334		* An snprint-style helper to print an individual token key.
335		*/
336		static int
337		json_token_snprint(char buf, int size, const struct json_token token,
338		int index_base)
339	0	{
340	0	int len = 0;
341	0	switch (token->type) {
342	0	case JSON_TOKEN_NUM:
343	0	len = snprintf(buf, size, "[%d]", token->num + index_base);
344	0	break;
345	0	case JSON_TOKEN_STR:
346	0	len = snprintf(buf, size, "[\"%.*s\"]", token->len, token->str);
347	0	break;
348	0	case JSON_TOKEN_ANY:
349	0	len = snprintf(buf, size, "[*]");
350	0	break;
351	0	default:
352	0	unreachable();
353	0	}
354	0	return len;
355	0	}
356
357		int
358		json_tree_snprint_path(char buf, int size, const struct json_token token,
359		int index_base)
360	0	{
361		/* The token must be linked in a tree. */
362	0	assert(token->parent != NULL);
363
364		/*
365		* Calculate the length of the final path string by passing
366		* 0-length buffer to json_token_snprint() routine.
367		*/
368	0	int len = 0;
369	0	for (const struct json_token *iter = token;
370	0	iter->type != JSON_TOKEN_END; iter = iter->parent)
371	0	len += json_token_snprint(NULL, 0, iter, index_base);
372
373		/*
374		* Nothing else to do if the caller is only interested in
375		* the string length.
376		*/
377	0	if (size == 0)
378	0	return len;
379
380		/*
381		* Write the path to the supplied buffer.
382		*/
383	0	int pos = len;
384	0	char last = '\0';
385	0	for (const struct json_token *iter = token;
386	0	iter->type != JSON_TOKEN_END; iter = iter->parent) {
387	0	pos -= json_token_snprint(NULL, 0, iter, index_base);
388	0	assert(pos >= 0);
389	0	if (pos >= size) {
390		/* The token doesn't fit in the buffer. */
391	0	continue;
392	0	}
393	0	int rc = json_token_snprint(buf + pos, size - pos,
394	0	iter, index_base);
395		/*
396		* Restore the character overwritten with
397		* a terminating nul by json_token_snprint().
398		*/
399	0	if (last != '\0') {
400	0	assert(pos + rc < size);
401	0	buf[pos + rc] = last;
402	0	}
403	0	last = buf[pos];
404	0	}
405	0	return len;
406	0	}
407
408		#define MH_SOURCE 1
409		#define mh_name _json
410		#define mh_key_t struct json_token *
411	0	#define mh_node_t struct json_token *
412		#define mh_arg_t void *
413	0	#define mh_hash(a, arg) ((*(a))->hash)
414	0	#define mh_hash_key(a, arg) ((a)->hash)
415	0	#define mh_cmp(a, b, arg) (json_token_cmp_in_tree((a), (b)))
416	0	#define mh_cmp_key(a, b, arg) (json_token_cmp_in_tree((a), *(b)))
417		#include "salad/mhash.h"
418
419		static const uint32_t hash_seed = 13U;
420
421		/**
422		* Compute the hash value of a JSON token.
423		*
424		* See the comment to json_tree::hash for implementation details.
425		*/
426		static uint32_t
427		json_token_hash(struct json_token *token)
428	0	{
429	0	uint32_t h = token->parent->hash;
430	0	uint32_t carry = 0;
431	0	const void *data;
432	0	int data_size;
433	0	if (token->type == JSON_TOKEN_STR) {
434	0	data = token->str;
435	0	data_size = token->len;
436	0	} else if (token->type == JSON_TOKEN_NUM) {
437	0	data = &token->num;
438	0	data_size = sizeof(token->num);
439	0	} else if (token->type == JSON_TOKEN_ANY) {
440	0	data = "*";
441	0	data_size = 1;
442	0	} else {
443	0	unreachable();
444	0	}
445	0	PMurHash32_Process(&h, &carry, data, data_size);
446	0	return PMurHash32_Result(h, carry, data_size);
447	0	}
448
449		void
450		json_tree_create(struct json_tree *tree)
451	2	{
452	2	memset(tree, 0, sizeof(struct json_tree));
453	2	tree->root.hash = hash_seed;
454	2	tree->root.type = JSON_TOKEN_END;
455	2	tree->root.max_child_idx = -1;
456	2	tree->root.sibling_idx = -1;
457	2	tree->hash = mh_json_new();
458	2	}
459
460		static void
461		json_token_destroy(struct json_token *token)
462	0	{
463	0	assert(token->max_child_idx == -1);
464	0	assert(token->sibling_idx == -1);
465	0	free(token->children);
466	0	token->children = NULL;
467	0	}
468
469		void
470		json_tree_destroy(struct json_tree *tree)
471	0	{
472	0	json_token_destroy(&tree->root);
473	0	mh_json_delete(tree->hash);
474	0	}
475
476		struct json_token *
477		json_tree_lookup_slowpath(struct json_tree tree, struct json_token parent,
478		const struct json_token *token)
479	0	{
480	0	assert(token->type == JSON_TOKEN_STR);
481	0	struct json_token key;
482	0	key.type = token->type;
483	0	key.str = token->str;
484	0	key.len = token->len;
485	0	key.parent = parent;
486	0	key.hash = json_token_hash(&key);
487	0	mh_int_t id = mh_json_find(tree->hash, &key, NULL);
488	0	if (id == mh_end(tree->hash))
489	0	return NULL;
490	0	struct json_token **entry = mh_json_node(tree->hash, id);
491	0	assert(entry != NULL && *entry != NULL);
492	0	return *entry;
493	0	}
494
495		int
496		json_tree_add(struct json_tree tree, struct json_token parent,
497		struct json_token *token)
498	0	{
499	0	assert(json_tree_lookup(tree, parent, token) == NULL);
500	0	token->parent = parent;
501	0	token->children = NULL;
502	0	token->children_capacity = 0;
503	0	token->max_child_idx = -1;
504	0	token->hash = json_token_hash(token);
505	0	int insert_idx = token->type == JSON_TOKEN_NUM ?
506	0	(int)token->num : parent->max_child_idx + 1;
507		/*
508		* Dynamically grow the children array if necessary.
509		*/
510	0	if (insert_idx >= parent->children_capacity) {
511	0	int new_size = parent->children_capacity == 0 ?
512	0	8 : 2 * parent->children_capacity;
513	0	while (insert_idx >= new_size)
514	0	new_size *= 2;
515	0	struct json_token **children = realloc(parent->children,
516	0	new_size * sizeof(void *));
517	0	if (children == NULL)
518	0	return -1; /* out of memory */
519	0	memset(children + parent->children_capacity, 0,
520	0	(new_size - parent->children_capacity) * sizeof(void *));
521	0	parent->children = children;
522	0	parent->children_capacity = new_size;
523	0	}
524		/*
525		* Insert the token into the hash (only for tokens representing
526		* JSON map entries, see the comment to json_tree::hash).
527		*/
528	0	if (token->type == JSON_TOKEN_STR) {
529	0	mh_json_put(tree->hash, (const struct json_token **)&token,
530	0	NULL, NULL);
531	0	}
532		/*
533		* Success, now we can insert the new token into its parent's
534		* children array.
535		*/
536	0	assert(parent->children[insert_idx] == NULL);
537	0	parent->children[insert_idx] = token;
538	0	parent->max_child_idx = MAX(parent->max_child_idx, insert_idx);
539	0	token->sibling_idx = insert_idx;
540	0	assert(json_tree_lookup(tree, parent, token) == token);
541	0	return 0;
542	0	}
543
544		void
545		json_tree_del(struct json_tree tree, struct json_token token)
546	0	{
547	0	struct json_token *parent = token->parent;
548	0	assert(token->sibling_idx >= 0);
549	0	assert(parent->children[token->sibling_idx] == token);
550	0	assert(json_tree_lookup(tree, parent, token) == token);
551		/*
552		* Clear the entry corresponding to this token in parent's
553		* children array and update max_child_idx if necessary.
554		*/
555	0	parent->children[token->sibling_idx] = NULL;
556	0	token->sibling_idx = -1;
557	0	while (parent->max_child_idx >= 0 &&
558	0	parent->children[parent->max_child_idx] == NULL)
559	0	parent->max_child_idx--;
560		/*
561		* Remove the token from the hash (only for tokens representing
562		* JSON map entries, see the comment to json_tree::hash).
563		*/
564	0	if (token->type == JSON_TOKEN_STR) {
565	0	mh_int_t id = mh_json_find(tree->hash, token, NULL);
566	0	assert(id != mh_end(tree->hash));
567	0	mh_json_del(tree->hash, id, NULL);
568	0	}
569	0	json_token_destroy(token);
570	0	assert(json_tree_lookup(tree, parent, token) == NULL);
571	0	}
572
573		struct json_token *
574		json_tree_lookup_path(struct json_tree tree, struct json_token root,
575		const char *path, int path_len, int index_base)
576	0	{
577	0	int rc;
578	0	struct json_lexer lexer;
579	0	struct json_token token;
580	0	struct json_token *ret = root;
581	0	json_lexer_create(&lexer, path, path_len, index_base);
582	0	while ((rc = json_lexer_next_token(&lexer, &token)) == 0 &&
583	0	token.type != JSON_TOKEN_END && ret != NULL) {
584		/*
585		* We could skip intermediate lookups by computing
586		* a rolling hash of all tokens produced by the
587		* lexer. But then we would still have to traverse
588		* the path back to ensure it is equal to the
589		* found to the found one. For that we would have
590		* to keep the stack of lexer tokens somewhere.
591		* Given the complications of the alternative,
592		* intermediate lookups don't seem to be so big of
593		* a problem.
594		*/
595	0	ret = json_tree_lookup(tree, ret, &token);
596	0	}
597	0	if (rc != 0 \|\| token.type != JSON_TOKEN_END)
598	0	return NULL;
599	0	return ret;
600	0	}
601
602		/**
603		* Return the child of @parent following @pos or NULL if @pos
604		* points to the last child in the children array. If @pos is
605		* NULL, this function returns the first child.
606		*/
607		static struct json_token *
608		json_tree_child_next(struct json_token parent, struct json_token pos)
609	2	{
610	2	assert(pos == NULL \|\| pos->parent == parent);
611	2	struct json_token **arr = parent->children;
612	2	if (arr == NULL)
613	2	return NULL;
614	0	int idx = pos != NULL ? pos->sibling_idx + 1 : 0;
615	0	while (idx <= parent->max_child_idx && arr[idx] == NULL)
616	0	idx++;
617	0	return idx <= parent->max_child_idx ? arr[idx] : NULL;
618	2	}
619
620		/**
621		* Return the leftmost descendant of the tree rooted at @pos
622		* or NULL if the tree is empty.
623		*/
624		static struct json_token *
625		json_tree_leftmost(struct json_token *pos)
626	0	{
627	0	struct json_token *last;
628	0	do {
629	0	last = pos;
630	0	pos = json_tree_child_next(pos, NULL);
631	0	} while (pos != NULL);
632	0	return last;
633	0	}
634
635		struct json_token *
636		json_tree_preorder_next(struct json_token root, struct json_token pos)
637	2	{
638	2	struct json_token *next = json_tree_child_next(pos, NULL);
639	2	if (next != NULL)
640	0	return next;
641	2	while (pos != root) {
642	0	next = json_tree_child_next(pos->parent, pos);
643	0	if (next != NULL)
644	0	return next;
645	0	pos = pos->parent;
646	0	}
647	2	return NULL;
648	2	}
649
650		struct json_token *
651		json_tree_postorder_next(struct json_token root, struct json_token pos)
652	0	{
653	0	struct json_token *next;
654	0	if (pos == NULL)
655	0	return json_tree_leftmost(root);
656	0	if (pos == root)
657	0	return NULL;
658	0	next = json_tree_child_next(pos->parent, pos);
659	0	if (next != NULL)
660	0	return json_tree_leftmost(next);
661	0	return pos->parent;
662	0	}