/src/hermes/lib/VM/JSLib/JSONLexer.cpp

Source (jump to first uncovered line)
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "JSONLexer.h"

#include "hermes/VM/StringPrimitive.h"
#include "llvh/ADT/ScopeExit.h"

#include "dtoa/dtoa.h"

namespace hermes {
namespace vm {

static const char *TrueString = "true";
static const char *FalseString = "false";
static const char *NullString = "null";

static bool isJSONWhiteSpace(char16_t ch) {
  // JSONWhiteSpace includes <TAB>, <CR>, <LF>, <SP>.
  return (ch == u'\t' || ch == u'\r' || ch == u'\n' || ch == u' ');
}

ExecutionStatus JSONLexer::advance() {
  return advanceHelper(false);
}

ExecutionStatus JSONLexer::advanceStrAsSymbol() {
  return advanceHelper(true);
}

ExecutionStatus JSONLexer::advanceHelper(bool forKey) {
  // Skip whitespaces.
  while (curCharPtr_.hasChar() && isJSONWhiteSpace(*curCharPtr_)) {
    ++curCharPtr_;
  }

  // End of buffer.
  if (!curCharPtr_.hasChar()) {
    token_.setEof();
    return ExecutionStatus::RETURNED;
  }

  token_.setFirstChar(*curCharPtr_);

#define PUNC(ch, tok)          \
  case ch:                     \
    token_.setPunctuator(tok); \
    ++curCharPtr_;             \
    return ExecutionStatus::RETURNED

#define WORD(ch, word, tok) \
  case ch:                  \
    return scanWord(word, tok)

  switch (*curCharPtr_) {
    PUNC(u'{', JSONTokenKind::LBrace);
    PUNC(u'}', JSONTokenKind::RBrace);
    PUNC(u'[', JSONTokenKind::LSquare);
    PUNC(u']', JSONTokenKind::RSquare);
    PUNC(u',', JSONTokenKind::Comma);
    PUNC(u':', JSONTokenKind::Colon);
    WORD(u't', TrueString, JSONTokenKind::True);
    WORD(u'f', FalseString, JSONTokenKind::False);
    WORD(u'n', NullString, JSONTokenKind::Null);

      // clang-format off
    case u'-':
    case u'0': case u'1': case u'2': case u'3': case u'4':
    case u'5': case u'6': case u'7': case u'8': case u'9':
      // clang-format on
      return scanNumber();

    case u'"':
      if (forKey) {
        return scanString<StrAsSymbol>();
      } else {
        return scanString<StrAsValue>();
      }

    default:
      return errorWithChar(u"Unexpected character: ", *curCharPtr_);
  }
}

CallResult<char16_t> JSONLexer::consumeUnicode() {
  uint16_t val = 0;
  for (unsigned i = 0; i < 4; ++i) {
    if (!curCharPtr_.hasChar()) {
      return error("Unexpected end of input");
    }
    int ch = *curCharPtr_ | 32;
    if (ch >= '0' && ch <= '9') {
      ch -= '0';
    } else if (ch >= 'a' && ch <= 'f') {
      ch -= 'a' - 10;
    } else {
      return errorWithChar(u"Invalid unicode point character: ", *curCharPtr_);
    }
    val = (val << 4) + ch;
    ++curCharPtr_;
  }

  return static_cast<char16_t>(val);
}

ExecutionStatus JSONLexer::scanNumber() {
  llvh::SmallVector<char, 32> str8;
  while (curCharPtr_.hasChar()) {
    auto ch = *curCharPtr_;
    if (!(ch == u'-' || ch == u'+' || ch == u'.' || (ch | 32) == u'e' ||
          (ch >= u'0' && ch <= u'9'))) {
      break;
    }
    str8.push_back(ch);
    ++curCharPtr_;
  }

  size_t len = str8.size();
  assert(len > 0 && "scanNumber must be called on a number-looking char");
  if (str8[0] == '0' && len > 1 && str8[1] >= '0' && str8[1] <= '9') {
    // The integer part cannot start with 0, unless it's 0.
    return errorWithChar(u"Unexpected character in number: ", str8[1]);
  }

  str8.push_back('\0');

  char *endPtr;
  double value = ::hermes_g_strtod(str8.data(), &endPtr);
  if (endPtr != str8.data() + len) {
    return errorWithChar(u"Unexpected character in number: ", *endPtr);
  }
  token_.setNumber(value);
  return ExecutionStatus::RETURNED;
}

template <typename ForKey>
ExecutionStatus JSONLexer::scanString() {
  assert(*curCharPtr_ == '"');
  ++curCharPtr_;
  bool hasEscape = false;
  // Ideally we don't have to use tmpStorage. In the case of a plain string with
  // no escapes, we construct an ArrayRef at the end of scanning that points to
  // the beginning and end of the string.
  SmallU16String<32> tmpStorage;
  curCharPtr_.beginCapture();
  // Make sure we don't somehow leave a dangling open capture.
  auto ensureCaptureClosed =
      llvh::make_scope_exit([this] { curCharPtr_.cancelCapture(); });
  bool allAscii = true;
  hermes::JenkinsHash hash = hermes::JenkinsHashInit;

  while (curCharPtr_.hasChar()) {
    if (*curCharPtr_ == '"') {
      // End of string.
      llvh::ArrayRef<char16_t> strRef =
          hasEscape ? tmpStorage.arrayRef() : curCharPtr_.endCapture();
      ++curCharPtr_;
      if constexpr (ForKey::value) {
        auto symRes = runtime_.getIdentifierTable().getSymbolHandle(
            runtime_, strRef, hash);
        if (symRes == ExecutionStatus::EXCEPTION)
          return ExecutionStatus::EXCEPTION;
        token_.setSymbol(*symRes);
        return ExecutionStatus::RETURNED;
      }
      auto strRes =
          StringPrimitive::createWithKnownEncoding(runtime_, strRef, allAscii);
      if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) {
        return ExecutionStatus::EXCEPTION;
      }
      token_.setString(runtime_.makeHandle<StringPrimitive>(*strRes));
      return ExecutionStatus::RETURNED;
    } else if (*curCharPtr_ <= '\u001F') {
      return error(u"U+0000 thru U+001F is not allowed in string");
    }
    char16_t scannedChar = -1;
    if (*curCharPtr_ == u'\\') {
      if (!hasEscape) {
        // This is the first escape character encountered, so append everything
        // we've seen so far to tmpStorage.
        tmpStorage.append(curCharPtr_.endCapture());
      }
      hasEscape = true;
      ++curCharPtr_;
      if (!curCharPtr_.hasChar()) {
        return error("Unexpected end of input");
      }
      switch (*curCharPtr_) {
#define CONSUME_VAL(v)     \
  tmpStorage.push_back(v); \
  ++curCharPtr_;

        case u'"':
        case u'/':
        case u'\\':
          CONSUME_VAL(*curCharPtr_)
          break;
        case 'b':
          CONSUME_VAL(8)
          break;
        case 'f':
          CONSUME_VAL(12)
          break;
        case 'n':
          CONSUME_VAL(10)
          break;
        case 'r':
          CONSUME_VAL(13)
          break;
        case 't':
          CONSUME_VAL(9)
          break;
        case 'u': {
          ++curCharPtr_;
          CallResult<char16_t> cr = consumeUnicode();
          if (LLVM_UNLIKELY(cr == ExecutionStatus::EXCEPTION)) {
            return ExecutionStatus::EXCEPTION;
          }
          tmpStorage.push_back(*cr);
          break;
        }

        default:
          return errorWithChar(u"Invalid escape sequence: ", *curCharPtr_);
      }
      scannedChar = tmpStorage.back();
    } else {
      scannedChar = *curCharPtr_;
      if (hasEscape)
        tmpStorage.push_back(scannedChar);
      ++curCharPtr_;
    }
    if constexpr (ForKey::value) {
      hash = hermes::updateJenkinsHash(hash, scannedChar);
    } else {
      allAscii &= isASCII(scannedChar);
    }
  }
  return error("Unexpected end of input");
}

ExecutionStatus JSONLexer::scanWord(const char *word, JSONTokenKind kind) {
  while (*word && curCharPtr_.hasChar()) {
    if (*curCharPtr_ != *word) {
      return errorWithChar(u"Unexpected character: ", *curCharPtr_);
    }
    ++curCharPtr_;
    ++word;
  }
  if (*word) {
    return error(u"Unexpected end of input");
  }
  token_.setPunctuator(kind);
  return ExecutionStatus::RETURNED;
}

} // namespace vm
} // namespace hermes

Coverage Report

Created: 2025-06-24 06:43

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) Meta Platforms, Inc. and affiliates.
3		*
4		* This source code is licensed under the MIT license found in the
5		* LICENSE file in the root directory of this source tree.
6		*/
7
8		#include "JSONLexer.h"
9
10		#include "hermes/VM/StringPrimitive.h"
11		#include "llvh/ADT/ScopeExit.h"
12
13		#include "dtoa/dtoa.h"
14
15		namespace hermes {
16		namespace vm {
17
18		static const char *TrueString = "true";
19		static const char *FalseString = "false";
20		static const char *NullString = "null";
21
22	0	static bool isJSONWhiteSpace(char16_t ch) {
23		// JSONWhiteSpace includes <TAB>, <CR>, <LF>, <SP>.
24	0	return (ch == u'\t' \|\| ch == u'\r' \|\| ch == u'\n' \|\| ch == u' ');
25	0	}
26
27	0	ExecutionStatus JSONLexer::advance() {
28	0	return advanceHelper(false);
29	0	}
30
31	0	ExecutionStatus JSONLexer::advanceStrAsSymbol() {
32	0	return advanceHelper(true);
33	0	}
34
35	0	ExecutionStatus JSONLexer::advanceHelper(bool forKey) {
36		// Skip whitespaces.
37	0	while (curCharPtr_.hasChar() && isJSONWhiteSpace(*curCharPtr_)) {
38	0	++curCharPtr_;
39	0	}
40
41		// End of buffer.
42	0	if (!curCharPtr_.hasChar()) {
43	0	token_.setEof();
44	0	return ExecutionStatus::RETURNED;
45	0	}
46
47	0	token_.setFirstChar(*curCharPtr_);
48
49	0	#define PUNC(ch, tok) \
50	0	case ch: \
51	0	token_.setPunctuator(tok); \
52	0	++curCharPtr_; \
53	0	return ExecutionStatus::RETURNED
54
55	0	#define WORD(ch, word, tok) \
56	0	case ch: \
57	0	return scanWord(word, tok)
58
59	0	switch (*curCharPtr_) {
60	0	PUNC(u'{', JSONTokenKind::LBrace);
61	0	PUNC(u'}', JSONTokenKind::RBrace);
62	0	PUNC(u'[', JSONTokenKind::LSquare);
63	0	PUNC(u']', JSONTokenKind::RSquare);
64	0	PUNC(u',', JSONTokenKind::Comma);
65	0	PUNC(u':', JSONTokenKind::Colon);
66	0	WORD(u't', TrueString, JSONTokenKind::True);
67	0	WORD(u'f', FalseString, JSONTokenKind::False);
68	0	WORD(u'n', NullString, JSONTokenKind::Null);
69
70		// clang-format off
71	0	case u'-':
72	0	case u'0': case u'1': case u'2': case u'3': case u'4':
73	0	case u'5': case u'6': case u'7': case u'8': case u'9':
74		// clang-format on
75	0	return scanNumber();
76
77	0	case u'"':
78	0	if (forKey) {
79	0	return scanString<StrAsSymbol>();
80	0	} else {
81	0	return scanString<StrAsValue>();
82	0	}
83
84	0	default:
85	0	return errorWithChar(u"Unexpected character: ", *curCharPtr_);
86	0	}
87	0	}
88
89	0	CallResult<char16_t> JSONLexer::consumeUnicode() {
90	0	uint16_t val = 0;
91	0	for (unsigned i = 0; i < 4; ++i) {
92	0	if (!curCharPtr_.hasChar()) {
93	0	return error("Unexpected end of input");
94	0	}
95	0	int ch = *curCharPtr_ \| 32;
96	0	if (ch >= '0' && ch <= '9') {
97	0	ch -= '0';
98	0	} else if (ch >= 'a' && ch <= 'f') {
99	0	ch -= 'a' - 10;
100	0	} else {
101	0	return errorWithChar(u"Invalid unicode point character: ", *curCharPtr_);
102	0	}
103	0	val = (val << 4) + ch;
104	0	++curCharPtr_;
105	0	}
106
107	0	return static_cast<char16_t>(val);
108	0	}
109
110	0	ExecutionStatus JSONLexer::scanNumber() {
111	0	llvh::SmallVector<char, 32> str8;
112	0	while (curCharPtr_.hasChar()) {
113	0	auto ch = *curCharPtr_;
114	0	if (!(ch == u'-' \|\| ch == u'+' \|\| ch == u'.' \|\| (ch \| 32) == u'e' \|\|
115	0	(ch >= u'0' && ch <= u'9'))) {
116	0	break;
117	0	}
118	0	str8.push_back(ch);
119	0	++curCharPtr_;
120	0	}
121
122	0	size_t len = str8.size();
123	0	assert(len > 0 && "scanNumber must be called on a number-looking char");
124	0	if (str8[0] == '0' && len > 1 && str8[1] >= '0' && str8[1] <= '9') {
125		// The integer part cannot start with 0, unless it's 0.
126	0	return errorWithChar(u"Unexpected character in number: ", str8[1]);
127	0	}
128
129	0	str8.push_back('\0');
130
131	0	char *endPtr;
132	0	double value = ::hermes_g_strtod(str8.data(), &endPtr);
133	0	if (endPtr != str8.data() + len) {
134	0	return errorWithChar(u"Unexpected character in number: ", *endPtr);
135	0	}
136	0	token_.setNumber(value);
137	0	return ExecutionStatus::RETURNED;
138	0	}
139
140		template <typename ForKey>
141	0	ExecutionStatus JSONLexer::scanString() {
142	0	assert(*curCharPtr_ == '"');
143	0	++curCharPtr_;
144	0	bool hasEscape = false;
145		// Ideally we don't have to use tmpStorage. In the case of a plain string with
146		// no escapes, we construct an ArrayRef at the end of scanning that points to
147		// the beginning and end of the string.
148	0	SmallU16String<32> tmpStorage;
149	0	curCharPtr_.beginCapture();
150		// Make sure we don't somehow leave a dangling open capture.
151	0	auto ensureCaptureClosed =
152	0	llvh::make_scope_exit([this] { curCharPtr_.cancelCapture(); }); Unexecuted instantiation: hermes::vm::JSONLexer::scanString<std::__1::integral_constant<bool, true> >()::{lambda()#1}::operator()() const Unexecuted instantiation: hermes::vm::JSONLexer::scanString<std::__1::integral_constant<bool, false> >()::{lambda()#1}::operator()() const
153	0	bool allAscii = true;
154	0	hermes::JenkinsHash hash = hermes::JenkinsHashInit;
155
156	0	while (curCharPtr_.hasChar()) {
157	0	if (*curCharPtr_ == '"') {
158		// End of string.
159	0	llvh::ArrayRef<char16_t> strRef =
160	0	hasEscape ? tmpStorage.arrayRef() : curCharPtr_.endCapture();
161	0	++curCharPtr_;
162	0	if constexpr (ForKey::value) {
163	0	auto symRes = runtime_.getIdentifierTable().getSymbolHandle(
164	0	runtime_, strRef, hash);
165	0	if (symRes == ExecutionStatus::EXCEPTION)
166	0	return ExecutionStatus::EXCEPTION;
167	0	token_.setSymbol(*symRes);
168	0	return ExecutionStatus::RETURNED;
169	0	}
170	0	auto strRes =
171	0	StringPrimitive::createWithKnownEncoding(runtime_, strRef, allAscii);
172	0	if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) {
173	0	return ExecutionStatus::EXCEPTION;
174	0	}
175	0	token_.setString(runtime_.makeHandle<StringPrimitive>(*strRes));
176	0	return ExecutionStatus::RETURNED;
177	0	} else if (*curCharPtr_ <= '\u001F') {
178	0	return error(u"U+0000 thru U+001F is not allowed in string");
179	0	}
180	0	char16_t scannedChar = -1;
181	0	if (*curCharPtr_ == u'\\') {
182	0	if (!hasEscape) {
183		// This is the first escape character encountered, so append everything
184		// we've seen so far to tmpStorage.
185	0	tmpStorage.append(curCharPtr_.endCapture());
186	0	}
187	0	hasEscape = true;
188	0	++curCharPtr_;
189	0	if (!curCharPtr_.hasChar()) {
190	0	return error("Unexpected end of input");
191	0	}
192	0	switch (*curCharPtr_) {
193	0	#define CONSUME_VAL(v) \
194	0	tmpStorage.push_back(v); \
195	0	++curCharPtr_;
196
197	0	case u'"':
198	0	case u'/':
199	0	case u'\\':
200	0	CONSUME_VAL(*curCharPtr_)
201	0	break;
202	0	case 'b':
203	0	CONSUME_VAL(8)
204	0	break;
205	0	case 'f':
206	0	CONSUME_VAL(12)
207	0	break;
208	0	case 'n':
209	0	CONSUME_VAL(10)
210	0	break;
211	0	case 'r':
212	0	CONSUME_VAL(13)
213	0	break;
214	0	case 't':
215	0	CONSUME_VAL(9)
216	0	break;
217	0	case 'u': {
218	0	++curCharPtr_;
219	0	CallResult<char16_t> cr = consumeUnicode();
220	0	if (LLVM_UNLIKELY(cr == ExecutionStatus::EXCEPTION)) {
221	0	return ExecutionStatus::EXCEPTION;
222	0	}
223	0	tmpStorage.push_back(*cr);
224	0	break;
225	0	}
226
227	0	default:
228	0	return errorWithChar(u"Invalid escape sequence: ", *curCharPtr_);
229	0	}
230	0	scannedChar = tmpStorage.back();
231	0	} else {
232	0	scannedChar = *curCharPtr_;
233	0	if (hasEscape)
234	0	tmpStorage.push_back(scannedChar);
235	0	++curCharPtr_;
236	0	}
237	0	if constexpr (ForKey::value) {
238	0	hash = hermes::updateJenkinsHash(hash, scannedChar);
239	0	} else {
240	0	allAscii &= isASCII(scannedChar);
241	0	}
242	0	}
243	0	return error("Unexpected end of input");
244	0	} Unexecuted instantiation: hermes::vm::ExecutionStatus hermes::vm::JSONLexer::scanString<std::__1::integral_constant<bool, true> >() Unexecuted instantiation: hermes::vm::ExecutionStatus hermes::vm::JSONLexer::scanString<std::__1::integral_constant<bool, false> >()
245
246	0	ExecutionStatus JSONLexer::scanWord(const char *word, JSONTokenKind kind) {
247	0	while (*word && curCharPtr_.hasChar()) {
248	0	if (curCharPtr_ != word) {
249	0	return errorWithChar(u"Unexpected character: ", *curCharPtr_);
250	0	}
251	0	++curCharPtr_;
252	0	++word;
253	0	}
254	0	if (*word) {
255	0	return error(u"Unexpected end of input");
256	0	}
257	0	token_.setPunctuator(kind);
258	0	return ExecutionStatus::RETURNED;
259	0	}
260
261		} // namespace vm
262		} // namespace hermes