/src/hermes/include/hermes/Platform/Unicode/CharacterProperties.h

Source (jump to first uncovered line)
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#ifndef HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
#define HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H

#include <cassert>
#include <cstdint>
#include <string>

#include "llvh/ADT/ArrayRef.h"

namespace hermes {

const uint32_t UNICODE_MAX_VALUE = 0x10FFFF;
/// The start of the surrogate range.
const uint32_t UNICODE_SURROGATE_FIRST = 0xD800;
/// The last character of the surrogate range (inclusive).
const uint32_t UNICODE_SURROGATE_LAST = 0xDFFF;
const uint32_t UTF16_HIGH_SURROGATE = 0xD800;
const uint32_t UTF16_LOW_SURROGATE = 0xDC00;
const uint32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
/// The last member of the BMP.
const uint32_t UNICODE_LAST_BMP = 0xFFFF;

const uint32_t UNICODE_LINE_SEPARATOR = 0x2028;
const uint32_t UNICODE_PARAGRAPH_SEPARATOR = 0x2029;

const uint32_t UNICODE_ZWNJ = 0x200C;
const uint32_t UNICODE_ZWJ = 0x200D;

/// The maximum number of precanonicalizations of any character.
/// Precanonicalization is not a term from the Unicode spec; rather it refers to
/// the RegExp Canonicalize function given in ES5.1 15.10.2.8. Most characters
/// are either canonicalized to by themselves or their lowercase variant;
/// there's a handful of exceptions which are tracked here.
const uint32_t UNICODE_MAX_PRECANONICALIZATIONS = 3;

inline bool isValidCodePoint(uint32_t cp) {
  return !(
      (cp >= UNICODE_SURROGATE_FIRST && cp <= UNICODE_SURROGATE_LAST) ||
      cp > UNICODE_MAX_VALUE);
}

/// \return whether \p cp is part of the Basic Multilingual Plane.
/// Surrogate characters are considered part of the BMP.
inline bool isMemberOfBMP(uint32_t cp) {
  return cp <= UNICODE_LAST_BMP;
}

/// \return whether cp is a high surrogate.
inline bool isHighSurrogate(uint32_t cp) {
  return UNICODE_SURROGATE_FIRST <= cp && cp < UTF16_LOW_SURROGATE;
}

/// \return whether cp is a low surrogate.
inline bool isLowSurrogate(uint32_t cp) {
  return UTF16_LOW_SURROGATE <= cp && cp <= UNICODE_SURROGATE_LAST;
}

//===----------------------------------------------------------------------===//
// ES14 11.1.3

/// Decode a surrogate pair [\p lead, \p trail] into a code point.
inline uint32_t utf16SurrogatePairToCodePoint(uint32_t lead, uint32_t trail) {
  assert(
      isHighSurrogate(lead) && isLowSurrogate(trail) && "Not a surrogate pair");
  return ((lead - UTF16_HIGH_SURROGATE) << 10) + (trail - UTF16_LOW_SURROGATE) +
      0x10000;
}

/// \return true if the codepoint is not ASCII and is a Unicode letter.
bool isUnicodeOnlyLetter(uint32_t cp);
/// \return true if the codepoint is not ASCII and is a Unicode space.
bool isUnicodeOnlySpace(uint32_t cp);
/// \return true if the codepoint is in the Non-Spacing Mark or
/// Combining-Spacing Mark categories.
bool isUnicodeCombiningMark(uint32_t cp);
/// \return true if the codepoint is in the Decimal Number category.
bool isUnicodeDigit(uint32_t cp);
/// \return true if the codepoint is in the Connector Punctuation category.
bool isUnicodeConnectorPunctuation(uint32_t cp);

/// \return true if the codepoint has the ID_Start property and is ASCII.
inline bool isASCIIIdentifierStart(uint32_t ch) {
  return ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z');
}

/// \return true if the codepoint has the ID_Start property.
inline bool isUnicodeIDStart(uint32_t cp) {
  return isASCIIIdentifierStart(cp) || isUnicodeOnlyLetter(cp);
}

/// \return true if the codepoint has the ID_Continue property.
inline bool isUnicodeIDContinue(uint32_t cp) {
  // TODO: clearly this has to be optimized somehow
  return isUnicodeIDStart(cp) || isUnicodeCombiningMark(cp) ||
      isUnicodeDigit(cp) || isUnicodeConnectorPunctuation(cp) ||
      cp == UNICODE_ZWNJ || cp == UNICODE_ZWJ;
}

/// \return true if the codepoint is valid in a unicode property name
inline bool isUnicodePropertyName(uint32_t ch) {
  return ch == '_' || ((ch | 32) >= 'a' && (ch | 32) <= 'z');
}

/// \return true if the codepoint is valid in a unicode property value
inline bool isUnicodePropertyValue(uint32_t ch) {
  return isUnicodePropertyName(ch) || isUnicodeDigit(ch);
}

/// \return the canonicalized value of \p cp, following ES9 21.2.2.8.2.
uint32_t canonicalize(uint32_t cp, bool unicode);

class CodePointSet;
/// \return a set containing all characters which are canonically equivalent to
/// any character in \p set, following ES9 21.2.2.8.2.
CodePointSet makeCanonicallyEquivalent(const CodePointSet &set, bool unicode);

struct UnicodeRangePoolRef;

// Create a codepoint range array from a Unicode \p propertyName and \p
// propertyValue.
llvh::ArrayRef<UnicodeRangePoolRef> unicodePropertyRanges(
    std::string_view propertyName,
    std::string_view propertyValue);

/// Add a codepoint range array of codepoints to \p receiver, typically used in
/// conjuction with unicodePropertyRanges.
void addRangeArrayPoolToBracket(
    CodePointSet *receiver,
    const llvh::ArrayRef<UnicodeRangePoolRef> rangeArrayPool,
    bool inverted);

} // namespace hermes

#endif // HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) Meta Platforms, Inc. and affiliates.
3		*
4		* This source code is licensed under the MIT license found in the
5		* LICENSE file in the root directory of this source tree.
6		*/
7
8		#ifndef HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
9		#define HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
10
11		#include <cassert>
12		#include <cstdint>
13		#include <string>
14
15		#include "llvh/ADT/ArrayRef.h"
16
17		namespace hermes {
18
19		const uint32_t UNICODE_MAX_VALUE = 0x10FFFF;
20		/// The start of the surrogate range.
21		const uint32_t UNICODE_SURROGATE_FIRST = 0xD800;
22		/// The last character of the surrogate range (inclusive).
23		const uint32_t UNICODE_SURROGATE_LAST = 0xDFFF;
24		const uint32_t UTF16_HIGH_SURROGATE = 0xD800;
25		const uint32_t UTF16_LOW_SURROGATE = 0xDC00;
26		const uint32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
27		/// The last member of the BMP.
28		const uint32_t UNICODE_LAST_BMP = 0xFFFF;
29
30		const uint32_t UNICODE_LINE_SEPARATOR = 0x2028;
31		const uint32_t UNICODE_PARAGRAPH_SEPARATOR = 0x2029;
32
33		const uint32_t UNICODE_ZWNJ = 0x200C;
34		const uint32_t UNICODE_ZWJ = 0x200D;
35
36		/// The maximum number of precanonicalizations of any character.
37		/// Precanonicalization is not a term from the Unicode spec; rather it refers to
38		/// the RegExp Canonicalize function given in ES5.1 15.10.2.8. Most characters
39		/// are either canonicalized to by themselves or their lowercase variant;
40		/// there's a handful of exceptions which are tracked here.
41		const uint32_t UNICODE_MAX_PRECANONICALIZATIONS = 3;
42
43	0	inline bool isValidCodePoint(uint32_t cp) {
44	0	return !(
45	0	(cp >= UNICODE_SURROGATE_FIRST && cp <= UNICODE_SURROGATE_LAST) \|\|
46	0	cp > UNICODE_MAX_VALUE);
47	0	}
48
49		/// \return whether \p cp is part of the Basic Multilingual Plane.
50		/// Surrogate characters are considered part of the BMP.
51	156	inline bool isMemberOfBMP(uint32_t cp) {
52	156	return cp <= UNICODE_LAST_BMP;
53	156	}
54
55		/// \return whether cp is a high surrogate.
56	181	inline bool isHighSurrogate(uint32_t cp) {
57	181	return UNICODE_SURROGATE_FIRST <= cp && cp < UTF16_LOW_SURROGATE;
58	181	}
59
60		/// \return whether cp is a low surrogate.
61	195	inline bool isLowSurrogate(uint32_t cp) {
62	195	return UTF16_LOW_SURROGATE <= cp && cp <= UNICODE_SURROGATE_LAST;
63	195	}
64
65		//===----------------------------------------------------------------------===//
66		// ES14 11.1.3
67
68		/// Decode a surrogate pair [\p lead, \p trail] into a code point.
69	14	inline uint32_t utf16SurrogatePairToCodePoint(uint32_t lead, uint32_t trail) {
70	14	assert(
71	14	isHighSurrogate(lead) && isLowSurrogate(trail) && "Not a surrogate pair");
72	14	return ((lead - UTF16_HIGH_SURROGATE) << 10) + (trail - UTF16_LOW_SURROGATE) +
73	14	0x10000;
74	14	}
75
76		/// \return true if the codepoint is not ASCII and is a Unicode letter.
77		bool isUnicodeOnlyLetter(uint32_t cp);
78		/// \return true if the codepoint is not ASCII and is a Unicode space.
79		bool isUnicodeOnlySpace(uint32_t cp);
80		/// \return true if the codepoint is in the Non-Spacing Mark or
81		/// Combining-Spacing Mark categories.
82		bool isUnicodeCombiningMark(uint32_t cp);
83		/// \return true if the codepoint is in the Decimal Number category.
84		bool isUnicodeDigit(uint32_t cp);
85		/// \return true if the codepoint is in the Connector Punctuation category.
86		bool isUnicodeConnectorPunctuation(uint32_t cp);
87
88		/// \return true if the codepoint has the ID_Start property and is ASCII.
89	13.4k	inline bool isASCIIIdentifierStart(uint32_t ch) {
90	13.4k	return ch == '_' \|\| ch == '$' \|\| ((ch \| 32) >= 'a' && (ch \| 32) <= 'z');
91	13.4k	}
92
93		/// \return true if the codepoint has the ID_Start property.
94	13.3k	inline bool isUnicodeIDStart(uint32_t cp) {
95	13.3k	return isASCIIIdentifierStart(cp) \|\| isUnicodeOnlyLetter(cp);
96	13.3k	}
97
98		/// \return true if the codepoint has the ID_Continue property.
99	13.3k	inline bool isUnicodeIDContinue(uint32_t cp) {
100		// TODO: clearly this has to be optimized somehow
101	13.3k	return isUnicodeIDStart(cp) \|\| isUnicodeCombiningMark(cp) \|\|
102	13.3k	isUnicodeDigit(cp) \|\| isUnicodeConnectorPunctuation(cp) \|\|
103	13.3k	cp == UNICODE_ZWNJ \|\| cp == UNICODE_ZWJ;
104	13.3k	}
105
106		/// \return true if the codepoint is valid in a unicode property name
107	0	inline bool isUnicodePropertyName(uint32_t ch) {
108	0	return ch == '_' \|\| ((ch \| 32) >= 'a' && (ch \| 32) <= 'z');
109	0	}
110
111		/// \return true if the codepoint is valid in a unicode property value
112	0	inline bool isUnicodePropertyValue(uint32_t ch) {
113	0	return isUnicodePropertyName(ch) \|\| isUnicodeDigit(ch);
114	0	}
115
116		/// \return the canonicalized value of \p cp, following ES9 21.2.2.8.2.
117		uint32_t canonicalize(uint32_t cp, bool unicode);
118
119		class CodePointSet;
120		/// \return a set containing all characters which are canonically equivalent to
121		/// any character in \p set, following ES9 21.2.2.8.2.
122		CodePointSet makeCanonicallyEquivalent(const CodePointSet &set, bool unicode);
123
124		struct UnicodeRangePoolRef;
125
126		// Create a codepoint range array from a Unicode \p propertyName and \p
127		// propertyValue.
128		llvh::ArrayRef<UnicodeRangePoolRef> unicodePropertyRanges(
129		std::string_view propertyName,
130		std::string_view propertyValue);
131
132		/// Add a codepoint range array of codepoints to \p receiver, typically used in
133		/// conjuction with unicodePropertyRanges.
134		void addRangeArrayPoolToBracket(
135		CodePointSet *receiver,
136		const llvh::ArrayRef<UnicodeRangePoolRef> rangeArrayPool,
137		bool inverted);
138
139		} // namespace hermes
140
141		#endif // HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H

Coverage Report

Created: 2025-08-28 06:48