Coverage Report

Created: 2025-08-28 06:48

/src/hermes/include/hermes/Platform/Unicode/CharacterProperties.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
#ifndef HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
9
#define HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
10
11
#include <cassert>
12
#include <cstdint>
13
#include <string>
14
15
#include "llvh/ADT/ArrayRef.h"
16
17
namespace hermes {
18
19
const uint32_t UNICODE_MAX_VALUE = 0x10FFFF;
20
/// The start of the surrogate range.
21
const uint32_t UNICODE_SURROGATE_FIRST = 0xD800;
22
/// The last character of the surrogate range (inclusive).
23
const uint32_t UNICODE_SURROGATE_LAST = 0xDFFF;
24
const uint32_t UTF16_HIGH_SURROGATE = 0xD800;
25
const uint32_t UTF16_LOW_SURROGATE = 0xDC00;
26
const uint32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
27
/// The last member of the BMP.
28
const uint32_t UNICODE_LAST_BMP = 0xFFFF;
29
30
const uint32_t UNICODE_LINE_SEPARATOR = 0x2028;
31
const uint32_t UNICODE_PARAGRAPH_SEPARATOR = 0x2029;
32
33
const uint32_t UNICODE_ZWNJ = 0x200C;
34
const uint32_t UNICODE_ZWJ = 0x200D;
35
36
/// The maximum number of precanonicalizations of any character.
37
/// Precanonicalization is not a term from the Unicode spec; rather it refers to
38
/// the RegExp Canonicalize function given in ES5.1 15.10.2.8. Most characters
39
/// are either canonicalized to by themselves or their lowercase variant;
40
/// there's a handful of exceptions which are tracked here.
41
const uint32_t UNICODE_MAX_PRECANONICALIZATIONS = 3;
42
43
0
inline bool isValidCodePoint(uint32_t cp) {
44
0
  return !(
45
0
      (cp >= UNICODE_SURROGATE_FIRST && cp <= UNICODE_SURROGATE_LAST) ||
46
0
      cp > UNICODE_MAX_VALUE);
47
0
}
48
49
/// \return whether \p cp is part of the Basic Multilingual Plane.
50
/// Surrogate characters are considered part of the BMP.
51
156
inline bool isMemberOfBMP(uint32_t cp) {
52
156
  return cp <= UNICODE_LAST_BMP;
53
156
}
54
55
/// \return whether cp is a high surrogate.
56
181
inline bool isHighSurrogate(uint32_t cp) {
57
181
  return UNICODE_SURROGATE_FIRST <= cp && cp < UTF16_LOW_SURROGATE;
58
181
}
59
60
/// \return whether cp is a low surrogate.
61
195
inline bool isLowSurrogate(uint32_t cp) {
62
195
  return UTF16_LOW_SURROGATE <= cp && cp <= UNICODE_SURROGATE_LAST;
63
195
}
64
65
//===----------------------------------------------------------------------===//
66
// ES14 11.1.3
67
68
/// Decode a surrogate pair [\p lead, \p trail] into a code point.
69
14
inline uint32_t utf16SurrogatePairToCodePoint(uint32_t lead, uint32_t trail) {
70
14
  assert(
71
14
      isHighSurrogate(lead) && isLowSurrogate(trail) && "Not a surrogate pair");
72
14
  return ((lead - UTF16_HIGH_SURROGATE) << 10) + (trail - UTF16_LOW_SURROGATE) +
73
14
      0x10000;
74
14
}
75
76
/// \return true if the codepoint is not ASCII and is a Unicode letter.
77
bool isUnicodeOnlyLetter(uint32_t cp);
78
/// \return true if the codepoint is not ASCII and is a Unicode space.
79
bool isUnicodeOnlySpace(uint32_t cp);
80
/// \return true if the codepoint is in the Non-Spacing Mark or
81
/// Combining-Spacing Mark categories.
82
bool isUnicodeCombiningMark(uint32_t cp);
83
/// \return true if the codepoint is in the Decimal Number category.
84
bool isUnicodeDigit(uint32_t cp);
85
/// \return true if the codepoint is in the Connector Punctuation category.
86
bool isUnicodeConnectorPunctuation(uint32_t cp);
87
88
/// \return true if the codepoint has the ID_Start property and is ASCII.
89
13.4k
inline bool isASCIIIdentifierStart(uint32_t ch) {
90
13.4k
  return ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z');
91
13.4k
}
92
93
/// \return true if the codepoint has the ID_Start property.
94
13.3k
inline bool isUnicodeIDStart(uint32_t cp) {
95
13.3k
  return isASCIIIdentifierStart(cp) || isUnicodeOnlyLetter(cp);
96
13.3k
}
97
98
/// \return true if the codepoint has the ID_Continue property.
99
13.3k
inline bool isUnicodeIDContinue(uint32_t cp) {
100
  // TODO: clearly this has to be optimized somehow
101
13.3k
  return isUnicodeIDStart(cp) || isUnicodeCombiningMark(cp) ||
102
13.3k
      isUnicodeDigit(cp) || isUnicodeConnectorPunctuation(cp) ||
103
13.3k
      cp == UNICODE_ZWNJ || cp == UNICODE_ZWJ;
104
13.3k
}
105
106
/// \return true if the codepoint is valid in a unicode property name
107
0
inline bool isUnicodePropertyName(uint32_t ch) {
108
0
  return ch == '_' || ((ch | 32) >= 'a' && (ch | 32) <= 'z');
109
0
}
110
111
/// \return true if the codepoint is valid in a unicode property value
112
0
inline bool isUnicodePropertyValue(uint32_t ch) {
113
0
  return isUnicodePropertyName(ch) || isUnicodeDigit(ch);
114
0
}
115
116
/// \return the canonicalized value of \p cp, following ES9 21.2.2.8.2.
117
uint32_t canonicalize(uint32_t cp, bool unicode);
118
119
class CodePointSet;
120
/// \return a set containing all characters which are canonically equivalent to
121
/// any character in \p set, following ES9 21.2.2.8.2.
122
CodePointSet makeCanonicallyEquivalent(const CodePointSet &set, bool unicode);
123
124
struct UnicodeRangePoolRef;
125
126
// Create a codepoint range array from a Unicode \p propertyName and \p
127
// propertyValue.
128
llvh::ArrayRef<UnicodeRangePoolRef> unicodePropertyRanges(
129
    std::string_view propertyName,
130
    std::string_view propertyValue);
131
132
/// Add a codepoint range array of codepoints to \p receiver, typically used in
133
/// conjuction with unicodePropertyRanges.
134
void addRangeArrayPoolToBracket(
135
    CodePointSet *receiver,
136
    const llvh::ArrayRef<UnicodeRangePoolRef> rangeArrayPool,
137
    bool inverted);
138
139
} // namespace hermes
140
141
#endif // HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H