/src/hermes/include/hermes/Regex/RegexTypes.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | // -*- C++ -*- |
9 | | //===--------------------------- regex ------------------------------------===// |
10 | | // |
11 | | // The LLVM Compiler Infrastructure |
12 | | // |
13 | | // This file is dual licensed under the MIT and the University of Illinois Open |
14 | | // Source Licenses. See LICENSE.TXT for details. |
15 | | // |
16 | | //===----------------------------------------------------------------------===// |
17 | | |
18 | | #ifndef HERMES_REGEX_TYPES_H |
19 | | #define HERMES_REGEX_TYPES_H |
20 | | |
21 | | #include "llvh/ADT/ArrayRef.h" |
22 | | #include "llvh/ADT/SmallString.h" |
23 | | |
24 | | namespace hermes { |
25 | | |
26 | | struct UnicodeRangePoolRef; |
27 | | |
28 | | namespace regex { |
29 | | namespace constants { |
30 | | |
31 | | // MatchFlagType |
32 | | |
33 | | enum MatchFlagType { |
34 | | /// Default match options. |
35 | | matchDefault = 0, |
36 | | |
37 | | /// ^ anchors should not treat the input start as a line start. |
38 | | matchNotBeginningOfLine = 1 << 0, |
39 | | |
40 | | /// $ anchors should not treat the input end as a line end. |
41 | | matchNotEndOfLine = 1 << 1, |
42 | | |
43 | | /// Hint that the input is composed entirely of ASCII characters. |
44 | | matchInputAllAscii = 1 << 2, |
45 | | |
46 | | /// Do not search for a match past the search start location. |
47 | | matchOnlyAtStart = 1 << 3, |
48 | | }; |
49 | | |
50 | 0 | inline constexpr MatchFlagType operator~(MatchFlagType x) { |
51 | 0 | return MatchFlagType(~int(x) & 0x0FFF); |
52 | 0 | } |
53 | | |
54 | 0 | inline constexpr MatchFlagType operator&(MatchFlagType x, MatchFlagType y) { |
55 | 0 | return MatchFlagType(int(x) & int(y)); |
56 | 0 | } |
57 | | |
58 | 0 | inline constexpr MatchFlagType operator|(MatchFlagType x, MatchFlagType y) { |
59 | 0 | return MatchFlagType(int(x) | int(y)); |
60 | 0 | } |
61 | | |
62 | 0 | inline constexpr MatchFlagType operator^(MatchFlagType x, MatchFlagType y) { |
63 | 0 | return MatchFlagType(int(x) ^ int(y)); |
64 | 0 | } |
65 | | |
66 | 0 | inline MatchFlagType &operator&=(MatchFlagType &x, MatchFlagType y) { |
67 | 0 | x = x & y; |
68 | 0 | return x; |
69 | 0 | } |
70 | | |
71 | 0 | inline MatchFlagType &operator|=(MatchFlagType &x, MatchFlagType y) { |
72 | 0 | x = x | y; |
73 | 0 | return x; |
74 | 0 | } |
75 | | |
76 | 0 | inline MatchFlagType &operator^=(MatchFlagType &x, MatchFlagType y) { |
77 | 0 | x = x ^ y; |
78 | 0 | return x; |
79 | 0 | } |
80 | | |
81 | | enum class ErrorType { |
82 | | /// No error occurred. |
83 | | None, |
84 | | |
85 | | /// An escaped value would overflow: /\xFFFFFFFFFFF/ |
86 | | EscapeOverflow, |
87 | | |
88 | | /// incomplete escape: new RegExp("\\") |
89 | | EscapeIncomplete, |
90 | | |
91 | | /// Invalid escape: new RegExp("\\123", "u") |
92 | | EscapeInvalid, |
93 | | |
94 | | /// Mismatched [ and ]. |
95 | | UnbalancedBracket, |
96 | | |
97 | | /// Mismatched ( and ). |
98 | | UnbalancedParenthesis, |
99 | | |
100 | | /// Braces have valid syntax, but the range is invalid, such as {5,3}. |
101 | | BraceRange, |
102 | | |
103 | | /// Invalid character range, such as [b-a]. |
104 | | CharacterRange, |
105 | | |
106 | | /// A lone { or } was found. |
107 | | InvalidQuantifierBracket, |
108 | | |
109 | | /// One of *?+{ was not preceded by a valid regular expression. |
110 | | InvalidRepeat, |
111 | | |
112 | | /// The pattern exceeded internal limits, such as capture group or loop count. |
113 | | PatternExceedsParseLimits, |
114 | | |
115 | | /// The flags supplied were either invalid or contained repetition. |
116 | | InvalidFlags, |
117 | | |
118 | | /// The name for a capture group is invalid e.g. 1id |
119 | | InvalidCaptureGroupName, |
120 | | |
121 | | /// Duplicate capture group name. |
122 | | DuplicateCaptureGroupName, |
123 | | |
124 | | /// Syntactically invalid named backreference. |
125 | | InvalidNamedReference, |
126 | | |
127 | | /// Reference to nonexistent capture group. |
128 | | NonexistentNamedCaptureReference, |
129 | | |
130 | | /// Invalid Unicode property name or value |
131 | | InvalidPropertyName, |
132 | | }; |
133 | | |
134 | | /// \return an error message for the given \p error. |
135 | 2 | inline const char *messageForError(ErrorType error) { |
136 | 2 | switch (error) { |
137 | 0 | case ErrorType::EscapeOverflow: |
138 | 0 | return "Escaped value too large"; |
139 | 0 | case ErrorType::EscapeIncomplete: |
140 | 0 | return "Incomplete escape"; |
141 | 0 | case ErrorType::EscapeInvalid: |
142 | 0 | return "Invalid escape"; |
143 | 0 | case ErrorType::UnbalancedBracket: |
144 | 0 | return "Character class not closed"; |
145 | 0 | case ErrorType::UnbalancedParenthesis: |
146 | 0 | return "Parenthesized expression not closed"; |
147 | 0 | case ErrorType::BraceRange: |
148 | 0 | return "Quantifier range out of order"; |
149 | 0 | case ErrorType::CharacterRange: |
150 | 0 | return "Character class range out of order"; |
151 | 0 | case ErrorType::InvalidQuantifierBracket: |
152 | 0 | return "Invalid quantifier bracket"; |
153 | 1 | case ErrorType::InvalidRepeat: |
154 | 1 | return "Quantifier has nothing to repeat"; |
155 | 0 | case ErrorType::PatternExceedsParseLimits: |
156 | 0 | return "Pattern exceeds parse limits"; |
157 | 1 | case ErrorType::InvalidFlags: |
158 | 1 | return "Invalid flags"; |
159 | 0 | case ErrorType::InvalidCaptureGroupName: |
160 | 0 | return "Invalid capture group name"; |
161 | 0 | case ErrorType::DuplicateCaptureGroupName: |
162 | 0 | return "Duplicate capture group name"; |
163 | 0 | case ErrorType::InvalidNamedReference: |
164 | 0 | return "Invalid named reference"; |
165 | 0 | case ErrorType::NonexistentNamedCaptureReference: |
166 | 0 | return "Nonexistent named capture reference"; |
167 | 0 | case ErrorType::InvalidPropertyName: |
168 | 0 | return "Invalid property name"; |
169 | 0 | case ErrorType::None: |
170 | 0 | return "No error"; |
171 | 2 | } |
172 | 2 | llvm_unreachable("Unknown error"); |
173 | 0 | return nullptr; |
174 | 2 | } |
175 | | |
176 | | /// Maximum number of supported capture groups. |
177 | | /// This is therefore also the maximum valid backreference. |
178 | | constexpr uint16_t kMaxCaptureGroupCount = 65535; |
179 | | |
180 | | /// Maximum number of supported loops. |
181 | | constexpr uint16_t kMaxLoopCount = 65535; |
182 | | |
183 | | } // namespace constants |
184 | | |
185 | | /// After compiling a regex, there are certain properties we can test for that |
186 | | /// enable us to quickly rule out matches. We refer to these as |
187 | | /// MatchConstraints: they constrain the strings that may match the regex. |
188 | | enum MatchConstraintFlags : uint8_t { |
189 | | /// If set, ASCII strings can never match because we require at least one |
190 | | /// non-ASCII character. |
191 | | MatchConstraintNonASCII = 1 << 0, |
192 | | |
193 | | /// If set, the regex can only match at the beginning of the input string, due |
194 | | /// to ^ anchors. |
195 | | MatchConstraintAnchoredAtStart = 1 << 1, |
196 | | |
197 | | /// If set, the regex cannot possibly match an empty string, e.g. /a/ |
198 | | MatchConstraintNonEmpty = 1 << 2, |
199 | | }; |
200 | | |
201 | | /// \return whether a code point \p cp is ASCII. |
202 | 3.74M | inline bool isASCII(uint32_t c) { |
203 | 3.74M | return c <= 127; |
204 | 3.74M | } |
205 | | |
206 | | // Type wrapping up a character class, like \d or \S. |
207 | | struct CharacterClass { |
208 | | enum Type : uint8_t { |
209 | | Digits = 1 << 0, // \d \D |
210 | | Spaces = 1 << 1, // \s \S |
211 | | Words = 1 << 2, // \w \W |
212 | | } type_; |
213 | | |
214 | | // Whether the class is inverted (\D instead of \d). |
215 | | bool inverted_; |
216 | | |
217 | 45.4k | CharacterClass(Type type, bool invert) : type_(type), inverted_(invert) {} |
218 | | }; |
219 | | |
220 | | // Type wrapping up a Unicode codepoint range array. |
221 | | struct CharacterClassCodepointRanges { |
222 | | llvh::ArrayRef<UnicodeRangePoolRef> rangeArray; |
223 | | |
224 | | // Whether the class is inverted (\P instead of \p). |
225 | | bool inverted_; |
226 | | |
227 | | CharacterClassCodepointRanges( |
228 | | llvh::ArrayRef<UnicodeRangePoolRef> rangeArray, |
229 | | bool inverted) |
230 | 0 | : rangeArray(rangeArray), inverted_(inverted) {} |
231 | | }; |
232 | | |
233 | | // Struct representing flags which may be used when constructing the RegExp |
234 | | class SyntaxFlags { |
235 | | private: |
236 | | // Note these are encoded into bytecode files, so changing their values is a |
237 | | // breaking change. |
238 | | enum : uint8_t { |
239 | | ICASE = 1 << 0, |
240 | | GLOBAL = 1 << 1, |
241 | | MULTILINE = 1 << 2, |
242 | | UCODE = 1 << 3, |
243 | | DOTALL = 1 << 4, |
244 | | STICKY = 1 << 5, |
245 | | INDICES = 1 << 6, |
246 | | }; |
247 | | |
248 | | public: |
249 | | // Note: Preserving the order here and ensuring it lines up with the order of |
250 | | // the offsets above makes the generated assembly more efficient. |
251 | | // Specifically, it makes the conversion to/from a byte almost free. |
252 | | uint8_t ignoreCase : 1; |
253 | | uint8_t global : 1; |
254 | | uint8_t multiline : 1; |
255 | | uint8_t unicode : 1; |
256 | | uint8_t dotAll : 1; |
257 | | uint8_t sticky : 1; |
258 | | uint8_t hasIndices : 1; |
259 | | |
260 | | /// \return a byte representing the flags. Bits are set based on the offsets |
261 | | /// specified above. This is used for serialising the flags to bytecode. |
262 | 1.82k | uint8_t toByte() const { |
263 | 1.82k | uint8_t ret = 0; |
264 | 1.82k | if (global) |
265 | 2 | ret |= GLOBAL; |
266 | 1.82k | if (ignoreCase) |
267 | 1 | ret |= ICASE; |
268 | 1.82k | if (multiline) |
269 | 2 | ret |= MULTILINE; |
270 | 1.82k | if (unicode) |
271 | 1 | ret |= UCODE; |
272 | 1.82k | if (sticky) |
273 | 323 | ret |= STICKY; |
274 | 1.82k | if (dotAll) |
275 | 0 | ret |= DOTALL; |
276 | 1.82k | if (hasIndices) |
277 | 0 | ret |= INDICES; |
278 | 1.82k | return ret; |
279 | 1.82k | } |
280 | | |
281 | | /// \return a SyntaxFlags struct generated from the given byte. Bit offsets |
282 | | /// are specified above. This is used for deserialising the flags from |
283 | | /// bytecode. |
284 | 1.04k | static SyntaxFlags fromByte(uint8_t byte) { |
285 | 1.04k | SyntaxFlags ret = {}; |
286 | 1.04k | if (byte & GLOBAL) |
287 | 1 | ret.global = 1; |
288 | 1.04k | if (byte & ICASE) |
289 | 1 | ret.ignoreCase = 1; |
290 | 1.04k | if (byte & MULTILINE) |
291 | 0 | ret.multiline = 1; |
292 | 1.04k | if (byte & UCODE) |
293 | 0 | ret.unicode = 1; |
294 | 1.04k | if (byte & STICKY) |
295 | 0 | ret.sticky = 1; |
296 | 1.04k | if (byte & DOTALL) |
297 | 0 | ret.dotAll = 1; |
298 | 1.04k | if (byte & INDICES) |
299 | 0 | ret.hasIndices = 1; |
300 | 1.04k | return ret; |
301 | 1.04k | } |
302 | | |
303 | | /// \return a string representing the flags |
304 | | /// The characters are returned in the order given in ES2022 22.2.5.4 |
305 | | /// (specifically hasIndices, global, ignoreCase, multiline, dotAll, unicode, |
306 | | /// sticky) Note this may differ in order from the string passed in |
307 | | /// construction |
308 | 0 | llvh::SmallString<7> toString() const { |
309 | 0 | llvh::SmallString<7> result; |
310 | 0 | if (hasIndices) |
311 | 0 | result.push_back('d'); |
312 | 0 | if (global) |
313 | 0 | result.push_back('g'); |
314 | 0 | if (ignoreCase) |
315 | 0 | result.push_back('i'); |
316 | 0 | if (multiline) |
317 | 0 | result.push_back('m'); |
318 | 0 | if (dotAll) |
319 | 0 | result.push_back('s'); |
320 | 0 | if (unicode) |
321 | 0 | result.push_back('u'); |
322 | 0 | if (sticky) |
323 | 0 | result.push_back('y'); |
324 | 0 | return result; |
325 | 0 | } |
326 | | |
327 | | /// Given a flags string \p str, generate the corresponding SyntaxFlags |
328 | | /// \return the flags if the string is valid, an empty optional otherwise |
329 | | /// See ES 5.1 15.10.4.1 for description of the validation |
330 | | static llvh::Optional<SyntaxFlags> fromString( |
331 | 1.83k | const llvh::ArrayRef<char16_t> flags) { |
332 | | // A flags string may contain i,m,g, in any order, but at most once each |
333 | 1.83k | auto error = llvh::NoneType::None; |
334 | 1.83k | SyntaxFlags ret = {}; |
335 | 1.83k | for (auto c : flags) { |
336 | 331 | switch (c) { |
337 | 1 | case u'i': |
338 | 1 | if (ret.ignoreCase) |
339 | 0 | return error; |
340 | 1 | ret.ignoreCase = 1; |
341 | 1 | break; |
342 | 3 | case u'm': |
343 | 3 | if (ret.multiline) |
344 | 0 | return error; |
345 | 3 | ret.multiline = 1; |
346 | 3 | break; |
347 | 2 | case u'g': |
348 | 2 | if (ret.global) |
349 | 0 | return error; |
350 | 2 | ret.global = 1; |
351 | 2 | break; |
352 | 1 | case u'u': |
353 | 1 | if (ret.unicode) |
354 | 0 | return error; |
355 | 1 | ret.unicode = 1; |
356 | 1 | break; |
357 | 323 | case u'y': |
358 | 323 | if (ret.sticky) |
359 | 0 | return error; |
360 | 323 | ret.sticky = 1; |
361 | 323 | break; |
362 | 0 | case u's': |
363 | 0 | if (ret.dotAll) |
364 | 0 | return error; |
365 | 0 | ret.dotAll = 1; |
366 | 0 | break; |
367 | 0 | case u'd': |
368 | 0 | if (ret.hasIndices) |
369 | 0 | return error; |
370 | 0 | ret.hasIndices = 1; |
371 | 0 | break; |
372 | 1 | default: |
373 | 1 | return error; |
374 | 331 | } |
375 | 331 | } |
376 | 1.82k | return ret; |
377 | 1.83k | } |
378 | | }; |
379 | | } // namespace regex |
380 | | } // namespace hermes |
381 | | #endif // HERMES_REGEX_TYPES_H |