/src/hermes/lib/Regex/Executor.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | #include "hermes/Regex/Executor.h" |
9 | | #include "hermes/Regex/RegexTraits.h" |
10 | | #include "hermes/Support/OptValue.h" |
11 | | |
12 | | #include "llvh/ADT/ScopeExit.h" |
13 | | #include "llvh/ADT/SmallVector.h" |
14 | | #include "llvh/Support/TrailingObjects.h" |
15 | | #include "llvh/Support/raw_ostream.h" |
16 | | |
17 | | // This file contains the machinery for executing a regexp compiled to bytecode. |
18 | | |
19 | | namespace hermes { |
20 | | namespace regex { |
21 | | |
22 | | template <class Traits> |
23 | | struct State; |
24 | | |
25 | | /// Describes the exit status of a RegEx execution: it either returned |
26 | | /// normally or stack overflowed |
27 | | enum class ExecutionStatus : uint8_t { RETURNED, STACK_OVERFLOW }; |
28 | | |
29 | | /// A tuple combining the result of a function which may have returned |
30 | | /// successfully (ExecutionStatus::RETURNED) with a value, or thrown an |
31 | | /// exception (ExecutionStatus::STACK_OVERFLOW). |
32 | | /// This is used by some internal functions for convenience. |
33 | | template <typename T> |
34 | | class ExecutorResult { |
35 | | static_assert(std::is_trivial<T>::value, "T must be trivial."); |
36 | | |
37 | | private: |
38 | | ExecutionStatus status_; |
39 | | T value_; |
40 | | |
41 | | public: |
42 | | /* implicit */ ExecutorResult(const T &v) |
43 | 294 | : status_(ExecutionStatus::RETURNED), value_(v) {} hermes::regex::ExecutorResult<char16_t const*>::ExecutorResult(char16_t const* const&) Line | Count | Source | 43 | 86 | : status_(ExecutionStatus::RETURNED), value_(v) {} |
hermes::regex::ExecutorResult<bool>::ExecutorResult(bool const&) Line | Count | Source | 43 | 208 | : status_(ExecutionStatus::RETURNED), value_(v) {} |
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::ExecutorResult(char const* const&) |
44 | | |
45 | 0 | /* implicit */ ExecutorResult(ExecutionStatus status) : status_(status) { |
46 | 0 | assert(status != ExecutionStatus::RETURNED); |
47 | 0 | } Unexecuted instantiation: hermes::regex::ExecutorResult<char16_t const*>::ExecutorResult(hermes::regex::ExecutionStatus) Unexecuted instantiation: hermes::regex::ExecutorResult<bool>::ExecutorResult(hermes::regex::ExecutionStatus) Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::ExecutorResult(hermes::regex::ExecutionStatus) |
48 | | |
49 | 208 | const T &operator*() const { |
50 | 208 | return getValue(); |
51 | 208 | } |
52 | | |
53 | 294 | bool hasValue() const { |
54 | 294 | return status_ == ExecutionStatus::RETURNED; |
55 | 294 | } hermes::regex::ExecutorResult<bool>::hasValue() const Line | Count | Source | 53 | 208 | bool hasValue() const { | 54 | 208 | return status_ == ExecutionStatus::RETURNED; | 55 | 208 | } |
hermes::regex::ExecutorResult<char16_t const*>::hasValue() const Line | Count | Source | 53 | 86 | bool hasValue() const { | 54 | 86 | return status_ == ExecutionStatus::RETURNED; | 55 | 86 | } |
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::hasValue() const |
56 | | |
57 | 294 | explicit operator bool() const { |
58 | 294 | return hasValue(); |
59 | 294 | } hermes::regex::ExecutorResult<bool>::operator bool() const Line | Count | Source | 57 | 208 | explicit operator bool() const { | 58 | 208 | return hasValue(); | 59 | 208 | } |
hermes::regex::ExecutorResult<char16_t const*>::operator bool() const Line | Count | Source | 57 | 86 | explicit operator bool() const { | 58 | 86 | return hasValue(); | 59 | 86 | } |
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::operator bool() const |
60 | | |
61 | 294 | const T &getValue() const { |
62 | 294 | assert(getStatus() == ExecutionStatus::RETURNED); |
63 | 294 | return *reinterpret_cast<const T *>(&value_); |
64 | 294 | } hermes::regex::ExecutorResult<bool>::getValue() const Line | Count | Source | 61 | 208 | const T &getValue() const { | 62 | 208 | assert(getStatus() == ExecutionStatus::RETURNED); | 63 | 208 | return *reinterpret_cast<const T *>(&value_); | 64 | 208 | } |
hermes::regex::ExecutorResult<char16_t const*>::getValue() const Line | Count | Source | 61 | 86 | const T &getValue() const { | 62 | 86 | assert(getStatus() == ExecutionStatus::RETURNED); | 63 | 86 | return *reinterpret_cast<const T *>(&value_); | 64 | 86 | } |
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::getValue() const |
65 | | |
66 | 294 | ExecutionStatus getStatus() const { |
67 | 294 | return status_; |
68 | 294 | } hermes::regex::ExecutorResult<bool>::getStatus() const Line | Count | Source | 66 | 208 | ExecutionStatus getStatus() const { | 67 | 208 | return status_; | 68 | 208 | } |
hermes::regex::ExecutorResult<char16_t const*>::getStatus() const Line | Count | Source | 66 | 86 | ExecutionStatus getStatus() const { | 67 | 86 | return status_; | 68 | 86 | } |
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::getStatus() const |
69 | | }; |
70 | | /// An enum describing Width1 opcodes. This is the set of regex opcodes which |
71 | | /// always match exactly one character (or fail). This is broken out from Opcode |
72 | | /// to get exhaustiveness checking in switch statements. Note that conversions |
73 | | /// can be performed via static_cast. |
74 | | enum class Width1Opcode : uint8_t { |
75 | | MatchChar8 = (uint8_t)Opcode::MatchChar8, |
76 | | MatchChar16 = (uint8_t)Opcode::MatchChar16, |
77 | | MatchCharICase8 = (uint8_t)Opcode::MatchCharICase8, |
78 | | MatchCharICase16 = (uint8_t)Opcode::MatchCharICase16, |
79 | | MatchAny = (uint8_t)Opcode::MatchAny, |
80 | | MatchAnyButNewline = (uint8_t)Opcode::MatchAnyButNewline, |
81 | | Bracket = (uint8_t)Opcode::Bracket, |
82 | | }; |
83 | | |
84 | | /// LoopData tracks information about a loop during a match attempt. Each State |
85 | | /// has one LoopData per loop. |
86 | | struct LoopData { |
87 | | /// The number of times that this loop has executed in this state. |
88 | | uint32_t iterations; |
89 | | |
90 | | /// The input position where we entered the loop. |
91 | | uint32_t entryPosition; |
92 | | }; |
93 | | |
94 | | /// Cursor is a lightweight value type which allows tracking a character pointer |
95 | | /// 'current' within a range 'first' to 'last'. |
96 | | /// A cursor may either be forwards, in which case it proceeds from 'first' to |
97 | | /// 'last'. It may also (in the case of lookbehind assertions) be backwards, in |
98 | | /// which case the cursor proceeds from 'last' to 'first'. The terms "begin" and |
99 | | /// "end" denote tracking in the direction of the cursor, while "left" and |
100 | | /// "right" are direction independent. |
101 | | template <class Traits> |
102 | | class Cursor { |
103 | | using CodeUnit = typename Traits::CodeUnit; |
104 | | using CodePoint = typename Traits::CodePoint; |
105 | | |
106 | | public: |
107 | | /// Construct with the range \p first and \p last, setting the current |
108 | | /// position to \p first. Note that the \p last is one past the last valid |
109 | | /// character. \p forwards decides whether the current pointer advances |
110 | | /// towards last_ (true) or first_ (false). |
111 | | Cursor( |
112 | | const CodeUnit *first, |
113 | | const CodeUnit *current, |
114 | | const CodeUnit *last, |
115 | | bool forwards) |
116 | 86 | : first_(first), |
117 | 86 | last_(last), |
118 | 86 | current_(current), |
119 | 86 | end_(forwards ? last : first), |
120 | 86 | forwards_(forwards) { |
121 | 86 | assert(first_ <= last_ && "first and last out of order"); |
122 | 86 | assert( |
123 | 86 | first_ <= current_ && current <= last_ && |
124 | 86 | "current pointer not in range"); |
125 | 86 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::Cursor(char16_t const*, char16_t const*, char16_t const*, bool) Line | Count | Source | 116 | 86 | : first_(first), | 117 | 86 | last_(last), | 118 | 86 | current_(current), | 119 | 86 | end_(forwards ? last : first), | 120 | 86 | forwards_(forwards) { | 121 | 86 | assert(first_ <= last_ && "first and last out of order"); | 122 | 86 | assert( | 123 | 86 | first_ <= current_ && current <= last_ && | 124 | 86 | "current pointer not in range"); | 125 | 86 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::Cursor(char const*, char const*, char const*, bool) |
126 | | |
127 | | /// \return whether this cursor advances forwards. |
128 | 244 | bool forwards() const { |
129 | 244 | return forwards_; |
130 | 244 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::forwards() const Line | Count | Source | 128 | 244 | bool forwards() const { | 129 | 244 | return forwards_; | 130 | 244 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::forwards() const |
131 | | |
132 | | /// Set whether this cursor advances forwards to \p flag. |
133 | 0 | void setForwards(bool flag) { |
134 | 0 | forwards_ = flag; |
135 | 0 | end_ = forwards_ ? last_ : first_; |
136 | 0 | } Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::setForwards(bool) Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::setForwards(bool) |
137 | | |
138 | | /// \return the number of code units remaining. |
139 | 208 | uint32_t remaining() const { |
140 | 208 | return forwards_ ? last_ - current_ : current_ - first_; |
141 | 208 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::remaining() const Line | Count | Source | 139 | 208 | uint32_t remaining() const { | 140 | 208 | return forwards_ ? last_ - current_ : current_ - first_; | 141 | 208 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::remaining() const |
142 | | |
143 | | /// \return whether we are at the end of the range. |
144 | 243 | bool atEnd() const { |
145 | 243 | return current_ == end_; |
146 | 243 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::atEnd() const Line | Count | Source | 144 | 243 | bool atEnd() const { | 145 | 243 | return current_ == end_; | 146 | 243 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::atEnd() const |
147 | | |
148 | | /// \return the number of code units consumed from the leftmost character. |
149 | | /// This is called "offsetFromLeft" and not "offsetFromStart" to indicate that |
150 | | /// it does not change under backwards tracking. |
151 | 0 | uint32_t offsetFromLeft() const { |
152 | 0 | return current_ - first_; |
153 | 0 | } Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::offsetFromLeft() const Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::offsetFromLeft() const |
154 | | |
155 | | /// \return the number of code units between the current position and the end |
156 | | /// of the string. |
157 | | /// This is called "offsetFromRight" and not "offsetFromEnd" to indicate that |
158 | | /// it does not change under backwards tracking. |
159 | 86 | uint32_t offsetFromRight() const { |
160 | 86 | return last_ - current_; |
161 | 86 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::offsetFromRight() const Line | Count | Source | 159 | 86 | uint32_t offsetFromRight() const { | 160 | 86 | return last_ - current_; | 161 | 86 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::offsetFromRight() const |
162 | | |
163 | | /// \return whether we are at the leftmost position. |
164 | | /// This does not change under backwards tracking. |
165 | 86 | bool atLeft() const { |
166 | 86 | return current_ == first_; |
167 | 86 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::atLeft() const Line | Count | Source | 165 | 86 | bool atLeft() const { | 166 | 86 | return current_ == first_; | 167 | 86 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::atLeft() const |
168 | | |
169 | | /// \return whether we are at the rightmost position. |
170 | | /// This does not change under backwards tracking. |
171 | 0 | bool atRight() const { |
172 | 0 | return current_ == last_; |
173 | 0 | } Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::atRight() const Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::atRight() const |
174 | | |
175 | | /// \return the current code unit. |
176 | 243 | CodeUnit current() const { |
177 | | // Access the character at index 0 if forwards, -1 if backwards. |
178 | 243 | assert(!atEnd() && "Cursor is at end"); |
179 | 243 | return current_[(int)forwards_ - 1]; |
180 | 243 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::current() const Line | Count | Source | 176 | 243 | CodeUnit current() const { | 177 | | // Access the character at index 0 if forwards, -1 if backwards. | 178 | 243 | assert(!atEnd() && "Cursor is at end"); | 179 | 243 | return current_[(int)forwards_ - 1]; | 180 | 243 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::current() const |
181 | | |
182 | | /// \return the current cursor position. |
183 | 172 | const CodeUnit *currentPointer() const { |
184 | 172 | return current_; |
185 | 172 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::currentPointer() const Line | Count | Source | 183 | 172 | const CodeUnit *currentPointer() const { | 184 | 172 | return current_; | 185 | 172 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::currentPointer() const |
186 | | |
187 | | /// Set the current cursor position to \p current. |
188 | 208 | void setCurrentPointer(const CodeUnit *current) { |
189 | 208 | assert(first_ <= current && current <= last_ && "Current not in range"); |
190 | 208 | current_ = current; |
191 | 208 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::setCurrentPointer(char16_t const*) Line | Count | Source | 188 | 208 | void setCurrentPointer(const CodeUnit *current) { | 189 | 208 | assert(first_ <= current && current <= last_ && "Current not in range"); | 190 | 208 | current_ = current; | 191 | 208 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::setCurrentPointer(char const*) |
192 | | |
193 | | /// \return the current code unit, advancing the cursor by 1. |
194 | 243 | CodeUnit consume() { |
195 | 243 | CodeUnit result = current(); |
196 | 243 | current_ += forwards_ ? 1 : -1; |
197 | 243 | return result; |
198 | 243 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::consume() Line | Count | Source | 194 | 243 | CodeUnit consume() { | 195 | 243 | CodeUnit result = current(); | 196 | 243 | current_ += forwards_ ? 1 : -1; | 197 | 243 | return result; | 198 | 243 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::consume() |
199 | | |
200 | | /// \return a code point decoded from the code units under the cursor, |
201 | | /// possibly by decoding surrogates. Advances the cursor by the number of code |
202 | | /// units consumed. |
203 | 0 | CodePoint consumeUTF16() { |
204 | 0 | assert(!atEnd() && "At end"); |
205 | | |
206 | | // In ASCII we have no surrogates. |
207 | 0 | if (sizeof(CodeUnit) >= 2 && remaining() >= 2) { |
208 | 0 | CodeUnit hi = forwards_ ? current_[0] : current_[-2]; |
209 | 0 | CodeUnit lo = forwards_ ? current_[1] : current_[-1]; |
210 | 0 | if (isHighSurrogate(hi) && isLowSurrogate(lo)) { |
211 | 0 | current_ += forwards_ ? 2 : -2; |
212 | 0 | return utf16SurrogatePairToCodePoint(hi, lo); |
213 | 0 | } |
214 | 0 | } |
215 | 0 | return consume(); |
216 | 0 | } Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::consumeUTF16() Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::consumeUTF16() |
217 | | |
218 | | /// \return whether a regex match performed using the given \p flags can |
219 | | /// possibly match the given \p constraints. |
220 | | bool satisfiesConstraints( |
221 | | constants::MatchFlagType flags, |
222 | 86 | MatchConstraintSet constraints) const { |
223 | 86 | if ((constraints & MatchConstraintNonASCII) && |
224 | 86 | (flags & constants::matchInputAllAscii)) |
225 | 0 | return false; |
226 | 86 | if ((constraints & MatchConstraintAnchoredAtStart) && current_ != first_) |
227 | 0 | return false; |
228 | 86 | return true; |
229 | 86 | } hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::satisfiesConstraints(hermes::regex::constants::MatchFlagType, unsigned char) const Line | Count | Source | 222 | 86 | MatchConstraintSet constraints) const { | 223 | 86 | if ((constraints & MatchConstraintNonASCII) && | 224 | 86 | (flags & constants::matchInputAllAscii)) | 225 | 0 | return false; | 226 | 86 | if ((constraints & MatchConstraintAnchoredAtStart) && current_ != first_) | 227 | 0 | return false; | 228 | 86 | return true; | 229 | 86 | } |
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::satisfiesConstraints(hermes::regex::constants::MatchFlagType, unsigned char) const |
230 | | |
231 | | private: |
232 | | // The first code unit in the string. |
233 | | const CodeUnit *first_; |
234 | | |
235 | | // One past the last code unit in the string. |
236 | | const CodeUnit *last_; |
237 | | |
238 | | // Our position between first_ and last_. |
239 | | // If we are forwards, then the current character is current_[0]. |
240 | | // If we are backwards, then the current character is current_[-1]. |
241 | | const CodeUnit *current_; |
242 | | |
243 | | // A pointer to the end. This is either last (if forwards) or first (if not |
244 | | // forwards). If our current cursor reaches this value, we are done. |
245 | | const CodeUnit *end_; |
246 | | |
247 | | // Whether we are tracking forwards or backwards. |
248 | | bool forwards_; |
249 | | }; |
250 | | |
251 | | /// A Context records global information about a match attempt. |
252 | | template <class Traits> |
253 | | struct Context { |
254 | | using CodeUnit = typename Traits::CodeUnit; |
255 | | using CodePoint = typename Traits::CodePoint; |
256 | | |
257 | | /// The set of backtracking opcodes. These are interpreted by the backtrack() |
258 | | /// function. |
259 | | enum class BacktrackOp : uint8_t { |
260 | | /// Set the value of a capture group to a stored value. |
261 | | SetCaptureGroup, |
262 | | |
263 | | /// Set the value of a loop data to a stored value. |
264 | | SetLoopData, |
265 | | |
266 | | /// Set the IP and position in the input string to a stored value. |
267 | | SetPosition, |
268 | | |
269 | | /// Backtrack by entering the body of a non-greedy loop. |
270 | | EnterNonGreedyLoop, |
271 | | |
272 | | /// Backtrack a greedy loop whose body matches exactly one character, such |
273 | | /// as /.*/. |
274 | | GreedyWidth1Loop, |
275 | | |
276 | | /// Backtrack a nongreedy loop whose body matches exactly one character, |
277 | | /// such as /.*?/. |
278 | | NongreedyWidth1Loop, |
279 | | }; |
280 | | |
281 | | /// An instruction describing how to backtrack. |
282 | | union BacktrackInsn { |
283 | | /// The operation to perform. |
284 | | BacktrackOp op; |
285 | | |
286 | | /// List of instruction-specific fields. Note that the opcode is reproduced |
287 | | /// in every struct; this avoids padding between the opcode and the |
288 | | /// following field. |
289 | | |
290 | | /// Fields used by setCaptureGroup instruction. |
291 | | struct { |
292 | | BacktrackOp op; |
293 | | uint16_t mexp; /// Which capture group to set. |
294 | | CapturedRange range; /// Value to set. |
295 | | } setCaptureGroup; |
296 | | |
297 | | /// Fields used by SetLoopData instruction. |
298 | | struct { |
299 | | BacktrackOp op; |
300 | | uint16_t loopId; /// Which loop to set. |
301 | | LoopData loopData; /// Value to set. |
302 | | } setLoopData; |
303 | | |
304 | | /// Fields used by SetPosition instruction. |
305 | | struct { |
306 | | BacktrackOp op; |
307 | | uint32_t ip; /// Instruction pointer to set. |
308 | | const CodeUnit *value; /// Input string position to set. |
309 | | } setPosition; |
310 | | |
311 | | /// Fields used by EnterNonGreedyLoop instruction. |
312 | | struct { |
313 | | BacktrackOp op; |
314 | | uint32_t bodyIp; /// The IP of the loop body. |
315 | | LoopData loopData; /// Data for the loop to set. |
316 | | const BeginLoopInsn *loopInsn; /// The loop instruction. |
317 | | } enterNonGreedyLoop; |
318 | | |
319 | | /// Fields used by GreedyWidth1Loop and NongreedyWidth1Loop. |
320 | | struct { |
321 | | BacktrackOp op; /// The opcode. |
322 | | uint32_t continuation; /// The ip for the not-taken branch of the loop. |
323 | | const CodeUnit *min; /// The minimum possible match position. |
324 | | const CodeUnit *max; /// The maximum possible match position. |
325 | | } width1Loop; |
326 | | |
327 | 36 | /* implicit */ BacktrackInsn(BacktrackOp op) : op(op) {} hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::BacktrackInsn(hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackOp) Line | Count | Source | 327 | 36 | /* implicit */ BacktrackInsn(BacktrackOp op) : op(op) {} |
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::BacktrackInsn(hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackOp) |
328 | | |
329 | | /// \return a SetCaptureGroup instruction. |
330 | | static BacktrackInsn makeSetCaptureGroup( |
331 | | uint16_t mexp, |
332 | 0 | CapturedRange range) { |
333 | 0 | BacktrackInsn result{BacktrackOp::SetCaptureGroup}; |
334 | 0 | result.setCaptureGroup.mexp = mexp; |
335 | 0 | result.setCaptureGroup.range = range; |
336 | 0 | return result; |
337 | 0 | } Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeSetCaptureGroup(unsigned short, hermes::regex::CapturedRange) Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeSetCaptureGroup(unsigned short, hermes::regex::CapturedRange) |
338 | | |
339 | | /// \return a SetLoopData instruction. |
340 | 0 | static BacktrackInsn makeSetLoopData(uint16_t loopId, LoopData loopData) { |
341 | 0 | BacktrackInsn result{BacktrackOp::SetLoopData}; |
342 | 0 | result.setLoopData.loopId = loopId; |
343 | 0 | result.setLoopData.loopData = loopData; |
344 | 0 | return result; |
345 | 0 | } Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeSetLoopData(unsigned short, hermes::regex::LoopData) Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeSetLoopData(unsigned short, hermes::regex::LoopData) |
346 | | |
347 | | /// \return a SetPosition instruction. |
348 | | static BacktrackInsn makeSetPosition( |
349 | | uint32_t ip, |
350 | 0 | const CodeUnit *inputPos) { |
351 | 0 | BacktrackInsn result = BacktrackOp::SetPosition; |
352 | 0 | result.setPosition.ip = ip; |
353 | 0 | result.setPosition.value = inputPos; |
354 | 0 | return result; |
355 | 0 | } Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeSetPosition(unsigned int, char16_t const*) Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeSetPosition(unsigned int, char const*) |
356 | | |
357 | | /// \return an EnterNonGreedyLoop instruction. |
358 | | static BacktrackInsn makeEnterNonGreedyLoop( |
359 | | const BeginLoopInsn *loopInsn, |
360 | | uint32_t bodyIp, |
361 | 0 | LoopData loopData) { |
362 | 0 | BacktrackInsn result = BacktrackOp::EnterNonGreedyLoop; |
363 | 0 | result.enterNonGreedyLoop.bodyIp = bodyIp; |
364 | 0 | result.enterNonGreedyLoop.loopInsn = loopInsn; |
365 | 0 | result.enterNonGreedyLoop.loopData = loopData; |
366 | 0 | return result; |
367 | 0 | } Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeEnterNonGreedyLoop(hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData) Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeEnterNonGreedyLoop(hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData) |
368 | | }; |
369 | | |
370 | | /// Our stack of backtrack instructions. |
371 | | using BacktrackStack = llvh::SmallVector<BacktrackInsn, 64>; |
372 | | |
373 | | /// The maximum depth of our backtracking stack. Beyond this we return a stack |
374 | | /// overflow error. |
375 | | static constexpr size_t kMaxBacktrackDepth = 1u << 24; |
376 | | |
377 | | /// The stream of bytecode instructions, including the header. |
378 | | llvh::ArrayRef<uint8_t> bytecodeStream_; |
379 | | |
380 | | /// The flags associated with the match attempt. |
381 | | constants::MatchFlagType flags_; |
382 | | |
383 | | /// Syntax flags associated with the regex. |
384 | | SyntaxFlags syntaxFlags_; |
385 | | |
386 | | /// The first character in the input string. |
387 | | const CodeUnit *first_; |
388 | | |
389 | | /// The end of the input string (one-past the last). |
390 | | const CodeUnit *last_; |
391 | | |
392 | | /// Count of submatches. |
393 | | uint32_t markedCount_; |
394 | | |
395 | | /// Count of loops. |
396 | | uint32_t loopCount_; |
397 | | |
398 | | /// Traits used for canonicalization. |
399 | | Traits traits_; |
400 | | |
401 | | /// The remaining number of times we will attempt to backtrack. |
402 | | /// This is effectively a timeout on the regexp execution. |
403 | | uint32_t backtracksRemaining_ = kBacktrackLimit; |
404 | | |
405 | | /// Used to guard against stack overflow. Either uses real stack |
406 | | /// checking or call depth counter checking. |
407 | | StackOverflowGuard overflowGuard_; |
408 | | |
409 | | Context( |
410 | | llvh::ArrayRef<uint8_t> bytecodeStream, |
411 | | constants::MatchFlagType flags, |
412 | | SyntaxFlags syntaxFlags, |
413 | | const CodeUnit *first, |
414 | | const CodeUnit *last, |
415 | | uint32_t markedCount, |
416 | | uint32_t loopCount, |
417 | | StackOverflowGuard guard) |
418 | 86 | : bytecodeStream_(bytecodeStream), |
419 | 86 | flags_(flags), |
420 | 86 | syntaxFlags_(syntaxFlags), |
421 | 86 | first_(first), |
422 | 86 | last_(last), |
423 | 86 | markedCount_(markedCount), |
424 | 86 | loopCount_(loopCount), |
425 | 86 | overflowGuard_(guard) {} hermes::regex::Context<hermes::regex::UTF16RegexTraits>::Context(llvh::ArrayRef<unsigned char>, hermes::regex::constants::MatchFlagType, hermes::regex::SyntaxFlags, char16_t const*, char16_t const*, unsigned int, unsigned int, hermes::StackOverflowGuard) Line | Count | Source | 418 | 86 | : bytecodeStream_(bytecodeStream), | 419 | 86 | flags_(flags), | 420 | 86 | syntaxFlags_(syntaxFlags), | 421 | 86 | first_(first), | 422 | 86 | last_(last), | 423 | 86 | markedCount_(markedCount), | 424 | 86 | loopCount_(loopCount), | 425 | 86 | overflowGuard_(guard) {} |
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::Context(llvh::ArrayRef<unsigned char>, hermes::regex::constants::MatchFlagType, hermes::regex::SyntaxFlags, char const*, char const*, unsigned int, unsigned int, hermes::StackOverflowGuard) |
426 | | |
427 | | /// Run the given State \p state, by starting at its cursor and acting on its |
428 | | /// ip_ until the match succeeds or fails. If \p onlyAtStart is set, only |
429 | | /// test the match at \pos; otherwise test all successive input positions from |
430 | | /// pos_ through last_. |
431 | | /// \return a pointer to the start of the match if the match succeeds, nullptr |
432 | | /// if it fails. If the match succeeds, populates \p state with the state of |
433 | | /// the successful match; on failure the state's contents are undefined. |
434 | | /// Note the end of the match can be recovered as |
435 | | /// state->cursor_.currentPointer(). |
436 | | ExecutorResult<const CodeUnit *> match( |
437 | | State<Traits> *state, |
438 | | bool onlyAtStart); |
439 | | |
440 | | /// Backtrack the given state \p s with the backtrack stack \p bts. |
441 | | /// \return true if we backtracked, false if we exhausted the stack. |
442 | | LLVM_NODISCARD |
443 | | ExecutorResult<bool> backtrack(BacktrackStack &bts, State<Traits> *s); |
444 | | |
445 | | /// Set the state's position to the body of a non-greedy loop. |
446 | | /// \return RETURNED if backtracking was prepared, STACK_OVERFLOW otherwise. |
447 | | LLVM_NODISCARD |
448 | | ExecutionStatus performEnterNonGreedyLoop( |
449 | | State<Traits> *s, |
450 | | const BeginLoopInsn *loop, |
451 | | uint32_t bodyIp, |
452 | | LoopData loopData, |
453 | | BacktrackStack &backtrackStack); |
454 | | |
455 | | /// Add a backtrack instruction to the backtrack stack \p bts. |
456 | | /// \return RETURNED on success, STACK_OVERFLOW otherwise |
457 | | LLVM_NODISCARD |
458 | 36 | ExecutionStatus pushBacktrack(BacktrackStack &bts, BacktrackInsn insn) { |
459 | 36 | bts.push_back(insn); |
460 | 36 | if (LLVM_UNLIKELY(bts.size() > kMaxBacktrackDepth) || |
461 | 36 | LLVM_UNLIKELY(backtracksRemaining_ == 0)) { |
462 | 0 | return ExecutionStatus::STACK_OVERFLOW; |
463 | 0 | } |
464 | 36 | backtracksRemaining_--; |
465 | 36 | return ExecutionStatus::RETURNED; |
466 | 36 | } hermes::regex::Context<hermes::regex::UTF16RegexTraits>::pushBacktrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&, hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn) Line | Count | Source | 458 | 36 | ExecutionStatus pushBacktrack(BacktrackStack &bts, BacktrackInsn insn) { | 459 | 36 | bts.push_back(insn); | 460 | 36 | if (LLVM_UNLIKELY(bts.size() > kMaxBacktrackDepth) || | 461 | 36 | LLVM_UNLIKELY(backtracksRemaining_ == 0)) { | 462 | 0 | return ExecutionStatus::STACK_OVERFLOW; | 463 | 0 | } | 464 | 36 | backtracksRemaining_--; | 465 | 36 | return ExecutionStatus::RETURNED; | 466 | 36 | } |
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::pushBacktrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&, hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn) |
467 | | |
468 | | /// Run the given Width1Loop \p insn on the given state \p s with the |
469 | | /// backtrack stack \p bts. |
470 | | /// \return true on success, false if we should backtrack. |
471 | | LLVM_NODISCARD |
472 | | ExecutorResult<bool> matchWidth1Loop( |
473 | | const Width1LoopInsn *insn, |
474 | | State<Traits> *s, |
475 | | BacktrackStack &bts); |
476 | | |
477 | | private: |
478 | | /// Do initialization of the given state before it enters the loop body |
479 | | /// described by the LoopInsn \p loop, including setting up any backtracking |
480 | | /// state. |
481 | | /// \return RETURNED if backtracking was prepared, STACK_OVERFLOW else |
482 | | LLVM_NODISCARD |
483 | | ExecutionStatus prepareToEnterLoopBody( |
484 | | State<Traits> *state, |
485 | | const BeginLoopInsn *loop, |
486 | | BacktrackStack &bts); |
487 | | |
488 | | /// Given a Width1Opcode \p w1opcode, return true if the given char \p c |
489 | | /// matches the instruction \p insn (with that opcode). |
490 | | template <Width1Opcode w1opcode> |
491 | | inline bool matchWidth1(const Insn *insn, CodeUnit c) const; |
492 | | |
493 | | /// \return true if all chars, stored in contiguous memory after \p insn, |
494 | | /// match the chars in state \p s in the same order, case insensitive. Note |
495 | | /// the count of chars is given in \p insn. |
496 | | inline bool matchesNCharICase8( |
497 | | const MatchNCharICase8Insn *insn, |
498 | | State<Traits> &s); |
499 | | |
500 | | /// Execute the given Width1 instruction \p loopBody on cursor \p c up to \p |
501 | | /// max times. \return the number of matches made, not to exceed \p max. |
502 | | /// Note we deliberately accept \p c by value. |
503 | | template <Width1Opcode w1opcode> |
504 | | inline uint32_t |
505 | | matchWidth1LoopBody(const Insn *loopBody, Cursor<Traits> c, uint32_t max); |
506 | | |
507 | | /// ES6 21.2.5.2.3 AdvanceStringIndex. |
508 | | /// Return the index of the next character to check. |
509 | | /// This is typically just the index + 1, except if Unicode is enabled we need |
510 | | /// to skip surrogate pairs. |
511 | | inline size_t advanceStringIndex( |
512 | | const CodeUnit *start, |
513 | | size_t index, |
514 | | size_t lastIndex) const; |
515 | | }; |
516 | | |
517 | | /// We store loop and captured range data contiguously in a single allocation at |
518 | | /// the end of the State. Use this union to simplify the use of |
519 | | /// llvh::TrailingObjects. |
520 | | union LoopOrCapturedRange { |
521 | | struct LoopData loopData; |
522 | | struct CapturedRange capturedRange; |
523 | | }; |
524 | | |
525 | | /// State represents a set of in-flight capture groups and loop datas, along |
526 | | /// with the IP and input position. |
527 | | template <typename Traits> |
528 | | struct State { |
529 | | using CharT = typename Traits::CodeUnit; |
530 | | |
531 | | /// The cursor in the input string. |
532 | | Cursor<Traits> cursor_; |
533 | | |
534 | | /// The instruction pointer position in the bytecode stream. |
535 | | uint32_t ip_ = 0; |
536 | | |
537 | | /// List of captured ranges. This has size equal to the number of marked |
538 | | /// subexpressions for the regex. |
539 | | llvh::SmallVector<CapturedRange, 16> capturedRanges_; |
540 | | |
541 | | /// List of loop datas. This has size equal to the number of loops for the |
542 | | /// regex. |
543 | | llvh::SmallVector<LoopData, 16> loopDatas_; |
544 | | |
545 | | /// \return the loop data at index \p idx. |
546 | 0 | LoopData &getLoop(uint32_t idx) { |
547 | 0 | assert(idx < loopDatas_.size() && "Invalid loop index"); |
548 | 0 | return loopDatas_[idx]; |
549 | 0 | } Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::getLoop(unsigned int) Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::getLoop(unsigned int) |
550 | | |
551 | | /// \return the captured range at index \p idx. |
552 | 0 | CapturedRange &getCapturedRange(uint32_t idx) { |
553 | | // Captured ranges are allocated after loops, so add the loop count. |
554 | 0 | assert(idx < capturedRanges_.size() && "Invalid captured range index"); |
555 | 0 | return capturedRanges_[idx]; |
556 | 0 | } Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::getCapturedRange(unsigned int) Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::getCapturedRange(unsigned int) |
557 | | |
558 | | /// Construct a state which with the given \p cursor, which can hold \p |
559 | | /// markedCount submatches and \p loopCount loop datas. |
560 | | State(Cursor<Traits> cursor, uint32_t markedCount, uint32_t loopCount) |
561 | 86 | : cursor_(cursor), |
562 | 86 | capturedRanges_(markedCount, {kNotMatched, kNotMatched}), |
563 | 86 | loopDatas_(loopCount, {0, 0}) {} hermes::regex::State<hermes::regex::UTF16RegexTraits>::State(hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int, unsigned int) Line | Count | Source | 561 | 86 | : cursor_(cursor), | 562 | 86 | capturedRanges_(markedCount, {kNotMatched, kNotMatched}), | 563 | 86 | loopDatas_(loopCount, {0, 0}) {} |
Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::State(hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int, unsigned int) |
564 | | |
565 | 0 | State(const State &) = default; Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::State(hermes::regex::State<hermes::regex::UTF16RegexTraits> const&) Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::State(hermes::regex::State<hermes::regex::ASCIIRegexTraits> const&) |
566 | | State &operator=(const State &) = default; |
567 | | State(State &&) = default; |
568 | 0 | State &operator=(State &&) = default; Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::operator=(hermes::regex::State<hermes::regex::UTF16RegexTraits>&&) Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::operator=(hermes::regex::State<hermes::regex::ASCIIRegexTraits>&&) |
569 | | }; |
570 | | |
571 | | /// ES5.1 7.3 |
572 | | template <class CharT> |
573 | 0 | bool isLineTerminator(CharT c) { |
574 | 0 | return c == u'\u000A' || c == u'\u000D' || c == u'\u2028' || c == u'\u2029'; |
575 | 0 | } Unexecuted instantiation: bool hermes::regex::isLineTerminator<char16_t>(char16_t) Unexecuted instantiation: bool hermes::regex::isLineTerminator<unsigned int>(unsigned int) Unexecuted instantiation: bool hermes::regex::isLineTerminator<char>(char) Unexecuted instantiation: bool hermes::regex::isLineTerminator<unsigned char>(unsigned char) |
576 | | |
577 | | template <class Traits> |
578 | 86 | bool matchesLeftAnchor(Context<Traits> &ctx, State<Traits> &s) { |
579 | 86 | bool matchesAnchor = false; |
580 | 86 | const Cursor<Traits> &c = s.cursor_; |
581 | 86 | if (c.atLeft()) { |
582 | | // Beginning of text. |
583 | 86 | matchesAnchor = true; |
584 | 86 | } else if ( |
585 | 0 | (ctx.syntaxFlags_.multiline) && !c.atLeft() && |
586 | 0 | isLineTerminator(c.currentPointer()[-1])) { |
587 | | // Multiline and after line terminator. |
588 | 0 | matchesAnchor = true; |
589 | 0 | } |
590 | 86 | return matchesAnchor; |
591 | 86 | } bool hermes::regex::matchesLeftAnchor<hermes::regex::UTF16RegexTraits>(hermes::regex::Context<hermes::regex::UTF16RegexTraits>&, hermes::regex::State<hermes::regex::UTF16RegexTraits>&) Line | Count | Source | 578 | 86 | bool matchesLeftAnchor(Context<Traits> &ctx, State<Traits> &s) { | 579 | 86 | bool matchesAnchor = false; | 580 | 86 | const Cursor<Traits> &c = s.cursor_; | 581 | 86 | if (c.atLeft()) { | 582 | | // Beginning of text. | 583 | 86 | matchesAnchor = true; | 584 | 86 | } else if ( | 585 | 0 | (ctx.syntaxFlags_.multiline) && !c.atLeft() && | 586 | 0 | isLineTerminator(c.currentPointer()[-1])) { | 587 | | // Multiline and after line terminator. | 588 | 0 | matchesAnchor = true; | 589 | 0 | } | 590 | 86 | return matchesAnchor; | 591 | 86 | } |
Unexecuted instantiation: bool hermes::regex::matchesLeftAnchor<hermes::regex::ASCIIRegexTraits>(hermes::regex::Context<hermes::regex::ASCIIRegexTraits>&, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&) |
592 | | |
593 | | template <class Traits> |
594 | 0 | bool matchesRightAnchor(Context<Traits> &ctx, State<Traits> &s) { |
595 | 0 | bool matchesAnchor = false; |
596 | 0 | const Cursor<Traits> &c = s.cursor_; |
597 | 0 | if (c.atRight() && !(ctx.flags_ & constants::matchNotEndOfLine)) { |
598 | 0 | matchesAnchor = true; |
599 | 0 | } else if ( |
600 | 0 | (ctx.syntaxFlags_.multiline) && (!c.atRight()) && |
601 | 0 | isLineTerminator(c.currentPointer()[0])) { |
602 | 0 | matchesAnchor = true; |
603 | 0 | } |
604 | 0 | return matchesAnchor; |
605 | 0 | } Unexecuted instantiation: bool hermes::regex::matchesRightAnchor<hermes::regex::UTF16RegexTraits>(hermes::regex::Context<hermes::regex::UTF16RegexTraits>&, hermes::regex::State<hermes::regex::UTF16RegexTraits>&) Unexecuted instantiation: bool hermes::regex::matchesRightAnchor<hermes::regex::ASCIIRegexTraits>(hermes::regex::Context<hermes::regex::ASCIIRegexTraits>&, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&) |
606 | | |
607 | | /// \return true if all chars, stored in contiguous memory after \p insn, |
608 | | /// match the chars in state \p s in the same order. Note the count of chars |
609 | | /// is given in \p insn. |
610 | | template <class Traits> |
611 | 121 | bool matchesNChar8(const MatchNChar8Insn *insn, State<Traits> &s) { |
612 | 121 | Cursor<Traits> &c = s.cursor_; |
613 | 121 | auto insnCharPtr = reinterpret_cast<const char *>(insn + 1); |
614 | 121 | auto charCount = insn->charCount; |
615 | 121 | for (int idx = 0; idx < charCount; idx++) { |
616 | 121 | if (c.consume() != insnCharPtr[idx]) { |
617 | 121 | return false; |
618 | 121 | } |
619 | 121 | } |
620 | 0 | return true; |
621 | 121 | } bool hermes::regex::matchesNChar8<hermes::regex::UTF16RegexTraits>(hermes::regex::MatchNChar8Insn const*, hermes::regex::State<hermes::regex::UTF16RegexTraits>&) Line | Count | Source | 611 | 121 | bool matchesNChar8(const MatchNChar8Insn *insn, State<Traits> &s) { | 612 | 121 | Cursor<Traits> &c = s.cursor_; | 613 | 121 | auto insnCharPtr = reinterpret_cast<const char *>(insn + 1); | 614 | 121 | auto charCount = insn->charCount; | 615 | 121 | for (int idx = 0; idx < charCount; idx++) { | 616 | 121 | if (c.consume() != insnCharPtr[idx]) { | 617 | 121 | return false; | 618 | 121 | } | 619 | 121 | } | 620 | 0 | return true; | 621 | 121 | } |
Unexecuted instantiation: bool hermes::regex::matchesNChar8<hermes::regex::ASCIIRegexTraits>(hermes::regex::MatchNChar8Insn const*, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&) |
622 | | |
623 | | template <class Traits> |
624 | | bool Context<Traits>::matchesNCharICase8( |
625 | | const MatchNCharICase8Insn *insn, |
626 | 0 | State<Traits> &s) { |
627 | 0 | Cursor<Traits> &c = s.cursor_; |
628 | 0 | auto insnCharPtr = reinterpret_cast<const char *>(insn + 1); |
629 | 0 | auto charCount = insn->charCount; |
630 | 0 | bool unicode = syntaxFlags_.unicode; |
631 | 0 | for (int idx = 0; idx < charCount; idx++) { |
632 | 0 | auto c1 = c.consume(); |
633 | 0 | char instC = insnCharPtr[idx]; |
634 | 0 | if (c1 != instC && |
635 | 0 | (char32_t)traits_.canonicalize(c1, unicode) != (char32_t)instC) { |
636 | 0 | return false; |
637 | 0 | } |
638 | 0 | } |
639 | 0 | return true; |
640 | 0 | } Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchesNCharICase8(hermes::regex::MatchNCharICase8Insn const*, hermes::regex::State<hermes::regex::UTF16RegexTraits>&) Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchesNCharICase8(hermes::regex::MatchNCharICase8Insn const*, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&) |
641 | | |
642 | | /// \return true if the character \p ch matches a bracket instruction \p insn, |
643 | | /// containing the bracket ranges \p ranges. Note the count of ranges is given |
644 | | /// in \p insn. |
645 | | template <class Traits> |
646 | | bool bracketMatchesChar( |
647 | | const Context<Traits> &ctx, |
648 | | const BracketInsn *insn, |
649 | | const BracketRange32 *ranges, |
650 | 122 | typename Traits::CodePoint ch) { |
651 | 122 | const auto &traits = ctx.traits_; |
652 | | // Note that if the bracket is negated /[^abc]/, we want to return true if we |
653 | | // do not match, false if we do. Implement this by xor with the negate flag. |
654 | | |
655 | | // Check character classes. |
656 | | // Note we don't have to canonicalize here, because canonicalization does not |
657 | | // affect which character class a character is in (i.e. a character doesn't |
658 | | // become a digit after uppercasing). |
659 | 122 | if (insn->positiveCharClasses || insn->negativeCharClasses) { |
660 | 122 | for (auto charClass : |
661 | 122 | {CharacterClass::Digits, |
662 | 122 | CharacterClass::Spaces, |
663 | 330 | CharacterClass::Words}) { |
664 | 330 | if ((insn->positiveCharClasses & charClass) && |
665 | 330 | traits.characterHasType(ch, charClass)) |
666 | 36 | return true ^ insn->negate; |
667 | 294 | if ((insn->negativeCharClasses & charClass) && |
668 | 294 | !traits.characterHasType(ch, charClass)) |
669 | 0 | return true ^ insn->negate; |
670 | 294 | } |
671 | 122 | } |
672 | | |
673 | 86 | bool contained = |
674 | 86 | traits.rangesContain(llvh::makeArrayRef(ranges, insn->rangeCount), ch); |
675 | 86 | return contained ^ insn->negate; |
676 | 122 | } bool hermes::regex::bracketMatchesChar<hermes::regex::UTF16RegexTraits>(hermes::regex::Context<hermes::regex::UTF16RegexTraits> const&, hermes::regex::BracketInsn const*, hermes::regex::BracketRange32 const*, hermes::regex::UTF16RegexTraits::CodePoint) Line | Count | Source | 650 | 122 | typename Traits::CodePoint ch) { | 651 | 122 | const auto &traits = ctx.traits_; | 652 | | // Note that if the bracket is negated /[^abc]/, we want to return true if we | 653 | | // do not match, false if we do. Implement this by xor with the negate flag. | 654 | | | 655 | | // Check character classes. | 656 | | // Note we don't have to canonicalize here, because canonicalization does not | 657 | | // affect which character class a character is in (i.e. a character doesn't | 658 | | // become a digit after uppercasing). | 659 | 122 | if (insn->positiveCharClasses || insn->negativeCharClasses) { | 660 | 122 | for (auto charClass : | 661 | 122 | {CharacterClass::Digits, | 662 | 122 | CharacterClass::Spaces, | 663 | 330 | CharacterClass::Words}) { | 664 | 330 | if ((insn->positiveCharClasses & charClass) && | 665 | 330 | traits.characterHasType(ch, charClass)) | 666 | 36 | return true ^ insn->negate; | 667 | 294 | if ((insn->negativeCharClasses & charClass) && | 668 | 294 | !traits.characterHasType(ch, charClass)) | 669 | 0 | return true ^ insn->negate; | 670 | 294 | } | 671 | 122 | } | 672 | | | 673 | 86 | bool contained = | 674 | 86 | traits.rangesContain(llvh::makeArrayRef(ranges, insn->rangeCount), ch); | 675 | 86 | return contained ^ insn->negate; | 676 | 122 | } |
Unexecuted instantiation: bool hermes::regex::bracketMatchesChar<hermes::regex::ASCIIRegexTraits>(hermes::regex::Context<hermes::regex::ASCIIRegexTraits> const&, hermes::regex::BracketInsn const*, hermes::regex::BracketRange32 const*, hermes::regex::ASCIIRegexTraits::CodePoint) |
677 | | |
678 | | template <class Traits> |
679 | | ExecutionStatus Context<Traits>::prepareToEnterLoopBody( |
680 | | State<Traits> *s, |
681 | | const BeginLoopInsn *loop, |
682 | 0 | BacktrackStack &bts) { |
683 | 0 | LoopData &loopData = s->getLoop(loop->loopId); |
684 | 0 | auto res = pushBacktrack( |
685 | 0 | bts, BacktrackInsn::makeSetLoopData(loop->loopId, loopData)); |
686 | 0 | if (res != ExecutionStatus::RETURNED) { |
687 | 0 | return res; |
688 | 0 | } |
689 | 0 | loopData.iterations++; |
690 | 0 | loopData.entryPosition = s->cursor_.offsetFromLeft(); |
691 | | |
692 | | // Backtrack and reset contained capture groups. |
693 | 0 | for (uint32_t mexp = loop->mexpBegin; mexp != loop->mexpEnd; mexp++) { |
694 | 0 | auto &captureRange = s->getCapturedRange(mexp); |
695 | 0 | res = pushBacktrack( |
696 | 0 | bts, BacktrackInsn::makeSetCaptureGroup(mexp, captureRange)); |
697 | 0 | if (res != ExecutionStatus::RETURNED) { |
698 | 0 | return res; |
699 | 0 | } |
700 | 0 | captureRange = {kNotMatched, kNotMatched}; |
701 | 0 | } |
702 | 0 | return ExecutionStatus::RETURNED; |
703 | 0 | } Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::prepareToEnterLoopBody(hermes::regex::State<hermes::regex::UTF16RegexTraits>*, hermes::regex::BeginLoopInsn const*, llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&) Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::prepareToEnterLoopBody(hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, hermes::regex::BeginLoopInsn const*, llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&) |
704 | | |
705 | | template <class Traits> |
706 | | ExecutionStatus Context<Traits>::performEnterNonGreedyLoop( |
707 | | State<Traits> *s, |
708 | | const BeginLoopInsn *loop, |
709 | | uint32_t bodyIp, |
710 | | LoopData loopData, |
711 | 0 | BacktrackStack &backtrackStack) { |
712 | 0 | assert(loop->opcode == Opcode::BeginLoop && "Not a BeginLoopInsn"); |
713 | 0 | s->getLoop(loop->loopId) = loopData; |
714 | | |
715 | | // Set the IP and input position, and initialize the state for entering the |
716 | | // loop. |
717 | 0 | s->ip_ = bodyIp; |
718 | 0 | s->cursor_.setCurrentPointer(first_ + loopData.entryPosition); |
719 | 0 | return prepareToEnterLoopBody(s, loop, backtrackStack); |
720 | 0 | } Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::performEnterNonGreedyLoop(hermes::regex::State<hermes::regex::UTF16RegexTraits>*, hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData, llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&) Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::performEnterNonGreedyLoop(hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData, llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&) |
721 | | |
722 | | template <class Traits> |
723 | | ExecutorResult<bool> Context<Traits>::backtrack( |
724 | | BacktrackStack &bts, |
725 | 122 | State<Traits> *s) { |
726 | 158 | while (!bts.empty()) { |
727 | 72 | BacktrackInsn &binsn = bts.back(); |
728 | 72 | switch (binsn.op) { |
729 | 0 | case BacktrackOp::SetCaptureGroup: |
730 | 0 | s->getCapturedRange(binsn.setCaptureGroup.mexp) = |
731 | 0 | binsn.setCaptureGroup.range; |
732 | 0 | bts.pop_back(); |
733 | 0 | break; |
734 | | |
735 | 0 | case BacktrackOp::SetLoopData: |
736 | 0 | s->getLoop(binsn.setLoopData.loopId) = binsn.setLoopData.loopData; |
737 | 0 | bts.pop_back(); |
738 | 0 | break; |
739 | | |
740 | 0 | case BacktrackOp::SetPosition: |
741 | 0 | s->cursor_.setCurrentPointer(binsn.setPosition.value); |
742 | 0 | s->ip_ = binsn.setPosition.ip; |
743 | 0 | bts.pop_back(); |
744 | 0 | return true; |
745 | | |
746 | 0 | case BacktrackOp::EnterNonGreedyLoop: { |
747 | 0 | auto fields = binsn.enterNonGreedyLoop; |
748 | 0 | bts.pop_back(); |
749 | 0 | auto res = performEnterNonGreedyLoop( |
750 | 0 | s, fields.loopInsn, fields.bodyIp, fields.loopData, bts); |
751 | 0 | if (res != ExecutionStatus::RETURNED) { |
752 | 0 | return res; |
753 | 0 | } |
754 | 0 | return true; |
755 | 0 | } |
756 | | |
757 | 72 | case BacktrackOp::GreedyWidth1Loop: |
758 | 72 | case BacktrackOp::NongreedyWidth1Loop: { |
759 | | // In both of these instructions, we have a range [min, max] containing |
760 | | // possible match locations, and the match failed at the max location |
761 | | // (if we are greedy) or the min location (nongreedy). Backtrack by |
762 | | // decrementing the max (incrementing the min) if we are greedy |
763 | | // (nongreedy), setting the IP to that location, and jumping to the loop |
764 | | // exit. Note that if we are tracking backwards (lookbehind assertion) |
765 | | // our maximum is before our minimum, so we have to reverse the |
766 | | // direction of increment/decrement. |
767 | 72 | bool forwards = s->cursor_.forwards(); |
768 | 72 | assert( |
769 | 72 | (forwards ? binsn.width1Loop.min <= binsn.width1Loop.max |
770 | 72 | : binsn.width1Loop.min >= binsn.width1Loop.max) && |
771 | 72 | "Loop min should be <= max (or >= max if backwards)"); |
772 | 72 | if (binsn.width1Loop.min == binsn.width1Loop.max) { |
773 | | // We have backtracked as far as possible. Give up. |
774 | 36 | bts.pop_back(); |
775 | 36 | break; |
776 | 36 | } |
777 | 36 | if (binsn.op == BacktrackOp::GreedyWidth1Loop) { |
778 | 36 | binsn.width1Loop.max += forwards ? -1 : 1; |
779 | 36 | s->cursor_.setCurrentPointer(binsn.width1Loop.max); |
780 | 36 | } else { |
781 | 0 | binsn.width1Loop.min += forwards ? 1 : -1; |
782 | 0 | s->cursor_.setCurrentPointer(binsn.width1Loop.min); |
783 | 0 | } |
784 | 36 | s->ip_ = binsn.width1Loop.continuation; |
785 | 36 | return true; |
786 | 72 | } |
787 | 72 | } |
788 | 72 | } |
789 | | // Exhausted the backtracking stack. |
790 | 86 | return false; |
791 | 122 | } hermes::regex::Context<hermes::regex::UTF16RegexTraits>::backtrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&, hermes::regex::State<hermes::regex::UTF16RegexTraits>*) Line | Count | Source | 725 | 122 | State<Traits> *s) { | 726 | 158 | while (!bts.empty()) { | 727 | 72 | BacktrackInsn &binsn = bts.back(); | 728 | 72 | switch (binsn.op) { | 729 | 0 | case BacktrackOp::SetCaptureGroup: | 730 | 0 | s->getCapturedRange(binsn.setCaptureGroup.mexp) = | 731 | 0 | binsn.setCaptureGroup.range; | 732 | 0 | bts.pop_back(); | 733 | 0 | break; | 734 | | | 735 | 0 | case BacktrackOp::SetLoopData: | 736 | 0 | s->getLoop(binsn.setLoopData.loopId) = binsn.setLoopData.loopData; | 737 | 0 | bts.pop_back(); | 738 | 0 | break; | 739 | | | 740 | 0 | case BacktrackOp::SetPosition: | 741 | 0 | s->cursor_.setCurrentPointer(binsn.setPosition.value); | 742 | 0 | s->ip_ = binsn.setPosition.ip; | 743 | 0 | bts.pop_back(); | 744 | 0 | return true; | 745 | | | 746 | 0 | case BacktrackOp::EnterNonGreedyLoop: { | 747 | 0 | auto fields = binsn.enterNonGreedyLoop; | 748 | 0 | bts.pop_back(); | 749 | 0 | auto res = performEnterNonGreedyLoop( | 750 | 0 | s, fields.loopInsn, fields.bodyIp, fields.loopData, bts); | 751 | 0 | if (res != ExecutionStatus::RETURNED) { | 752 | 0 | return res; | 753 | 0 | } | 754 | 0 | return true; | 755 | 0 | } | 756 | | | 757 | 72 | case BacktrackOp::GreedyWidth1Loop: | 758 | 72 | case BacktrackOp::NongreedyWidth1Loop: { | 759 | | // In both of these instructions, we have a range [min, max] containing | 760 | | // possible match locations, and the match failed at the max location | 761 | | // (if we are greedy) or the min location (nongreedy). Backtrack by | 762 | | // decrementing the max (incrementing the min) if we are greedy | 763 | | // (nongreedy), setting the IP to that location, and jumping to the loop | 764 | | // exit. Note that if we are tracking backwards (lookbehind assertion) | 765 | | // our maximum is before our minimum, so we have to reverse the | 766 | | // direction of increment/decrement. | 767 | 72 | bool forwards = s->cursor_.forwards(); | 768 | 72 | assert( | 769 | 72 | (forwards ? binsn.width1Loop.min <= binsn.width1Loop.max | 770 | 72 | : binsn.width1Loop.min >= binsn.width1Loop.max) && | 771 | 72 | "Loop min should be <= max (or >= max if backwards)"); | 772 | 72 | if (binsn.width1Loop.min == binsn.width1Loop.max) { | 773 | | // We have backtracked as far as possible. Give up. | 774 | 36 | bts.pop_back(); | 775 | 36 | break; | 776 | 36 | } | 777 | 36 | if (binsn.op == BacktrackOp::GreedyWidth1Loop) { | 778 | 36 | binsn.width1Loop.max += forwards ? -1 : 1; | 779 | 36 | s->cursor_.setCurrentPointer(binsn.width1Loop.max); | 780 | 36 | } else { | 781 | 0 | binsn.width1Loop.min += forwards ? 1 : -1; | 782 | 0 | s->cursor_.setCurrentPointer(binsn.width1Loop.min); | 783 | 0 | } | 784 | 36 | s->ip_ = binsn.width1Loop.continuation; | 785 | 36 | return true; | 786 | 72 | } | 787 | 72 | } | 788 | 72 | } | 789 | | // Exhausted the backtracking stack. | 790 | 86 | return false; | 791 | 122 | } |
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::backtrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&, hermes::regex::State<hermes::regex::ASCIIRegexTraits>*) |
792 | | |
793 | | template <class Traits> |
794 | | template <Width1Opcode w1opcode> |
795 | 122 | bool Context<Traits>::matchWidth1(const Insn *base, CodeUnit c) const { |
796 | | // Note this switch should resolve at compile time. |
797 | 122 | assert( |
798 | 122 | base->opcode == static_cast<Opcode>(w1opcode) && |
799 | 122 | "Instruction has wrong opcode"); |
800 | 122 | switch (w1opcode) { |
801 | 0 | case Width1Opcode::MatchChar8: { |
802 | 0 | const auto *insn = llvh::cast<MatchChar8Insn>(base); |
803 | 0 | return c == insn->c; |
804 | 0 | } |
805 | | |
806 | 0 | case Width1Opcode::MatchChar16: { |
807 | 0 | const auto *insn = llvh::cast<MatchChar16Insn>(base); |
808 | 0 | return c == insn->c; |
809 | 0 | } |
810 | | |
811 | 0 | case Width1Opcode::MatchCharICase8: { |
812 | 0 | const auto *insn = llvh::cast<MatchCharICase8Insn>(base); |
813 | 0 | return c == (CodePoint)insn->c || |
814 | 0 | (CodePoint)traits_.canonicalize(c, syntaxFlags_.unicode) == |
815 | 0 | (CodePoint)insn->c; |
816 | 0 | } |
817 | | |
818 | 0 | case Width1Opcode::MatchCharICase16: { |
819 | 0 | const auto *insn = llvh::cast<MatchCharICase16Insn>(base); |
820 | 0 | return c == insn->c || |
821 | 0 | (char32_t)traits_.canonicalize(c, syntaxFlags_.unicode) == |
822 | 0 | (char32_t)insn->c; |
823 | 0 | } |
824 | | |
825 | 0 | case Width1Opcode::MatchAny: |
826 | 0 | return true; |
827 | | |
828 | 0 | case Width1Opcode::MatchAnyButNewline: |
829 | 0 | return !isLineTerminator(c); |
830 | | |
831 | 122 | case Width1Opcode::Bracket: { |
832 | | // BracketInsn is followed by a list of BracketRange32s. |
833 | 122 | assert( |
834 | 122 | !(syntaxFlags_.unicode) && |
835 | 122 | "Unicode should not be set for Width 1 brackets"); |
836 | 122 | const BracketInsn *insn = llvh::cast<BracketInsn>(base); |
837 | 122 | const BracketRange32 *ranges = |
838 | 122 | reinterpret_cast<const BracketRange32 *>(insn + 1); |
839 | 122 | return bracketMatchesChar<Traits>(*this, insn, ranges, c); |
840 | 122 | } |
841 | 122 | } |
842 | 122 | llvm_unreachable("Invalid width 1 opcode"); |
843 | 122 | } Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, char16_t) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, char16_t) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, char16_t) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, char16_t) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, char16_t) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, char16_t) const bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, char16_t) const Line | Count | Source | 795 | 122 | bool Context<Traits>::matchWidth1(const Insn *base, CodeUnit c) const { | 796 | | // Note this switch should resolve at compile time. | 797 | 122 | assert( | 798 | 122 | base->opcode == static_cast<Opcode>(w1opcode) && | 799 | 122 | "Instruction has wrong opcode"); | 800 | 122 | switch (w1opcode) { | 801 | 0 | case Width1Opcode::MatchChar8: { | 802 | 0 | const auto *insn = llvh::cast<MatchChar8Insn>(base); | 803 | 0 | return c == insn->c; | 804 | 0 | } | 805 | | | 806 | 0 | case Width1Opcode::MatchChar16: { | 807 | 0 | const auto *insn = llvh::cast<MatchChar16Insn>(base); | 808 | 0 | return c == insn->c; | 809 | 0 | } | 810 | | | 811 | 0 | case Width1Opcode::MatchCharICase8: { | 812 | 0 | const auto *insn = llvh::cast<MatchCharICase8Insn>(base); | 813 | 0 | return c == (CodePoint)insn->c || | 814 | 0 | (CodePoint)traits_.canonicalize(c, syntaxFlags_.unicode) == | 815 | 0 | (CodePoint)insn->c; | 816 | 0 | } | 817 | | | 818 | 0 | case Width1Opcode::MatchCharICase16: { | 819 | 0 | const auto *insn = llvh::cast<MatchCharICase16Insn>(base); | 820 | 0 | return c == insn->c || | 821 | 0 | (char32_t)traits_.canonicalize(c, syntaxFlags_.unicode) == | 822 | 0 | (char32_t)insn->c; | 823 | 0 | } | 824 | | | 825 | 0 | case Width1Opcode::MatchAny: | 826 | 0 | return true; | 827 | | | 828 | 0 | case Width1Opcode::MatchAnyButNewline: | 829 | 0 | return !isLineTerminator(c); | 830 | | | 831 | 122 | case Width1Opcode::Bracket: { | 832 | | // BracketInsn is followed by a list of BracketRange32s. | 833 | 122 | assert( | 834 | 122 | !(syntaxFlags_.unicode) && | 835 | 122 | "Unicode should not be set for Width 1 brackets"); | 836 | 122 | const BracketInsn *insn = llvh::cast<BracketInsn>(base); | 837 | 122 | const BracketRange32 *ranges = | 838 | 122 | reinterpret_cast<const BracketRange32 *>(insn + 1); | 839 | 122 | return bracketMatchesChar<Traits>(*this, insn, ranges, c); | 840 | 122 | } | 841 | 122 | } | 842 | 0 | llvm_unreachable("Invalid width 1 opcode"); | 843 | 122 | } |
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, char) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, char) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, char) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, char) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, char) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, char) const Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, char) const |
844 | | |
845 | | template <class Traits> |
846 | | template <Width1Opcode w1opcode> |
847 | | uint32_t Context<Traits>::matchWidth1LoopBody( |
848 | | const Insn *insn, |
849 | | Cursor<Traits> c, |
850 | 86 | uint32_t max) { |
851 | 86 | uint32_t iters = 0; |
852 | 122 | for (; iters < max; iters++) { |
853 | 122 | if (!matchWidth1<w1opcode>(insn, c.consume())) |
854 | 86 | break; |
855 | 122 | } |
856 | 86 | return iters; |
857 | 86 | } Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int) unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int) Line | Count | Source | 850 | 86 | uint32_t max) { | 851 | 86 | uint32_t iters = 0; | 852 | 122 | for (; iters < max; iters++) { | 853 | 122 | if (!matchWidth1<w1opcode>(insn, c.consume())) | 854 | 86 | break; | 855 | 122 | } | 856 | 86 | return iters; | 857 | 86 | } |
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int) Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int) |
858 | | |
859 | | template <class Traits> |
860 | | ExecutorResult<bool> Context<Traits>::matchWidth1Loop( |
861 | | const Width1LoopInsn *insn, |
862 | | State<Traits> *s, |
863 | 86 | BacktrackStack &bts) { |
864 | | // Note we copy the cursor here. |
865 | 86 | Cursor<Traits> c = s->cursor_; |
866 | 86 | uint32_t matched = 0, minMatch = insn->min, maxMatch = insn->max; |
867 | | |
868 | | // Limit our max to the smaller of the maximum in the loop and number of |
869 | | // number of characters remaining. This allows us to avoid having to test for |
870 | | // end of input in the loop body. |
871 | 86 | maxMatch = std::min(c.remaining(), maxMatch); |
872 | | |
873 | | // The loop body follows the loop instruction. |
874 | 86 | const Insn *body = static_cast<const Insn *>(&insn[1]); |
875 | | |
876 | | // Match as far as we can up to maxMatch. Note we do this even if the loop is |
877 | | // non-greedy: we compute how far we might conceivably have to backtrack |
878 | | // (except in non-greedy loops we're "backtracking" by moving forwards). |
879 | 86 | using W1 = Width1Opcode; |
880 | 86 | switch (static_cast<Width1Opcode>(body->opcode)) { |
881 | 0 | case W1::MatchChar8: |
882 | 0 | matched = matchWidth1LoopBody<W1::MatchChar8>(body, c, maxMatch); |
883 | 0 | break; |
884 | 0 | case W1::MatchChar16: |
885 | 0 | matched = matchWidth1LoopBody<W1::MatchChar16>(body, c, maxMatch); |
886 | 0 | break; |
887 | 0 | case W1::MatchCharICase8: |
888 | 0 | matched = matchWidth1LoopBody<W1::MatchCharICase8>(body, c, maxMatch); |
889 | 0 | break; |
890 | 0 | case W1::MatchCharICase16: |
891 | 0 | matched = matchWidth1LoopBody<W1::MatchCharICase16>(body, c, maxMatch); |
892 | 0 | break; |
893 | 0 | case W1::MatchAny: |
894 | 0 | matched = matchWidth1LoopBody<W1::MatchAny>(body, c, maxMatch); |
895 | 0 | break; |
896 | 0 | case W1::MatchAnyButNewline: |
897 | 0 | matched = matchWidth1LoopBody<W1::MatchAnyButNewline>(body, c, maxMatch); |
898 | 0 | break; |
899 | 86 | case W1::Bracket: |
900 | 86 | matched = matchWidth1LoopBody<W1::Bracket>(body, c, maxMatch); |
901 | 86 | break; |
902 | 86 | } |
903 | | |
904 | | // If we iterated less than the minimum, we failed to match. |
905 | 86 | if (matched < minMatch) { |
906 | 0 | return false; |
907 | 0 | } |
908 | 86 | assert( |
909 | 86 | minMatch <= matched && matched <= maxMatch && |
910 | 86 | "matched should be between min and max match count"); |
911 | | |
912 | | // Now we know the valid match range. |
913 | | // Compute the beginning and end pointers in this range. |
914 | 86 | bool forwards = s->cursor_.forwards(); |
915 | 86 | const CodeUnit *pos = s->cursor_.currentPointer(); |
916 | 86 | const CodeUnit *minPos = forwards ? pos + minMatch : pos - minMatch; |
917 | 86 | const CodeUnit *maxPos = forwards ? pos + matched : pos - matched; |
918 | | |
919 | | // If min == max (e.g. /a{3}/) then no backtracking is possible. If min < max, |
920 | | // backtracking is possible and we need to add a backtracking instruction. |
921 | 86 | if (minMatch < matched) { |
922 | 36 | BacktrackInsn backtrack{ |
923 | 36 | insn->greedy ? BacktrackOp::GreedyWidth1Loop |
924 | 36 | : BacktrackOp::NongreedyWidth1Loop}; |
925 | 36 | backtrack.width1Loop.continuation = insn->notTakenTarget; |
926 | 36 | backtrack.width1Loop.min = minPos; |
927 | 36 | backtrack.width1Loop.max = maxPos; |
928 | 36 | auto res = pushBacktrack(bts, backtrack); |
929 | 36 | if (res != ExecutionStatus::RETURNED) |
930 | 0 | return res; |
931 | 36 | } |
932 | | // Set the state's current position to either the minimum or maximum location, |
933 | | // and point it to the exit of the loop. |
934 | 86 | s->cursor_.setCurrentPointer(insn->greedy ? maxPos : minPos); |
935 | 86 | s->ip_ = insn->notTakenTarget; |
936 | 86 | return true; |
937 | 86 | } hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1Loop(hermes::regex::Width1LoopInsn const*, hermes::regex::State<hermes::regex::UTF16RegexTraits>*, llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&) Line | Count | Source | 863 | 86 | BacktrackStack &bts) { | 864 | | // Note we copy the cursor here. | 865 | 86 | Cursor<Traits> c = s->cursor_; | 866 | 86 | uint32_t matched = 0, minMatch = insn->min, maxMatch = insn->max; | 867 | | | 868 | | // Limit our max to the smaller of the maximum in the loop and number of | 869 | | // number of characters remaining. This allows us to avoid having to test for | 870 | | // end of input in the loop body. | 871 | 86 | maxMatch = std::min(c.remaining(), maxMatch); | 872 | | | 873 | | // The loop body follows the loop instruction. | 874 | 86 | const Insn *body = static_cast<const Insn *>(&insn[1]); | 875 | | | 876 | | // Match as far as we can up to maxMatch. Note we do this even if the loop is | 877 | | // non-greedy: we compute how far we might conceivably have to backtrack | 878 | | // (except in non-greedy loops we're "backtracking" by moving forwards). | 879 | 86 | using W1 = Width1Opcode; | 880 | 86 | switch (static_cast<Width1Opcode>(body->opcode)) { | 881 | 0 | case W1::MatchChar8: | 882 | 0 | matched = matchWidth1LoopBody<W1::MatchChar8>(body, c, maxMatch); | 883 | 0 | break; | 884 | 0 | case W1::MatchChar16: | 885 | 0 | matched = matchWidth1LoopBody<W1::MatchChar16>(body, c, maxMatch); | 886 | 0 | break; | 887 | 0 | case W1::MatchCharICase8: | 888 | 0 | matched = matchWidth1LoopBody<W1::MatchCharICase8>(body, c, maxMatch); | 889 | 0 | break; | 890 | 0 | case W1::MatchCharICase16: | 891 | 0 | matched = matchWidth1LoopBody<W1::MatchCharICase16>(body, c, maxMatch); | 892 | 0 | break; | 893 | 0 | case W1::MatchAny: | 894 | 0 | matched = matchWidth1LoopBody<W1::MatchAny>(body, c, maxMatch); | 895 | 0 | break; | 896 | 0 | case W1::MatchAnyButNewline: | 897 | 0 | matched = matchWidth1LoopBody<W1::MatchAnyButNewline>(body, c, maxMatch); | 898 | 0 | break; | 899 | 86 | case W1::Bracket: | 900 | 86 | matched = matchWidth1LoopBody<W1::Bracket>(body, c, maxMatch); | 901 | 86 | break; | 902 | 86 | } | 903 | | | 904 | | // If we iterated less than the minimum, we failed to match. | 905 | 86 | if (matched < minMatch) { | 906 | 0 | return false; | 907 | 0 | } | 908 | 86 | assert( | 909 | 86 | minMatch <= matched && matched <= maxMatch && | 910 | 86 | "matched should be between min and max match count"); | 911 | | | 912 | | // Now we know the valid match range. | 913 | | // Compute the beginning and end pointers in this range. | 914 | 86 | bool forwards = s->cursor_.forwards(); | 915 | 86 | const CodeUnit *pos = s->cursor_.currentPointer(); | 916 | 86 | const CodeUnit *minPos = forwards ? pos + minMatch : pos - minMatch; | 917 | 86 | const CodeUnit *maxPos = forwards ? pos + matched : pos - matched; | 918 | | | 919 | | // If min == max (e.g. /a{3}/) then no backtracking is possible. If min < max, | 920 | | // backtracking is possible and we need to add a backtracking instruction. | 921 | 86 | if (minMatch < matched) { | 922 | 36 | BacktrackInsn backtrack{ | 923 | 36 | insn->greedy ? BacktrackOp::GreedyWidth1Loop | 924 | 36 | : BacktrackOp::NongreedyWidth1Loop}; | 925 | 36 | backtrack.width1Loop.continuation = insn->notTakenTarget; | 926 | 36 | backtrack.width1Loop.min = minPos; | 927 | 36 | backtrack.width1Loop.max = maxPos; | 928 | 36 | auto res = pushBacktrack(bts, backtrack); | 929 | 36 | if (res != ExecutionStatus::RETURNED) | 930 | 0 | return res; | 931 | 36 | } | 932 | | // Set the state's current position to either the minimum or maximum location, | 933 | | // and point it to the exit of the loop. | 934 | 86 | s->cursor_.setCurrentPointer(insn->greedy ? maxPos : minPos); | 935 | 86 | s->ip_ = insn->notTakenTarget; | 936 | 86 | return true; | 937 | 86 | } |
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1Loop(hermes::regex::Width1LoopInsn const*, hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&) |
938 | | |
939 | | /// ES6 21.2.5.2.3. Effectively this skips surrogate pairs if the regexp has the |
940 | | /// Unicode flag set. |
941 | | template <class Traits> |
942 | | inline size_t Context<Traits>::advanceStringIndex( |
943 | | const CodeUnit *start, |
944 | | size_t index, |
945 | 86 | size_t length) const { |
946 | 86 | if (sizeof(CodeUnit) == 1) { |
947 | | // The input string is ASCII and therefore cannot have surrogate pairs. |
948 | 0 | return index + 1; |
949 | 0 | } |
950 | | // "If unicode is false, return index+1." |
951 | | // "If index+1 >= length, return index+1." |
952 | 86 | if (LLVM_LIKELY(!(syntaxFlags_.unicode)) || (index + 1 >= length)) |
953 | 86 | return index + 1; |
954 | | |
955 | | // Let first be the code unit value at index index in S |
956 | | // If first < 0xD800 or first > 0xDBFF, return index+1 |
957 | | // Let second be the code unit value at index index+1 in S. |
958 | | // If second < 0xDC00 or second > 0xDFFF, return index+1. |
959 | 0 | CodeUnit first = start[index]; |
960 | 0 | CodeUnit second = start[index + 1]; |
961 | 0 | if (LLVM_LIKELY(!isHighSurrogate(first)) || |
962 | 0 | LLVM_LIKELY(!isLowSurrogate(second))) { |
963 | 0 | return index + 1; |
964 | 0 | } |
965 | | // Return index+2. |
966 | 0 | return index + 2; |
967 | 0 | } hermes::regex::Context<hermes::regex::UTF16RegexTraits>::advanceStringIndex(char16_t const*, unsigned long, unsigned long) const Line | Count | Source | 945 | 86 | size_t length) const { | 946 | 86 | if (sizeof(CodeUnit) == 1) { | 947 | | // The input string is ASCII and therefore cannot have surrogate pairs. | 948 | 0 | return index + 1; | 949 | 0 | } | 950 | | // "If unicode is false, return index+1." | 951 | | // "If index+1 >= length, return index+1." | 952 | 86 | if (LLVM_LIKELY(!(syntaxFlags_.unicode)) || (index + 1 >= length)) | 953 | 86 | return index + 1; | 954 | | | 955 | | // Let first be the code unit value at index index in S | 956 | | // If first < 0xD800 or first > 0xDBFF, return index+1 | 957 | | // Let second be the code unit value at index index+1 in S. | 958 | | // If second < 0xDC00 or second > 0xDFFF, return index+1. | 959 | 0 | CodeUnit first = start[index]; | 960 | 0 | CodeUnit second = start[index + 1]; | 961 | 0 | if (LLVM_LIKELY(!isHighSurrogate(first)) || | 962 | 0 | LLVM_LIKELY(!isLowSurrogate(second))) { | 963 | 0 | return index + 1; | 964 | 0 | } | 965 | | // Return index+2. | 966 | 0 | return index + 2; | 967 | 0 | } |
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::advanceStringIndex(char const*, unsigned long, unsigned long) const |
968 | | |
969 | | template <class Traits> |
970 | | auto Context<Traits>::match(State<Traits> *s, bool onlyAtStart) |
971 | 86 | -> ExecutorResult<const CodeUnit *> { |
972 | 86 | using State = State<Traits>; |
973 | 86 | BacktrackStack backtrackStack; |
974 | | |
975 | | // We'll refer to the cursor often. |
976 | 86 | Cursor<Traits> &c = s->cursor_; |
977 | | |
978 | | // Pull out the instruction portion of the bytecode, following the header. |
979 | 86 | const uint8_t *const bytecode = &bytecodeStream_[sizeof(RegexBytecodeHeader)]; |
980 | | |
981 | | // Save the incoming IP in case we have to loop. |
982 | 86 | const auto startIp = s->ip_; |
983 | | |
984 | 86 | const CodeUnit *const startLoc = c.currentPointer(); |
985 | | |
986 | | // Use offsetFromRight() instead of remaining() here so that the length passed |
987 | | // to advanceStringIndex is accurate even when the cursor is going backwards. |
988 | 86 | const size_t charsToRight = c.offsetFromRight(); |
989 | | |
990 | | // Decide how many locations we'll need to check. |
991 | | // Note that we do want to check the empty range at the end, so add one to |
992 | | // charsToRight. |
993 | 86 | const size_t locsToCheckCount = onlyAtStart ? 1 : 1 + charsToRight; |
994 | | |
995 | | // If we are tracking backwards, we should only ever have one potential match |
996 | | // location. This is because advanceStringIndex only ever tracks forwards. |
997 | 86 | assert( |
998 | 86 | (c.forwards() || locsToCheckCount == 1) && |
999 | 86 | "Can only check one location when cursor is backwards"); |
1000 | | |
1001 | | #ifndef HERMES_CHECK_NATIVE_STACK |
1002 | | ++overflowGuard_.callDepth; |
1003 | | auto decrement = |
1004 | | llvh::make_scope_exit([this] { --overflowGuard_.callDepth; }); |
1005 | | #endif |
1006 | | |
1007 | | // Make sure we are not exceeding the set limit of the amount of times we can |
1008 | | // recurse. |
1009 | 86 | if (overflowGuard_.isOverflowing()) { |
1010 | 0 | return ExecutionStatus::STACK_OVERFLOW; |
1011 | 0 | } |
1012 | | |
1013 | | // Macro used when a state fails to match. |
1014 | 86 | #define BACKTRACK() \ |
1015 | 122 | do { \ |
1016 | 122 | auto btRes = backtrack(backtrackStack, s); \ |
1017 | 122 | if (LLVM_UNLIKELY(!btRes)) \ |
1018 | 122 | return btRes.getStatus(); \ |
1019 | 122 | if (*btRes) \ |
1020 | 122 | goto backtrackingSucceeded; \ |
1021 | 122 | goto backtrackingExhausted; \ |
1022 | 122 | } while (0) |
1023 | | |
1024 | 172 | for (size_t locIndex = 0; locIndex < locsToCheckCount; |
1025 | 86 | locIndex = advanceStringIndex(startLoc, locIndex, charsToRight)) { |
1026 | 86 | const CodeUnit *potentialMatchLocation = startLoc + locIndex; |
1027 | 86 | c.setCurrentPointer(potentialMatchLocation); |
1028 | 86 | s->ip_ = startIp; |
1029 | 122 | backtrackingSucceeded: |
1030 | 294 | for (;;) { |
1031 | 294 | const Insn *base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]); |
1032 | 294 | switch (base->opcode) { |
1033 | 0 | case Opcode::Goal: |
1034 | 0 | return potentialMatchLocation; |
1035 | | |
1036 | 86 | case Opcode::LeftAnchor: |
1037 | 86 | if (!matchesLeftAnchor(*this, *s)) |
1038 | 0 | BACKTRACK(); |
1039 | 86 | s->ip_ += sizeof(LeftAnchorInsn); |
1040 | 86 | break; |
1041 | | |
1042 | 0 | case Opcode::RightAnchor: |
1043 | 0 | if (!matchesRightAnchor(*this, *s)) |
1044 | 0 | BACKTRACK(); |
1045 | 0 | s->ip_ += sizeof(RightAnchorInsn); |
1046 | 0 | break; |
1047 | | |
1048 | 0 | case Opcode::MatchAny: |
1049 | 0 | if (c.atEnd() || |
1050 | 0 | !matchWidth1<Width1Opcode::MatchAny>(base, c.consume())) |
1051 | 0 | BACKTRACK(); |
1052 | 0 | s->ip_ += sizeof(MatchAnyInsn); |
1053 | 0 | break; |
1054 | | |
1055 | 0 | case Opcode::U16MatchAny: |
1056 | 0 | if (c.atEnd()) |
1057 | 0 | BACKTRACK(); |
1058 | 0 | c.consumeUTF16(); |
1059 | 0 | s->ip_ += sizeof(U16MatchAnyInsn); |
1060 | 0 | break; |
1061 | | |
1062 | 0 | case Opcode::MatchAnyButNewline: |
1063 | 0 | if (c.atEnd() || |
1064 | 0 | !matchWidth1<Width1Opcode::MatchAnyButNewline>(base, c.consume())) |
1065 | 0 | BACKTRACK(); |
1066 | 0 | s->ip_ += sizeof(MatchAnyButNewlineInsn); |
1067 | 0 | break; |
1068 | | |
1069 | 0 | case Opcode::U16MatchAnyButNewline: |
1070 | 0 | if (c.atEnd() || isLineTerminator(c.consumeUTF16())) |
1071 | 0 | BACKTRACK(); |
1072 | 0 | s->ip_ += sizeof(U16MatchAnyButNewlineInsn); |
1073 | 0 | break; |
1074 | | |
1075 | 0 | case Opcode::MatchChar8: { |
1076 | 0 | if (c.atEnd() || |
1077 | 0 | !matchWidth1<Width1Opcode::MatchChar8>(base, c.consume())) |
1078 | 0 | BACKTRACK(); |
1079 | 0 | s->ip_ += sizeof(MatchChar8Insn); |
1080 | 0 | break; |
1081 | 0 | } |
1082 | | |
1083 | 0 | case Opcode::MatchChar16: { |
1084 | 0 | if (c.atEnd() || |
1085 | 0 | !matchWidth1<Width1Opcode::MatchChar16>(base, c.consume())) |
1086 | 0 | BACKTRACK(); |
1087 | 0 | s->ip_ += sizeof(MatchChar16Insn); |
1088 | 0 | break; |
1089 | 0 | } |
1090 | | |
1091 | 0 | case Opcode::U16MatchChar32: { |
1092 | 0 | const auto *insn = llvh::cast<U16MatchChar32Insn>(base); |
1093 | 0 | if (c.atEnd() || c.consumeUTF16() != (CodePoint)insn->c) |
1094 | 0 | BACKTRACK(); |
1095 | 0 | s->ip_ += sizeof(U16MatchChar32Insn); |
1096 | 0 | break; |
1097 | 0 | } |
1098 | | |
1099 | 0 | case Opcode::MatchCharICase8: { |
1100 | 0 | if (c.atEnd() || |
1101 | 0 | !matchWidth1<Width1Opcode::MatchCharICase8>(base, c.consume())) |
1102 | 0 | BACKTRACK(); |
1103 | 0 | s->ip_ += sizeof(MatchCharICase8Insn); |
1104 | 0 | break; |
1105 | 0 | } |
1106 | | |
1107 | 0 | case Opcode::MatchCharICase16: { |
1108 | 0 | if (c.atEnd() || |
1109 | 0 | !matchWidth1<Width1Opcode::MatchCharICase16>(base, c.consume())) |
1110 | 0 | BACKTRACK(); |
1111 | 0 | s->ip_ += sizeof(MatchCharICase16Insn); |
1112 | 0 | break; |
1113 | 0 | } |
1114 | | |
1115 | 0 | case Opcode::U16MatchCharICase32: { |
1116 | 0 | const auto *insn = llvh::cast<U16MatchCharICase32Insn>(base); |
1117 | 0 | bool matched = false; |
1118 | 0 | if (!c.atEnd()) { |
1119 | 0 | CodePoint cp = c.consumeUTF16(); |
1120 | 0 | matched = |
1121 | 0 | (cp == (CodePoint)insn->c || |
1122 | 0 | traits_.canonicalize(cp, true) == (CodePoint)insn->c); |
1123 | 0 | } |
1124 | 0 | if (!matched) |
1125 | 0 | BACKTRACK(); |
1126 | 0 | s->ip_ += sizeof(U16MatchCharICase32Insn); |
1127 | 0 | break; |
1128 | 0 | } |
1129 | | |
1130 | 122 | case Opcode::MatchNChar8: { |
1131 | 122 | const auto *insn = llvh::cast<MatchNChar8Insn>(base); |
1132 | 122 | if (c.remaining() < insn->charCount || !matchesNChar8(insn, *s)) |
1133 | 122 | BACKTRACK(); |
1134 | 0 | s->ip_ += insn->totalWidth(); |
1135 | 0 | break; |
1136 | 122 | } |
1137 | | |
1138 | 0 | case Opcode::MatchNCharICase8: { |
1139 | 0 | const auto *insn = llvh::cast<MatchNCharICase8Insn>(base); |
1140 | 0 | if (c.remaining() < insn->charCount || !matchesNCharICase8(insn, *s)) |
1141 | 0 | BACKTRACK(); |
1142 | 0 | s->ip_ += insn->totalWidth(); |
1143 | 0 | break; |
1144 | 0 | } |
1145 | | |
1146 | 0 | case Opcode::Alternation: { |
1147 | | // We have an alternation. Determine which of our first and second |
1148 | | // branches are viable. If both are, we have to split our state. |
1149 | 0 | const AlternationInsn *alt = llvh::cast<AlternationInsn>(base); |
1150 | 0 | bool primaryViable = |
1151 | 0 | c.satisfiesConstraints(flags_, alt->primaryConstraints); |
1152 | 0 | bool secondaryViable = |
1153 | 0 | c.satisfiesConstraints(flags_, alt->secondaryConstraints); |
1154 | 0 | if (primaryViable && secondaryViable) { |
1155 | | // We need to explore both branches. Explore the primary branch |
1156 | | // first, backtrack to the secondary one. |
1157 | 0 | s->ip_ += sizeof(AlternationInsn); |
1158 | 0 | auto res = pushBacktrack( |
1159 | 0 | backtrackStack, |
1160 | 0 | BacktrackInsn::makeSetPosition( |
1161 | 0 | alt->secondaryBranch, c.currentPointer())); |
1162 | 0 | if (res != ExecutionStatus::RETURNED) { |
1163 | 0 | return res; |
1164 | 0 | } |
1165 | 0 | } else if (primaryViable) { |
1166 | 0 | s->ip_ += sizeof(AlternationInsn); |
1167 | 0 | } else if (secondaryViable) { |
1168 | 0 | s->ip_ = alt->secondaryBranch; |
1169 | 0 | } else { |
1170 | 0 | BACKTRACK(); |
1171 | 0 | } |
1172 | 0 | break; |
1173 | 0 | } |
1174 | | |
1175 | 0 | case Opcode::Jump32: |
1176 | 0 | s->ip_ = llvh::cast<Jump32Insn>(base)->target; |
1177 | 0 | break; |
1178 | | |
1179 | 0 | case Opcode::Bracket: { |
1180 | 0 | if (c.atEnd() || |
1181 | 0 | !matchWidth1<Width1Opcode::Bracket>(base, c.consume())) |
1182 | 0 | BACKTRACK(); |
1183 | 0 | s->ip_ += llvh::cast<BracketInsn>(base)->totalWidth(); |
1184 | 0 | break; |
1185 | 0 | } |
1186 | | |
1187 | 0 | case Opcode::U16Bracket: { |
1188 | 0 | const U16BracketInsn *insn = llvh::cast<U16BracketInsn>(base); |
1189 | | // U16BracketInsn is followed by a list of BracketRange32s. |
1190 | 0 | const BracketRange32 *ranges = |
1191 | 0 | reinterpret_cast<const BracketRange32 *>(insn + 1); |
1192 | 0 | if (c.atEnd() || |
1193 | 0 | !bracketMatchesChar<Traits>( |
1194 | 0 | *this, insn, ranges, c.consumeUTF16())) |
1195 | 0 | BACKTRACK(); |
1196 | 0 | s->ip_ += insn->totalWidth(); |
1197 | 0 | break; |
1198 | 0 | } |
1199 | | |
1200 | 0 | case Opcode::WordBoundary: { |
1201 | 0 | const WordBoundaryInsn *insn = llvh::cast<WordBoundaryInsn>(base); |
1202 | 0 | const auto *charPointer = c.currentPointer(); |
1203 | |
|
1204 | 0 | bool prevIsWordchar = false; |
1205 | 0 | if (!c.atLeft()) |
1206 | 0 | prevIsWordchar = traits_.characterHasType( |
1207 | 0 | charPointer[-1], CharacterClass::Words); |
1208 | |
|
1209 | 0 | bool currentIsWordchar = false; |
1210 | 0 | if (!c.atRight()) |
1211 | 0 | currentIsWordchar = |
1212 | 0 | traits_.characterHasType(charPointer[0], CharacterClass::Words); |
1213 | |
|
1214 | 0 | bool isWordBoundary = (prevIsWordchar != currentIsWordchar); |
1215 | 0 | if (isWordBoundary ^ insn->invert) |
1216 | 0 | s->ip_ += sizeof(WordBoundaryInsn); |
1217 | 0 | else |
1218 | 0 | BACKTRACK(); |
1219 | 0 | break; |
1220 | 0 | } |
1221 | | |
1222 | 0 | case Opcode::BeginMarkedSubexpression: { |
1223 | 0 | const auto *insn = llvh::cast<BeginMarkedSubexpressionInsn>(base); |
1224 | 0 | auto res = pushBacktrack( |
1225 | 0 | backtrackStack, |
1226 | 0 | BacktrackInsn::makeSetCaptureGroup( |
1227 | 0 | insn->mexp, {kNotMatched, kNotMatched})); |
1228 | 0 | if (res != ExecutionStatus::RETURNED) { |
1229 | 0 | return res; |
1230 | 0 | } |
1231 | | // When tracking backwards (in a lookbehind assertion) we traverse our |
1232 | | // input backwards, so set the end before the start. |
1233 | 0 | auto &range = s->getCapturedRange(insn->mexp); |
1234 | 0 | if (c.forwards()) { |
1235 | 0 | range.start = c.offsetFromLeft(); |
1236 | 0 | } else { |
1237 | 0 | range.end = c.offsetFromLeft(); |
1238 | 0 | } |
1239 | 0 | s->ip_ += sizeof(BeginMarkedSubexpressionInsn); |
1240 | 0 | break; |
1241 | 0 | } |
1242 | | |
1243 | 0 | case Opcode::EndMarkedSubexpression: { |
1244 | 0 | const auto *insn = llvh::cast<EndMarkedSubexpressionInsn>(base); |
1245 | 0 | auto &range = s->getCapturedRange(insn->mexp); |
1246 | 0 | if (c.forwards()) { |
1247 | 0 | assert( |
1248 | 0 | range.start != kNotMatched && "Capture group was not entered"); |
1249 | 0 | range.end = c.offsetFromLeft(); |
1250 | 0 | } else { |
1251 | 0 | assert(range.end != kNotMatched && "Capture group was not entered"); |
1252 | 0 | range.start = c.offsetFromLeft(); |
1253 | 0 | } |
1254 | 0 | assert(range.start <= range.end && "Captured range end before start"); |
1255 | 0 | s->ip_ += sizeof(EndMarkedSubexpressionInsn); |
1256 | 0 | break; |
1257 | 0 | } |
1258 | | |
1259 | | // ES10 21.2.2.9.1 |
1260 | 0 | case Opcode::BackRef: { |
1261 | 0 | const auto insn = llvh::cast<BackRefInsn>(base); |
1262 | | // a. Let cap be x's captures List. |
1263 | | // b. Let s be cap[n]. |
1264 | 0 | CapturedRange cr = s->getCapturedRange(insn->mexp); |
1265 | | |
1266 | | // c. If s is undefined, return c(x). |
1267 | | // Note we have to check both cr.start and cr.end here. If we are |
1268 | | // currently in the middle of matching a capture group (going either |
1269 | | // forwards or backwards) we should just return success. |
1270 | 0 | if (cr.start == kNotMatched || cr.end == kNotMatched) { |
1271 | | // Backreferences to a capture group that did not match always |
1272 | | // succeed (ES10 21.2.2.9) |
1273 | 0 | s->ip_ += sizeof(BackRefInsn); |
1274 | 0 | break; |
1275 | 0 | } |
1276 | | |
1277 | | // TODO: this can be optimized by hoisting the branches out of the |
1278 | | // loop. |
1279 | 0 | bool icase = syntaxFlags_.ignoreCase; |
1280 | 0 | bool unicode = syntaxFlags_.unicode; |
1281 | 0 | auto capturedStart = first_ + cr.start; |
1282 | 0 | auto capturedEnd = first_ + cr.end; |
1283 | 0 | Cursor<Traits> cursor2( |
1284 | 0 | capturedStart, |
1285 | 0 | c.forwards() ? capturedStart : capturedEnd, |
1286 | 0 | capturedEnd, |
1287 | 0 | c.forwards()); |
1288 | 0 | Cursor<Traits> cursor1 = c; |
1289 | 0 | bool matched = true; |
1290 | 0 | while (matched && !cursor2.atEnd()) { |
1291 | 0 | if (cursor1.atEnd()) { |
1292 | 0 | matched = false; |
1293 | 0 | } else if (!icase) { |
1294 | | // Direct comparison. Here we don't need to decode surrogate |
1295 | | // pairs. |
1296 | 0 | matched = (cursor1.consume() == cursor2.consume()); |
1297 | 0 | } else if (!unicode) { |
1298 | | // Case-insensitive non-Unicode comparison, no decoding of |
1299 | | // surrogate pairs. |
1300 | 0 | auto c1 = cursor1.consume(); |
1301 | 0 | auto c2 = cursor2.consume(); |
1302 | 0 | matched = |
1303 | 0 | (c1 == c2 || |
1304 | 0 | traits_.canonicalize(c1, unicode) == |
1305 | 0 | traits_.canonicalize(c2, unicode)); |
1306 | 0 | } else { |
1307 | | // Unicode: we do need to decode surrogate pairs. |
1308 | 0 | auto cp1 = cursor1.consumeUTF16(); |
1309 | 0 | auto cp2 = cursor2.consumeUTF16(); |
1310 | 0 | matched = |
1311 | 0 | (cp1 == cp2 || |
1312 | 0 | traits_.canonicalize(cp1, unicode) == |
1313 | 0 | traits_.canonicalize(cp2, unicode)); |
1314 | 0 | } |
1315 | 0 | } |
1316 | 0 | if (!matched) { |
1317 | 0 | BACKTRACK(); |
1318 | 0 | } |
1319 | 0 | s->ip_ += sizeof(BackRefInsn); |
1320 | 0 | c.setCurrentPointer(cursor1.currentPointer()); |
1321 | 0 | break; |
1322 | 0 | } |
1323 | | |
1324 | 0 | case Opcode::Lookaround: { |
1325 | 0 | const LookaroundInsn *insn = llvh::cast<LookaroundInsn>(base); |
1326 | 0 | bool matched = false; |
1327 | 0 | if (c.satisfiesConstraints(flags_, insn->constraints)) { |
1328 | | // Copy the state. This is because if the match fails (or if we are |
1329 | | // inverted) we need to restore its capture groups. |
1330 | 0 | State savedState{*s}; |
1331 | | |
1332 | | // Set the direction of the cursor. |
1333 | 0 | c.setForwards(insn->forwards); |
1334 | | |
1335 | | // Invoke match() recursively with our expression. |
1336 | | // Save and restore the position because lookaheads do not consume |
1337 | | // anything. |
1338 | 0 | s->ip_ += sizeof(LookaroundInsn); |
1339 | 0 | auto match = this->match(s, true /* onlyAtStart */); |
1340 | | // If the match errored out due to stack overflow, then we need to |
1341 | | // return an error here as well. |
1342 | 0 | if (LLVM_UNLIKELY(!match)) { |
1343 | 0 | return match.getStatus(); |
1344 | 0 | } |
1345 | | // We got a match if the value is non-null. |
1346 | 0 | matched = match.getValue() != nullptr; |
1347 | 0 | c.setCurrentPointer(savedState.cursor_.currentPointer()); |
1348 | 0 | c.setForwards(savedState.cursor_.forwards()); |
1349 | | |
1350 | | // Restore capture groups unless we are a positive lookaround that |
1351 | | // successfully matched. If we are a successfully matching positive |
1352 | | // lookaround, set up backtracking to reset the capture groups. Note |
1353 | | // we never backtrack INTO a successfully matched lookahead: |
1354 | | // once a lookahead finds a match it forgets all other ways it could |
1355 | | // have matched. (ES 5.1 15.10.2.8 Note 2). |
1356 | 0 | if (matched && !insn->invert) { |
1357 | | // Backtrack capture groups in the lookahead expression. |
1358 | 0 | for (uint32_t i = insn->mexpBegin, e = insn->mexpEnd; i < e; |
1359 | 0 | i++) { |
1360 | 0 | CapturedRange cr = savedState.getCapturedRange(i); |
1361 | 0 | auto res = pushBacktrack( |
1362 | 0 | backtrackStack, BacktrackInsn::makeSetCaptureGroup(i, cr)); |
1363 | 0 | if (res != ExecutionStatus::RETURNED) |
1364 | 0 | return res; |
1365 | 0 | } |
1366 | 0 | } else { |
1367 | | // Restore the saved state. |
1368 | 0 | *s = std::move(savedState); |
1369 | 0 | } |
1370 | 0 | } |
1371 | | |
1372 | | // 'matched' tells us whether the enclosed assertion expression |
1373 | | // matched the input. This instruction matched the input if it is a |
1374 | | // positive assertion (invert == false) and the expression matched, |
1375 | | // or a negative assertion (invert == true) and the expression did |
1376 | | // not match. Hence xor with invert. |
1377 | 0 | if (matched ^ insn->invert) |
1378 | 0 | s->ip_ = insn->continuation; |
1379 | 0 | else |
1380 | 0 | BACKTRACK(); |
1381 | 0 | break; |
1382 | 0 | } |
1383 | | |
1384 | 0 | case Opcode::BeginLoop: { |
1385 | | // Here we are entering a loop from outside, not jumping back into |
1386 | | // it. |
1387 | 0 | const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base); |
1388 | 0 | s->getLoop(loop->loopId).iterations = 0; |
1389 | | // Check to see if the loop body is viable. If not, and the loop has |
1390 | | // a nonzero minimum iteration, then we know we won't match and we |
1391 | | // can reject the state. If it does have a minimum iteration, we can |
1392 | | // just skip to the not-taken target. Note that this is a static |
1393 | | // property of the loop so we don't need to check it on every |
1394 | | // iteration, only the first one. |
1395 | 0 | if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) { |
1396 | 0 | if (loop->min > 0) { |
1397 | 0 | BACKTRACK(); |
1398 | 0 | } else { |
1399 | 0 | s->ip_ = loop->notTakenTarget; |
1400 | 0 | break; |
1401 | 0 | } |
1402 | 0 | } |
1403 | 0 | goto runLoop; |
1404 | 0 | } |
1405 | | |
1406 | 0 | case Opcode::EndLoop: |
1407 | | // This is reached after the body of a loop finishes executing. |
1408 | | // Move the IP to the loop and run it again immediately. |
1409 | 0 | s->ip_ = llvh::cast<EndLoopInsn>(base)->target; |
1410 | 0 | base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]); |
1411 | | // Note fall through. |
1412 | |
|
1413 | 0 | runLoop: { |
1414 | 0 | const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base); |
1415 | 0 | auto &loopData = s->getLoop(loop->loopId); |
1416 | 0 | uint32_t iteration = loopData.iterations; |
1417 | |
|
1418 | 0 | const uint32_t loopTakenIp = s->ip_ + sizeof(BeginLoopInsn); |
1419 | |
|
1420 | 0 | assert(loop->min <= loop->max && "Inconsistent loop bounds"); |
1421 | | |
1422 | | // Check to see if we have looped more than the minimum number of |
1423 | | // iterations, and if so, whether the subexpression we looped over |
1424 | | // matched an empty string. ES6 21.2.2.5.1 Note 4: "once the |
1425 | | // minimum number of repetitions has been satisfied, any more |
1426 | | // expansions of Atom that match the empty character sequence are |
1427 | | // not considered for further repetitions." |
1428 | 0 | if (iteration > loop->min && |
1429 | 0 | loopData.entryPosition == c.offsetFromLeft()) |
1430 | 0 | BACKTRACK(); |
1431 | | |
1432 | 0 | if (iteration < loop->min) { |
1433 | 0 | auto res = prepareToEnterLoopBody(s, loop, backtrackStack); |
1434 | 0 | if (res != ExecutionStatus::RETURNED) |
1435 | 0 | return res; |
1436 | 0 | s->ip_ = loopTakenIp; |
1437 | 0 | } else if (iteration == loop->max) { |
1438 | 0 | s->ip_ = loop->notTakenTarget; |
1439 | 0 | } else { |
1440 | | // We are within the target iteration range, figure out whether we |
1441 | | // should continue or exit. |
1442 | 0 | assert(iteration >= loop->min && iteration < loop->max); |
1443 | 0 | if (!loop->greedy) { |
1444 | | // Backtrack by entering this non-greedy loop. |
1445 | 0 | loopData.entryPosition = c.offsetFromLeft(); |
1446 | 0 | auto res = pushBacktrack( |
1447 | 0 | backtrackStack, |
1448 | 0 | BacktrackInsn::makeEnterNonGreedyLoop( |
1449 | 0 | loop, loopTakenIp, loopData)); |
1450 | 0 | if (res != ExecutionStatus::RETURNED) { |
1451 | 0 | return res; |
1452 | 0 | } |
1453 | 0 | s->ip_ = loop->notTakenTarget; |
1454 | 0 | } else { |
1455 | | // Backtrack by exiting this greedy loop. |
1456 | 0 | auto pushRes = pushBacktrack( |
1457 | 0 | backtrackStack, |
1458 | 0 | BacktrackInsn::makeSetPosition( |
1459 | 0 | loop->notTakenTarget, c.currentPointer())); |
1460 | 0 | if (pushRes != ExecutionStatus::RETURNED) |
1461 | 0 | return pushRes; |
1462 | | |
1463 | 0 | auto prepRes = prepareToEnterLoopBody(s, loop, backtrackStack); |
1464 | 0 | if (prepRes != ExecutionStatus::RETURNED) |
1465 | 0 | return prepRes; |
1466 | 0 | s->ip_ = loopTakenIp; |
1467 | 0 | } |
1468 | 0 | } |
1469 | 0 | break; |
1470 | 0 | } |
1471 | | |
1472 | 0 | case Opcode::BeginSimpleLoop: { |
1473 | | // Here we are entering a simple loop from outside, |
1474 | | // not jumping back into it. |
1475 | 0 | const BeginSimpleLoopInsn *loop = |
1476 | 0 | llvh::cast<BeginSimpleLoopInsn>(base); |
1477 | |
|
1478 | 0 | if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) { |
1479 | 0 | s->ip_ = loop->notTakenTarget; |
1480 | 0 | break; |
1481 | 0 | } |
1482 | | |
1483 | 0 | goto runSimpleLoop; |
1484 | 0 | } |
1485 | | |
1486 | 0 | case Opcode::EndSimpleLoop: |
1487 | 0 | s->ip_ = llvh::cast<EndSimpleLoopInsn>(base)->target; |
1488 | 0 | base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]); |
1489 | | // Note: fall-through. |
1490 | |
|
1491 | 0 | runSimpleLoop: { |
1492 | 0 | const BeginSimpleLoopInsn *loop = |
1493 | 0 | llvh::cast<BeginSimpleLoopInsn>(base); |
1494 | | // Since this is a simple loop, we'll always need to explore both |
1495 | | // exiting the loop at this point and continuing to loop. |
1496 | | // Note simple loops are always greedy. |
1497 | 0 | auto res = pushBacktrack( |
1498 | 0 | backtrackStack, |
1499 | 0 | BacktrackInsn::makeSetPosition( |
1500 | 0 | loop->notTakenTarget, c.currentPointer())); |
1501 | 0 | if (res != ExecutionStatus::RETURNED) { |
1502 | 0 | return res; |
1503 | 0 | } |
1504 | 0 | s->ip_ += sizeof(BeginSimpleLoopInsn); |
1505 | 0 | break; |
1506 | 0 | } |
1507 | | |
1508 | 86 | case Opcode::Width1Loop: { |
1509 | 86 | const Width1LoopInsn *loop = llvh::cast<Width1LoopInsn>(base); |
1510 | 86 | auto matchRes = matchWidth1Loop(loop, s, backtrackStack); |
1511 | 86 | if (LLVM_UNLIKELY(!matchRes)) |
1512 | 0 | return matchRes.getStatus(); |
1513 | 86 | if (!*matchRes) |
1514 | 0 | BACKTRACK(); |
1515 | 86 | break; |
1516 | 86 | } |
1517 | 294 | } |
1518 | 294 | } |
1519 | | // The search failed at this location. |
1520 | 86 | backtrackingExhausted: |
1521 | 86 | continue; |
1522 | 122 | } |
1523 | 86 | #undef BACKTRACK |
1524 | | // The match failed. |
1525 | 86 | return nullptr; |
1526 | 86 | } hermes::regex::Context<hermes::regex::UTF16RegexTraits>::match(hermes::regex::State<hermes::regex::UTF16RegexTraits>*, bool) Line | Count | Source | 971 | 86 | -> ExecutorResult<const CodeUnit *> { | 972 | 86 | using State = State<Traits>; | 973 | 86 | BacktrackStack backtrackStack; | 974 | | | 975 | | // We'll refer to the cursor often. | 976 | 86 | Cursor<Traits> &c = s->cursor_; | 977 | | | 978 | | // Pull out the instruction portion of the bytecode, following the header. | 979 | 86 | const uint8_t *const bytecode = &bytecodeStream_[sizeof(RegexBytecodeHeader)]; | 980 | | | 981 | | // Save the incoming IP in case we have to loop. | 982 | 86 | const auto startIp = s->ip_; | 983 | | | 984 | 86 | const CodeUnit *const startLoc = c.currentPointer(); | 985 | | | 986 | | // Use offsetFromRight() instead of remaining() here so that the length passed | 987 | | // to advanceStringIndex is accurate even when the cursor is going backwards. | 988 | 86 | const size_t charsToRight = c.offsetFromRight(); | 989 | | | 990 | | // Decide how many locations we'll need to check. | 991 | | // Note that we do want to check the empty range at the end, so add one to | 992 | | // charsToRight. | 993 | 86 | const size_t locsToCheckCount = onlyAtStart ? 1 : 1 + charsToRight; | 994 | | | 995 | | // If we are tracking backwards, we should only ever have one potential match | 996 | | // location. This is because advanceStringIndex only ever tracks forwards. | 997 | 86 | assert( | 998 | 86 | (c.forwards() || locsToCheckCount == 1) && | 999 | 86 | "Can only check one location when cursor is backwards"); | 1000 | | | 1001 | | #ifndef HERMES_CHECK_NATIVE_STACK | 1002 | | ++overflowGuard_.callDepth; | 1003 | | auto decrement = | 1004 | | llvh::make_scope_exit([this] { --overflowGuard_.callDepth; }); | 1005 | | #endif | 1006 | | | 1007 | | // Make sure we are not exceeding the set limit of the amount of times we can | 1008 | | // recurse. | 1009 | 86 | if (overflowGuard_.isOverflowing()) { | 1010 | 0 | return ExecutionStatus::STACK_OVERFLOW; | 1011 | 0 | } | 1012 | | | 1013 | | // Macro used when a state fails to match. | 1014 | 86 | #define BACKTRACK() \ | 1015 | 86 | do { \ | 1016 | 86 | auto btRes = backtrack(backtrackStack, s); \ | 1017 | 86 | if (LLVM_UNLIKELY(!btRes)) \ | 1018 | 86 | return btRes.getStatus(); \ | 1019 | 86 | if (*btRes) \ | 1020 | 86 | goto backtrackingSucceeded; \ | 1021 | 86 | goto backtrackingExhausted; \ | 1022 | 86 | } while (0) | 1023 | | | 1024 | 172 | for (size_t locIndex = 0; locIndex < locsToCheckCount; | 1025 | 86 | locIndex = advanceStringIndex(startLoc, locIndex, charsToRight)) { | 1026 | 86 | const CodeUnit *potentialMatchLocation = startLoc + locIndex; | 1027 | 86 | c.setCurrentPointer(potentialMatchLocation); | 1028 | 86 | s->ip_ = startIp; | 1029 | 122 | backtrackingSucceeded: | 1030 | 294 | for (;;) { | 1031 | 294 | const Insn *base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]); | 1032 | 294 | switch (base->opcode) { | 1033 | 0 | case Opcode::Goal: | 1034 | 0 | return potentialMatchLocation; | 1035 | | | 1036 | 86 | case Opcode::LeftAnchor: | 1037 | 86 | if (!matchesLeftAnchor(*this, *s)) | 1038 | 0 | BACKTRACK(); | 1039 | 86 | s->ip_ += sizeof(LeftAnchorInsn); | 1040 | 86 | break; | 1041 | | | 1042 | 0 | case Opcode::RightAnchor: | 1043 | 0 | if (!matchesRightAnchor(*this, *s)) | 1044 | 0 | BACKTRACK(); | 1045 | 0 | s->ip_ += sizeof(RightAnchorInsn); | 1046 | 0 | break; | 1047 | | | 1048 | 0 | case Opcode::MatchAny: | 1049 | 0 | if (c.atEnd() || | 1050 | 0 | !matchWidth1<Width1Opcode::MatchAny>(base, c.consume())) | 1051 | 0 | BACKTRACK(); | 1052 | 0 | s->ip_ += sizeof(MatchAnyInsn); | 1053 | 0 | break; | 1054 | | | 1055 | 0 | case Opcode::U16MatchAny: | 1056 | 0 | if (c.atEnd()) | 1057 | 0 | BACKTRACK(); | 1058 | 0 | c.consumeUTF16(); | 1059 | 0 | s->ip_ += sizeof(U16MatchAnyInsn); | 1060 | 0 | break; | 1061 | | | 1062 | 0 | case Opcode::MatchAnyButNewline: | 1063 | 0 | if (c.atEnd() || | 1064 | 0 | !matchWidth1<Width1Opcode::MatchAnyButNewline>(base, c.consume())) | 1065 | 0 | BACKTRACK(); | 1066 | 0 | s->ip_ += sizeof(MatchAnyButNewlineInsn); | 1067 | 0 | break; | 1068 | | | 1069 | 0 | case Opcode::U16MatchAnyButNewline: | 1070 | 0 | if (c.atEnd() || isLineTerminator(c.consumeUTF16())) | 1071 | 0 | BACKTRACK(); | 1072 | 0 | s->ip_ += sizeof(U16MatchAnyButNewlineInsn); | 1073 | 0 | break; | 1074 | | | 1075 | 0 | case Opcode::MatchChar8: { | 1076 | 0 | if (c.atEnd() || | 1077 | 0 | !matchWidth1<Width1Opcode::MatchChar8>(base, c.consume())) | 1078 | 0 | BACKTRACK(); | 1079 | 0 | s->ip_ += sizeof(MatchChar8Insn); | 1080 | 0 | break; | 1081 | 0 | } | 1082 | | | 1083 | 0 | case Opcode::MatchChar16: { | 1084 | 0 | if (c.atEnd() || | 1085 | 0 | !matchWidth1<Width1Opcode::MatchChar16>(base, c.consume())) | 1086 | 0 | BACKTRACK(); | 1087 | 0 | s->ip_ += sizeof(MatchChar16Insn); | 1088 | 0 | break; | 1089 | 0 | } | 1090 | | | 1091 | 0 | case Opcode::U16MatchChar32: { | 1092 | 0 | const auto *insn = llvh::cast<U16MatchChar32Insn>(base); | 1093 | 0 | if (c.atEnd() || c.consumeUTF16() != (CodePoint)insn->c) | 1094 | 0 | BACKTRACK(); | 1095 | 0 | s->ip_ += sizeof(U16MatchChar32Insn); | 1096 | 0 | break; | 1097 | 0 | } | 1098 | | | 1099 | 0 | case Opcode::MatchCharICase8: { | 1100 | 0 | if (c.atEnd() || | 1101 | 0 | !matchWidth1<Width1Opcode::MatchCharICase8>(base, c.consume())) | 1102 | 0 | BACKTRACK(); | 1103 | 0 | s->ip_ += sizeof(MatchCharICase8Insn); | 1104 | 0 | break; | 1105 | 0 | } | 1106 | | | 1107 | 0 | case Opcode::MatchCharICase16: { | 1108 | 0 | if (c.atEnd() || | 1109 | 0 | !matchWidth1<Width1Opcode::MatchCharICase16>(base, c.consume())) | 1110 | 0 | BACKTRACK(); | 1111 | 0 | s->ip_ += sizeof(MatchCharICase16Insn); | 1112 | 0 | break; | 1113 | 0 | } | 1114 | | | 1115 | 0 | case Opcode::U16MatchCharICase32: { | 1116 | 0 | const auto *insn = llvh::cast<U16MatchCharICase32Insn>(base); | 1117 | 0 | bool matched = false; | 1118 | 0 | if (!c.atEnd()) { | 1119 | 0 | CodePoint cp = c.consumeUTF16(); | 1120 | 0 | matched = | 1121 | 0 | (cp == (CodePoint)insn->c || | 1122 | 0 | traits_.canonicalize(cp, true) == (CodePoint)insn->c); | 1123 | 0 | } | 1124 | 0 | if (!matched) | 1125 | 0 | BACKTRACK(); | 1126 | 0 | s->ip_ += sizeof(U16MatchCharICase32Insn); | 1127 | 0 | break; | 1128 | 0 | } | 1129 | | | 1130 | 122 | case Opcode::MatchNChar8: { | 1131 | 122 | const auto *insn = llvh::cast<MatchNChar8Insn>(base); | 1132 | 122 | if (c.remaining() < insn->charCount || !matchesNChar8(insn, *s)) | 1133 | 122 | BACKTRACK(); | 1134 | 0 | s->ip_ += insn->totalWidth(); | 1135 | 0 | break; | 1136 | 122 | } | 1137 | | | 1138 | 0 | case Opcode::MatchNCharICase8: { | 1139 | 0 | const auto *insn = llvh::cast<MatchNCharICase8Insn>(base); | 1140 | 0 | if (c.remaining() < insn->charCount || !matchesNCharICase8(insn, *s)) | 1141 | 0 | BACKTRACK(); | 1142 | 0 | s->ip_ += insn->totalWidth(); | 1143 | 0 | break; | 1144 | 0 | } | 1145 | | | 1146 | 0 | case Opcode::Alternation: { | 1147 | | // We have an alternation. Determine which of our first and second | 1148 | | // branches are viable. If both are, we have to split our state. | 1149 | 0 | const AlternationInsn *alt = llvh::cast<AlternationInsn>(base); | 1150 | 0 | bool primaryViable = | 1151 | 0 | c.satisfiesConstraints(flags_, alt->primaryConstraints); | 1152 | 0 | bool secondaryViable = | 1153 | 0 | c.satisfiesConstraints(flags_, alt->secondaryConstraints); | 1154 | 0 | if (primaryViable && secondaryViable) { | 1155 | | // We need to explore both branches. Explore the primary branch | 1156 | | // first, backtrack to the secondary one. | 1157 | 0 | s->ip_ += sizeof(AlternationInsn); | 1158 | 0 | auto res = pushBacktrack( | 1159 | 0 | backtrackStack, | 1160 | 0 | BacktrackInsn::makeSetPosition( | 1161 | 0 | alt->secondaryBranch, c.currentPointer())); | 1162 | 0 | if (res != ExecutionStatus::RETURNED) { | 1163 | 0 | return res; | 1164 | 0 | } | 1165 | 0 | } else if (primaryViable) { | 1166 | 0 | s->ip_ += sizeof(AlternationInsn); | 1167 | 0 | } else if (secondaryViable) { | 1168 | 0 | s->ip_ = alt->secondaryBranch; | 1169 | 0 | } else { | 1170 | 0 | BACKTRACK(); | 1171 | 0 | } | 1172 | 0 | break; | 1173 | 0 | } | 1174 | | | 1175 | 0 | case Opcode::Jump32: | 1176 | 0 | s->ip_ = llvh::cast<Jump32Insn>(base)->target; | 1177 | 0 | break; | 1178 | | | 1179 | 0 | case Opcode::Bracket: { | 1180 | 0 | if (c.atEnd() || | 1181 | 0 | !matchWidth1<Width1Opcode::Bracket>(base, c.consume())) | 1182 | 0 | BACKTRACK(); | 1183 | 0 | s->ip_ += llvh::cast<BracketInsn>(base)->totalWidth(); | 1184 | 0 | break; | 1185 | 0 | } | 1186 | | | 1187 | 0 | case Opcode::U16Bracket: { | 1188 | 0 | const U16BracketInsn *insn = llvh::cast<U16BracketInsn>(base); | 1189 | | // U16BracketInsn is followed by a list of BracketRange32s. | 1190 | 0 | const BracketRange32 *ranges = | 1191 | 0 | reinterpret_cast<const BracketRange32 *>(insn + 1); | 1192 | 0 | if (c.atEnd() || | 1193 | 0 | !bracketMatchesChar<Traits>( | 1194 | 0 | *this, insn, ranges, c.consumeUTF16())) | 1195 | 0 | BACKTRACK(); | 1196 | 0 | s->ip_ += insn->totalWidth(); | 1197 | 0 | break; | 1198 | 0 | } | 1199 | | | 1200 | 0 | case Opcode::WordBoundary: { | 1201 | 0 | const WordBoundaryInsn *insn = llvh::cast<WordBoundaryInsn>(base); | 1202 | 0 | const auto *charPointer = c.currentPointer(); | 1203 | |
| 1204 | 0 | bool prevIsWordchar = false; | 1205 | 0 | if (!c.atLeft()) | 1206 | 0 | prevIsWordchar = traits_.characterHasType( | 1207 | 0 | charPointer[-1], CharacterClass::Words); | 1208 | |
| 1209 | 0 | bool currentIsWordchar = false; | 1210 | 0 | if (!c.atRight()) | 1211 | 0 | currentIsWordchar = | 1212 | 0 | traits_.characterHasType(charPointer[0], CharacterClass::Words); | 1213 | |
| 1214 | 0 | bool isWordBoundary = (prevIsWordchar != currentIsWordchar); | 1215 | 0 | if (isWordBoundary ^ insn->invert) | 1216 | 0 | s->ip_ += sizeof(WordBoundaryInsn); | 1217 | 0 | else | 1218 | 0 | BACKTRACK(); | 1219 | 0 | break; | 1220 | 0 | } | 1221 | | | 1222 | 0 | case Opcode::BeginMarkedSubexpression: { | 1223 | 0 | const auto *insn = llvh::cast<BeginMarkedSubexpressionInsn>(base); | 1224 | 0 | auto res = pushBacktrack( | 1225 | 0 | backtrackStack, | 1226 | 0 | BacktrackInsn::makeSetCaptureGroup( | 1227 | 0 | insn->mexp, {kNotMatched, kNotMatched})); | 1228 | 0 | if (res != ExecutionStatus::RETURNED) { | 1229 | 0 | return res; | 1230 | 0 | } | 1231 | | // When tracking backwards (in a lookbehind assertion) we traverse our | 1232 | | // input backwards, so set the end before the start. | 1233 | 0 | auto &range = s->getCapturedRange(insn->mexp); | 1234 | 0 | if (c.forwards()) { | 1235 | 0 | range.start = c.offsetFromLeft(); | 1236 | 0 | } else { | 1237 | 0 | range.end = c.offsetFromLeft(); | 1238 | 0 | } | 1239 | 0 | s->ip_ += sizeof(BeginMarkedSubexpressionInsn); | 1240 | 0 | break; | 1241 | 0 | } | 1242 | | | 1243 | 0 | case Opcode::EndMarkedSubexpression: { | 1244 | 0 | const auto *insn = llvh::cast<EndMarkedSubexpressionInsn>(base); | 1245 | 0 | auto &range = s->getCapturedRange(insn->mexp); | 1246 | 0 | if (c.forwards()) { | 1247 | 0 | assert( | 1248 | 0 | range.start != kNotMatched && "Capture group was not entered"); | 1249 | 0 | range.end = c.offsetFromLeft(); | 1250 | 0 | } else { | 1251 | 0 | assert(range.end != kNotMatched && "Capture group was not entered"); | 1252 | 0 | range.start = c.offsetFromLeft(); | 1253 | 0 | } | 1254 | 0 | assert(range.start <= range.end && "Captured range end before start"); | 1255 | 0 | s->ip_ += sizeof(EndMarkedSubexpressionInsn); | 1256 | 0 | break; | 1257 | 0 | } | 1258 | | | 1259 | | // ES10 21.2.2.9.1 | 1260 | 0 | case Opcode::BackRef: { | 1261 | 0 | const auto insn = llvh::cast<BackRefInsn>(base); | 1262 | | // a. Let cap be x's captures List. | 1263 | | // b. Let s be cap[n]. | 1264 | 0 | CapturedRange cr = s->getCapturedRange(insn->mexp); | 1265 | | | 1266 | | // c. If s is undefined, return c(x). | 1267 | | // Note we have to check both cr.start and cr.end here. If we are | 1268 | | // currently in the middle of matching a capture group (going either | 1269 | | // forwards or backwards) we should just return success. | 1270 | 0 | if (cr.start == kNotMatched || cr.end == kNotMatched) { | 1271 | | // Backreferences to a capture group that did not match always | 1272 | | // succeed (ES10 21.2.2.9) | 1273 | 0 | s->ip_ += sizeof(BackRefInsn); | 1274 | 0 | break; | 1275 | 0 | } | 1276 | | | 1277 | | // TODO: this can be optimized by hoisting the branches out of the | 1278 | | // loop. | 1279 | 0 | bool icase = syntaxFlags_.ignoreCase; | 1280 | 0 | bool unicode = syntaxFlags_.unicode; | 1281 | 0 | auto capturedStart = first_ + cr.start; | 1282 | 0 | auto capturedEnd = first_ + cr.end; | 1283 | 0 | Cursor<Traits> cursor2( | 1284 | 0 | capturedStart, | 1285 | 0 | c.forwards() ? capturedStart : capturedEnd, | 1286 | 0 | capturedEnd, | 1287 | 0 | c.forwards()); | 1288 | 0 | Cursor<Traits> cursor1 = c; | 1289 | 0 | bool matched = true; | 1290 | 0 | while (matched && !cursor2.atEnd()) { | 1291 | 0 | if (cursor1.atEnd()) { | 1292 | 0 | matched = false; | 1293 | 0 | } else if (!icase) { | 1294 | | // Direct comparison. Here we don't need to decode surrogate | 1295 | | // pairs. | 1296 | 0 | matched = (cursor1.consume() == cursor2.consume()); | 1297 | 0 | } else if (!unicode) { | 1298 | | // Case-insensitive non-Unicode comparison, no decoding of | 1299 | | // surrogate pairs. | 1300 | 0 | auto c1 = cursor1.consume(); | 1301 | 0 | auto c2 = cursor2.consume(); | 1302 | 0 | matched = | 1303 | 0 | (c1 == c2 || | 1304 | 0 | traits_.canonicalize(c1, unicode) == | 1305 | 0 | traits_.canonicalize(c2, unicode)); | 1306 | 0 | } else { | 1307 | | // Unicode: we do need to decode surrogate pairs. | 1308 | 0 | auto cp1 = cursor1.consumeUTF16(); | 1309 | 0 | auto cp2 = cursor2.consumeUTF16(); | 1310 | 0 | matched = | 1311 | 0 | (cp1 == cp2 || | 1312 | 0 | traits_.canonicalize(cp1, unicode) == | 1313 | 0 | traits_.canonicalize(cp2, unicode)); | 1314 | 0 | } | 1315 | 0 | } | 1316 | 0 | if (!matched) { | 1317 | 0 | BACKTRACK(); | 1318 | 0 | } | 1319 | 0 | s->ip_ += sizeof(BackRefInsn); | 1320 | 0 | c.setCurrentPointer(cursor1.currentPointer()); | 1321 | 0 | break; | 1322 | 0 | } | 1323 | | | 1324 | 0 | case Opcode::Lookaround: { | 1325 | 0 | const LookaroundInsn *insn = llvh::cast<LookaroundInsn>(base); | 1326 | 0 | bool matched = false; | 1327 | 0 | if (c.satisfiesConstraints(flags_, insn->constraints)) { | 1328 | | // Copy the state. This is because if the match fails (or if we are | 1329 | | // inverted) we need to restore its capture groups. | 1330 | 0 | State savedState{*s}; | 1331 | | | 1332 | | // Set the direction of the cursor. | 1333 | 0 | c.setForwards(insn->forwards); | 1334 | | | 1335 | | // Invoke match() recursively with our expression. | 1336 | | // Save and restore the position because lookaheads do not consume | 1337 | | // anything. | 1338 | 0 | s->ip_ += sizeof(LookaroundInsn); | 1339 | 0 | auto match = this->match(s, true /* onlyAtStart */); | 1340 | | // If the match errored out due to stack overflow, then we need to | 1341 | | // return an error here as well. | 1342 | 0 | if (LLVM_UNLIKELY(!match)) { | 1343 | 0 | return match.getStatus(); | 1344 | 0 | } | 1345 | | // We got a match if the value is non-null. | 1346 | 0 | matched = match.getValue() != nullptr; | 1347 | 0 | c.setCurrentPointer(savedState.cursor_.currentPointer()); | 1348 | 0 | c.setForwards(savedState.cursor_.forwards()); | 1349 | | | 1350 | | // Restore capture groups unless we are a positive lookaround that | 1351 | | // successfully matched. If we are a successfully matching positive | 1352 | | // lookaround, set up backtracking to reset the capture groups. Note | 1353 | | // we never backtrack INTO a successfully matched lookahead: | 1354 | | // once a lookahead finds a match it forgets all other ways it could | 1355 | | // have matched. (ES 5.1 15.10.2.8 Note 2). | 1356 | 0 | if (matched && !insn->invert) { | 1357 | | // Backtrack capture groups in the lookahead expression. | 1358 | 0 | for (uint32_t i = insn->mexpBegin, e = insn->mexpEnd; i < e; | 1359 | 0 | i++) { | 1360 | 0 | CapturedRange cr = savedState.getCapturedRange(i); | 1361 | 0 | auto res = pushBacktrack( | 1362 | 0 | backtrackStack, BacktrackInsn::makeSetCaptureGroup(i, cr)); | 1363 | 0 | if (res != ExecutionStatus::RETURNED) | 1364 | 0 | return res; | 1365 | 0 | } | 1366 | 0 | } else { | 1367 | | // Restore the saved state. | 1368 | 0 | *s = std::move(savedState); | 1369 | 0 | } | 1370 | 0 | } | 1371 | | | 1372 | | // 'matched' tells us whether the enclosed assertion expression | 1373 | | // matched the input. This instruction matched the input if it is a | 1374 | | // positive assertion (invert == false) and the expression matched, | 1375 | | // or a negative assertion (invert == true) and the expression did | 1376 | | // not match. Hence xor with invert. | 1377 | 0 | if (matched ^ insn->invert) | 1378 | 0 | s->ip_ = insn->continuation; | 1379 | 0 | else | 1380 | 0 | BACKTRACK(); | 1381 | 0 | break; | 1382 | 0 | } | 1383 | | | 1384 | 0 | case Opcode::BeginLoop: { | 1385 | | // Here we are entering a loop from outside, not jumping back into | 1386 | | // it. | 1387 | 0 | const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base); | 1388 | 0 | s->getLoop(loop->loopId).iterations = 0; | 1389 | | // Check to see if the loop body is viable. If not, and the loop has | 1390 | | // a nonzero minimum iteration, then we know we won't match and we | 1391 | | // can reject the state. If it does have a minimum iteration, we can | 1392 | | // just skip to the not-taken target. Note that this is a static | 1393 | | // property of the loop so we don't need to check it on every | 1394 | | // iteration, only the first one. | 1395 | 0 | if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) { | 1396 | 0 | if (loop->min > 0) { | 1397 | 0 | BACKTRACK(); | 1398 | 0 | } else { | 1399 | 0 | s->ip_ = loop->notTakenTarget; | 1400 | 0 | break; | 1401 | 0 | } | 1402 | 0 | } | 1403 | 0 | goto runLoop; | 1404 | 0 | } | 1405 | | | 1406 | 0 | case Opcode::EndLoop: | 1407 | | // This is reached after the body of a loop finishes executing. | 1408 | | // Move the IP to the loop and run it again immediately. | 1409 | 0 | s->ip_ = llvh::cast<EndLoopInsn>(base)->target; | 1410 | 0 | base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]); | 1411 | | // Note fall through. | 1412 | |
| 1413 | 0 | runLoop: { | 1414 | 0 | const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base); | 1415 | 0 | auto &loopData = s->getLoop(loop->loopId); | 1416 | 0 | uint32_t iteration = loopData.iterations; | 1417 | |
| 1418 | 0 | const uint32_t loopTakenIp = s->ip_ + sizeof(BeginLoopInsn); | 1419 | |
| 1420 | 0 | assert(loop->min <= loop->max && "Inconsistent loop bounds"); | 1421 | | | 1422 | | // Check to see if we have looped more than the minimum number of | 1423 | | // iterations, and if so, whether the subexpression we looped over | 1424 | | // matched an empty string. ES6 21.2.2.5.1 Note 4: "once the | 1425 | | // minimum number of repetitions has been satisfied, any more | 1426 | | // expansions of Atom that match the empty character sequence are | 1427 | | // not considered for further repetitions." | 1428 | 0 | if (iteration > loop->min && | 1429 | 0 | loopData.entryPosition == c.offsetFromLeft()) | 1430 | 0 | BACKTRACK(); | 1431 | | | 1432 | 0 | if (iteration < loop->min) { | 1433 | 0 | auto res = prepareToEnterLoopBody(s, loop, backtrackStack); | 1434 | 0 | if (res != ExecutionStatus::RETURNED) | 1435 | 0 | return res; | 1436 | 0 | s->ip_ = loopTakenIp; | 1437 | 0 | } else if (iteration == loop->max) { | 1438 | 0 | s->ip_ = loop->notTakenTarget; | 1439 | 0 | } else { | 1440 | | // We are within the target iteration range, figure out whether we | 1441 | | // should continue or exit. | 1442 | 0 | assert(iteration >= loop->min && iteration < loop->max); | 1443 | 0 | if (!loop->greedy) { | 1444 | | // Backtrack by entering this non-greedy loop. | 1445 | 0 | loopData.entryPosition = c.offsetFromLeft(); | 1446 | 0 | auto res = pushBacktrack( | 1447 | 0 | backtrackStack, | 1448 | 0 | BacktrackInsn::makeEnterNonGreedyLoop( | 1449 | 0 | loop, loopTakenIp, loopData)); | 1450 | 0 | if (res != ExecutionStatus::RETURNED) { | 1451 | 0 | return res; | 1452 | 0 | } | 1453 | 0 | s->ip_ = loop->notTakenTarget; | 1454 | 0 | } else { | 1455 | | // Backtrack by exiting this greedy loop. | 1456 | 0 | auto pushRes = pushBacktrack( | 1457 | 0 | backtrackStack, | 1458 | 0 | BacktrackInsn::makeSetPosition( | 1459 | 0 | loop->notTakenTarget, c.currentPointer())); | 1460 | 0 | if (pushRes != ExecutionStatus::RETURNED) | 1461 | 0 | return pushRes; | 1462 | | | 1463 | 0 | auto prepRes = prepareToEnterLoopBody(s, loop, backtrackStack); | 1464 | 0 | if (prepRes != ExecutionStatus::RETURNED) | 1465 | 0 | return prepRes; | 1466 | 0 | s->ip_ = loopTakenIp; | 1467 | 0 | } | 1468 | 0 | } | 1469 | 0 | break; | 1470 | 0 | } | 1471 | | | 1472 | 0 | case Opcode::BeginSimpleLoop: { | 1473 | | // Here we are entering a simple loop from outside, | 1474 | | // not jumping back into it. | 1475 | 0 | const BeginSimpleLoopInsn *loop = | 1476 | 0 | llvh::cast<BeginSimpleLoopInsn>(base); | 1477 | |
| 1478 | 0 | if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) { | 1479 | 0 | s->ip_ = loop->notTakenTarget; | 1480 | 0 | break; | 1481 | 0 | } | 1482 | | | 1483 | 0 | goto runSimpleLoop; | 1484 | 0 | } | 1485 | | | 1486 | 0 | case Opcode::EndSimpleLoop: | 1487 | 0 | s->ip_ = llvh::cast<EndSimpleLoopInsn>(base)->target; | 1488 | 0 | base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]); | 1489 | | // Note: fall-through. | 1490 | |
| 1491 | 0 | runSimpleLoop: { | 1492 | 0 | const BeginSimpleLoopInsn *loop = | 1493 | 0 | llvh::cast<BeginSimpleLoopInsn>(base); | 1494 | | // Since this is a simple loop, we'll always need to explore both | 1495 | | // exiting the loop at this point and continuing to loop. | 1496 | | // Note simple loops are always greedy. | 1497 | 0 | auto res = pushBacktrack( | 1498 | 0 | backtrackStack, | 1499 | 0 | BacktrackInsn::makeSetPosition( | 1500 | 0 | loop->notTakenTarget, c.currentPointer())); | 1501 | 0 | if (res != ExecutionStatus::RETURNED) { | 1502 | 0 | return res; | 1503 | 0 | } | 1504 | 0 | s->ip_ += sizeof(BeginSimpleLoopInsn); | 1505 | 0 | break; | 1506 | 0 | } | 1507 | | | 1508 | 86 | case Opcode::Width1Loop: { | 1509 | 86 | const Width1LoopInsn *loop = llvh::cast<Width1LoopInsn>(base); | 1510 | 86 | auto matchRes = matchWidth1Loop(loop, s, backtrackStack); | 1511 | 86 | if (LLVM_UNLIKELY(!matchRes)) | 1512 | 0 | return matchRes.getStatus(); | 1513 | 86 | if (!*matchRes) | 1514 | 0 | BACKTRACK(); | 1515 | 86 | break; | 1516 | 86 | } | 1517 | 294 | } | 1518 | 294 | } | 1519 | | // The search failed at this location. | 1520 | 86 | backtrackingExhausted: | 1521 | 86 | continue; | 1522 | 122 | } | 1523 | 86 | #undef BACKTRACK | 1524 | | // The match failed. | 1525 | 86 | return nullptr; | 1526 | 86 | } |
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::match(hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, bool) |
1527 | | |
1528 | | /// Entry point for searching a string via regex compiled bytecode. |
1529 | | /// Given the bytecode \p bytecode, search the range starting at \p first up to |
1530 | | /// (not including) \p last with the flags \p matchFlags. If the search |
1531 | | /// succeeds, poopulate MatchResults with the capture groups. \return true if |
1532 | | /// some portion of the string matched the regex represented by the bytecode, |
1533 | | /// false otherwise. |
1534 | | template <typename CharT, class Traits> |
1535 | | MatchRuntimeResult searchWithBytecodeImpl( |
1536 | | llvh::ArrayRef<uint8_t> bytecode, |
1537 | | const CharT *first, |
1538 | | uint32_t start, |
1539 | | uint32_t length, |
1540 | | std::vector<CapturedRange> *m, |
1541 | | constants::MatchFlagType matchFlags, |
1542 | 86 | StackOverflowGuard guard) { |
1543 | 86 | assert( |
1544 | 86 | bytecode.size() >= sizeof(RegexBytecodeHeader) && "Bytecode too small"); |
1545 | 86 | auto header = reinterpret_cast<const RegexBytecodeHeader *>(bytecode.data()); |
1546 | | |
1547 | | // Check for match impossibility before doing anything else. |
1548 | 86 | Cursor<Traits> cursor{ |
1549 | 86 | first, first + start, first + length, true /* forwards */}; |
1550 | 86 | if (!cursor.satisfiesConstraints(matchFlags, header->constraints)) |
1551 | 0 | return MatchRuntimeResult::NoMatch; |
1552 | | |
1553 | 86 | auto markedCount = header->markedCount; |
1554 | 86 | auto loopCount = header->loopCount; |
1555 | | |
1556 | 86 | Context<Traits> ctx( |
1557 | 86 | bytecode, |
1558 | 86 | matchFlags, |
1559 | 86 | SyntaxFlags::fromByte(header->syntaxFlags), |
1560 | 86 | first, |
1561 | 86 | first + length, |
1562 | 86 | header->markedCount, |
1563 | 86 | header->loopCount, |
1564 | 86 | guard); |
1565 | 86 | State<Traits> state{cursor, markedCount, loopCount}; |
1566 | | |
1567 | | // We check only one location if either the regex pattern constrains us to, or |
1568 | | // the flags request it (via the sticky flag 'y'). |
1569 | 86 | bool onlyAtStart = (header->constraints & MatchConstraintAnchoredAtStart) || |
1570 | 86 | (matchFlags & constants::matchOnlyAtStart); |
1571 | | |
1572 | 86 | auto res = ctx.match(&state, onlyAtStart); |
1573 | 86 | if (!res) { |
1574 | 0 | assert(res.getStatus() == ExecutionStatus::STACK_OVERFLOW); |
1575 | 0 | return MatchRuntimeResult::StackOverflow; |
1576 | 0 | } |
1577 | 86 | if (const CharT *matchStartLoc = res.getValue()) { |
1578 | | // Match succeeded. Return captured ranges. The first range is the total |
1579 | | // match, followed by any capture groups. |
1580 | 0 | if (m != nullptr) { |
1581 | 0 | uint32_t totalStart = static_cast<uint32_t>(matchStartLoc - first); |
1582 | 0 | uint32_t totalEnd = |
1583 | 0 | static_cast<uint32_t>(state.cursor_.currentPointer() - first); |
1584 | 0 | m->clear(); |
1585 | 0 | m->push_back(CapturedRange{totalStart, totalEnd}); |
1586 | 0 | std::copy_n( |
1587 | 0 | state.capturedRanges_.begin(), markedCount, std::back_inserter(*m)); |
1588 | 0 | } |
1589 | 0 | return MatchRuntimeResult::Match; |
1590 | 0 | } |
1591 | 86 | return MatchRuntimeResult::NoMatch; |
1592 | 86 | } hermes::regex::MatchRuntimeResult hermes::regex::searchWithBytecodeImpl<char16_t, hermes::regex::UTF16RegexTraits>(llvh::ArrayRef<unsigned char>, char16_t const*, unsigned int, unsigned int, std::__1::vector<hermes::regex::CapturedRange, std::__1::allocator<hermes::regex::CapturedRange> >*, hermes::regex::constants::MatchFlagType, hermes::StackOverflowGuard) Line | Count | Source | 1542 | 86 | StackOverflowGuard guard) { | 1543 | 86 | assert( | 1544 | 86 | bytecode.size() >= sizeof(RegexBytecodeHeader) && "Bytecode too small"); | 1545 | 86 | auto header = reinterpret_cast<const RegexBytecodeHeader *>(bytecode.data()); | 1546 | | | 1547 | | // Check for match impossibility before doing anything else. | 1548 | 86 | Cursor<Traits> cursor{ | 1549 | 86 | first, first + start, first + length, true /* forwards */}; | 1550 | 86 | if (!cursor.satisfiesConstraints(matchFlags, header->constraints)) | 1551 | 0 | return MatchRuntimeResult::NoMatch; | 1552 | | | 1553 | 86 | auto markedCount = header->markedCount; | 1554 | 86 | auto loopCount = header->loopCount; | 1555 | | | 1556 | 86 | Context<Traits> ctx( | 1557 | 86 | bytecode, | 1558 | 86 | matchFlags, | 1559 | 86 | SyntaxFlags::fromByte(header->syntaxFlags), | 1560 | 86 | first, | 1561 | 86 | first + length, | 1562 | 86 | header->markedCount, | 1563 | 86 | header->loopCount, | 1564 | 86 | guard); | 1565 | 86 | State<Traits> state{cursor, markedCount, loopCount}; | 1566 | | | 1567 | | // We check only one location if either the regex pattern constrains us to, or | 1568 | | // the flags request it (via the sticky flag 'y'). | 1569 | 86 | bool onlyAtStart = (header->constraints & MatchConstraintAnchoredAtStart) || | 1570 | 86 | (matchFlags & constants::matchOnlyAtStart); | 1571 | | | 1572 | 86 | auto res = ctx.match(&state, onlyAtStart); | 1573 | 86 | if (!res) { | 1574 | 0 | assert(res.getStatus() == ExecutionStatus::STACK_OVERFLOW); | 1575 | 0 | return MatchRuntimeResult::StackOverflow; | 1576 | 0 | } | 1577 | 86 | if (const CharT *matchStartLoc = res.getValue()) { | 1578 | | // Match succeeded. Return captured ranges. The first range is the total | 1579 | | // match, followed by any capture groups. | 1580 | 0 | if (m != nullptr) { | 1581 | 0 | uint32_t totalStart = static_cast<uint32_t>(matchStartLoc - first); | 1582 | 0 | uint32_t totalEnd = | 1583 | 0 | static_cast<uint32_t>(state.cursor_.currentPointer() - first); | 1584 | 0 | m->clear(); | 1585 | 0 | m->push_back(CapturedRange{totalStart, totalEnd}); | 1586 | 0 | std::copy_n( | 1587 | 0 | state.capturedRanges_.begin(), markedCount, std::back_inserter(*m)); | 1588 | 0 | } | 1589 | 0 | return MatchRuntimeResult::Match; | 1590 | 0 | } | 1591 | 86 | return MatchRuntimeResult::NoMatch; | 1592 | 86 | } |
Unexecuted instantiation: hermes::regex::MatchRuntimeResult hermes::regex::searchWithBytecodeImpl<char, hermes::regex::ASCIIRegexTraits>(llvh::ArrayRef<unsigned char>, char const*, unsigned int, unsigned int, std::__1::vector<hermes::regex::CapturedRange, std::__1::allocator<hermes::regex::CapturedRange> >*, hermes::regex::constants::MatchFlagType, hermes::StackOverflowGuard) |
1593 | | |
1594 | | MatchRuntimeResult searchWithBytecode( |
1595 | | llvh::ArrayRef<uint8_t> bytecode, |
1596 | | const char16_t *first, |
1597 | | uint32_t start, |
1598 | | uint32_t length, |
1599 | | std::vector<CapturedRange> *m, |
1600 | | constants::MatchFlagType matchFlags, |
1601 | 86 | StackOverflowGuard guard) { |
1602 | 86 | return searchWithBytecodeImpl<char16_t, UTF16RegexTraits>( |
1603 | 86 | bytecode, first, start, length, m, matchFlags, guard); |
1604 | 86 | } |
1605 | | |
1606 | | MatchRuntimeResult searchWithBytecode( |
1607 | | llvh::ArrayRef<uint8_t> bytecode, |
1608 | | const char *first, |
1609 | | uint32_t start, |
1610 | | uint32_t length, |
1611 | | std::vector<CapturedRange> *m, |
1612 | | constants::MatchFlagType matchFlags, |
1613 | 0 | StackOverflowGuard guard) { |
1614 | 0 | return searchWithBytecodeImpl<char, ASCIIRegexTraits>( |
1615 | 0 | bytecode, first, start, length, m, matchFlags, guard); |
1616 | 0 | } |
1617 | | |
1618 | | } // namespace regex |
1619 | | } // namespace hermes |