Coverage Report

Created: 2025-01-28 06:38

/src/hermes/lib/Regex/Executor.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
#include "hermes/Regex/Executor.h"
9
#include "hermes/Regex/RegexTraits.h"
10
#include "hermes/Support/OptValue.h"
11
12
#include "llvh/ADT/ScopeExit.h"
13
#include "llvh/ADT/SmallVector.h"
14
#include "llvh/Support/TrailingObjects.h"
15
#include "llvh/Support/raw_ostream.h"
16
17
// This file contains the machinery for executing a regexp compiled to bytecode.
18
19
namespace hermes {
20
namespace regex {
21
22
template <class Traits>
23
struct State;
24
25
/// Describes the exit status of a RegEx execution: it either returned
26
/// normally or stack overflowed
27
enum class ExecutionStatus : uint8_t { RETURNED, STACK_OVERFLOW };
28
29
/// A tuple combining the result of a function which may have returned
30
/// successfully (ExecutionStatus::RETURNED) with a value, or thrown an
31
/// exception (ExecutionStatus::STACK_OVERFLOW).
32
/// This is used by some internal functions for convenience.
33
template <typename T>
34
class ExecutorResult {
35
  static_assert(std::is_trivial<T>::value, "T must be trivial.");
36
37
 private:
38
  ExecutionStatus status_;
39
  T value_;
40
41
 public:
42
  /* implicit */ ExecutorResult(const T &v)
43
294
      : status_(ExecutionStatus::RETURNED), value_(v) {}
hermes::regex::ExecutorResult<char16_t const*>::ExecutorResult(char16_t const* const&)
Line
Count
Source
43
86
      : status_(ExecutionStatus::RETURNED), value_(v) {}
hermes::regex::ExecutorResult<bool>::ExecutorResult(bool const&)
Line
Count
Source
43
208
      : status_(ExecutionStatus::RETURNED), value_(v) {}
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::ExecutorResult(char const* const&)
44
45
0
  /* implicit */ ExecutorResult(ExecutionStatus status) : status_(status) {
46
0
    assert(status != ExecutionStatus::RETURNED);
47
0
  }
Unexecuted instantiation: hermes::regex::ExecutorResult<char16_t const*>::ExecutorResult(hermes::regex::ExecutionStatus)
Unexecuted instantiation: hermes::regex::ExecutorResult<bool>::ExecutorResult(hermes::regex::ExecutionStatus)
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::ExecutorResult(hermes::regex::ExecutionStatus)
48
49
208
  const T &operator*() const {
50
208
    return getValue();
51
208
  }
52
53
294
  bool hasValue() const {
54
294
    return status_ == ExecutionStatus::RETURNED;
55
294
  }
hermes::regex::ExecutorResult<bool>::hasValue() const
Line
Count
Source
53
208
  bool hasValue() const {
54
208
    return status_ == ExecutionStatus::RETURNED;
55
208
  }
hermes::regex::ExecutorResult<char16_t const*>::hasValue() const
Line
Count
Source
53
86
  bool hasValue() const {
54
86
    return status_ == ExecutionStatus::RETURNED;
55
86
  }
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::hasValue() const
56
57
294
  explicit operator bool() const {
58
294
    return hasValue();
59
294
  }
hermes::regex::ExecutorResult<bool>::operator bool() const
Line
Count
Source
57
208
  explicit operator bool() const {
58
208
    return hasValue();
59
208
  }
hermes::regex::ExecutorResult<char16_t const*>::operator bool() const
Line
Count
Source
57
86
  explicit operator bool() const {
58
86
    return hasValue();
59
86
  }
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::operator bool() const
60
61
294
  const T &getValue() const {
62
294
    assert(getStatus() == ExecutionStatus::RETURNED);
63
294
    return *reinterpret_cast<const T *>(&value_);
64
294
  }
hermes::regex::ExecutorResult<bool>::getValue() const
Line
Count
Source
61
208
  const T &getValue() const {
62
208
    assert(getStatus() == ExecutionStatus::RETURNED);
63
208
    return *reinterpret_cast<const T *>(&value_);
64
208
  }
hermes::regex::ExecutorResult<char16_t const*>::getValue() const
Line
Count
Source
61
86
  const T &getValue() const {
62
86
    assert(getStatus() == ExecutionStatus::RETURNED);
63
86
    return *reinterpret_cast<const T *>(&value_);
64
86
  }
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::getValue() const
65
66
294
  ExecutionStatus getStatus() const {
67
294
    return status_;
68
294
  }
hermes::regex::ExecutorResult<bool>::getStatus() const
Line
Count
Source
66
208
  ExecutionStatus getStatus() const {
67
208
    return status_;
68
208
  }
hermes::regex::ExecutorResult<char16_t const*>::getStatus() const
Line
Count
Source
66
86
  ExecutionStatus getStatus() const {
67
86
    return status_;
68
86
  }
Unexecuted instantiation: hermes::regex::ExecutorResult<char const*>::getStatus() const
69
};
70
/// An enum describing Width1 opcodes. This is the set of regex opcodes which
71
/// always match exactly one character (or fail). This is broken out from Opcode
72
/// to get exhaustiveness checking in switch statements. Note that conversions
73
/// can be performed via static_cast.
74
enum class Width1Opcode : uint8_t {
75
  MatchChar8 = (uint8_t)Opcode::MatchChar8,
76
  MatchChar16 = (uint8_t)Opcode::MatchChar16,
77
  MatchCharICase8 = (uint8_t)Opcode::MatchCharICase8,
78
  MatchCharICase16 = (uint8_t)Opcode::MatchCharICase16,
79
  MatchAny = (uint8_t)Opcode::MatchAny,
80
  MatchAnyButNewline = (uint8_t)Opcode::MatchAnyButNewline,
81
  Bracket = (uint8_t)Opcode::Bracket,
82
};
83
84
/// LoopData tracks information about a loop during a match attempt. Each State
85
/// has one LoopData per loop.
86
struct LoopData {
87
  /// The number of times that this loop has executed in this state.
88
  uint32_t iterations;
89
90
  /// The input position where we entered the loop.
91
  uint32_t entryPosition;
92
};
93
94
/// Cursor is a lightweight value type which allows tracking a character pointer
95
/// 'current' within a range 'first' to 'last'.
96
/// A cursor may either be forwards, in which case it proceeds from 'first' to
97
/// 'last'. It may also (in the case of lookbehind assertions) be backwards, in
98
/// which case the cursor proceeds from 'last' to 'first'. The terms "begin" and
99
/// "end" denote tracking in the direction of the cursor, while "left" and
100
/// "right" are direction independent.
101
template <class Traits>
102
class Cursor {
103
  using CodeUnit = typename Traits::CodeUnit;
104
  using CodePoint = typename Traits::CodePoint;
105
106
 public:
107
  /// Construct with the range \p first and \p last, setting the current
108
  /// position to \p first. Note that the \p last is one past the last valid
109
  /// character. \p forwards decides whether the current pointer advances
110
  /// towards last_ (true) or first_ (false).
111
  Cursor(
112
      const CodeUnit *first,
113
      const CodeUnit *current,
114
      const CodeUnit *last,
115
      bool forwards)
116
86
      : first_(first),
117
86
        last_(last),
118
86
        current_(current),
119
86
        end_(forwards ? last : first),
120
86
        forwards_(forwards) {
121
86
    assert(first_ <= last_ && "first and last out of order");
122
86
    assert(
123
86
        first_ <= current_ && current <= last_ &&
124
86
        "current pointer not in range");
125
86
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::Cursor(char16_t const*, char16_t const*, char16_t const*, bool)
Line
Count
Source
116
86
      : first_(first),
117
86
        last_(last),
118
86
        current_(current),
119
86
        end_(forwards ? last : first),
120
86
        forwards_(forwards) {
121
86
    assert(first_ <= last_ && "first and last out of order");
122
86
    assert(
123
86
        first_ <= current_ && current <= last_ &&
124
86
        "current pointer not in range");
125
86
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::Cursor(char const*, char const*, char const*, bool)
126
127
  /// \return whether this cursor advances forwards.
128
244
  bool forwards() const {
129
244
    return forwards_;
130
244
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::forwards() const
Line
Count
Source
128
244
  bool forwards() const {
129
244
    return forwards_;
130
244
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::forwards() const
131
132
  /// Set whether this cursor advances forwards to \p flag.
133
0
  void setForwards(bool flag) {
134
0
    forwards_ = flag;
135
0
    end_ = forwards_ ? last_ : first_;
136
0
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::setForwards(bool)
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::setForwards(bool)
137
138
  /// \return the number of code units remaining.
139
208
  uint32_t remaining() const {
140
208
    return forwards_ ? last_ - current_ : current_ - first_;
141
208
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::remaining() const
Line
Count
Source
139
208
  uint32_t remaining() const {
140
208
    return forwards_ ? last_ - current_ : current_ - first_;
141
208
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::remaining() const
142
143
  /// \return whether we are at the end of the range.
144
243
  bool atEnd() const {
145
243
    return current_ == end_;
146
243
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::atEnd() const
Line
Count
Source
144
243
  bool atEnd() const {
145
243
    return current_ == end_;
146
243
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::atEnd() const
147
148
  /// \return the number of code units consumed from the leftmost character.
149
  /// This is called "offsetFromLeft" and not "offsetFromStart" to indicate that
150
  /// it does not change under backwards tracking.
151
0
  uint32_t offsetFromLeft() const {
152
0
    return current_ - first_;
153
0
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::offsetFromLeft() const
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::offsetFromLeft() const
154
155
  /// \return the number of code units between the current position and the end
156
  /// of the string.
157
  /// This is called "offsetFromRight" and not "offsetFromEnd" to indicate that
158
  /// it does not change under backwards tracking.
159
86
  uint32_t offsetFromRight() const {
160
86
    return last_ - current_;
161
86
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::offsetFromRight() const
Line
Count
Source
159
86
  uint32_t offsetFromRight() const {
160
86
    return last_ - current_;
161
86
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::offsetFromRight() const
162
163
  /// \return whether we are at the leftmost position.
164
  /// This does not change under backwards tracking.
165
86
  bool atLeft() const {
166
86
    return current_ == first_;
167
86
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::atLeft() const
Line
Count
Source
165
86
  bool atLeft() const {
166
86
    return current_ == first_;
167
86
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::atLeft() const
168
169
  /// \return whether we are at the rightmost position.
170
  /// This does not change under backwards tracking.
171
0
  bool atRight() const {
172
0
    return current_ == last_;
173
0
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::atRight() const
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::atRight() const
174
175
  /// \return the current code unit.
176
243
  CodeUnit current() const {
177
    // Access the character at index 0 if forwards, -1 if backwards.
178
243
    assert(!atEnd() && "Cursor is at end");
179
243
    return current_[(int)forwards_ - 1];
180
243
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::current() const
Line
Count
Source
176
243
  CodeUnit current() const {
177
    // Access the character at index 0 if forwards, -1 if backwards.
178
243
    assert(!atEnd() && "Cursor is at end");
179
243
    return current_[(int)forwards_ - 1];
180
243
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::current() const
181
182
  /// \return the current cursor position.
183
172
  const CodeUnit *currentPointer() const {
184
172
    return current_;
185
172
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::currentPointer() const
Line
Count
Source
183
172
  const CodeUnit *currentPointer() const {
184
172
    return current_;
185
172
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::currentPointer() const
186
187
  /// Set the current cursor position to \p current.
188
208
  void setCurrentPointer(const CodeUnit *current) {
189
208
    assert(first_ <= current && current <= last_ && "Current not in range");
190
208
    current_ = current;
191
208
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::setCurrentPointer(char16_t const*)
Line
Count
Source
188
208
  void setCurrentPointer(const CodeUnit *current) {
189
208
    assert(first_ <= current && current <= last_ && "Current not in range");
190
208
    current_ = current;
191
208
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::setCurrentPointer(char const*)
192
193
  /// \return the current code unit, advancing the cursor by 1.
194
243
  CodeUnit consume() {
195
243
    CodeUnit result = current();
196
243
    current_ += forwards_ ? 1 : -1;
197
243
    return result;
198
243
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::consume()
Line
Count
Source
194
243
  CodeUnit consume() {
195
243
    CodeUnit result = current();
196
243
    current_ += forwards_ ? 1 : -1;
197
243
    return result;
198
243
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::consume()
199
200
  /// \return a code point decoded from the code units under the cursor,
201
  /// possibly by decoding surrogates. Advances the cursor by the number of code
202
  /// units consumed.
203
0
  CodePoint consumeUTF16() {
204
0
    assert(!atEnd() && "At end");
205
206
    // In ASCII we have no surrogates.
207
0
    if (sizeof(CodeUnit) >= 2 && remaining() >= 2) {
208
0
      CodeUnit hi = forwards_ ? current_[0] : current_[-2];
209
0
      CodeUnit lo = forwards_ ? current_[1] : current_[-1];
210
0
      if (isHighSurrogate(hi) && isLowSurrogate(lo)) {
211
0
        current_ += forwards_ ? 2 : -2;
212
0
        return utf16SurrogatePairToCodePoint(hi, lo);
213
0
      }
214
0
    }
215
0
    return consume();
216
0
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::consumeUTF16()
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::consumeUTF16()
217
218
  /// \return whether a regex match performed using the given \p flags can
219
  /// possibly match the given \p constraints.
220
  bool satisfiesConstraints(
221
      constants::MatchFlagType flags,
222
86
      MatchConstraintSet constraints) const {
223
86
    if ((constraints & MatchConstraintNonASCII) &&
224
86
        (flags & constants::matchInputAllAscii))
225
0
      return false;
226
86
    if ((constraints & MatchConstraintAnchoredAtStart) && current_ != first_)
227
0
      return false;
228
86
    return true;
229
86
  }
hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>::satisfiesConstraints(hermes::regex::constants::MatchFlagType, unsigned char) const
Line
Count
Source
222
86
      MatchConstraintSet constraints) const {
223
86
    if ((constraints & MatchConstraintNonASCII) &&
224
86
        (flags & constants::matchInputAllAscii))
225
0
      return false;
226
86
    if ((constraints & MatchConstraintAnchoredAtStart) && current_ != first_)
227
0
      return false;
228
86
    return true;
229
86
  }
Unexecuted instantiation: hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>::satisfiesConstraints(hermes::regex::constants::MatchFlagType, unsigned char) const
230
231
 private:
232
  // The first code unit in the string.
233
  const CodeUnit *first_;
234
235
  // One past the last code unit in the string.
236
  const CodeUnit *last_;
237
238
  // Our position between first_ and last_.
239
  // If we are forwards, then the current character is current_[0].
240
  // If we are backwards, then the current character is current_[-1].
241
  const CodeUnit *current_;
242
243
  // A pointer to the end. This is either last (if forwards) or first (if not
244
  // forwards). If our current cursor reaches this value, we are done.
245
  const CodeUnit *end_;
246
247
  // Whether we are tracking forwards or backwards.
248
  bool forwards_;
249
};
250
251
/// A Context records global information about a match attempt.
252
template <class Traits>
253
struct Context {
254
  using CodeUnit = typename Traits::CodeUnit;
255
  using CodePoint = typename Traits::CodePoint;
256
257
  /// The set of backtracking opcodes. These are interpreted by the backtrack()
258
  /// function.
259
  enum class BacktrackOp : uint8_t {
260
    /// Set the value of a capture group to a stored value.
261
    SetCaptureGroup,
262
263
    /// Set the value of a loop data to a stored value.
264
    SetLoopData,
265
266
    /// Set the IP and position in the input string to a stored value.
267
    SetPosition,
268
269
    /// Backtrack by entering the body of a non-greedy loop.
270
    EnterNonGreedyLoop,
271
272
    /// Backtrack a greedy loop whose body matches exactly one character, such
273
    /// as /.*/.
274
    GreedyWidth1Loop,
275
276
    /// Backtrack a nongreedy loop whose body matches exactly one character,
277
    /// such as /.*?/.
278
    NongreedyWidth1Loop,
279
  };
280
281
  /// An instruction describing how to backtrack.
282
  union BacktrackInsn {
283
    /// The operation to perform.
284
    BacktrackOp op;
285
286
    /// List of instruction-specific fields. Note that the opcode is reproduced
287
    /// in every struct; this avoids padding between the opcode and the
288
    /// following field.
289
290
    /// Fields used by setCaptureGroup instruction.
291
    struct {
292
      BacktrackOp op;
293
      uint16_t mexp; /// Which capture group to set.
294
      CapturedRange range; /// Value to set.
295
    } setCaptureGroup;
296
297
    /// Fields used by SetLoopData instruction.
298
    struct {
299
      BacktrackOp op;
300
      uint16_t loopId; /// Which loop to set.
301
      LoopData loopData; /// Value to set.
302
    } setLoopData;
303
304
    /// Fields used by SetPosition instruction.
305
    struct {
306
      BacktrackOp op;
307
      uint32_t ip; /// Instruction pointer to set.
308
      const CodeUnit *value; /// Input string position to set.
309
    } setPosition;
310
311
    /// Fields used by EnterNonGreedyLoop instruction.
312
    struct {
313
      BacktrackOp op;
314
      uint32_t bodyIp; /// The IP of the loop body.
315
      LoopData loopData; /// Data for the loop to set.
316
      const BeginLoopInsn *loopInsn; /// The loop instruction.
317
    } enterNonGreedyLoop;
318
319
    /// Fields used by GreedyWidth1Loop and NongreedyWidth1Loop.
320
    struct {
321
      BacktrackOp op; /// The opcode.
322
      uint32_t continuation; /// The ip for the not-taken branch of the loop.
323
      const CodeUnit *min; /// The minimum possible match position.
324
      const CodeUnit *max; /// The maximum possible match position.
325
    } width1Loop;
326
327
36
    /* implicit */ BacktrackInsn(BacktrackOp op) : op(op) {}
hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::BacktrackInsn(hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackOp)
Line
Count
Source
327
36
    /* implicit */ BacktrackInsn(BacktrackOp op) : op(op) {}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::BacktrackInsn(hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackOp)
328
329
    /// \return a SetCaptureGroup instruction.
330
    static BacktrackInsn makeSetCaptureGroup(
331
        uint16_t mexp,
332
0
        CapturedRange range) {
333
0
      BacktrackInsn result{BacktrackOp::SetCaptureGroup};
334
0
      result.setCaptureGroup.mexp = mexp;
335
0
      result.setCaptureGroup.range = range;
336
0
      return result;
337
0
    }
Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeSetCaptureGroup(unsigned short, hermes::regex::CapturedRange)
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeSetCaptureGroup(unsigned short, hermes::regex::CapturedRange)
338
339
    /// \return a SetLoopData instruction.
340
0
    static BacktrackInsn makeSetLoopData(uint16_t loopId, LoopData loopData) {
341
0
      BacktrackInsn result{BacktrackOp::SetLoopData};
342
0
      result.setLoopData.loopId = loopId;
343
0
      result.setLoopData.loopData = loopData;
344
0
      return result;
345
0
    }
Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeSetLoopData(unsigned short, hermes::regex::LoopData)
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeSetLoopData(unsigned short, hermes::regex::LoopData)
346
347
    /// \return a SetPosition instruction.
348
    static BacktrackInsn makeSetPosition(
349
        uint32_t ip,
350
0
        const CodeUnit *inputPos) {
351
0
      BacktrackInsn result = BacktrackOp::SetPosition;
352
0
      result.setPosition.ip = ip;
353
0
      result.setPosition.value = inputPos;
354
0
      return result;
355
0
    }
Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeSetPosition(unsigned int, char16_t const*)
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeSetPosition(unsigned int, char const*)
356
357
    /// \return an EnterNonGreedyLoop instruction.
358
    static BacktrackInsn makeEnterNonGreedyLoop(
359
        const BeginLoopInsn *loopInsn,
360
        uint32_t bodyIp,
361
0
        LoopData loopData) {
362
0
      BacktrackInsn result = BacktrackOp::EnterNonGreedyLoop;
363
0
      result.enterNonGreedyLoop.bodyIp = bodyIp;
364
0
      result.enterNonGreedyLoop.loopInsn = loopInsn;
365
0
      result.enterNonGreedyLoop.loopData = loopData;
366
0
      return result;
367
0
    }
Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn::makeEnterNonGreedyLoop(hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData)
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn::makeEnterNonGreedyLoop(hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData)
368
  };
369
370
  /// Our stack of backtrack instructions.
371
  using BacktrackStack = llvh::SmallVector<BacktrackInsn, 64>;
372
373
  /// The maximum depth of our backtracking stack. Beyond this we return a stack
374
  /// overflow error.
375
  static constexpr size_t kMaxBacktrackDepth = 1u << 24;
376
377
  /// The stream of bytecode instructions, including the header.
378
  llvh::ArrayRef<uint8_t> bytecodeStream_;
379
380
  /// The flags associated with the match attempt.
381
  constants::MatchFlagType flags_;
382
383
  /// Syntax flags associated with the regex.
384
  SyntaxFlags syntaxFlags_;
385
386
  /// The first character in the input string.
387
  const CodeUnit *first_;
388
389
  /// The end of the input string (one-past the last).
390
  const CodeUnit *last_;
391
392
  /// Count of submatches.
393
  uint32_t markedCount_;
394
395
  /// Count of loops.
396
  uint32_t loopCount_;
397
398
  /// Traits used for canonicalization.
399
  Traits traits_;
400
401
  /// The remaining number of times we will attempt to backtrack.
402
  /// This is effectively a timeout on the regexp execution.
403
  uint32_t backtracksRemaining_ = kBacktrackLimit;
404
405
  /// Used to guard against stack overflow. Either uses real stack
406
  /// checking or call depth counter checking.
407
  StackOverflowGuard overflowGuard_;
408
409
  Context(
410
      llvh::ArrayRef<uint8_t> bytecodeStream,
411
      constants::MatchFlagType flags,
412
      SyntaxFlags syntaxFlags,
413
      const CodeUnit *first,
414
      const CodeUnit *last,
415
      uint32_t markedCount,
416
      uint32_t loopCount,
417
      StackOverflowGuard guard)
418
86
      : bytecodeStream_(bytecodeStream),
419
86
        flags_(flags),
420
86
        syntaxFlags_(syntaxFlags),
421
86
        first_(first),
422
86
        last_(last),
423
86
        markedCount_(markedCount),
424
86
        loopCount_(loopCount),
425
86
        overflowGuard_(guard) {}
hermes::regex::Context<hermes::regex::UTF16RegexTraits>::Context(llvh::ArrayRef<unsigned char>, hermes::regex::constants::MatchFlagType, hermes::regex::SyntaxFlags, char16_t const*, char16_t const*, unsigned int, unsigned int, hermes::StackOverflowGuard)
Line
Count
Source
418
86
      : bytecodeStream_(bytecodeStream),
419
86
        flags_(flags),
420
86
        syntaxFlags_(syntaxFlags),
421
86
        first_(first),
422
86
        last_(last),
423
86
        markedCount_(markedCount),
424
86
        loopCount_(loopCount),
425
86
        overflowGuard_(guard) {}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::Context(llvh::ArrayRef<unsigned char>, hermes::regex::constants::MatchFlagType, hermes::regex::SyntaxFlags, char const*, char const*, unsigned int, unsigned int, hermes::StackOverflowGuard)
426
427
  /// Run the given State \p state, by starting at its cursor and acting on its
428
  /// ip_ until the match succeeds or fails. If \p onlyAtStart is set, only
429
  /// test the match at \pos; otherwise test all successive input positions from
430
  /// pos_ through last_.
431
  /// \return a pointer to the start of the match if the match succeeds, nullptr
432
  /// if it fails. If the match succeeds, populates \p state with the state of
433
  /// the successful match; on failure the state's contents are undefined.
434
  /// Note the end of the match can be recovered as
435
  /// state->cursor_.currentPointer().
436
  ExecutorResult<const CodeUnit *> match(
437
      State<Traits> *state,
438
      bool onlyAtStart);
439
440
  /// Backtrack the given state \p s with the backtrack stack \p bts.
441
  /// \return true if we backtracked, false if we exhausted the stack.
442
  LLVM_NODISCARD
443
  ExecutorResult<bool> backtrack(BacktrackStack &bts, State<Traits> *s);
444
445
  /// Set the state's position to the body of a non-greedy loop.
446
  /// \return RETURNED if backtracking was prepared, STACK_OVERFLOW otherwise.
447
  LLVM_NODISCARD
448
  ExecutionStatus performEnterNonGreedyLoop(
449
      State<Traits> *s,
450
      const BeginLoopInsn *loop,
451
      uint32_t bodyIp,
452
      LoopData loopData,
453
      BacktrackStack &backtrackStack);
454
455
  /// Add a backtrack instruction to the backtrack stack \p bts.
456
  /// \return RETURNED on success, STACK_OVERFLOW otherwise
457
  LLVM_NODISCARD
458
36
  ExecutionStatus pushBacktrack(BacktrackStack &bts, BacktrackInsn insn) {
459
36
    bts.push_back(insn);
460
36
    if (LLVM_UNLIKELY(bts.size() > kMaxBacktrackDepth) ||
461
36
        LLVM_UNLIKELY(backtracksRemaining_ == 0)) {
462
0
      return ExecutionStatus::STACK_OVERFLOW;
463
0
    }
464
36
    backtracksRemaining_--;
465
36
    return ExecutionStatus::RETURNED;
466
36
  }
hermes::regex::Context<hermes::regex::UTF16RegexTraits>::pushBacktrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&, hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn)
Line
Count
Source
458
36
  ExecutionStatus pushBacktrack(BacktrackStack &bts, BacktrackInsn insn) {
459
36
    bts.push_back(insn);
460
36
    if (LLVM_UNLIKELY(bts.size() > kMaxBacktrackDepth) ||
461
36
        LLVM_UNLIKELY(backtracksRemaining_ == 0)) {
462
0
      return ExecutionStatus::STACK_OVERFLOW;
463
0
    }
464
36
    backtracksRemaining_--;
465
36
    return ExecutionStatus::RETURNED;
466
36
  }
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::pushBacktrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&, hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn)
467
468
  /// Run the given Width1Loop \p insn on the given state \p s with the
469
  /// backtrack stack \p bts.
470
  /// \return true on success, false if we should backtrack.
471
  LLVM_NODISCARD
472
  ExecutorResult<bool> matchWidth1Loop(
473
      const Width1LoopInsn *insn,
474
      State<Traits> *s,
475
      BacktrackStack &bts);
476
477
 private:
478
  /// Do initialization of the given state before it enters the loop body
479
  /// described by the LoopInsn \p loop, including setting up any backtracking
480
  /// state.
481
  /// \return RETURNED if backtracking was prepared, STACK_OVERFLOW else
482
  LLVM_NODISCARD
483
  ExecutionStatus prepareToEnterLoopBody(
484
      State<Traits> *state,
485
      const BeginLoopInsn *loop,
486
      BacktrackStack &bts);
487
488
  /// Given a Width1Opcode \p w1opcode, return true if the given char \p c
489
  /// matches the instruction \p insn (with that opcode).
490
  template <Width1Opcode w1opcode>
491
  inline bool matchWidth1(const Insn *insn, CodeUnit c) const;
492
493
  /// \return true if all chars, stored in contiguous memory after \p insn,
494
  /// match the chars in state \p s in the same order, case insensitive. Note
495
  /// the count of chars is given in \p insn.
496
  inline bool matchesNCharICase8(
497
      const MatchNCharICase8Insn *insn,
498
      State<Traits> &s);
499
500
  /// Execute the given Width1 instruction \p loopBody on cursor \p c up to \p
501
  /// max times. \return the number of matches made, not to exceed \p max.
502
  /// Note we deliberately accept \p c by value.
503
  template <Width1Opcode w1opcode>
504
  inline uint32_t
505
  matchWidth1LoopBody(const Insn *loopBody, Cursor<Traits> c, uint32_t max);
506
507
  /// ES6 21.2.5.2.3 AdvanceStringIndex.
508
  /// Return the index of the next character to check.
509
  /// This is typically just the index + 1, except if Unicode is enabled we need
510
  /// to skip surrogate pairs.
511
  inline size_t advanceStringIndex(
512
      const CodeUnit *start,
513
      size_t index,
514
      size_t lastIndex) const;
515
};
516
517
/// We store loop and captured range data contiguously in a single allocation at
518
/// the end of the State. Use this union to simplify the use of
519
/// llvh::TrailingObjects.
520
union LoopOrCapturedRange {
521
  struct LoopData loopData;
522
  struct CapturedRange capturedRange;
523
};
524
525
/// State represents a set of in-flight capture groups and loop datas, along
526
/// with the IP and input position.
527
template <typename Traits>
528
struct State {
529
  using CharT = typename Traits::CodeUnit;
530
531
  /// The cursor in the input string.
532
  Cursor<Traits> cursor_;
533
534
  /// The instruction pointer position in the bytecode stream.
535
  uint32_t ip_ = 0;
536
537
  /// List of captured ranges. This has size equal to the number of marked
538
  /// subexpressions for the regex.
539
  llvh::SmallVector<CapturedRange, 16> capturedRanges_;
540
541
  /// List of loop datas. This has size equal to the number of loops for the
542
  /// regex.
543
  llvh::SmallVector<LoopData, 16> loopDatas_;
544
545
  /// \return the loop data at index \p idx.
546
0
  LoopData &getLoop(uint32_t idx) {
547
0
    assert(idx < loopDatas_.size() && "Invalid loop index");
548
0
    return loopDatas_[idx];
549
0
  }
Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::getLoop(unsigned int)
Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::getLoop(unsigned int)
550
551
  /// \return the captured range at index \p idx.
552
0
  CapturedRange &getCapturedRange(uint32_t idx) {
553
    // Captured ranges are allocated after loops, so add the loop count.
554
0
    assert(idx < capturedRanges_.size() && "Invalid captured range index");
555
0
    return capturedRanges_[idx];
556
0
  }
Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::getCapturedRange(unsigned int)
Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::getCapturedRange(unsigned int)
557
558
  /// Construct a state which with the given \p cursor, which can hold \p
559
  /// markedCount submatches and \p loopCount loop datas.
560
  State(Cursor<Traits> cursor, uint32_t markedCount, uint32_t loopCount)
561
86
      : cursor_(cursor),
562
86
        capturedRanges_(markedCount, {kNotMatched, kNotMatched}),
563
86
        loopDatas_(loopCount, {0, 0}) {}
hermes::regex::State<hermes::regex::UTF16RegexTraits>::State(hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int, unsigned int)
Line
Count
Source
561
86
      : cursor_(cursor),
562
86
        capturedRanges_(markedCount, {kNotMatched, kNotMatched}),
563
86
        loopDatas_(loopCount, {0, 0}) {}
Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::State(hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int, unsigned int)
564
565
0
  State(const State &) = default;
Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::State(hermes::regex::State<hermes::regex::UTF16RegexTraits> const&)
Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::State(hermes::regex::State<hermes::regex::ASCIIRegexTraits> const&)
566
  State &operator=(const State &) = default;
567
  State(State &&) = default;
568
0
  State &operator=(State &&) = default;
Unexecuted instantiation: hermes::regex::State<hermes::regex::UTF16RegexTraits>::operator=(hermes::regex::State<hermes::regex::UTF16RegexTraits>&&)
Unexecuted instantiation: hermes::regex::State<hermes::regex::ASCIIRegexTraits>::operator=(hermes::regex::State<hermes::regex::ASCIIRegexTraits>&&)
569
};
570
571
/// ES5.1 7.3
572
template <class CharT>
573
0
bool isLineTerminator(CharT c) {
574
0
  return c == u'\u000A' || c == u'\u000D' || c == u'\u2028' || c == u'\u2029';
575
0
}
Unexecuted instantiation: bool hermes::regex::isLineTerminator<char16_t>(char16_t)
Unexecuted instantiation: bool hermes::regex::isLineTerminator<unsigned int>(unsigned int)
Unexecuted instantiation: bool hermes::regex::isLineTerminator<char>(char)
Unexecuted instantiation: bool hermes::regex::isLineTerminator<unsigned char>(unsigned char)
576
577
template <class Traits>
578
86
bool matchesLeftAnchor(Context<Traits> &ctx, State<Traits> &s) {
579
86
  bool matchesAnchor = false;
580
86
  const Cursor<Traits> &c = s.cursor_;
581
86
  if (c.atLeft()) {
582
    // Beginning of text.
583
86
    matchesAnchor = true;
584
86
  } else if (
585
0
      (ctx.syntaxFlags_.multiline) && !c.atLeft() &&
586
0
      isLineTerminator(c.currentPointer()[-1])) {
587
    // Multiline and after line terminator.
588
0
    matchesAnchor = true;
589
0
  }
590
86
  return matchesAnchor;
591
86
}
bool hermes::regex::matchesLeftAnchor<hermes::regex::UTF16RegexTraits>(hermes::regex::Context<hermes::regex::UTF16RegexTraits>&, hermes::regex::State<hermes::regex::UTF16RegexTraits>&)
Line
Count
Source
578
86
bool matchesLeftAnchor(Context<Traits> &ctx, State<Traits> &s) {
579
86
  bool matchesAnchor = false;
580
86
  const Cursor<Traits> &c = s.cursor_;
581
86
  if (c.atLeft()) {
582
    // Beginning of text.
583
86
    matchesAnchor = true;
584
86
  } else if (
585
0
      (ctx.syntaxFlags_.multiline) && !c.atLeft() &&
586
0
      isLineTerminator(c.currentPointer()[-1])) {
587
    // Multiline and after line terminator.
588
0
    matchesAnchor = true;
589
0
  }
590
86
  return matchesAnchor;
591
86
}
Unexecuted instantiation: bool hermes::regex::matchesLeftAnchor<hermes::regex::ASCIIRegexTraits>(hermes::regex::Context<hermes::regex::ASCIIRegexTraits>&, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&)
592
593
template <class Traits>
594
0
bool matchesRightAnchor(Context<Traits> &ctx, State<Traits> &s) {
595
0
  bool matchesAnchor = false;
596
0
  const Cursor<Traits> &c = s.cursor_;
597
0
  if (c.atRight() && !(ctx.flags_ & constants::matchNotEndOfLine)) {
598
0
    matchesAnchor = true;
599
0
  } else if (
600
0
      (ctx.syntaxFlags_.multiline) && (!c.atRight()) &&
601
0
      isLineTerminator(c.currentPointer()[0])) {
602
0
    matchesAnchor = true;
603
0
  }
604
0
  return matchesAnchor;
605
0
}
Unexecuted instantiation: bool hermes::regex::matchesRightAnchor<hermes::regex::UTF16RegexTraits>(hermes::regex::Context<hermes::regex::UTF16RegexTraits>&, hermes::regex::State<hermes::regex::UTF16RegexTraits>&)
Unexecuted instantiation: bool hermes::regex::matchesRightAnchor<hermes::regex::ASCIIRegexTraits>(hermes::regex::Context<hermes::regex::ASCIIRegexTraits>&, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&)
606
607
/// \return true if all chars, stored in contiguous memory after \p insn,
608
/// match the chars in state \p s in the same order. Note the count of chars
609
/// is given in \p insn.
610
template <class Traits>
611
121
bool matchesNChar8(const MatchNChar8Insn *insn, State<Traits> &s) {
612
121
  Cursor<Traits> &c = s.cursor_;
613
121
  auto insnCharPtr = reinterpret_cast<const char *>(insn + 1);
614
121
  auto charCount = insn->charCount;
615
121
  for (int idx = 0; idx < charCount; idx++) {
616
121
    if (c.consume() != insnCharPtr[idx]) {
617
121
      return false;
618
121
    }
619
121
  }
620
0
  return true;
621
121
}
bool hermes::regex::matchesNChar8<hermes::regex::UTF16RegexTraits>(hermes::regex::MatchNChar8Insn const*, hermes::regex::State<hermes::regex::UTF16RegexTraits>&)
Line
Count
Source
611
121
bool matchesNChar8(const MatchNChar8Insn *insn, State<Traits> &s) {
612
121
  Cursor<Traits> &c = s.cursor_;
613
121
  auto insnCharPtr = reinterpret_cast<const char *>(insn + 1);
614
121
  auto charCount = insn->charCount;
615
121
  for (int idx = 0; idx < charCount; idx++) {
616
121
    if (c.consume() != insnCharPtr[idx]) {
617
121
      return false;
618
121
    }
619
121
  }
620
0
  return true;
621
121
}
Unexecuted instantiation: bool hermes::regex::matchesNChar8<hermes::regex::ASCIIRegexTraits>(hermes::regex::MatchNChar8Insn const*, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&)
622
623
template <class Traits>
624
bool Context<Traits>::matchesNCharICase8(
625
    const MatchNCharICase8Insn *insn,
626
0
    State<Traits> &s) {
627
0
  Cursor<Traits> &c = s.cursor_;
628
0
  auto insnCharPtr = reinterpret_cast<const char *>(insn + 1);
629
0
  auto charCount = insn->charCount;
630
0
  bool unicode = syntaxFlags_.unicode;
631
0
  for (int idx = 0; idx < charCount; idx++) {
632
0
    auto c1 = c.consume();
633
0
    char instC = insnCharPtr[idx];
634
0
    if (c1 != instC &&
635
0
        (char32_t)traits_.canonicalize(c1, unicode) != (char32_t)instC) {
636
0
      return false;
637
0
    }
638
0
  }
639
0
  return true;
640
0
}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchesNCharICase8(hermes::regex::MatchNCharICase8Insn const*, hermes::regex::State<hermes::regex::UTF16RegexTraits>&)
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchesNCharICase8(hermes::regex::MatchNCharICase8Insn const*, hermes::regex::State<hermes::regex::ASCIIRegexTraits>&)
641
642
/// \return true if the character \p ch matches a bracket instruction \p insn,
643
/// containing the bracket ranges \p ranges. Note the count of ranges is given
644
/// in \p insn.
645
template <class Traits>
646
bool bracketMatchesChar(
647
    const Context<Traits> &ctx,
648
    const BracketInsn *insn,
649
    const BracketRange32 *ranges,
650
122
    typename Traits::CodePoint ch) {
651
122
  const auto &traits = ctx.traits_;
652
  // Note that if the bracket is negated /[^abc]/, we want to return true if we
653
  // do not match, false if we do. Implement this by xor with the negate flag.
654
655
  // Check character classes.
656
  // Note we don't have to canonicalize here, because canonicalization does not
657
  // affect which character class a character is in (i.e. a character doesn't
658
  // become a digit after uppercasing).
659
122
  if (insn->positiveCharClasses || insn->negativeCharClasses) {
660
122
    for (auto charClass :
661
122
         {CharacterClass::Digits,
662
122
          CharacterClass::Spaces,
663
330
          CharacterClass::Words}) {
664
330
      if ((insn->positiveCharClasses & charClass) &&
665
330
          traits.characterHasType(ch, charClass))
666
36
        return true ^ insn->negate;
667
294
      if ((insn->negativeCharClasses & charClass) &&
668
294
          !traits.characterHasType(ch, charClass))
669
0
        return true ^ insn->negate;
670
294
    }
671
122
  }
672
673
86
  bool contained =
674
86
      traits.rangesContain(llvh::makeArrayRef(ranges, insn->rangeCount), ch);
675
86
  return contained ^ insn->negate;
676
122
}
bool hermes::regex::bracketMatchesChar<hermes::regex::UTF16RegexTraits>(hermes::regex::Context<hermes::regex::UTF16RegexTraits> const&, hermes::regex::BracketInsn const*, hermes::regex::BracketRange32 const*, hermes::regex::UTF16RegexTraits::CodePoint)
Line
Count
Source
650
122
    typename Traits::CodePoint ch) {
651
122
  const auto &traits = ctx.traits_;
652
  // Note that if the bracket is negated /[^abc]/, we want to return true if we
653
  // do not match, false if we do. Implement this by xor with the negate flag.
654
655
  // Check character classes.
656
  // Note we don't have to canonicalize here, because canonicalization does not
657
  // affect which character class a character is in (i.e. a character doesn't
658
  // become a digit after uppercasing).
659
122
  if (insn->positiveCharClasses || insn->negativeCharClasses) {
660
122
    for (auto charClass :
661
122
         {CharacterClass::Digits,
662
122
          CharacterClass::Spaces,
663
330
          CharacterClass::Words}) {
664
330
      if ((insn->positiveCharClasses & charClass) &&
665
330
          traits.characterHasType(ch, charClass))
666
36
        return true ^ insn->negate;
667
294
      if ((insn->negativeCharClasses & charClass) &&
668
294
          !traits.characterHasType(ch, charClass))
669
0
        return true ^ insn->negate;
670
294
    }
671
122
  }
672
673
86
  bool contained =
674
86
      traits.rangesContain(llvh::makeArrayRef(ranges, insn->rangeCount), ch);
675
86
  return contained ^ insn->negate;
676
122
}
Unexecuted instantiation: bool hermes::regex::bracketMatchesChar<hermes::regex::ASCIIRegexTraits>(hermes::regex::Context<hermes::regex::ASCIIRegexTraits> const&, hermes::regex::BracketInsn const*, hermes::regex::BracketRange32 const*, hermes::regex::ASCIIRegexTraits::CodePoint)
677
678
template <class Traits>
679
ExecutionStatus Context<Traits>::prepareToEnterLoopBody(
680
    State<Traits> *s,
681
    const BeginLoopInsn *loop,
682
0
    BacktrackStack &bts) {
683
0
  LoopData &loopData = s->getLoop(loop->loopId);
684
0
  auto res = pushBacktrack(
685
0
      bts, BacktrackInsn::makeSetLoopData(loop->loopId, loopData));
686
0
  if (res != ExecutionStatus::RETURNED) {
687
0
    return res;
688
0
  }
689
0
  loopData.iterations++;
690
0
  loopData.entryPosition = s->cursor_.offsetFromLeft();
691
692
  // Backtrack and reset contained capture groups.
693
0
  for (uint32_t mexp = loop->mexpBegin; mexp != loop->mexpEnd; mexp++) {
694
0
    auto &captureRange = s->getCapturedRange(mexp);
695
0
    res = pushBacktrack(
696
0
        bts, BacktrackInsn::makeSetCaptureGroup(mexp, captureRange));
697
0
    if (res != ExecutionStatus::RETURNED) {
698
0
      return res;
699
0
    }
700
0
    captureRange = {kNotMatched, kNotMatched};
701
0
  }
702
0
  return ExecutionStatus::RETURNED;
703
0
}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::prepareToEnterLoopBody(hermes::regex::State<hermes::regex::UTF16RegexTraits>*, hermes::regex::BeginLoopInsn const*, llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&)
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::prepareToEnterLoopBody(hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, hermes::regex::BeginLoopInsn const*, llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&)
704
705
template <class Traits>
706
ExecutionStatus Context<Traits>::performEnterNonGreedyLoop(
707
    State<Traits> *s,
708
    const BeginLoopInsn *loop,
709
    uint32_t bodyIp,
710
    LoopData loopData,
711
0
    BacktrackStack &backtrackStack) {
712
0
  assert(loop->opcode == Opcode::BeginLoop && "Not a BeginLoopInsn");
713
0
  s->getLoop(loop->loopId) = loopData;
714
715
  // Set the IP and input position, and initialize the state for entering the
716
  // loop.
717
0
  s->ip_ = bodyIp;
718
0
  s->cursor_.setCurrentPointer(first_ + loopData.entryPosition);
719
0
  return prepareToEnterLoopBody(s, loop, backtrackStack);
720
0
}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::UTF16RegexTraits>::performEnterNonGreedyLoop(hermes::regex::State<hermes::regex::UTF16RegexTraits>*, hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData, llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&)
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::performEnterNonGreedyLoop(hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, hermes::regex::BeginLoopInsn const*, unsigned int, hermes::regex::LoopData, llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&)
721
722
template <class Traits>
723
ExecutorResult<bool> Context<Traits>::backtrack(
724
    BacktrackStack &bts,
725
122
    State<Traits> *s) {
726
158
  while (!bts.empty()) {
727
72
    BacktrackInsn &binsn = bts.back();
728
72
    switch (binsn.op) {
729
0
      case BacktrackOp::SetCaptureGroup:
730
0
        s->getCapturedRange(binsn.setCaptureGroup.mexp) =
731
0
            binsn.setCaptureGroup.range;
732
0
        bts.pop_back();
733
0
        break;
734
735
0
      case BacktrackOp::SetLoopData:
736
0
        s->getLoop(binsn.setLoopData.loopId) = binsn.setLoopData.loopData;
737
0
        bts.pop_back();
738
0
        break;
739
740
0
      case BacktrackOp::SetPosition:
741
0
        s->cursor_.setCurrentPointer(binsn.setPosition.value);
742
0
        s->ip_ = binsn.setPosition.ip;
743
0
        bts.pop_back();
744
0
        return true;
745
746
0
      case BacktrackOp::EnterNonGreedyLoop: {
747
0
        auto fields = binsn.enterNonGreedyLoop;
748
0
        bts.pop_back();
749
0
        auto res = performEnterNonGreedyLoop(
750
0
            s, fields.loopInsn, fields.bodyIp, fields.loopData, bts);
751
0
        if (res != ExecutionStatus::RETURNED) {
752
0
          return res;
753
0
        }
754
0
        return true;
755
0
      }
756
757
72
      case BacktrackOp::GreedyWidth1Loop:
758
72
      case BacktrackOp::NongreedyWidth1Loop: {
759
        // In both of these instructions, we have a range [min, max] containing
760
        // possible match locations, and the match failed at the max location
761
        // (if we are greedy) or the min location (nongreedy). Backtrack by
762
        // decrementing the max (incrementing the min) if we are greedy
763
        // (nongreedy), setting the IP to that location, and jumping to the loop
764
        // exit. Note that if we are tracking backwards (lookbehind assertion)
765
        // our maximum is before our minimum, so we have to reverse the
766
        // direction of increment/decrement.
767
72
        bool forwards = s->cursor_.forwards();
768
72
        assert(
769
72
            (forwards ? binsn.width1Loop.min <= binsn.width1Loop.max
770
72
                      : binsn.width1Loop.min >= binsn.width1Loop.max) &&
771
72
            "Loop min should be <= max (or >= max if backwards)");
772
72
        if (binsn.width1Loop.min == binsn.width1Loop.max) {
773
          // We have backtracked as far as possible. Give up.
774
36
          bts.pop_back();
775
36
          break;
776
36
        }
777
36
        if (binsn.op == BacktrackOp::GreedyWidth1Loop) {
778
36
          binsn.width1Loop.max += forwards ? -1 : 1;
779
36
          s->cursor_.setCurrentPointer(binsn.width1Loop.max);
780
36
        } else {
781
0
          binsn.width1Loop.min += forwards ? 1 : -1;
782
0
          s->cursor_.setCurrentPointer(binsn.width1Loop.min);
783
0
        }
784
36
        s->ip_ = binsn.width1Loop.continuation;
785
36
        return true;
786
72
      }
787
72
    }
788
72
  }
789
  // Exhausted the backtracking stack.
790
86
  return false;
791
122
}
hermes::regex::Context<hermes::regex::UTF16RegexTraits>::backtrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&, hermes::regex::State<hermes::regex::UTF16RegexTraits>*)
Line
Count
Source
725
122
    State<Traits> *s) {
726
158
  while (!bts.empty()) {
727
72
    BacktrackInsn &binsn = bts.back();
728
72
    switch (binsn.op) {
729
0
      case BacktrackOp::SetCaptureGroup:
730
0
        s->getCapturedRange(binsn.setCaptureGroup.mexp) =
731
0
            binsn.setCaptureGroup.range;
732
0
        bts.pop_back();
733
0
        break;
734
735
0
      case BacktrackOp::SetLoopData:
736
0
        s->getLoop(binsn.setLoopData.loopId) = binsn.setLoopData.loopData;
737
0
        bts.pop_back();
738
0
        break;
739
740
0
      case BacktrackOp::SetPosition:
741
0
        s->cursor_.setCurrentPointer(binsn.setPosition.value);
742
0
        s->ip_ = binsn.setPosition.ip;
743
0
        bts.pop_back();
744
0
        return true;
745
746
0
      case BacktrackOp::EnterNonGreedyLoop: {
747
0
        auto fields = binsn.enterNonGreedyLoop;
748
0
        bts.pop_back();
749
0
        auto res = performEnterNonGreedyLoop(
750
0
            s, fields.loopInsn, fields.bodyIp, fields.loopData, bts);
751
0
        if (res != ExecutionStatus::RETURNED) {
752
0
          return res;
753
0
        }
754
0
        return true;
755
0
      }
756
757
72
      case BacktrackOp::GreedyWidth1Loop:
758
72
      case BacktrackOp::NongreedyWidth1Loop: {
759
        // In both of these instructions, we have a range [min, max] containing
760
        // possible match locations, and the match failed at the max location
761
        // (if we are greedy) or the min location (nongreedy). Backtrack by
762
        // decrementing the max (incrementing the min) if we are greedy
763
        // (nongreedy), setting the IP to that location, and jumping to the loop
764
        // exit. Note that if we are tracking backwards (lookbehind assertion)
765
        // our maximum is before our minimum, so we have to reverse the
766
        // direction of increment/decrement.
767
72
        bool forwards = s->cursor_.forwards();
768
72
        assert(
769
72
            (forwards ? binsn.width1Loop.min <= binsn.width1Loop.max
770
72
                      : binsn.width1Loop.min >= binsn.width1Loop.max) &&
771
72
            "Loop min should be <= max (or >= max if backwards)");
772
72
        if (binsn.width1Loop.min == binsn.width1Loop.max) {
773
          // We have backtracked as far as possible. Give up.
774
36
          bts.pop_back();
775
36
          break;
776
36
        }
777
36
        if (binsn.op == BacktrackOp::GreedyWidth1Loop) {
778
36
          binsn.width1Loop.max += forwards ? -1 : 1;
779
36
          s->cursor_.setCurrentPointer(binsn.width1Loop.max);
780
36
        } else {
781
0
          binsn.width1Loop.min += forwards ? 1 : -1;
782
0
          s->cursor_.setCurrentPointer(binsn.width1Loop.min);
783
0
        }
784
36
        s->ip_ = binsn.width1Loop.continuation;
785
36
        return true;
786
72
      }
787
72
    }
788
72
  }
789
  // Exhausted the backtracking stack.
790
86
  return false;
791
122
}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::backtrack(llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&, hermes::regex::State<hermes::regex::ASCIIRegexTraits>*)
792
793
template <class Traits>
794
template <Width1Opcode w1opcode>
795
122
bool Context<Traits>::matchWidth1(const Insn *base, CodeUnit c) const {
796
  // Note this switch should resolve at compile time.
797
122
  assert(
798
122
      base->opcode == static_cast<Opcode>(w1opcode) &&
799
122
      "Instruction has wrong opcode");
800
122
  switch (w1opcode) {
801
0
    case Width1Opcode::MatchChar8: {
802
0
      const auto *insn = llvh::cast<MatchChar8Insn>(base);
803
0
      return c == insn->c;
804
0
    }
805
806
0
    case Width1Opcode::MatchChar16: {
807
0
      const auto *insn = llvh::cast<MatchChar16Insn>(base);
808
0
      return c == insn->c;
809
0
    }
810
811
0
    case Width1Opcode::MatchCharICase8: {
812
0
      const auto *insn = llvh::cast<MatchCharICase8Insn>(base);
813
0
      return c == (CodePoint)insn->c ||
814
0
          (CodePoint)traits_.canonicalize(c, syntaxFlags_.unicode) ==
815
0
          (CodePoint)insn->c;
816
0
    }
817
818
0
    case Width1Opcode::MatchCharICase16: {
819
0
      const auto *insn = llvh::cast<MatchCharICase16Insn>(base);
820
0
      return c == insn->c ||
821
0
          (char32_t)traits_.canonicalize(c, syntaxFlags_.unicode) ==
822
0
          (char32_t)insn->c;
823
0
    }
824
825
0
    case Width1Opcode::MatchAny:
826
0
      return true;
827
828
0
    case Width1Opcode::MatchAnyButNewline:
829
0
      return !isLineTerminator(c);
830
831
122
    case Width1Opcode::Bracket: {
832
      // BracketInsn is followed by a list of BracketRange32s.
833
122
      assert(
834
122
          !(syntaxFlags_.unicode) &&
835
122
          "Unicode should not be set for Width 1 brackets");
836
122
      const BracketInsn *insn = llvh::cast<BracketInsn>(base);
837
122
      const BracketRange32 *ranges =
838
122
          reinterpret_cast<const BracketRange32 *>(insn + 1);
839
122
      return bracketMatchesChar<Traits>(*this, insn, ranges, c);
840
122
    }
841
122
  }
842
122
  llvm_unreachable("Invalid width 1 opcode");
843
122
}
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, char16_t) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, char16_t) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, char16_t) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, char16_t) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, char16_t) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, char16_t) const
bool hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, char16_t) const
Line
Count
Source
795
122
bool Context<Traits>::matchWidth1(const Insn *base, CodeUnit c) const {
796
  // Note this switch should resolve at compile time.
797
122
  assert(
798
122
      base->opcode == static_cast<Opcode>(w1opcode) &&
799
122
      "Instruction has wrong opcode");
800
122
  switch (w1opcode) {
801
0
    case Width1Opcode::MatchChar8: {
802
0
      const auto *insn = llvh::cast<MatchChar8Insn>(base);
803
0
      return c == insn->c;
804
0
    }
805
806
0
    case Width1Opcode::MatchChar16: {
807
0
      const auto *insn = llvh::cast<MatchChar16Insn>(base);
808
0
      return c == insn->c;
809
0
    }
810
811
0
    case Width1Opcode::MatchCharICase8: {
812
0
      const auto *insn = llvh::cast<MatchCharICase8Insn>(base);
813
0
      return c == (CodePoint)insn->c ||
814
0
          (CodePoint)traits_.canonicalize(c, syntaxFlags_.unicode) ==
815
0
          (CodePoint)insn->c;
816
0
    }
817
818
0
    case Width1Opcode::MatchCharICase16: {
819
0
      const auto *insn = llvh::cast<MatchCharICase16Insn>(base);
820
0
      return c == insn->c ||
821
0
          (char32_t)traits_.canonicalize(c, syntaxFlags_.unicode) ==
822
0
          (char32_t)insn->c;
823
0
    }
824
825
0
    case Width1Opcode::MatchAny:
826
0
      return true;
827
828
0
    case Width1Opcode::MatchAnyButNewline:
829
0
      return !isLineTerminator(c);
830
831
122
    case Width1Opcode::Bracket: {
832
      // BracketInsn is followed by a list of BracketRange32s.
833
122
      assert(
834
122
          !(syntaxFlags_.unicode) &&
835
122
          "Unicode should not be set for Width 1 brackets");
836
122
      const BracketInsn *insn = llvh::cast<BracketInsn>(base);
837
122
      const BracketRange32 *ranges =
838
122
          reinterpret_cast<const BracketRange32 *>(insn + 1);
839
122
      return bracketMatchesChar<Traits>(*this, insn, ranges, c);
840
122
    }
841
122
  }
842
0
  llvm_unreachable("Invalid width 1 opcode");
843
122
}
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, char) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, char) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, char) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, char) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, char) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, char) const
Unexecuted instantiation: bool hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, char) const
844
845
template <class Traits>
846
template <Width1Opcode w1opcode>
847
uint32_t Context<Traits>::matchWidth1LoopBody(
848
    const Insn *insn,
849
    Cursor<Traits> c,
850
86
    uint32_t max) {
851
86
  uint32_t iters = 0;
852
122
  for (; iters < max; iters++) {
853
122
    if (!matchWidth1<w1opcode>(insn, c.consume()))
854
86
      break;
855
122
  }
856
86
  return iters;
857
86
}
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int)
unsigned int hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::UTF16RegexTraits>, unsigned int)
Line
Count
Source
850
86
    uint32_t max) {
851
86
  uint32_t iters = 0;
852
122
  for (; iters < max; iters++) {
853
122
    if (!matchWidth1<w1opcode>(insn, c.consume()))
854
86
      break;
855
122
  }
856
86
  return iters;
857
86
}
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)7>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)8>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)12>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)13>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)3>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)5>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int)
Unexecuted instantiation: unsigned int hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1LoopBody<(hermes::regex::Width1Opcode)17>(hermes::regex::Insn const*, hermes::regex::Cursor<hermes::regex::ASCIIRegexTraits>, unsigned int)
858
859
template <class Traits>
860
ExecutorResult<bool> Context<Traits>::matchWidth1Loop(
861
    const Width1LoopInsn *insn,
862
    State<Traits> *s,
863
86
    BacktrackStack &bts) {
864
  // Note we copy the cursor here.
865
86
  Cursor<Traits> c = s->cursor_;
866
86
  uint32_t matched = 0, minMatch = insn->min, maxMatch = insn->max;
867
868
  // Limit our max to the smaller of the maximum in the loop and number of
869
  // number of characters remaining. This allows us to avoid having to test for
870
  // end of input in the loop body.
871
86
  maxMatch = std::min(c.remaining(), maxMatch);
872
873
  // The loop body follows the loop instruction.
874
86
  const Insn *body = static_cast<const Insn *>(&insn[1]);
875
876
  // Match as far as we can up to maxMatch. Note we do this even if the loop is
877
  // non-greedy: we compute how far we might conceivably have to backtrack
878
  // (except in non-greedy loops we're "backtracking" by moving forwards).
879
86
  using W1 = Width1Opcode;
880
86
  switch (static_cast<Width1Opcode>(body->opcode)) {
881
0
    case W1::MatchChar8:
882
0
      matched = matchWidth1LoopBody<W1::MatchChar8>(body, c, maxMatch);
883
0
      break;
884
0
    case W1::MatchChar16:
885
0
      matched = matchWidth1LoopBody<W1::MatchChar16>(body, c, maxMatch);
886
0
      break;
887
0
    case W1::MatchCharICase8:
888
0
      matched = matchWidth1LoopBody<W1::MatchCharICase8>(body, c, maxMatch);
889
0
      break;
890
0
    case W1::MatchCharICase16:
891
0
      matched = matchWidth1LoopBody<W1::MatchCharICase16>(body, c, maxMatch);
892
0
      break;
893
0
    case W1::MatchAny:
894
0
      matched = matchWidth1LoopBody<W1::MatchAny>(body, c, maxMatch);
895
0
      break;
896
0
    case W1::MatchAnyButNewline:
897
0
      matched = matchWidth1LoopBody<W1::MatchAnyButNewline>(body, c, maxMatch);
898
0
      break;
899
86
    case W1::Bracket:
900
86
      matched = matchWidth1LoopBody<W1::Bracket>(body, c, maxMatch);
901
86
      break;
902
86
  }
903
904
  // If we iterated less than the minimum, we failed to match.
905
86
  if (matched < minMatch) {
906
0
    return false;
907
0
  }
908
86
  assert(
909
86
      minMatch <= matched && matched <= maxMatch &&
910
86
      "matched should be between min and max match count");
911
912
  // Now we know the valid match range.
913
  // Compute the beginning and end pointers in this range.
914
86
  bool forwards = s->cursor_.forwards();
915
86
  const CodeUnit *pos = s->cursor_.currentPointer();
916
86
  const CodeUnit *minPos = forwards ? pos + minMatch : pos - minMatch;
917
86
  const CodeUnit *maxPos = forwards ? pos + matched : pos - matched;
918
919
  // If min == max (e.g. /a{3}/) then no backtracking is possible. If min < max,
920
  // backtracking is possible and we need to add a backtracking instruction.
921
86
  if (minMatch < matched) {
922
36
    BacktrackInsn backtrack{
923
36
        insn->greedy ? BacktrackOp::GreedyWidth1Loop
924
36
                     : BacktrackOp::NongreedyWidth1Loop};
925
36
    backtrack.width1Loop.continuation = insn->notTakenTarget;
926
36
    backtrack.width1Loop.min = minPos;
927
36
    backtrack.width1Loop.max = maxPos;
928
36
    auto res = pushBacktrack(bts, backtrack);
929
36
    if (res != ExecutionStatus::RETURNED)
930
0
      return res;
931
36
  }
932
  // Set the state's current position to either the minimum or maximum location,
933
  // and point it to the exit of the loop.
934
86
  s->cursor_.setCurrentPointer(insn->greedy ? maxPos : minPos);
935
86
  s->ip_ = insn->notTakenTarget;
936
86
  return true;
937
86
}
hermes::regex::Context<hermes::regex::UTF16RegexTraits>::matchWidth1Loop(hermes::regex::Width1LoopInsn const*, hermes::regex::State<hermes::regex::UTF16RegexTraits>*, llvh::SmallVector<hermes::regex::Context<hermes::regex::UTF16RegexTraits>::BacktrackInsn, 64u>&)
Line
Count
Source
863
86
    BacktrackStack &bts) {
864
  // Note we copy the cursor here.
865
86
  Cursor<Traits> c = s->cursor_;
866
86
  uint32_t matched = 0, minMatch = insn->min, maxMatch = insn->max;
867
868
  // Limit our max to the smaller of the maximum in the loop and number of
869
  // number of characters remaining. This allows us to avoid having to test for
870
  // end of input in the loop body.
871
86
  maxMatch = std::min(c.remaining(), maxMatch);
872
873
  // The loop body follows the loop instruction.
874
86
  const Insn *body = static_cast<const Insn *>(&insn[1]);
875
876
  // Match as far as we can up to maxMatch. Note we do this even if the loop is
877
  // non-greedy: we compute how far we might conceivably have to backtrack
878
  // (except in non-greedy loops we're "backtracking" by moving forwards).
879
86
  using W1 = Width1Opcode;
880
86
  switch (static_cast<Width1Opcode>(body->opcode)) {
881
0
    case W1::MatchChar8:
882
0
      matched = matchWidth1LoopBody<W1::MatchChar8>(body, c, maxMatch);
883
0
      break;
884
0
    case W1::MatchChar16:
885
0
      matched = matchWidth1LoopBody<W1::MatchChar16>(body, c, maxMatch);
886
0
      break;
887
0
    case W1::MatchCharICase8:
888
0
      matched = matchWidth1LoopBody<W1::MatchCharICase8>(body, c, maxMatch);
889
0
      break;
890
0
    case W1::MatchCharICase16:
891
0
      matched = matchWidth1LoopBody<W1::MatchCharICase16>(body, c, maxMatch);
892
0
      break;
893
0
    case W1::MatchAny:
894
0
      matched = matchWidth1LoopBody<W1::MatchAny>(body, c, maxMatch);
895
0
      break;
896
0
    case W1::MatchAnyButNewline:
897
0
      matched = matchWidth1LoopBody<W1::MatchAnyButNewline>(body, c, maxMatch);
898
0
      break;
899
86
    case W1::Bracket:
900
86
      matched = matchWidth1LoopBody<W1::Bracket>(body, c, maxMatch);
901
86
      break;
902
86
  }
903
904
  // If we iterated less than the minimum, we failed to match.
905
86
  if (matched < minMatch) {
906
0
    return false;
907
0
  }
908
86
  assert(
909
86
      minMatch <= matched && matched <= maxMatch &&
910
86
      "matched should be between min and max match count");
911
912
  // Now we know the valid match range.
913
  // Compute the beginning and end pointers in this range.
914
86
  bool forwards = s->cursor_.forwards();
915
86
  const CodeUnit *pos = s->cursor_.currentPointer();
916
86
  const CodeUnit *minPos = forwards ? pos + minMatch : pos - minMatch;
917
86
  const CodeUnit *maxPos = forwards ? pos + matched : pos - matched;
918
919
  // If min == max (e.g. /a{3}/) then no backtracking is possible. If min < max,
920
  // backtracking is possible and we need to add a backtracking instruction.
921
86
  if (minMatch < matched) {
922
36
    BacktrackInsn backtrack{
923
36
        insn->greedy ? BacktrackOp::GreedyWidth1Loop
924
36
                     : BacktrackOp::NongreedyWidth1Loop};
925
36
    backtrack.width1Loop.continuation = insn->notTakenTarget;
926
36
    backtrack.width1Loop.min = minPos;
927
36
    backtrack.width1Loop.max = maxPos;
928
36
    auto res = pushBacktrack(bts, backtrack);
929
36
    if (res != ExecutionStatus::RETURNED)
930
0
      return res;
931
36
  }
932
  // Set the state's current position to either the minimum or maximum location,
933
  // and point it to the exit of the loop.
934
86
  s->cursor_.setCurrentPointer(insn->greedy ? maxPos : minPos);
935
86
  s->ip_ = insn->notTakenTarget;
936
86
  return true;
937
86
}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::matchWidth1Loop(hermes::regex::Width1LoopInsn const*, hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, llvh::SmallVector<hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::BacktrackInsn, 64u>&)
938
939
/// ES6 21.2.5.2.3. Effectively this skips surrogate pairs if the regexp has the
940
/// Unicode flag set.
941
template <class Traits>
942
inline size_t Context<Traits>::advanceStringIndex(
943
    const CodeUnit *start,
944
    size_t index,
945
86
    size_t length) const {
946
86
  if (sizeof(CodeUnit) == 1) {
947
    // The input string is ASCII and therefore cannot have surrogate pairs.
948
0
    return index + 1;
949
0
  }
950
  // "If unicode is false, return index+1."
951
  // "If index+1 >= length, return index+1."
952
86
  if (LLVM_LIKELY(!(syntaxFlags_.unicode)) || (index + 1 >= length))
953
86
    return index + 1;
954
955
  // Let first be the code unit value at index index in S
956
  // If first < 0xD800 or first > 0xDBFF, return index+1
957
  // Let second be the code unit value at index index+1 in S.
958
  // If second < 0xDC00 or second > 0xDFFF, return index+1.
959
0
  CodeUnit first = start[index];
960
0
  CodeUnit second = start[index + 1];
961
0
  if (LLVM_LIKELY(!isHighSurrogate(first)) ||
962
0
      LLVM_LIKELY(!isLowSurrogate(second))) {
963
0
    return index + 1;
964
0
  }
965
  // Return index+2.
966
0
  return index + 2;
967
0
}
hermes::regex::Context<hermes::regex::UTF16RegexTraits>::advanceStringIndex(char16_t const*, unsigned long, unsigned long) const
Line
Count
Source
945
86
    size_t length) const {
946
86
  if (sizeof(CodeUnit) == 1) {
947
    // The input string is ASCII and therefore cannot have surrogate pairs.
948
0
    return index + 1;
949
0
  }
950
  // "If unicode is false, return index+1."
951
  // "If index+1 >= length, return index+1."
952
86
  if (LLVM_LIKELY(!(syntaxFlags_.unicode)) || (index + 1 >= length))
953
86
    return index + 1;
954
955
  // Let first be the code unit value at index index in S
956
  // If first < 0xD800 or first > 0xDBFF, return index+1
957
  // Let second be the code unit value at index index+1 in S.
958
  // If second < 0xDC00 or second > 0xDFFF, return index+1.
959
0
  CodeUnit first = start[index];
960
0
  CodeUnit second = start[index + 1];
961
0
  if (LLVM_LIKELY(!isHighSurrogate(first)) ||
962
0
      LLVM_LIKELY(!isLowSurrogate(second))) {
963
0
    return index + 1;
964
0
  }
965
  // Return index+2.
966
0
  return index + 2;
967
0
}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::advanceStringIndex(char const*, unsigned long, unsigned long) const
968
969
template <class Traits>
970
auto Context<Traits>::match(State<Traits> *s, bool onlyAtStart)
971
86
    -> ExecutorResult<const CodeUnit *> {
972
86
  using State = State<Traits>;
973
86
  BacktrackStack backtrackStack;
974
975
  // We'll refer to the cursor often.
976
86
  Cursor<Traits> &c = s->cursor_;
977
978
  // Pull out the instruction portion of the bytecode, following the header.
979
86
  const uint8_t *const bytecode = &bytecodeStream_[sizeof(RegexBytecodeHeader)];
980
981
  // Save the incoming IP in case we have to loop.
982
86
  const auto startIp = s->ip_;
983
984
86
  const CodeUnit *const startLoc = c.currentPointer();
985
986
  // Use offsetFromRight() instead of remaining() here so that the length passed
987
  // to advanceStringIndex is accurate even when the cursor is going backwards.
988
86
  const size_t charsToRight = c.offsetFromRight();
989
990
  // Decide how many locations we'll need to check.
991
  // Note that we do want to check the empty range at the end, so add one to
992
  // charsToRight.
993
86
  const size_t locsToCheckCount = onlyAtStart ? 1 : 1 + charsToRight;
994
995
  // If we are tracking backwards, we should only ever have one potential match
996
  // location. This is because advanceStringIndex only ever tracks forwards.
997
86
  assert(
998
86
      (c.forwards() || locsToCheckCount == 1) &&
999
86
      "Can only check one location when cursor is backwards");
1000
1001
#ifndef HERMES_CHECK_NATIVE_STACK
1002
  ++overflowGuard_.callDepth;
1003
  auto decrement =
1004
      llvh::make_scope_exit([this] { --overflowGuard_.callDepth; });
1005
#endif
1006
1007
  // Make sure we are not exceeding the set limit of the amount of times we can
1008
  // recurse.
1009
86
  if (overflowGuard_.isOverflowing()) {
1010
0
    return ExecutionStatus::STACK_OVERFLOW;
1011
0
  }
1012
1013
  // Macro used when a state fails to match.
1014
86
#define BACKTRACK()                            \
1015
122
  do {                                         \
1016
122
    auto btRes = backtrack(backtrackStack, s); \
1017
122
    if (LLVM_UNLIKELY(!btRes))                 \
1018
122
      return btRes.getStatus();                \
1019
122
    if (*btRes)                                \
1020
122
      goto backtrackingSucceeded;              \
1021
122
    goto backtrackingExhausted;                \
1022
122
  } while (0)
1023
1024
172
  for (size_t locIndex = 0; locIndex < locsToCheckCount;
1025
86
       locIndex = advanceStringIndex(startLoc, locIndex, charsToRight)) {
1026
86
    const CodeUnit *potentialMatchLocation = startLoc + locIndex;
1027
86
    c.setCurrentPointer(potentialMatchLocation);
1028
86
    s->ip_ = startIp;
1029
122
  backtrackingSucceeded:
1030
294
    for (;;) {
1031
294
      const Insn *base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
1032
294
      switch (base->opcode) {
1033
0
        case Opcode::Goal:
1034
0
          return potentialMatchLocation;
1035
1036
86
        case Opcode::LeftAnchor:
1037
86
          if (!matchesLeftAnchor(*this, *s))
1038
0
            BACKTRACK();
1039
86
          s->ip_ += sizeof(LeftAnchorInsn);
1040
86
          break;
1041
1042
0
        case Opcode::RightAnchor:
1043
0
          if (!matchesRightAnchor(*this, *s))
1044
0
            BACKTRACK();
1045
0
          s->ip_ += sizeof(RightAnchorInsn);
1046
0
          break;
1047
1048
0
        case Opcode::MatchAny:
1049
0
          if (c.atEnd() ||
1050
0
              !matchWidth1<Width1Opcode::MatchAny>(base, c.consume()))
1051
0
            BACKTRACK();
1052
0
          s->ip_ += sizeof(MatchAnyInsn);
1053
0
          break;
1054
1055
0
        case Opcode::U16MatchAny:
1056
0
          if (c.atEnd())
1057
0
            BACKTRACK();
1058
0
          c.consumeUTF16();
1059
0
          s->ip_ += sizeof(U16MatchAnyInsn);
1060
0
          break;
1061
1062
0
        case Opcode::MatchAnyButNewline:
1063
0
          if (c.atEnd() ||
1064
0
              !matchWidth1<Width1Opcode::MatchAnyButNewline>(base, c.consume()))
1065
0
            BACKTRACK();
1066
0
          s->ip_ += sizeof(MatchAnyButNewlineInsn);
1067
0
          break;
1068
1069
0
        case Opcode::U16MatchAnyButNewline:
1070
0
          if (c.atEnd() || isLineTerminator(c.consumeUTF16()))
1071
0
            BACKTRACK();
1072
0
          s->ip_ += sizeof(U16MatchAnyButNewlineInsn);
1073
0
          break;
1074
1075
0
        case Opcode::MatchChar8: {
1076
0
          if (c.atEnd() ||
1077
0
              !matchWidth1<Width1Opcode::MatchChar8>(base, c.consume()))
1078
0
            BACKTRACK();
1079
0
          s->ip_ += sizeof(MatchChar8Insn);
1080
0
          break;
1081
0
        }
1082
1083
0
        case Opcode::MatchChar16: {
1084
0
          if (c.atEnd() ||
1085
0
              !matchWidth1<Width1Opcode::MatchChar16>(base, c.consume()))
1086
0
            BACKTRACK();
1087
0
          s->ip_ += sizeof(MatchChar16Insn);
1088
0
          break;
1089
0
        }
1090
1091
0
        case Opcode::U16MatchChar32: {
1092
0
          const auto *insn = llvh::cast<U16MatchChar32Insn>(base);
1093
0
          if (c.atEnd() || c.consumeUTF16() != (CodePoint)insn->c)
1094
0
            BACKTRACK();
1095
0
          s->ip_ += sizeof(U16MatchChar32Insn);
1096
0
          break;
1097
0
        }
1098
1099
0
        case Opcode::MatchCharICase8: {
1100
0
          if (c.atEnd() ||
1101
0
              !matchWidth1<Width1Opcode::MatchCharICase8>(base, c.consume()))
1102
0
            BACKTRACK();
1103
0
          s->ip_ += sizeof(MatchCharICase8Insn);
1104
0
          break;
1105
0
        }
1106
1107
0
        case Opcode::MatchCharICase16: {
1108
0
          if (c.atEnd() ||
1109
0
              !matchWidth1<Width1Opcode::MatchCharICase16>(base, c.consume()))
1110
0
            BACKTRACK();
1111
0
          s->ip_ += sizeof(MatchCharICase16Insn);
1112
0
          break;
1113
0
        }
1114
1115
0
        case Opcode::U16MatchCharICase32: {
1116
0
          const auto *insn = llvh::cast<U16MatchCharICase32Insn>(base);
1117
0
          bool matched = false;
1118
0
          if (!c.atEnd()) {
1119
0
            CodePoint cp = c.consumeUTF16();
1120
0
            matched =
1121
0
                (cp == (CodePoint)insn->c ||
1122
0
                 traits_.canonicalize(cp, true) == (CodePoint)insn->c);
1123
0
          }
1124
0
          if (!matched)
1125
0
            BACKTRACK();
1126
0
          s->ip_ += sizeof(U16MatchCharICase32Insn);
1127
0
          break;
1128
0
        }
1129
1130
122
        case Opcode::MatchNChar8: {
1131
122
          const auto *insn = llvh::cast<MatchNChar8Insn>(base);
1132
122
          if (c.remaining() < insn->charCount || !matchesNChar8(insn, *s))
1133
122
            BACKTRACK();
1134
0
          s->ip_ += insn->totalWidth();
1135
0
          break;
1136
122
        }
1137
1138
0
        case Opcode::MatchNCharICase8: {
1139
0
          const auto *insn = llvh::cast<MatchNCharICase8Insn>(base);
1140
0
          if (c.remaining() < insn->charCount || !matchesNCharICase8(insn, *s))
1141
0
            BACKTRACK();
1142
0
          s->ip_ += insn->totalWidth();
1143
0
          break;
1144
0
        }
1145
1146
0
        case Opcode::Alternation: {
1147
          // We have an alternation. Determine which of our first and second
1148
          // branches are viable. If both are, we have to split our state.
1149
0
          const AlternationInsn *alt = llvh::cast<AlternationInsn>(base);
1150
0
          bool primaryViable =
1151
0
              c.satisfiesConstraints(flags_, alt->primaryConstraints);
1152
0
          bool secondaryViable =
1153
0
              c.satisfiesConstraints(flags_, alt->secondaryConstraints);
1154
0
          if (primaryViable && secondaryViable) {
1155
            // We need to explore both branches. Explore the primary branch
1156
            // first, backtrack to the secondary one.
1157
0
            s->ip_ += sizeof(AlternationInsn);
1158
0
            auto res = pushBacktrack(
1159
0
                backtrackStack,
1160
0
                BacktrackInsn::makeSetPosition(
1161
0
                    alt->secondaryBranch, c.currentPointer()));
1162
0
            if (res != ExecutionStatus::RETURNED) {
1163
0
              return res;
1164
0
            }
1165
0
          } else if (primaryViable) {
1166
0
            s->ip_ += sizeof(AlternationInsn);
1167
0
          } else if (secondaryViable) {
1168
0
            s->ip_ = alt->secondaryBranch;
1169
0
          } else {
1170
0
            BACKTRACK();
1171
0
          }
1172
0
          break;
1173
0
        }
1174
1175
0
        case Opcode::Jump32:
1176
0
          s->ip_ = llvh::cast<Jump32Insn>(base)->target;
1177
0
          break;
1178
1179
0
        case Opcode::Bracket: {
1180
0
          if (c.atEnd() ||
1181
0
              !matchWidth1<Width1Opcode::Bracket>(base, c.consume()))
1182
0
            BACKTRACK();
1183
0
          s->ip_ += llvh::cast<BracketInsn>(base)->totalWidth();
1184
0
          break;
1185
0
        }
1186
1187
0
        case Opcode::U16Bracket: {
1188
0
          const U16BracketInsn *insn = llvh::cast<U16BracketInsn>(base);
1189
          // U16BracketInsn is followed by a list of BracketRange32s.
1190
0
          const BracketRange32 *ranges =
1191
0
              reinterpret_cast<const BracketRange32 *>(insn + 1);
1192
0
          if (c.atEnd() ||
1193
0
              !bracketMatchesChar<Traits>(
1194
0
                  *this, insn, ranges, c.consumeUTF16()))
1195
0
            BACKTRACK();
1196
0
          s->ip_ += insn->totalWidth();
1197
0
          break;
1198
0
        }
1199
1200
0
        case Opcode::WordBoundary: {
1201
0
          const WordBoundaryInsn *insn = llvh::cast<WordBoundaryInsn>(base);
1202
0
          const auto *charPointer = c.currentPointer();
1203
1204
0
          bool prevIsWordchar = false;
1205
0
          if (!c.atLeft())
1206
0
            prevIsWordchar = traits_.characterHasType(
1207
0
                charPointer[-1], CharacterClass::Words);
1208
1209
0
          bool currentIsWordchar = false;
1210
0
          if (!c.atRight())
1211
0
            currentIsWordchar =
1212
0
                traits_.characterHasType(charPointer[0], CharacterClass::Words);
1213
1214
0
          bool isWordBoundary = (prevIsWordchar != currentIsWordchar);
1215
0
          if (isWordBoundary ^ insn->invert)
1216
0
            s->ip_ += sizeof(WordBoundaryInsn);
1217
0
          else
1218
0
            BACKTRACK();
1219
0
          break;
1220
0
        }
1221
1222
0
        case Opcode::BeginMarkedSubexpression: {
1223
0
          const auto *insn = llvh::cast<BeginMarkedSubexpressionInsn>(base);
1224
0
          auto res = pushBacktrack(
1225
0
              backtrackStack,
1226
0
              BacktrackInsn::makeSetCaptureGroup(
1227
0
                  insn->mexp, {kNotMatched, kNotMatched}));
1228
0
          if (res != ExecutionStatus::RETURNED) {
1229
0
            return res;
1230
0
          }
1231
          // When tracking backwards (in a lookbehind assertion) we traverse our
1232
          // input backwards, so set the end before the start.
1233
0
          auto &range = s->getCapturedRange(insn->mexp);
1234
0
          if (c.forwards()) {
1235
0
            range.start = c.offsetFromLeft();
1236
0
          } else {
1237
0
            range.end = c.offsetFromLeft();
1238
0
          }
1239
0
          s->ip_ += sizeof(BeginMarkedSubexpressionInsn);
1240
0
          break;
1241
0
        }
1242
1243
0
        case Opcode::EndMarkedSubexpression: {
1244
0
          const auto *insn = llvh::cast<EndMarkedSubexpressionInsn>(base);
1245
0
          auto &range = s->getCapturedRange(insn->mexp);
1246
0
          if (c.forwards()) {
1247
0
            assert(
1248
0
                range.start != kNotMatched && "Capture group was not entered");
1249
0
            range.end = c.offsetFromLeft();
1250
0
          } else {
1251
0
            assert(range.end != kNotMatched && "Capture group was not entered");
1252
0
            range.start = c.offsetFromLeft();
1253
0
          }
1254
0
          assert(range.start <= range.end && "Captured range end before start");
1255
0
          s->ip_ += sizeof(EndMarkedSubexpressionInsn);
1256
0
          break;
1257
0
        }
1258
1259
        // ES10 21.2.2.9.1
1260
0
        case Opcode::BackRef: {
1261
0
          const auto insn = llvh::cast<BackRefInsn>(base);
1262
          // a. Let cap be x's captures List.
1263
          // b. Let s be cap[n].
1264
0
          CapturedRange cr = s->getCapturedRange(insn->mexp);
1265
1266
          // c. If s is undefined, return c(x).
1267
          // Note we have to check both cr.start and cr.end here. If we are
1268
          // currently in the middle of matching a capture group (going either
1269
          // forwards or backwards) we should just return success.
1270
0
          if (cr.start == kNotMatched || cr.end == kNotMatched) {
1271
            // Backreferences to a capture group that did not match always
1272
            // succeed (ES10 21.2.2.9)
1273
0
            s->ip_ += sizeof(BackRefInsn);
1274
0
            break;
1275
0
          }
1276
1277
          // TODO: this can be optimized by hoisting the branches out of the
1278
          // loop.
1279
0
          bool icase = syntaxFlags_.ignoreCase;
1280
0
          bool unicode = syntaxFlags_.unicode;
1281
0
          auto capturedStart = first_ + cr.start;
1282
0
          auto capturedEnd = first_ + cr.end;
1283
0
          Cursor<Traits> cursor2(
1284
0
              capturedStart,
1285
0
              c.forwards() ? capturedStart : capturedEnd,
1286
0
              capturedEnd,
1287
0
              c.forwards());
1288
0
          Cursor<Traits> cursor1 = c;
1289
0
          bool matched = true;
1290
0
          while (matched && !cursor2.atEnd()) {
1291
0
            if (cursor1.atEnd()) {
1292
0
              matched = false;
1293
0
            } else if (!icase) {
1294
              // Direct comparison. Here we don't need to decode surrogate
1295
              // pairs.
1296
0
              matched = (cursor1.consume() == cursor2.consume());
1297
0
            } else if (!unicode) {
1298
              // Case-insensitive non-Unicode comparison, no decoding of
1299
              // surrogate pairs.
1300
0
              auto c1 = cursor1.consume();
1301
0
              auto c2 = cursor2.consume();
1302
0
              matched =
1303
0
                  (c1 == c2 ||
1304
0
                   traits_.canonicalize(c1, unicode) ==
1305
0
                       traits_.canonicalize(c2, unicode));
1306
0
            } else {
1307
              // Unicode: we do need to decode surrogate pairs.
1308
0
              auto cp1 = cursor1.consumeUTF16();
1309
0
              auto cp2 = cursor2.consumeUTF16();
1310
0
              matched =
1311
0
                  (cp1 == cp2 ||
1312
0
                   traits_.canonicalize(cp1, unicode) ==
1313
0
                       traits_.canonicalize(cp2, unicode));
1314
0
            }
1315
0
          }
1316
0
          if (!matched) {
1317
0
            BACKTRACK();
1318
0
          }
1319
0
          s->ip_ += sizeof(BackRefInsn);
1320
0
          c.setCurrentPointer(cursor1.currentPointer());
1321
0
          break;
1322
0
        }
1323
1324
0
        case Opcode::Lookaround: {
1325
0
          const LookaroundInsn *insn = llvh::cast<LookaroundInsn>(base);
1326
0
          bool matched = false;
1327
0
          if (c.satisfiesConstraints(flags_, insn->constraints)) {
1328
            // Copy the state. This is because if the match fails (or if we are
1329
            // inverted) we need to restore its capture groups.
1330
0
            State savedState{*s};
1331
1332
            // Set the direction of the cursor.
1333
0
            c.setForwards(insn->forwards);
1334
1335
            // Invoke match() recursively with our expression.
1336
            // Save and restore the position because lookaheads do not consume
1337
            // anything.
1338
0
            s->ip_ += sizeof(LookaroundInsn);
1339
0
            auto match = this->match(s, true /* onlyAtStart */);
1340
            // If the match errored out due to stack overflow, then we need to
1341
            // return an error here as well.
1342
0
            if (LLVM_UNLIKELY(!match)) {
1343
0
              return match.getStatus();
1344
0
            }
1345
            // We got a match if the value is non-null.
1346
0
            matched = match.getValue() != nullptr;
1347
0
            c.setCurrentPointer(savedState.cursor_.currentPointer());
1348
0
            c.setForwards(savedState.cursor_.forwards());
1349
1350
            // Restore capture groups unless we are a positive lookaround that
1351
            // successfully matched. If we are a successfully matching positive
1352
            // lookaround, set up backtracking to reset the capture groups. Note
1353
            // we never backtrack INTO a successfully matched lookahead:
1354
            // once a lookahead finds a match it forgets all other ways it could
1355
            // have matched. (ES 5.1 15.10.2.8 Note 2).
1356
0
            if (matched && !insn->invert) {
1357
              // Backtrack capture groups in the lookahead expression.
1358
0
              for (uint32_t i = insn->mexpBegin, e = insn->mexpEnd; i < e;
1359
0
                   i++) {
1360
0
                CapturedRange cr = savedState.getCapturedRange(i);
1361
0
                auto res = pushBacktrack(
1362
0
                    backtrackStack, BacktrackInsn::makeSetCaptureGroup(i, cr));
1363
0
                if (res != ExecutionStatus::RETURNED)
1364
0
                  return res;
1365
0
              }
1366
0
            } else {
1367
              // Restore the saved state.
1368
0
              *s = std::move(savedState);
1369
0
            }
1370
0
          }
1371
1372
          // 'matched' tells us whether the enclosed assertion expression
1373
          // matched the input. This instruction matched the input if it is a
1374
          // positive assertion (invert == false) and the expression matched,
1375
          // or a negative assertion (invert == true) and the expression did
1376
          // not match. Hence xor with invert.
1377
0
          if (matched ^ insn->invert)
1378
0
            s->ip_ = insn->continuation;
1379
0
          else
1380
0
            BACKTRACK();
1381
0
          break;
1382
0
        }
1383
1384
0
        case Opcode::BeginLoop: {
1385
          // Here we are entering a loop from outside, not jumping back into
1386
          // it.
1387
0
          const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
1388
0
          s->getLoop(loop->loopId).iterations = 0;
1389
          // Check to see if the loop body is viable. If not, and the loop has
1390
          // a nonzero minimum iteration, then we know we won't match and we
1391
          // can reject the state. If it does have a minimum iteration, we can
1392
          // just skip to the not-taken target. Note that this is a static
1393
          // property of the loop so we don't need to check it on every
1394
          // iteration, only the first one.
1395
0
          if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
1396
0
            if (loop->min > 0) {
1397
0
              BACKTRACK();
1398
0
            } else {
1399
0
              s->ip_ = loop->notTakenTarget;
1400
0
              break;
1401
0
            }
1402
0
          }
1403
0
          goto runLoop;
1404
0
        }
1405
1406
0
        case Opcode::EndLoop:
1407
          // This is reached after the body of a loop finishes executing.
1408
          // Move the IP to the loop and run it again immediately.
1409
0
          s->ip_ = llvh::cast<EndLoopInsn>(base)->target;
1410
0
          base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
1411
          // Note fall through.
1412
1413
0
        runLoop: {
1414
0
          const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
1415
0
          auto &loopData = s->getLoop(loop->loopId);
1416
0
          uint32_t iteration = loopData.iterations;
1417
1418
0
          const uint32_t loopTakenIp = s->ip_ + sizeof(BeginLoopInsn);
1419
1420
0
          assert(loop->min <= loop->max && "Inconsistent loop bounds");
1421
1422
          // Check to see if we have looped more than the minimum number of
1423
          // iterations, and if so, whether the subexpression we looped over
1424
          // matched an empty string. ES6 21.2.2.5.1 Note 4: "once the
1425
          // minimum number of repetitions has been satisfied, any more
1426
          // expansions of Atom that match the empty character sequence are
1427
          // not considered for further repetitions."
1428
0
          if (iteration > loop->min &&
1429
0
              loopData.entryPosition == c.offsetFromLeft())
1430
0
            BACKTRACK();
1431
1432
0
          if (iteration < loop->min) {
1433
0
            auto res = prepareToEnterLoopBody(s, loop, backtrackStack);
1434
0
            if (res != ExecutionStatus::RETURNED)
1435
0
              return res;
1436
0
            s->ip_ = loopTakenIp;
1437
0
          } else if (iteration == loop->max) {
1438
0
            s->ip_ = loop->notTakenTarget;
1439
0
          } else {
1440
            // We are within the target iteration range, figure out whether we
1441
            // should continue or exit.
1442
0
            assert(iteration >= loop->min && iteration < loop->max);
1443
0
            if (!loop->greedy) {
1444
              // Backtrack by entering this non-greedy loop.
1445
0
              loopData.entryPosition = c.offsetFromLeft();
1446
0
              auto res = pushBacktrack(
1447
0
                  backtrackStack,
1448
0
                  BacktrackInsn::makeEnterNonGreedyLoop(
1449
0
                      loop, loopTakenIp, loopData));
1450
0
              if (res != ExecutionStatus::RETURNED) {
1451
0
                return res;
1452
0
              }
1453
0
              s->ip_ = loop->notTakenTarget;
1454
0
            } else {
1455
              // Backtrack by exiting this greedy loop.
1456
0
              auto pushRes = pushBacktrack(
1457
0
                  backtrackStack,
1458
0
                  BacktrackInsn::makeSetPosition(
1459
0
                      loop->notTakenTarget, c.currentPointer()));
1460
0
              if (pushRes != ExecutionStatus::RETURNED)
1461
0
                return pushRes;
1462
1463
0
              auto prepRes = prepareToEnterLoopBody(s, loop, backtrackStack);
1464
0
              if (prepRes != ExecutionStatus::RETURNED)
1465
0
                return prepRes;
1466
0
              s->ip_ = loopTakenIp;
1467
0
            }
1468
0
          }
1469
0
          break;
1470
0
        }
1471
1472
0
        case Opcode::BeginSimpleLoop: {
1473
          // Here we are entering a simple loop from outside,
1474
          // not jumping back into it.
1475
0
          const BeginSimpleLoopInsn *loop =
1476
0
              llvh::cast<BeginSimpleLoopInsn>(base);
1477
1478
0
          if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
1479
0
            s->ip_ = loop->notTakenTarget;
1480
0
            break;
1481
0
          }
1482
1483
0
          goto runSimpleLoop;
1484
0
        }
1485
1486
0
        case Opcode::EndSimpleLoop:
1487
0
          s->ip_ = llvh::cast<EndSimpleLoopInsn>(base)->target;
1488
0
          base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
1489
          // Note: fall-through.
1490
1491
0
        runSimpleLoop: {
1492
0
          const BeginSimpleLoopInsn *loop =
1493
0
              llvh::cast<BeginSimpleLoopInsn>(base);
1494
          // Since this is a simple loop, we'll always need to explore both
1495
          // exiting the loop at this point and continuing to loop.
1496
          // Note simple loops are always greedy.
1497
0
          auto res = pushBacktrack(
1498
0
              backtrackStack,
1499
0
              BacktrackInsn::makeSetPosition(
1500
0
                  loop->notTakenTarget, c.currentPointer()));
1501
0
          if (res != ExecutionStatus::RETURNED) {
1502
0
            return res;
1503
0
          }
1504
0
          s->ip_ += sizeof(BeginSimpleLoopInsn);
1505
0
          break;
1506
0
        }
1507
1508
86
        case Opcode::Width1Loop: {
1509
86
          const Width1LoopInsn *loop = llvh::cast<Width1LoopInsn>(base);
1510
86
          auto matchRes = matchWidth1Loop(loop, s, backtrackStack);
1511
86
          if (LLVM_UNLIKELY(!matchRes))
1512
0
            return matchRes.getStatus();
1513
86
          if (!*matchRes)
1514
0
            BACKTRACK();
1515
86
          break;
1516
86
        }
1517
294
      }
1518
294
    }
1519
  // The search failed at this location.
1520
86
  backtrackingExhausted:
1521
86
    continue;
1522
122
  }
1523
86
#undef BACKTRACK
1524
  // The match failed.
1525
86
  return nullptr;
1526
86
}
hermes::regex::Context<hermes::regex::UTF16RegexTraits>::match(hermes::regex::State<hermes::regex::UTF16RegexTraits>*, bool)
Line
Count
Source
971
86
    -> ExecutorResult<const CodeUnit *> {
972
86
  using State = State<Traits>;
973
86
  BacktrackStack backtrackStack;
974
975
  // We'll refer to the cursor often.
976
86
  Cursor<Traits> &c = s->cursor_;
977
978
  // Pull out the instruction portion of the bytecode, following the header.
979
86
  const uint8_t *const bytecode = &bytecodeStream_[sizeof(RegexBytecodeHeader)];
980
981
  // Save the incoming IP in case we have to loop.
982
86
  const auto startIp = s->ip_;
983
984
86
  const CodeUnit *const startLoc = c.currentPointer();
985
986
  // Use offsetFromRight() instead of remaining() here so that the length passed
987
  // to advanceStringIndex is accurate even when the cursor is going backwards.
988
86
  const size_t charsToRight = c.offsetFromRight();
989
990
  // Decide how many locations we'll need to check.
991
  // Note that we do want to check the empty range at the end, so add one to
992
  // charsToRight.
993
86
  const size_t locsToCheckCount = onlyAtStart ? 1 : 1 + charsToRight;
994
995
  // If we are tracking backwards, we should only ever have one potential match
996
  // location. This is because advanceStringIndex only ever tracks forwards.
997
86
  assert(
998
86
      (c.forwards() || locsToCheckCount == 1) &&
999
86
      "Can only check one location when cursor is backwards");
1000
1001
#ifndef HERMES_CHECK_NATIVE_STACK
1002
  ++overflowGuard_.callDepth;
1003
  auto decrement =
1004
      llvh::make_scope_exit([this] { --overflowGuard_.callDepth; });
1005
#endif
1006
1007
  // Make sure we are not exceeding the set limit of the amount of times we can
1008
  // recurse.
1009
86
  if (overflowGuard_.isOverflowing()) {
1010
0
    return ExecutionStatus::STACK_OVERFLOW;
1011
0
  }
1012
1013
  // Macro used when a state fails to match.
1014
86
#define BACKTRACK()                            \
1015
86
  do {                                         \
1016
86
    auto btRes = backtrack(backtrackStack, s); \
1017
86
    if (LLVM_UNLIKELY(!btRes))                 \
1018
86
      return btRes.getStatus();                \
1019
86
    if (*btRes)                                \
1020
86
      goto backtrackingSucceeded;              \
1021
86
    goto backtrackingExhausted;                \
1022
86
  } while (0)
1023
1024
172
  for (size_t locIndex = 0; locIndex < locsToCheckCount;
1025
86
       locIndex = advanceStringIndex(startLoc, locIndex, charsToRight)) {
1026
86
    const CodeUnit *potentialMatchLocation = startLoc + locIndex;
1027
86
    c.setCurrentPointer(potentialMatchLocation);
1028
86
    s->ip_ = startIp;
1029
122
  backtrackingSucceeded:
1030
294
    for (;;) {
1031
294
      const Insn *base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
1032
294
      switch (base->opcode) {
1033
0
        case Opcode::Goal:
1034
0
          return potentialMatchLocation;
1035
1036
86
        case Opcode::LeftAnchor:
1037
86
          if (!matchesLeftAnchor(*this, *s))
1038
0
            BACKTRACK();
1039
86
          s->ip_ += sizeof(LeftAnchorInsn);
1040
86
          break;
1041
1042
0
        case Opcode::RightAnchor:
1043
0
          if (!matchesRightAnchor(*this, *s))
1044
0
            BACKTRACK();
1045
0
          s->ip_ += sizeof(RightAnchorInsn);
1046
0
          break;
1047
1048
0
        case Opcode::MatchAny:
1049
0
          if (c.atEnd() ||
1050
0
              !matchWidth1<Width1Opcode::MatchAny>(base, c.consume()))
1051
0
            BACKTRACK();
1052
0
          s->ip_ += sizeof(MatchAnyInsn);
1053
0
          break;
1054
1055
0
        case Opcode::U16MatchAny:
1056
0
          if (c.atEnd())
1057
0
            BACKTRACK();
1058
0
          c.consumeUTF16();
1059
0
          s->ip_ += sizeof(U16MatchAnyInsn);
1060
0
          break;
1061
1062
0
        case Opcode::MatchAnyButNewline:
1063
0
          if (c.atEnd() ||
1064
0
              !matchWidth1<Width1Opcode::MatchAnyButNewline>(base, c.consume()))
1065
0
            BACKTRACK();
1066
0
          s->ip_ += sizeof(MatchAnyButNewlineInsn);
1067
0
          break;
1068
1069
0
        case Opcode::U16MatchAnyButNewline:
1070
0
          if (c.atEnd() || isLineTerminator(c.consumeUTF16()))
1071
0
            BACKTRACK();
1072
0
          s->ip_ += sizeof(U16MatchAnyButNewlineInsn);
1073
0
          break;
1074
1075
0
        case Opcode::MatchChar8: {
1076
0
          if (c.atEnd() ||
1077
0
              !matchWidth1<Width1Opcode::MatchChar8>(base, c.consume()))
1078
0
            BACKTRACK();
1079
0
          s->ip_ += sizeof(MatchChar8Insn);
1080
0
          break;
1081
0
        }
1082
1083
0
        case Opcode::MatchChar16: {
1084
0
          if (c.atEnd() ||
1085
0
              !matchWidth1<Width1Opcode::MatchChar16>(base, c.consume()))
1086
0
            BACKTRACK();
1087
0
          s->ip_ += sizeof(MatchChar16Insn);
1088
0
          break;
1089
0
        }
1090
1091
0
        case Opcode::U16MatchChar32: {
1092
0
          const auto *insn = llvh::cast<U16MatchChar32Insn>(base);
1093
0
          if (c.atEnd() || c.consumeUTF16() != (CodePoint)insn->c)
1094
0
            BACKTRACK();
1095
0
          s->ip_ += sizeof(U16MatchChar32Insn);
1096
0
          break;
1097
0
        }
1098
1099
0
        case Opcode::MatchCharICase8: {
1100
0
          if (c.atEnd() ||
1101
0
              !matchWidth1<Width1Opcode::MatchCharICase8>(base, c.consume()))
1102
0
            BACKTRACK();
1103
0
          s->ip_ += sizeof(MatchCharICase8Insn);
1104
0
          break;
1105
0
        }
1106
1107
0
        case Opcode::MatchCharICase16: {
1108
0
          if (c.atEnd() ||
1109
0
              !matchWidth1<Width1Opcode::MatchCharICase16>(base, c.consume()))
1110
0
            BACKTRACK();
1111
0
          s->ip_ += sizeof(MatchCharICase16Insn);
1112
0
          break;
1113
0
        }
1114
1115
0
        case Opcode::U16MatchCharICase32: {
1116
0
          const auto *insn = llvh::cast<U16MatchCharICase32Insn>(base);
1117
0
          bool matched = false;
1118
0
          if (!c.atEnd()) {
1119
0
            CodePoint cp = c.consumeUTF16();
1120
0
            matched =
1121
0
                (cp == (CodePoint)insn->c ||
1122
0
                 traits_.canonicalize(cp, true) == (CodePoint)insn->c);
1123
0
          }
1124
0
          if (!matched)
1125
0
            BACKTRACK();
1126
0
          s->ip_ += sizeof(U16MatchCharICase32Insn);
1127
0
          break;
1128
0
        }
1129
1130
122
        case Opcode::MatchNChar8: {
1131
122
          const auto *insn = llvh::cast<MatchNChar8Insn>(base);
1132
122
          if (c.remaining() < insn->charCount || !matchesNChar8(insn, *s))
1133
122
            BACKTRACK();
1134
0
          s->ip_ += insn->totalWidth();
1135
0
          break;
1136
122
        }
1137
1138
0
        case Opcode::MatchNCharICase8: {
1139
0
          const auto *insn = llvh::cast<MatchNCharICase8Insn>(base);
1140
0
          if (c.remaining() < insn->charCount || !matchesNCharICase8(insn, *s))
1141
0
            BACKTRACK();
1142
0
          s->ip_ += insn->totalWidth();
1143
0
          break;
1144
0
        }
1145
1146
0
        case Opcode::Alternation: {
1147
          // We have an alternation. Determine which of our first and second
1148
          // branches are viable. If both are, we have to split our state.
1149
0
          const AlternationInsn *alt = llvh::cast<AlternationInsn>(base);
1150
0
          bool primaryViable =
1151
0
              c.satisfiesConstraints(flags_, alt->primaryConstraints);
1152
0
          bool secondaryViable =
1153
0
              c.satisfiesConstraints(flags_, alt->secondaryConstraints);
1154
0
          if (primaryViable && secondaryViable) {
1155
            // We need to explore both branches. Explore the primary branch
1156
            // first, backtrack to the secondary one.
1157
0
            s->ip_ += sizeof(AlternationInsn);
1158
0
            auto res = pushBacktrack(
1159
0
                backtrackStack,
1160
0
                BacktrackInsn::makeSetPosition(
1161
0
                    alt->secondaryBranch, c.currentPointer()));
1162
0
            if (res != ExecutionStatus::RETURNED) {
1163
0
              return res;
1164
0
            }
1165
0
          } else if (primaryViable) {
1166
0
            s->ip_ += sizeof(AlternationInsn);
1167
0
          } else if (secondaryViable) {
1168
0
            s->ip_ = alt->secondaryBranch;
1169
0
          } else {
1170
0
            BACKTRACK();
1171
0
          }
1172
0
          break;
1173
0
        }
1174
1175
0
        case Opcode::Jump32:
1176
0
          s->ip_ = llvh::cast<Jump32Insn>(base)->target;
1177
0
          break;
1178
1179
0
        case Opcode::Bracket: {
1180
0
          if (c.atEnd() ||
1181
0
              !matchWidth1<Width1Opcode::Bracket>(base, c.consume()))
1182
0
            BACKTRACK();
1183
0
          s->ip_ += llvh::cast<BracketInsn>(base)->totalWidth();
1184
0
          break;
1185
0
        }
1186
1187
0
        case Opcode::U16Bracket: {
1188
0
          const U16BracketInsn *insn = llvh::cast<U16BracketInsn>(base);
1189
          // U16BracketInsn is followed by a list of BracketRange32s.
1190
0
          const BracketRange32 *ranges =
1191
0
              reinterpret_cast<const BracketRange32 *>(insn + 1);
1192
0
          if (c.atEnd() ||
1193
0
              !bracketMatchesChar<Traits>(
1194
0
                  *this, insn, ranges, c.consumeUTF16()))
1195
0
            BACKTRACK();
1196
0
          s->ip_ += insn->totalWidth();
1197
0
          break;
1198
0
        }
1199
1200
0
        case Opcode::WordBoundary: {
1201
0
          const WordBoundaryInsn *insn = llvh::cast<WordBoundaryInsn>(base);
1202
0
          const auto *charPointer = c.currentPointer();
1203
1204
0
          bool prevIsWordchar = false;
1205
0
          if (!c.atLeft())
1206
0
            prevIsWordchar = traits_.characterHasType(
1207
0
                charPointer[-1], CharacterClass::Words);
1208
1209
0
          bool currentIsWordchar = false;
1210
0
          if (!c.atRight())
1211
0
            currentIsWordchar =
1212
0
                traits_.characterHasType(charPointer[0], CharacterClass::Words);
1213
1214
0
          bool isWordBoundary = (prevIsWordchar != currentIsWordchar);
1215
0
          if (isWordBoundary ^ insn->invert)
1216
0
            s->ip_ += sizeof(WordBoundaryInsn);
1217
0
          else
1218
0
            BACKTRACK();
1219
0
          break;
1220
0
        }
1221
1222
0
        case Opcode::BeginMarkedSubexpression: {
1223
0
          const auto *insn = llvh::cast<BeginMarkedSubexpressionInsn>(base);
1224
0
          auto res = pushBacktrack(
1225
0
              backtrackStack,
1226
0
              BacktrackInsn::makeSetCaptureGroup(
1227
0
                  insn->mexp, {kNotMatched, kNotMatched}));
1228
0
          if (res != ExecutionStatus::RETURNED) {
1229
0
            return res;
1230
0
          }
1231
          // When tracking backwards (in a lookbehind assertion) we traverse our
1232
          // input backwards, so set the end before the start.
1233
0
          auto &range = s->getCapturedRange(insn->mexp);
1234
0
          if (c.forwards()) {
1235
0
            range.start = c.offsetFromLeft();
1236
0
          } else {
1237
0
            range.end = c.offsetFromLeft();
1238
0
          }
1239
0
          s->ip_ += sizeof(BeginMarkedSubexpressionInsn);
1240
0
          break;
1241
0
        }
1242
1243
0
        case Opcode::EndMarkedSubexpression: {
1244
0
          const auto *insn = llvh::cast<EndMarkedSubexpressionInsn>(base);
1245
0
          auto &range = s->getCapturedRange(insn->mexp);
1246
0
          if (c.forwards()) {
1247
0
            assert(
1248
0
                range.start != kNotMatched && "Capture group was not entered");
1249
0
            range.end = c.offsetFromLeft();
1250
0
          } else {
1251
0
            assert(range.end != kNotMatched && "Capture group was not entered");
1252
0
            range.start = c.offsetFromLeft();
1253
0
          }
1254
0
          assert(range.start <= range.end && "Captured range end before start");
1255
0
          s->ip_ += sizeof(EndMarkedSubexpressionInsn);
1256
0
          break;
1257
0
        }
1258
1259
        // ES10 21.2.2.9.1
1260
0
        case Opcode::BackRef: {
1261
0
          const auto insn = llvh::cast<BackRefInsn>(base);
1262
          // a. Let cap be x's captures List.
1263
          // b. Let s be cap[n].
1264
0
          CapturedRange cr = s->getCapturedRange(insn->mexp);
1265
1266
          // c. If s is undefined, return c(x).
1267
          // Note we have to check both cr.start and cr.end here. If we are
1268
          // currently in the middle of matching a capture group (going either
1269
          // forwards or backwards) we should just return success.
1270
0
          if (cr.start == kNotMatched || cr.end == kNotMatched) {
1271
            // Backreferences to a capture group that did not match always
1272
            // succeed (ES10 21.2.2.9)
1273
0
            s->ip_ += sizeof(BackRefInsn);
1274
0
            break;
1275
0
          }
1276
1277
          // TODO: this can be optimized by hoisting the branches out of the
1278
          // loop.
1279
0
          bool icase = syntaxFlags_.ignoreCase;
1280
0
          bool unicode = syntaxFlags_.unicode;
1281
0
          auto capturedStart = first_ + cr.start;
1282
0
          auto capturedEnd = first_ + cr.end;
1283
0
          Cursor<Traits> cursor2(
1284
0
              capturedStart,
1285
0
              c.forwards() ? capturedStart : capturedEnd,
1286
0
              capturedEnd,
1287
0
              c.forwards());
1288
0
          Cursor<Traits> cursor1 = c;
1289
0
          bool matched = true;
1290
0
          while (matched && !cursor2.atEnd()) {
1291
0
            if (cursor1.atEnd()) {
1292
0
              matched = false;
1293
0
            } else if (!icase) {
1294
              // Direct comparison. Here we don't need to decode surrogate
1295
              // pairs.
1296
0
              matched = (cursor1.consume() == cursor2.consume());
1297
0
            } else if (!unicode) {
1298
              // Case-insensitive non-Unicode comparison, no decoding of
1299
              // surrogate pairs.
1300
0
              auto c1 = cursor1.consume();
1301
0
              auto c2 = cursor2.consume();
1302
0
              matched =
1303
0
                  (c1 == c2 ||
1304
0
                   traits_.canonicalize(c1, unicode) ==
1305
0
                       traits_.canonicalize(c2, unicode));
1306
0
            } else {
1307
              // Unicode: we do need to decode surrogate pairs.
1308
0
              auto cp1 = cursor1.consumeUTF16();
1309
0
              auto cp2 = cursor2.consumeUTF16();
1310
0
              matched =
1311
0
                  (cp1 == cp2 ||
1312
0
                   traits_.canonicalize(cp1, unicode) ==
1313
0
                       traits_.canonicalize(cp2, unicode));
1314
0
            }
1315
0
          }
1316
0
          if (!matched) {
1317
0
            BACKTRACK();
1318
0
          }
1319
0
          s->ip_ += sizeof(BackRefInsn);
1320
0
          c.setCurrentPointer(cursor1.currentPointer());
1321
0
          break;
1322
0
        }
1323
1324
0
        case Opcode::Lookaround: {
1325
0
          const LookaroundInsn *insn = llvh::cast<LookaroundInsn>(base);
1326
0
          bool matched = false;
1327
0
          if (c.satisfiesConstraints(flags_, insn->constraints)) {
1328
            // Copy the state. This is because if the match fails (or if we are
1329
            // inverted) we need to restore its capture groups.
1330
0
            State savedState{*s};
1331
1332
            // Set the direction of the cursor.
1333
0
            c.setForwards(insn->forwards);
1334
1335
            // Invoke match() recursively with our expression.
1336
            // Save and restore the position because lookaheads do not consume
1337
            // anything.
1338
0
            s->ip_ += sizeof(LookaroundInsn);
1339
0
            auto match = this->match(s, true /* onlyAtStart */);
1340
            // If the match errored out due to stack overflow, then we need to
1341
            // return an error here as well.
1342
0
            if (LLVM_UNLIKELY(!match)) {
1343
0
              return match.getStatus();
1344
0
            }
1345
            // We got a match if the value is non-null.
1346
0
            matched = match.getValue() != nullptr;
1347
0
            c.setCurrentPointer(savedState.cursor_.currentPointer());
1348
0
            c.setForwards(savedState.cursor_.forwards());
1349
1350
            // Restore capture groups unless we are a positive lookaround that
1351
            // successfully matched. If we are a successfully matching positive
1352
            // lookaround, set up backtracking to reset the capture groups. Note
1353
            // we never backtrack INTO a successfully matched lookahead:
1354
            // once a lookahead finds a match it forgets all other ways it could
1355
            // have matched. (ES 5.1 15.10.2.8 Note 2).
1356
0
            if (matched && !insn->invert) {
1357
              // Backtrack capture groups in the lookahead expression.
1358
0
              for (uint32_t i = insn->mexpBegin, e = insn->mexpEnd; i < e;
1359
0
                   i++) {
1360
0
                CapturedRange cr = savedState.getCapturedRange(i);
1361
0
                auto res = pushBacktrack(
1362
0
                    backtrackStack, BacktrackInsn::makeSetCaptureGroup(i, cr));
1363
0
                if (res != ExecutionStatus::RETURNED)
1364
0
                  return res;
1365
0
              }
1366
0
            } else {
1367
              // Restore the saved state.
1368
0
              *s = std::move(savedState);
1369
0
            }
1370
0
          }
1371
1372
          // 'matched' tells us whether the enclosed assertion expression
1373
          // matched the input. This instruction matched the input if it is a
1374
          // positive assertion (invert == false) and the expression matched,
1375
          // or a negative assertion (invert == true) and the expression did
1376
          // not match. Hence xor with invert.
1377
0
          if (matched ^ insn->invert)
1378
0
            s->ip_ = insn->continuation;
1379
0
          else
1380
0
            BACKTRACK();
1381
0
          break;
1382
0
        }
1383
1384
0
        case Opcode::BeginLoop: {
1385
          // Here we are entering a loop from outside, not jumping back into
1386
          // it.
1387
0
          const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
1388
0
          s->getLoop(loop->loopId).iterations = 0;
1389
          // Check to see if the loop body is viable. If not, and the loop has
1390
          // a nonzero minimum iteration, then we know we won't match and we
1391
          // can reject the state. If it does have a minimum iteration, we can
1392
          // just skip to the not-taken target. Note that this is a static
1393
          // property of the loop so we don't need to check it on every
1394
          // iteration, only the first one.
1395
0
          if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
1396
0
            if (loop->min > 0) {
1397
0
              BACKTRACK();
1398
0
            } else {
1399
0
              s->ip_ = loop->notTakenTarget;
1400
0
              break;
1401
0
            }
1402
0
          }
1403
0
          goto runLoop;
1404
0
        }
1405
1406
0
        case Opcode::EndLoop:
1407
          // This is reached after the body of a loop finishes executing.
1408
          // Move the IP to the loop and run it again immediately.
1409
0
          s->ip_ = llvh::cast<EndLoopInsn>(base)->target;
1410
0
          base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
1411
          // Note fall through.
1412
1413
0
        runLoop: {
1414
0
          const BeginLoopInsn *loop = llvh::cast<BeginLoopInsn>(base);
1415
0
          auto &loopData = s->getLoop(loop->loopId);
1416
0
          uint32_t iteration = loopData.iterations;
1417
1418
0
          const uint32_t loopTakenIp = s->ip_ + sizeof(BeginLoopInsn);
1419
1420
0
          assert(loop->min <= loop->max && "Inconsistent loop bounds");
1421
1422
          // Check to see if we have looped more than the minimum number of
1423
          // iterations, and if so, whether the subexpression we looped over
1424
          // matched an empty string. ES6 21.2.2.5.1 Note 4: "once the
1425
          // minimum number of repetitions has been satisfied, any more
1426
          // expansions of Atom that match the empty character sequence are
1427
          // not considered for further repetitions."
1428
0
          if (iteration > loop->min &&
1429
0
              loopData.entryPosition == c.offsetFromLeft())
1430
0
            BACKTRACK();
1431
1432
0
          if (iteration < loop->min) {
1433
0
            auto res = prepareToEnterLoopBody(s, loop, backtrackStack);
1434
0
            if (res != ExecutionStatus::RETURNED)
1435
0
              return res;
1436
0
            s->ip_ = loopTakenIp;
1437
0
          } else if (iteration == loop->max) {
1438
0
            s->ip_ = loop->notTakenTarget;
1439
0
          } else {
1440
            // We are within the target iteration range, figure out whether we
1441
            // should continue or exit.
1442
0
            assert(iteration >= loop->min && iteration < loop->max);
1443
0
            if (!loop->greedy) {
1444
              // Backtrack by entering this non-greedy loop.
1445
0
              loopData.entryPosition = c.offsetFromLeft();
1446
0
              auto res = pushBacktrack(
1447
0
                  backtrackStack,
1448
0
                  BacktrackInsn::makeEnterNonGreedyLoop(
1449
0
                      loop, loopTakenIp, loopData));
1450
0
              if (res != ExecutionStatus::RETURNED) {
1451
0
                return res;
1452
0
              }
1453
0
              s->ip_ = loop->notTakenTarget;
1454
0
            } else {
1455
              // Backtrack by exiting this greedy loop.
1456
0
              auto pushRes = pushBacktrack(
1457
0
                  backtrackStack,
1458
0
                  BacktrackInsn::makeSetPosition(
1459
0
                      loop->notTakenTarget, c.currentPointer()));
1460
0
              if (pushRes != ExecutionStatus::RETURNED)
1461
0
                return pushRes;
1462
1463
0
              auto prepRes = prepareToEnterLoopBody(s, loop, backtrackStack);
1464
0
              if (prepRes != ExecutionStatus::RETURNED)
1465
0
                return prepRes;
1466
0
              s->ip_ = loopTakenIp;
1467
0
            }
1468
0
          }
1469
0
          break;
1470
0
        }
1471
1472
0
        case Opcode::BeginSimpleLoop: {
1473
          // Here we are entering a simple loop from outside,
1474
          // not jumping back into it.
1475
0
          const BeginSimpleLoopInsn *loop =
1476
0
              llvh::cast<BeginSimpleLoopInsn>(base);
1477
1478
0
          if (!c.satisfiesConstraints(flags_, loop->loopeeConstraints)) {
1479
0
            s->ip_ = loop->notTakenTarget;
1480
0
            break;
1481
0
          }
1482
1483
0
          goto runSimpleLoop;
1484
0
        }
1485
1486
0
        case Opcode::EndSimpleLoop:
1487
0
          s->ip_ = llvh::cast<EndSimpleLoopInsn>(base)->target;
1488
0
          base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
1489
          // Note: fall-through.
1490
1491
0
        runSimpleLoop: {
1492
0
          const BeginSimpleLoopInsn *loop =
1493
0
              llvh::cast<BeginSimpleLoopInsn>(base);
1494
          // Since this is a simple loop, we'll always need to explore both
1495
          // exiting the loop at this point and continuing to loop.
1496
          // Note simple loops are always greedy.
1497
0
          auto res = pushBacktrack(
1498
0
              backtrackStack,
1499
0
              BacktrackInsn::makeSetPosition(
1500
0
                  loop->notTakenTarget, c.currentPointer()));
1501
0
          if (res != ExecutionStatus::RETURNED) {
1502
0
            return res;
1503
0
          }
1504
0
          s->ip_ += sizeof(BeginSimpleLoopInsn);
1505
0
          break;
1506
0
        }
1507
1508
86
        case Opcode::Width1Loop: {
1509
86
          const Width1LoopInsn *loop = llvh::cast<Width1LoopInsn>(base);
1510
86
          auto matchRes = matchWidth1Loop(loop, s, backtrackStack);
1511
86
          if (LLVM_UNLIKELY(!matchRes))
1512
0
            return matchRes.getStatus();
1513
86
          if (!*matchRes)
1514
0
            BACKTRACK();
1515
86
          break;
1516
86
        }
1517
294
      }
1518
294
    }
1519
  // The search failed at this location.
1520
86
  backtrackingExhausted:
1521
86
    continue;
1522
122
  }
1523
86
#undef BACKTRACK
1524
  // The match failed.
1525
86
  return nullptr;
1526
86
}
Unexecuted instantiation: hermes::regex::Context<hermes::regex::ASCIIRegexTraits>::match(hermes::regex::State<hermes::regex::ASCIIRegexTraits>*, bool)
1527
1528
/// Entry point for searching a string via regex compiled bytecode.
1529
/// Given the bytecode \p bytecode, search the range starting at \p first up to
1530
/// (not including) \p last with the flags \p matchFlags. If the search
1531
/// succeeds, poopulate MatchResults with the capture groups. \return true if
1532
/// some portion of the string matched the regex represented by the bytecode,
1533
/// false otherwise.
1534
template <typename CharT, class Traits>
1535
MatchRuntimeResult searchWithBytecodeImpl(
1536
    llvh::ArrayRef<uint8_t> bytecode,
1537
    const CharT *first,
1538
    uint32_t start,
1539
    uint32_t length,
1540
    std::vector<CapturedRange> *m,
1541
    constants::MatchFlagType matchFlags,
1542
86
    StackOverflowGuard guard) {
1543
86
  assert(
1544
86
      bytecode.size() >= sizeof(RegexBytecodeHeader) && "Bytecode too small");
1545
86
  auto header = reinterpret_cast<const RegexBytecodeHeader *>(bytecode.data());
1546
1547
  // Check for match impossibility before doing anything else.
1548
86
  Cursor<Traits> cursor{
1549
86
      first, first + start, first + length, true /* forwards */};
1550
86
  if (!cursor.satisfiesConstraints(matchFlags, header->constraints))
1551
0
    return MatchRuntimeResult::NoMatch;
1552
1553
86
  auto markedCount = header->markedCount;
1554
86
  auto loopCount = header->loopCount;
1555
1556
86
  Context<Traits> ctx(
1557
86
      bytecode,
1558
86
      matchFlags,
1559
86
      SyntaxFlags::fromByte(header->syntaxFlags),
1560
86
      first,
1561
86
      first + length,
1562
86
      header->markedCount,
1563
86
      header->loopCount,
1564
86
      guard);
1565
86
  State<Traits> state{cursor, markedCount, loopCount};
1566
1567
  // We check only one location if either the regex pattern constrains us to, or
1568
  // the flags request it (via the sticky flag 'y').
1569
86
  bool onlyAtStart = (header->constraints & MatchConstraintAnchoredAtStart) ||
1570
86
      (matchFlags & constants::matchOnlyAtStart);
1571
1572
86
  auto res = ctx.match(&state, onlyAtStart);
1573
86
  if (!res) {
1574
0
    assert(res.getStatus() == ExecutionStatus::STACK_OVERFLOW);
1575
0
    return MatchRuntimeResult::StackOverflow;
1576
0
  }
1577
86
  if (const CharT *matchStartLoc = res.getValue()) {
1578
    // Match succeeded. Return captured ranges. The first range is the total
1579
    // match, followed by any capture groups.
1580
0
    if (m != nullptr) {
1581
0
      uint32_t totalStart = static_cast<uint32_t>(matchStartLoc - first);
1582
0
      uint32_t totalEnd =
1583
0
          static_cast<uint32_t>(state.cursor_.currentPointer() - first);
1584
0
      m->clear();
1585
0
      m->push_back(CapturedRange{totalStart, totalEnd});
1586
0
      std::copy_n(
1587
0
          state.capturedRanges_.begin(), markedCount, std::back_inserter(*m));
1588
0
    }
1589
0
    return MatchRuntimeResult::Match;
1590
0
  }
1591
86
  return MatchRuntimeResult::NoMatch;
1592
86
}
hermes::regex::MatchRuntimeResult hermes::regex::searchWithBytecodeImpl<char16_t, hermes::regex::UTF16RegexTraits>(llvh::ArrayRef<unsigned char>, char16_t const*, unsigned int, unsigned int, std::__1::vector<hermes::regex::CapturedRange, std::__1::allocator<hermes::regex::CapturedRange> >*, hermes::regex::constants::MatchFlagType, hermes::StackOverflowGuard)
Line
Count
Source
1542
86
    StackOverflowGuard guard) {
1543
86
  assert(
1544
86
      bytecode.size() >= sizeof(RegexBytecodeHeader) && "Bytecode too small");
1545
86
  auto header = reinterpret_cast<const RegexBytecodeHeader *>(bytecode.data());
1546
1547
  // Check for match impossibility before doing anything else.
1548
86
  Cursor<Traits> cursor{
1549
86
      first, first + start, first + length, true /* forwards */};
1550
86
  if (!cursor.satisfiesConstraints(matchFlags, header->constraints))
1551
0
    return MatchRuntimeResult::NoMatch;
1552
1553
86
  auto markedCount = header->markedCount;
1554
86
  auto loopCount = header->loopCount;
1555
1556
86
  Context<Traits> ctx(
1557
86
      bytecode,
1558
86
      matchFlags,
1559
86
      SyntaxFlags::fromByte(header->syntaxFlags),
1560
86
      first,
1561
86
      first + length,
1562
86
      header->markedCount,
1563
86
      header->loopCount,
1564
86
      guard);
1565
86
  State<Traits> state{cursor, markedCount, loopCount};
1566
1567
  // We check only one location if either the regex pattern constrains us to, or
1568
  // the flags request it (via the sticky flag 'y').
1569
86
  bool onlyAtStart = (header->constraints & MatchConstraintAnchoredAtStart) ||
1570
86
      (matchFlags & constants::matchOnlyAtStart);
1571
1572
86
  auto res = ctx.match(&state, onlyAtStart);
1573
86
  if (!res) {
1574
0
    assert(res.getStatus() == ExecutionStatus::STACK_OVERFLOW);
1575
0
    return MatchRuntimeResult::StackOverflow;
1576
0
  }
1577
86
  if (const CharT *matchStartLoc = res.getValue()) {
1578
    // Match succeeded. Return captured ranges. The first range is the total
1579
    // match, followed by any capture groups.
1580
0
    if (m != nullptr) {
1581
0
      uint32_t totalStart = static_cast<uint32_t>(matchStartLoc - first);
1582
0
      uint32_t totalEnd =
1583
0
          static_cast<uint32_t>(state.cursor_.currentPointer() - first);
1584
0
      m->clear();
1585
0
      m->push_back(CapturedRange{totalStart, totalEnd});
1586
0
      std::copy_n(
1587
0
          state.capturedRanges_.begin(), markedCount, std::back_inserter(*m));
1588
0
    }
1589
0
    return MatchRuntimeResult::Match;
1590
0
  }
1591
86
  return MatchRuntimeResult::NoMatch;
1592
86
}
Unexecuted instantiation: hermes::regex::MatchRuntimeResult hermes::regex::searchWithBytecodeImpl<char, hermes::regex::ASCIIRegexTraits>(llvh::ArrayRef<unsigned char>, char const*, unsigned int, unsigned int, std::__1::vector<hermes::regex::CapturedRange, std::__1::allocator<hermes::regex::CapturedRange> >*, hermes::regex::constants::MatchFlagType, hermes::StackOverflowGuard)
1593
1594
MatchRuntimeResult searchWithBytecode(
1595
    llvh::ArrayRef<uint8_t> bytecode,
1596
    const char16_t *first,
1597
    uint32_t start,
1598
    uint32_t length,
1599
    std::vector<CapturedRange> *m,
1600
    constants::MatchFlagType matchFlags,
1601
86
    StackOverflowGuard guard) {
1602
86
  return searchWithBytecodeImpl<char16_t, UTF16RegexTraits>(
1603
86
      bytecode, first, start, length, m, matchFlags, guard);
1604
86
}
1605
1606
MatchRuntimeResult searchWithBytecode(
1607
    llvh::ArrayRef<uint8_t> bytecode,
1608
    const char *first,
1609
    uint32_t start,
1610
    uint32_t length,
1611
    std::vector<CapturedRange> *m,
1612
    constants::MatchFlagType matchFlags,
1613
0
    StackOverflowGuard guard) {
1614
0
  return searchWithBytecodeImpl<char, ASCIIRegexTraits>(
1615
0
      bytecode, first, start, length, m, matchFlags, guard);
1616
0
}
1617
1618
} // namespace regex
1619
} // namespace hermes