Coverage Report

Created: 2025-12-12 07:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/hermes/include/hermes/Regex/RegexTypes.h
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
// -*- C++ -*-
9
//===--------------------------- regex ------------------------------------===//
10
//
11
//                     The LLVM Compiler Infrastructure
12
//
13
// This file is dual licensed under the MIT and the University of Illinois Open
14
// Source Licenses. See LICENSE.TXT for details.
15
//
16
//===----------------------------------------------------------------------===//
17
18
#ifndef HERMES_REGEX_TYPES_H
19
#define HERMES_REGEX_TYPES_H
20
21
#include "llvh/ADT/ArrayRef.h"
22
#include "llvh/ADT/SmallString.h"
23
24
namespace hermes {
25
26
struct UnicodeRangePoolRef;
27
28
namespace regex {
29
namespace constants {
30
31
// MatchFlagType
32
33
enum MatchFlagType {
34
  /// Default match options.
35
  matchDefault = 0,
36
37
  /// ^ anchors should not treat the input start as a line start.
38
  matchNotBeginningOfLine = 1 << 0,
39
40
  /// $ anchors should not treat the input end as a line end.
41
  matchNotEndOfLine = 1 << 1,
42
43
  /// Hint that the input is composed entirely of ASCII characters.
44
  matchInputAllAscii = 1 << 2,
45
46
  /// Do not search for a match past the search start location.
47
  matchOnlyAtStart = 1 << 3,
48
};
49
50
0
inline constexpr MatchFlagType operator~(MatchFlagType x) {
51
0
  return MatchFlagType(~int(x) & 0x0FFF);
52
0
}
53
54
0
inline constexpr MatchFlagType operator&(MatchFlagType x, MatchFlagType y) {
55
0
  return MatchFlagType(int(x) & int(y));
56
0
}
57
58
0
inline constexpr MatchFlagType operator|(MatchFlagType x, MatchFlagType y) {
59
0
  return MatchFlagType(int(x) | int(y));
60
0
}
61
62
0
inline constexpr MatchFlagType operator^(MatchFlagType x, MatchFlagType y) {
63
0
  return MatchFlagType(int(x) ^ int(y));
64
0
}
65
66
0
inline MatchFlagType &operator&=(MatchFlagType &x, MatchFlagType y) {
67
0
  x = x & y;
68
0
  return x;
69
0
}
70
71
0
inline MatchFlagType &operator|=(MatchFlagType &x, MatchFlagType y) {
72
0
  x = x | y;
73
0
  return x;
74
0
}
75
76
0
inline MatchFlagType &operator^=(MatchFlagType &x, MatchFlagType y) {
77
0
  x = x ^ y;
78
0
  return x;
79
0
}
80
81
enum class ErrorType {
82
  /// No error occurred.
83
  None,
84
85
  /// An escaped value would overflow: /\xFFFFFFFFFFF/
86
  EscapeOverflow,
87
88
  /// incomplete escape: new RegExp("\\")
89
  EscapeIncomplete,
90
91
  /// Invalid escape: new RegExp("\\123", "u")
92
  EscapeInvalid,
93
94
  /// Mismatched [ and ].
95
  UnbalancedBracket,
96
97
  /// Mismatched ( and ).
98
  UnbalancedParenthesis,
99
100
  /// Braces have valid syntax, but the range is invalid, such as {5,3}.
101
  BraceRange,
102
103
  /// Invalid character range, such as [b-a].
104
  CharacterRange,
105
106
  /// A lone { or } was found.
107
  InvalidQuantifierBracket,
108
109
  /// One of *?+{ was not preceded by a valid regular expression.
110
  InvalidRepeat,
111
112
  /// The pattern exceeded internal limits, such as capture group or loop count.
113
  PatternExceedsParseLimits,
114
115
  /// The flags supplied were either invalid or contained repetition.
116
  InvalidFlags,
117
118
  /// The name for a capture group is invalid e.g. 1id
119
  InvalidCaptureGroupName,
120
121
  /// Duplicate capture group name.
122
  DuplicateCaptureGroupName,
123
124
  /// Syntactically invalid named backreference.
125
  InvalidNamedReference,
126
127
  /// Reference to nonexistent capture group.
128
  NonexistentNamedCaptureReference,
129
130
  /// Invalid Unicode property name or value
131
  InvalidPropertyName,
132
};
133
134
/// \return an error message for the given \p error.
135
2
inline const char *messageForError(ErrorType error) {
136
2
  switch (error) {
137
0
    case ErrorType::EscapeOverflow:
138
0
      return "Escaped value too large";
139
0
    case ErrorType::EscapeIncomplete:
140
0
      return "Incomplete escape";
141
0
    case ErrorType::EscapeInvalid:
142
0
      return "Invalid escape";
143
0
    case ErrorType::UnbalancedBracket:
144
0
      return "Character class not closed";
145
0
    case ErrorType::UnbalancedParenthesis:
146
0
      return "Parenthesized expression not closed";
147
0
    case ErrorType::BraceRange:
148
0
      return "Quantifier range out of order";
149
0
    case ErrorType::CharacterRange:
150
0
      return "Character class range out of order";
151
0
    case ErrorType::InvalidQuantifierBracket:
152
0
      return "Invalid quantifier bracket";
153
1
    case ErrorType::InvalidRepeat:
154
1
      return "Quantifier has nothing to repeat";
155
0
    case ErrorType::PatternExceedsParseLimits:
156
0
      return "Pattern exceeds parse limits";
157
1
    case ErrorType::InvalidFlags:
158
1
      return "Invalid flags";
159
0
    case ErrorType::InvalidCaptureGroupName:
160
0
      return "Invalid capture group name";
161
0
    case ErrorType::DuplicateCaptureGroupName:
162
0
      return "Duplicate capture group name";
163
0
    case ErrorType::InvalidNamedReference:
164
0
      return "Invalid named reference";
165
0
    case ErrorType::NonexistentNamedCaptureReference:
166
0
      return "Nonexistent named capture reference";
167
0
    case ErrorType::InvalidPropertyName:
168
0
      return "Invalid property name";
169
0
    case ErrorType::None:
170
0
      return "No error";
171
2
  }
172
2
  llvm_unreachable("Unknown error");
173
0
  return nullptr;
174
2
}
175
176
/// Maximum number of supported capture groups.
177
/// This is therefore also the maximum valid backreference.
178
constexpr uint16_t kMaxCaptureGroupCount = 65535;
179
180
/// Maximum number of supported loops.
181
constexpr uint16_t kMaxLoopCount = 65535;
182
183
} // namespace constants
184
185
/// After compiling a regex, there are certain properties we can test for that
186
/// enable us to quickly rule out matches. We refer to these as
187
/// MatchConstraints: they constrain the strings that may match the regex.
188
enum MatchConstraintFlags : uint8_t {
189
  /// If set, ASCII strings can never match because we require at least one
190
  /// non-ASCII character.
191
  MatchConstraintNonASCII = 1 << 0,
192
193
  /// If set, the regex can only match at the beginning of the input string, due
194
  /// to ^ anchors.
195
  MatchConstraintAnchoredAtStart = 1 << 1,
196
197
  /// If set, the regex cannot possibly match an empty string, e.g. /a/
198
  MatchConstraintNonEmpty = 1 << 2,
199
};
200
201
/// \return whether a code point \p cp is ASCII.
202
3.74M
inline bool isASCII(uint32_t c) {
203
3.74M
  return c <= 127;
204
3.74M
}
205
206
// Type wrapping up a character class, like \d or \S.
207
struct CharacterClass {
208
  enum Type : uint8_t {
209
    Digits = 1 << 0, // \d \D
210
    Spaces = 1 << 1, // \s \S
211
    Words = 1 << 2, // \w \W
212
  } type_;
213
214
  // Whether the class is inverted (\D instead of \d).
215
  bool inverted_;
216
217
45.4k
  CharacterClass(Type type, bool invert) : type_(type), inverted_(invert) {}
218
};
219
220
// Type wrapping up a Unicode codepoint range array.
221
struct CharacterClassCodepointRanges {
222
  llvh::ArrayRef<UnicodeRangePoolRef> rangeArray;
223
224
  // Whether the class is inverted (\P instead of \p).
225
  bool inverted_;
226
227
  CharacterClassCodepointRanges(
228
      llvh::ArrayRef<UnicodeRangePoolRef> rangeArray,
229
      bool inverted)
230
0
      : rangeArray(rangeArray), inverted_(inverted) {}
231
};
232
233
// Struct representing flags which may be used when constructing the RegExp
234
class SyntaxFlags {
235
 private:
236
  // Note these are encoded into bytecode files, so changing their values is a
237
  // breaking change.
238
  enum : uint8_t {
239
    ICASE = 1 << 0,
240
    GLOBAL = 1 << 1,
241
    MULTILINE = 1 << 2,
242
    UCODE = 1 << 3,
243
    DOTALL = 1 << 4,
244
    STICKY = 1 << 5,
245
    INDICES = 1 << 6,
246
  };
247
248
 public:
249
  // Note: Preserving the order here and ensuring it lines up with the order of
250
  // the offsets above makes the generated assembly more efficient.
251
  // Specifically, it makes the conversion to/from a byte almost free.
252
  uint8_t ignoreCase : 1;
253
  uint8_t global : 1;
254
  uint8_t multiline : 1;
255
  uint8_t unicode : 1;
256
  uint8_t dotAll : 1;
257
  uint8_t sticky : 1;
258
  uint8_t hasIndices : 1;
259
260
  /// \return a byte representing the flags. Bits are set based on the offsets
261
  /// specified above. This is used for serialising the flags to bytecode.
262
1.82k
  uint8_t toByte() const {
263
1.82k
    uint8_t ret = 0;
264
1.82k
    if (global)
265
2
      ret |= GLOBAL;
266
1.82k
    if (ignoreCase)
267
1
      ret |= ICASE;
268
1.82k
    if (multiline)
269
2
      ret |= MULTILINE;
270
1.82k
    if (unicode)
271
1
      ret |= UCODE;
272
1.82k
    if (sticky)
273
323
      ret |= STICKY;
274
1.82k
    if (dotAll)
275
0
      ret |= DOTALL;
276
1.82k
    if (hasIndices)
277
0
      ret |= INDICES;
278
1.82k
    return ret;
279
1.82k
  }
280
281
  /// \return a SyntaxFlags struct generated from the given byte. Bit offsets
282
  /// are specified above. This is used for deserialising the flags from
283
  /// bytecode.
284
1.04k
  static SyntaxFlags fromByte(uint8_t byte) {
285
1.04k
    SyntaxFlags ret = {};
286
1.04k
    if (byte & GLOBAL)
287
1
      ret.global = 1;
288
1.04k
    if (byte & ICASE)
289
1
      ret.ignoreCase = 1;
290
1.04k
    if (byte & MULTILINE)
291
0
      ret.multiline = 1;
292
1.04k
    if (byte & UCODE)
293
0
      ret.unicode = 1;
294
1.04k
    if (byte & STICKY)
295
0
      ret.sticky = 1;
296
1.04k
    if (byte & DOTALL)
297
0
      ret.dotAll = 1;
298
1.04k
    if (byte & INDICES)
299
0
      ret.hasIndices = 1;
300
1.04k
    return ret;
301
1.04k
  }
302
303
  /// \return a string representing the flags
304
  /// The characters are returned in the order given in ES2022 22.2.5.4
305
  /// (specifically hasIndices, global, ignoreCase, multiline, dotAll, unicode,
306
  /// sticky) Note this may differ in order from the string passed in
307
  /// construction
308
0
  llvh::SmallString<7> toString() const {
309
0
    llvh::SmallString<7> result;
310
0
    if (hasIndices)
311
0
      result.push_back('d');
312
0
    if (global)
313
0
      result.push_back('g');
314
0
    if (ignoreCase)
315
0
      result.push_back('i');
316
0
    if (multiline)
317
0
      result.push_back('m');
318
0
    if (dotAll)
319
0
      result.push_back('s');
320
0
    if (unicode)
321
0
      result.push_back('u');
322
0
    if (sticky)
323
0
      result.push_back('y');
324
0
    return result;
325
0
  }
326
327
  /// Given a flags string \p str, generate the corresponding SyntaxFlags
328
  /// \return the flags if the string is valid, an empty optional otherwise
329
  /// See ES 5.1 15.10.4.1 for description of the validation
330
  static llvh::Optional<SyntaxFlags> fromString(
331
1.83k
      const llvh::ArrayRef<char16_t> flags) {
332
    // A flags string may contain i,m,g, in any order, but at most once each
333
1.83k
    auto error = llvh::NoneType::None;
334
1.83k
    SyntaxFlags ret = {};
335
1.83k
    for (auto c : flags) {
336
331
      switch (c) {
337
1
        case u'i':
338
1
          if (ret.ignoreCase)
339
0
            return error;
340
1
          ret.ignoreCase = 1;
341
1
          break;
342
3
        case u'm':
343
3
          if (ret.multiline)
344
0
            return error;
345
3
          ret.multiline = 1;
346
3
          break;
347
2
        case u'g':
348
2
          if (ret.global)
349
0
            return error;
350
2
          ret.global = 1;
351
2
          break;
352
1
        case u'u':
353
1
          if (ret.unicode)
354
0
            return error;
355
1
          ret.unicode = 1;
356
1
          break;
357
323
        case u'y':
358
323
          if (ret.sticky)
359
0
            return error;
360
323
          ret.sticky = 1;
361
323
          break;
362
0
        case u's':
363
0
          if (ret.dotAll)
364
0
            return error;
365
0
          ret.dotAll = 1;
366
0
          break;
367
0
        case u'd':
368
0
          if (ret.hasIndices)
369
0
            return error;
370
0
          ret.hasIndices = 1;
371
0
          break;
372
1
        default:
373
1
          return error;
374
331
      }
375
331
    }
376
1.82k
    return ret;
377
1.83k
  }
378
};
379
} // namespace regex
380
} // namespace hermes
381
#endif // HERMES_REGEX_TYPES_H