Coverage Report

Created: 2025-12-12 07:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/hermes/include/hermes/Regex/RegexTraits.h
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
//===----------------------------------------------------------------------===//
9
/// \file
10
/// Regex traits appropriate for Hermes regex.
11
//===----------------------------------------------------------------------===//
12
13
#ifndef HERMES_REGEX_TRAITS_H
14
#define HERMES_REGEX_TRAITS_H
15
16
#include "hermes/Platform/Unicode/CharacterProperties.h"
17
#include "hermes/Platform/Unicode/PlatformUnicode.h"
18
#include "hermes/Regex/RegexBytecode.h"
19
#include "hermes/Regex/RegexTypes.h"
20
#include "llvh/ADT/DenseMap.h"
21
#include "llvh/ADT/Optional.h"
22
23
namespace hermes {
24
namespace regex {
25
26
/// \return whether any range in \p ranges contains the character \p c,
27
/// inclusive of both ends.
28
inline bool anyRangeContainsChar(
29
    llvh::ArrayRef<BracketRange32> ranges,
30
83
    uint32_t c) {
31
83
  for (const auto &r : ranges) {
32
0
    if (r.start <= c && c <= r.end) {
33
0
      return true;
34
0
    }
35
0
  }
36
83
  return false;
37
83
}
38
39
/// Implementation of regex::Traits for UTF-16.
40
struct UTF16RegexTraits {
41
  /// A CodePoint is a 24-bit Unicode code point.
42
  using CodePoint = uint32_t;
43
44
  /// A CodeUnit is either a CodePoint or half of a UTF-16 surrogate pair.
45
  using CodeUnit = char16_t;
46
47
 private:
48
  using CanonicalizeCache = llvh::SmallDenseMap<CodePoint, CodePoint, 16>;
49
  mutable CanonicalizeCache toUpperCache_;
50
51
  /// ES9 11.2
52
83
  static bool isWhiteSpaceChar(CodePoint c) {
53
83
    return c == u'\u0009' || c == u'\u000B' || c == u'\u000C' ||
54
83
        c == u'\u0020' || c == u'\u00A0' || c == u'\uFEFF' || c == u'\u1680' ||
55
83
        (c >= u'\u2000' && c <= u'\u200A') || c == u'\u202F' ||
56
83
        c == u'\u205F' || c == u'\u3000';
57
83
  }
58
59
  /// ES9 11.3
60
83
  static bool isLineTerminatorChar(CodePoint c) {
61
83
    return c == u'\u000A' || c == u'\u000D' || c == u'\u2028' || c == u'\u2029';
62
83
  }
63
64
 public:
65
  /// \return whether the character \p c has the character type \p type.
66
83
  bool characterHasType(CodePoint c, regex::CharacterClass::Type type) const {
67
83
    switch (type) {
68
0
      case regex::CharacterClass::Digits:
69
0
        return u'0' <= c && c <= u'9';
70
83
      case regex::CharacterClass::Spaces:
71
83
        return isWhiteSpaceChar(c) || isLineTerminatorChar(c);
72
0
      case regex::CharacterClass::Words:
73
0
        return (u'a' <= c && c <= u'z') || (u'A' <= c && c <= u'Z') ||
74
0
            (u'0' <= c && c <= u'9') || (c == u'_');
75
83
    }
76
83
    llvm_unreachable("Unknown character type");
77
83
  }
78
79
  /// ES9 21.2.2.8.2
80
  /// \return the canonicalized form of \p c, following either the unicode or
81
  /// non-unicode algorithm according to \p unicode.
82
51.4k
  static CodePoint canonicalize(CodePoint c, bool unicode) {
83
51.4k
    static_assert(
84
51.4k
        std::numeric_limits<CodePoint>::min() == 0,
85
51.4k
        "CodePoint must be unsigned");
86
    // If we are unicode, we want to case-fold, which is effectively lowercase
87
    // form. If we are non-Unicode, we need the uppercase form.
88
51.4k
    if (LLVM_LIKELY(c <= 127)) {
89
      // ASCII fast path.
90
51.4k
      if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) {
91
        // It's a letter. Make it lowercase (uppercase) if Unicode is set
92
        // (unset).
93
50.1k
        c &= ~(1u << 5); // uppercase
94
50.1k
        c |= ((uint32_t)unicode << 5); // lowercase if Unicode is set.
95
50.1k
      }
96
51.4k
      return c;
97
51.4k
    }
98
16
    return hermes::canonicalize(c, unicode);
99
51.4k
  }
100
101
  /// \return whether the character c is contained within the range [first,
102
  /// last]. If ICase is set, perform a canonicalizing membership test as
103
  /// specified in "CharacterSetMatcher" ES5.1 15.10.2.8.
104
83
  bool rangesContain(llvh::ArrayRef<BracketRange32> ranges, CodePoint c) const {
105
83
    return anyRangeContainsChar(ranges, c);
106
83
  }
107
};
108
109
/// Implementation of regex::Traits for 7-bit ASCII.
110
struct ASCIIRegexTraits {
111
  /// CodePoint and CodeUnits are both 7-bit ASCII values.
112
  using CodePoint = uint8_t;
113
  using CodeUnit = char;
114
115
0
  bool characterHasType(CodePoint c, regex::CharacterClass::Type type) const {
116
0
    switch (type) {
117
0
      case regex::CharacterClass::Digits:
118
0
        return '0' <= c && c <= '9';
119
0
      case regex::CharacterClass::Spaces:
120
0
        switch (c) {
121
0
          case ' ':
122
0
          case '\t':
123
0
          case '\r':
124
0
          case '\n':
125
0
          case '\v':
126
0
          case '\f':
127
0
            return true;
128
0
          default:
129
0
            return false;
130
0
        }
131
0
      case regex::CharacterClass::Words:
132
0
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
133
0
            ('0' <= c && c <= '9') || (c == '_');
134
0
    }
135
0
    llvm_unreachable("Unknown character type");
136
0
  }
137
138
  /// ES6 21.2.2.8.2
139
  /// The Unicode path is lowercase; the non-Unicode path is uppercase.
140
0
  static CodePoint canonicalize(CodePoint c, bool unicode) {
141
0
    if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) {
142
      // It's a letter. Make it lowercase (uppercase) if Unicode is set (unset).
143
0
      c &= ~(1u << 5); // uppercase
144
0
      c |= ((uint32_t)unicode << 5); // lowercase if Unicode is set.
145
0
    }
146
0
    return c;
147
0
  }
148
149
  /// \return whether any of a list of ranges contains \p c.
150
  /// Note that our ranges contain uint32_t, but we test chars for membership.
151
0
  bool rangesContain(llvh::ArrayRef<BracketRange32> ranges, char16_t c) const {
152
0
    return anyRangeContainsChar(ranges, c);
153
0
  }
154
};
155
156
} // end namespace regex
157
} // end namespace hermes
158
159
#endif