/src/hermes/include/hermes/Regex/RegexTraits.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | //===----------------------------------------------------------------------===// |
9 | | /// \file |
10 | | /// Regex traits appropriate for Hermes regex. |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #ifndef HERMES_REGEX_TRAITS_H |
14 | | #define HERMES_REGEX_TRAITS_H |
15 | | |
16 | | #include "hermes/Platform/Unicode/CharacterProperties.h" |
17 | | #include "hermes/Platform/Unicode/PlatformUnicode.h" |
18 | | #include "hermes/Regex/RegexBytecode.h" |
19 | | #include "hermes/Regex/RegexTypes.h" |
20 | | #include "llvh/ADT/DenseMap.h" |
21 | | #include "llvh/ADT/Optional.h" |
22 | | |
23 | | namespace hermes { |
24 | | namespace regex { |
25 | | |
26 | | /// \return whether any range in \p ranges contains the character \p c, |
27 | | /// inclusive of both ends. |
28 | | inline bool anyRangeContainsChar( |
29 | | llvh::ArrayRef<BracketRange32> ranges, |
30 | 83 | uint32_t c) { |
31 | 83 | for (const auto &r : ranges) { |
32 | 0 | if (r.start <= c && c <= r.end) { |
33 | 0 | return true; |
34 | 0 | } |
35 | 0 | } |
36 | 83 | return false; |
37 | 83 | } |
38 | | |
39 | | /// Implementation of regex::Traits for UTF-16. |
40 | | struct UTF16RegexTraits { |
41 | | /// A CodePoint is a 24-bit Unicode code point. |
42 | | using CodePoint = uint32_t; |
43 | | |
44 | | /// A CodeUnit is either a CodePoint or half of a UTF-16 surrogate pair. |
45 | | using CodeUnit = char16_t; |
46 | | |
47 | | private: |
48 | | using CanonicalizeCache = llvh::SmallDenseMap<CodePoint, CodePoint, 16>; |
49 | | mutable CanonicalizeCache toUpperCache_; |
50 | | |
51 | | /// ES9 11.2 |
52 | 83 | static bool isWhiteSpaceChar(CodePoint c) { |
53 | 83 | return c == u'\u0009' || c == u'\u000B' || c == u'\u000C' || |
54 | 83 | c == u'\u0020' || c == u'\u00A0' || c == u'\uFEFF' || c == u'\u1680' || |
55 | 83 | (c >= u'\u2000' && c <= u'\u200A') || c == u'\u202F' || |
56 | 83 | c == u'\u205F' || c == u'\u3000'; |
57 | 83 | } |
58 | | |
59 | | /// ES9 11.3 |
60 | 83 | static bool isLineTerminatorChar(CodePoint c) { |
61 | 83 | return c == u'\u000A' || c == u'\u000D' || c == u'\u2028' || c == u'\u2029'; |
62 | 83 | } |
63 | | |
64 | | public: |
65 | | /// \return whether the character \p c has the character type \p type. |
66 | 83 | bool characterHasType(CodePoint c, regex::CharacterClass::Type type) const { |
67 | 83 | switch (type) { |
68 | 0 | case regex::CharacterClass::Digits: |
69 | 0 | return u'0' <= c && c <= u'9'; |
70 | 83 | case regex::CharacterClass::Spaces: |
71 | 83 | return isWhiteSpaceChar(c) || isLineTerminatorChar(c); |
72 | 0 | case regex::CharacterClass::Words: |
73 | 0 | return (u'a' <= c && c <= u'z') || (u'A' <= c && c <= u'Z') || |
74 | 0 | (u'0' <= c && c <= u'9') || (c == u'_'); |
75 | 83 | } |
76 | 83 | llvm_unreachable("Unknown character type"); |
77 | 83 | } |
78 | | |
79 | | /// ES9 21.2.2.8.2 |
80 | | /// \return the canonicalized form of \p c, following either the unicode or |
81 | | /// non-unicode algorithm according to \p unicode. |
82 | 51.4k | static CodePoint canonicalize(CodePoint c, bool unicode) { |
83 | 51.4k | static_assert( |
84 | 51.4k | std::numeric_limits<CodePoint>::min() == 0, |
85 | 51.4k | "CodePoint must be unsigned"); |
86 | | // If we are unicode, we want to case-fold, which is effectively lowercase |
87 | | // form. If we are non-Unicode, we need the uppercase form. |
88 | 51.4k | if (LLVM_LIKELY(c <= 127)) { |
89 | | // ASCII fast path. |
90 | 51.4k | if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) { |
91 | | // It's a letter. Make it lowercase (uppercase) if Unicode is set |
92 | | // (unset). |
93 | 50.1k | c &= ~(1u << 5); // uppercase |
94 | 50.1k | c |= ((uint32_t)unicode << 5); // lowercase if Unicode is set. |
95 | 50.1k | } |
96 | 51.4k | return c; |
97 | 51.4k | } |
98 | 16 | return hermes::canonicalize(c, unicode); |
99 | 51.4k | } |
100 | | |
101 | | /// \return whether the character c is contained within the range [first, |
102 | | /// last]. If ICase is set, perform a canonicalizing membership test as |
103 | | /// specified in "CharacterSetMatcher" ES5.1 15.10.2.8. |
104 | 83 | bool rangesContain(llvh::ArrayRef<BracketRange32> ranges, CodePoint c) const { |
105 | 83 | return anyRangeContainsChar(ranges, c); |
106 | 83 | } |
107 | | }; |
108 | | |
109 | | /// Implementation of regex::Traits for 7-bit ASCII. |
110 | | struct ASCIIRegexTraits { |
111 | | /// CodePoint and CodeUnits are both 7-bit ASCII values. |
112 | | using CodePoint = uint8_t; |
113 | | using CodeUnit = char; |
114 | | |
115 | 0 | bool characterHasType(CodePoint c, regex::CharacterClass::Type type) const { |
116 | 0 | switch (type) { |
117 | 0 | case regex::CharacterClass::Digits: |
118 | 0 | return '0' <= c && c <= '9'; |
119 | 0 | case regex::CharacterClass::Spaces: |
120 | 0 | switch (c) { |
121 | 0 | case ' ': |
122 | 0 | case '\t': |
123 | 0 | case '\r': |
124 | 0 | case '\n': |
125 | 0 | case '\v': |
126 | 0 | case '\f': |
127 | 0 | return true; |
128 | 0 | default: |
129 | 0 | return false; |
130 | 0 | } |
131 | 0 | case regex::CharacterClass::Words: |
132 | 0 | return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || |
133 | 0 | ('0' <= c && c <= '9') || (c == '_'); |
134 | 0 | } |
135 | 0 | llvm_unreachable("Unknown character type"); |
136 | 0 | } |
137 | | |
138 | | /// ES6 21.2.2.8.2 |
139 | | /// The Unicode path is lowercase; the non-Unicode path is uppercase. |
140 | 0 | static CodePoint canonicalize(CodePoint c, bool unicode) { |
141 | 0 | if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) { |
142 | | // It's a letter. Make it lowercase (uppercase) if Unicode is set (unset). |
143 | 0 | c &= ~(1u << 5); // uppercase |
144 | 0 | c |= ((uint32_t)unicode << 5); // lowercase if Unicode is set. |
145 | 0 | } |
146 | 0 | return c; |
147 | 0 | } |
148 | | |
149 | | /// \return whether any of a list of ranges contains \p c. |
150 | | /// Note that our ranges contain uint32_t, but we test chars for membership. |
151 | 0 | bool rangesContain(llvh::ArrayRef<BracketRange32> ranges, char16_t c) const { |
152 | 0 | return anyRangeContainsChar(ranges, c); |
153 | 0 | } |
154 | | }; |
155 | | |
156 | | } // end namespace regex |
157 | | } // end namespace hermes |
158 | | |
159 | | #endif |