/src/hermes/include/hermes/Regex/Regex.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | // -*- C++ -*- |
9 | | //===--------------------------- regex ------------------------------------===// |
10 | | // |
11 | | // The LLVM Compiler Infrastructure |
12 | | // |
13 | | // This file is dual licensed under the MIT and the University of Illinois Open |
14 | | // Source Licenses. See LICENSE.TXT for details. |
15 | | // |
16 | | //===----------------------------------------------------------------------===// |
17 | | |
18 | | #ifndef HERMES_REGEX_COMPILER_H |
19 | | #define HERMES_REGEX_COMPILER_H |
20 | | |
21 | | #include "hermes/Regex/RegexBytecode.h" |
22 | | #include "hermes/Regex/RegexNode.h" |
23 | | #include "hermes/Regex/RegexSupport.h" |
24 | | #include "hermes/Regex/RegexTypes.h" |
25 | | #include "hermes/Support/Algorithms.h" |
26 | | #include "hermes/Support/Compiler.h" |
27 | | |
28 | | #include <deque> |
29 | | #include <string> |
30 | | #include <vector> |
31 | | |
32 | | namespace hermes { |
33 | | namespace regex { |
34 | | |
35 | | template <class Traits> |
36 | | class Regex { |
37 | | // Enable the Parser to add nodes to us. |
38 | | template <class A, class B> |
39 | | friend class Parser; |
40 | | |
41 | | using CharT = typename Traits::CodeUnit; |
42 | | using CodePoint = typename Traits::CodePoint; |
43 | | using Node = regex::Node; |
44 | | using BracketNode = regex::BracketNode<Traits>; |
45 | | |
46 | | private: |
47 | | Traits traits_; |
48 | | SyntaxFlags flags_ = {}; |
49 | | |
50 | | // Number of capture groups encountered so far. |
51 | | uint16_t markedCount_ = 0; |
52 | | |
53 | | // Number of loops encountered so far. |
54 | | uint32_t loopCount_ = 0; |
55 | | |
56 | | // The list of nodes so far. |
57 | | NodeList nodes_; |
58 | | |
59 | | // List of all unique_ptrs to all nodes created by this Regex, used to defer |
60 | | // destructors and avoid a stack overflow. |
61 | | NodeHolder nodeHolder_; |
62 | | |
63 | | // The error, which may be set after parsing. |
64 | | constants::ErrorType error_ = constants::ErrorType::None; |
65 | | |
66 | | // Constraints on the type of strings that can match this regex. |
67 | | MatchConstraintSet matchConstraints_ = 0; |
68 | | |
69 | | // This holds the named capture groups in the order they were defined. |
70 | | std::deque<llvh::SmallVector<char16_t, 5>> orderedGroupNames_{}; |
71 | | |
72 | | ParsedGroupNamesMapping nameMapping_{}; |
73 | | |
74 | | // We can skip double parsing if there were no named backreferences used |
75 | | // before the definition of the first named capture group. |
76 | | bool sawNamedBackrefBeforeGroup_{false}; |
77 | | |
78 | | // Named backrefs that might be invalid. |
79 | | std::vector<std::pair<GroupName, BackRefNode *>> unresolvedNamedBackRefs_; |
80 | | |
81 | | /// Construct and and append a node of type NodeType at the end of the nodes_ |
82 | | /// list. The node should be constructible from \p args. |
83 | | /// \return an observer pointer to the new node. |
84 | | template <typename NodeType, typename... Args> |
85 | 1.75M | NodeType *appendNode(Args &&...args) { |
86 | 1.75M | std::unique_ptr<NodeType> node = |
87 | 1.75M | std::make_unique<NodeType>(std::forward<Args>(args)...); |
88 | 1.75M | NodeType *nodePtr = node.get(); |
89 | 1.75M | nodeHolder_.push_back(std::move(node)); |
90 | 1.75M | nodes_.push_back(nodePtr); |
91 | 1.75M | return nodePtr; |
92 | 1.75M | } hermes::regex::Node* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::Node>() Line | Count | Source | 85 | 1.82k | NodeType *appendNode(Args &&...args) { | 86 | 1.82k | std::unique_ptr<NodeType> node = | 87 | 1.82k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 1.82k | NodeType *nodePtr = node.get(); | 89 | 1.82k | nodeHolder_.push_back(std::move(node)); | 90 | 1.82k | nodes_.push_back(nodePtr); | 91 | 1.82k | return nodePtr; | 92 | 1.82k | } |
hermes::regex::GoalNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::GoalNode>() Line | Count | Source | 85 | 1.82k | NodeType *appendNode(Args &&...args) { | 86 | 1.82k | std::unique_ptr<NodeType> node = | 87 | 1.82k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 1.82k | NodeType *nodePtr = node.get(); | 89 | 1.82k | nodeHolder_.push_back(std::move(node)); | 90 | 1.82k | nodes_.push_back(nodePtr); | 91 | 1.82k | return nodePtr; | 92 | 1.82k | } |
hermes::regex::AlternationNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::AlternationNode, std::__1::vector<std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> >, std::__1::allocator<std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> > > > >(std::__1::vector<std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> >, std::__1::allocator<std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> > > >&&) Line | Count | Source | 85 | 789 | NodeType *appendNode(Args &&...args) { | 86 | 789 | std::unique_ptr<NodeType> node = | 87 | 789 | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 789 | NodeType *nodePtr = node.get(); | 89 | 789 | nodeHolder_.push_back(std::move(node)); | 90 | 789 | nodes_.push_back(nodePtr); | 91 | 789 | return nodePtr; | 92 | 789 | } |
hermes::regex::MarkedSubexpressionNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::MarkedSubexpressionNode, std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> >, unsigned int&>(std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> >&&, unsigned int&) Line | Count | Source | 85 | 60.6k | NodeType *appendNode(Args &&...args) { | 86 | 60.6k | std::unique_ptr<NodeType> node = | 87 | 60.6k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 60.6k | NodeType *nodePtr = node.get(); | 89 | 60.6k | nodeHolder_.push_back(std::move(node)); | 90 | 60.6k | nodes_.push_back(nodePtr); | 91 | 60.6k | return nodePtr; | 92 | 60.6k | } |
Unexecuted instantiation: hermes::regex::LookaroundNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::LookaroundNode, std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> >, unsigned short&, unsigned short&, bool&, bool&>(std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> >&&, unsigned short&, unsigned short&, bool&, bool&) hermes::regex::LoopNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::LoopNode, unsigned int, unsigned int&, unsigned int&, bool&, unsigned int&, unsigned short&, std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> > >(unsigned int&&, unsigned int&, unsigned int&, bool&, unsigned int&, unsigned short&, std::__1::vector<hermes::regex::Node*, std::__1::allocator<hermes::regex::Node*> >&&) Line | Count | Source | 85 | 79.6k | NodeType *appendNode(Args &&...args) { | 86 | 79.6k | std::unique_ptr<NodeType> node = | 87 | 79.6k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 79.6k | NodeType *nodePtr = node.get(); | 89 | 79.6k | nodeHolder_.push_back(std::move(node)); | 90 | 79.6k | nodes_.push_back(nodePtr); | 91 | 79.6k | return nodePtr; | 92 | 79.6k | } |
hermes::regex::LeftAnchorNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::LeftAnchorNode, hermes::regex::SyntaxFlags>(hermes::regex::SyntaxFlags&&) Line | Count | Source | 85 | 10.3k | NodeType *appendNode(Args &&...args) { | 86 | 10.3k | std::unique_ptr<NodeType> node = | 87 | 10.3k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 10.3k | NodeType *nodePtr = node.get(); | 89 | 10.3k | nodeHolder_.push_back(std::move(node)); | 90 | 10.3k | nodes_.push_back(nodePtr); | 91 | 10.3k | return nodePtr; | 92 | 10.3k | } |
hermes::regex::RightAnchorNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::RightAnchorNode>() Line | Count | Source | 85 | 1 | NodeType *appendNode(Args &&...args) { | 86 | 1 | std::unique_ptr<NodeType> node = | 87 | 1 | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 1 | NodeType *nodePtr = node.get(); | 89 | 1 | nodeHolder_.push_back(std::move(node)); | 90 | 1 | nodes_.push_back(nodePtr); | 91 | 1 | return nodePtr; | 92 | 1 | } |
hermes::regex::WordBoundaryNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::WordBoundaryNode, bool&>(bool&) Line | Count | Source | 85 | 42.3k | NodeType *appendNode(Args &&...args) { | 86 | 42.3k | std::unique_ptr<NodeType> node = | 87 | 42.3k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 42.3k | NodeType *nodePtr = node.get(); | 89 | 42.3k | nodeHolder_.push_back(std::move(node)); | 90 | 42.3k | nodes_.push_back(nodePtr); | 91 | 42.3k | return nodePtr; | 92 | 42.3k | } |
hermes::regex::BracketNode<hermes::regex::UTF16RegexTraits>* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::BracketNode<hermes::regex::UTF16RegexTraits>, hermes::regex::UTF16RegexTraits&, bool&, hermes::regex::SyntaxFlags&>(hermes::regex::UTF16RegexTraits&, bool&, hermes::regex::SyntaxFlags&) Line | Count | Source | 85 | 45.0k | NodeType *appendNode(Args &&...args) { | 86 | 45.0k | std::unique_ptr<NodeType> node = | 87 | 45.0k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 45.0k | NodeType *nodePtr = node.get(); | 89 | 45.0k | nodeHolder_.push_back(std::move(node)); | 90 | 45.0k | nodes_.push_back(nodePtr); | 91 | 45.0k | return nodePtr; | 92 | 45.0k | } |
hermes::regex::BackRefNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::BackRefNode, unsigned int&>(unsigned int&) Line | Count | Source | 85 | 110 | NodeType *appendNode(Args &&...args) { | 86 | 110 | std::unique_ptr<NodeType> node = | 87 | 110 | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 110 | NodeType *nodePtr = node.get(); | 89 | 110 | nodeHolder_.push_back(std::move(node)); | 90 | 110 | nodes_.push_back(nodePtr); | 91 | 110 | return nodePtr; | 92 | 110 | } |
Unexecuted instantiation: hermes::regex::BackRefNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::BackRefNode, int>(int&&) Unexecuted instantiation: hermes::regex::BackRefNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::BackRefNode, unsigned int>(unsigned int&&) hermes::regex::MatchAnyNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::MatchAnyNode, hermes::regex::SyntaxFlags>(hermes::regex::SyntaxFlags&&) Line | Count | Source | 85 | 86.2k | NodeType *appendNode(Args &&...args) { | 86 | 86.2k | std::unique_ptr<NodeType> node = | 87 | 86.2k | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 86.2k | NodeType *nodePtr = node.get(); | 89 | 86.2k | nodeHolder_.push_back(std::move(node)); | 90 | 86.2k | nodes_.push_back(nodePtr); | 91 | 86.2k | return nodePtr; | 92 | 86.2k | } |
hermes::regex::MatchCharNode* hermes::regex::Regex<hermes::regex::UTF16RegexTraits>::appendNode<hermes::regex::MatchCharNode, llvh::SmallVector<unsigned int, 5u>, hermes::regex::SyntaxFlags>(llvh::SmallVector<unsigned int, 5u>&&, hermes::regex::SyntaxFlags&&) Line | Count | Source | 85 | 1.42M | NodeType *appendNode(Args &&...args) { | 86 | 1.42M | std::unique_ptr<NodeType> node = | 87 | 1.42M | std::make_unique<NodeType>(std::forward<Args>(args)...); | 88 | 1.42M | NodeType *nodePtr = node.get(); | 89 | 1.42M | nodeHolder_.push_back(std::move(node)); | 90 | 1.42M | nodes_.push_back(nodePtr); | 91 | 1.42M | return nodePtr; | 92 | 1.42M | } |
|
93 | | |
94 | | /// \return the "current" node, which is the last (rightmost) node created. |
95 | 1.92M | Node *currentNode() { |
96 | 1.92M | return nodes_.back(); |
97 | 1.92M | } |
98 | | |
99 | | /// \return the number of marked subexpressions. |
100 | 1.92M | uint16_t markedCount() const { |
101 | 1.92M | return markedCount_; |
102 | 1.92M | } |
103 | | |
104 | | /// Increment the number of marked subexpressions. |
105 | | /// \return the previous value of markedCount_. |
106 | 60.6k | uint16_t incrementMarkedCount() { |
107 | 60.6k | assert( |
108 | 60.6k | markedCount_ < std::numeric_limits<uint16_t>::max() && |
109 | 60.6k | "markedCount_ will overflow"); |
110 | 60.6k | return markedCount_++; |
111 | 60.6k | } |
112 | | |
113 | | /// Given that the node \p splicePoint is in our node list, remove all nodes |
114 | | /// after it. \return a list of the removed nodes. |
115 | 211k | NodeList spliceOut(Node *splicePoint) { |
116 | 211k | assert(splicePoint && "null node in spliceOut"); |
117 | | // Find the index of the splice point. We expect it to be towards the end. |
118 | 211k | size_t spliceIndex = nodes_.size(); |
119 | 1.32M | while (spliceIndex--) { |
120 | 1.32M | if (nodes_[spliceIndex] == splicePoint) |
121 | 211k | break; |
122 | 1.32M | } |
123 | 211k | assert(spliceIndex < nodes_.size() && "Node not in node list"); |
124 | | // Move all nodes after the splice index into a new vector. |
125 | | // Note this may be empty. |
126 | 211k | auto firstToMove = nodes_.begin() + spliceIndex + 1; |
127 | 211k | NodeList result; |
128 | 211k | std::move(firstToMove, nodes_.end(), std::back_inserter(result)); |
129 | 211k | nodes_.erase(firstToMove, nodes_.end()); |
130 | 211k | return result; |
131 | 211k | } |
132 | | |
133 | | public: |
134 | | /// Compile the regex into bytecode. Return the resulting bytecode. |
135 | 1.82k | std::vector<uint8_t> compile() const { |
136 | 1.82k | assert(valid() && "Cannot compile invalid regex."); |
137 | | // TODO: add validation for the loop and reduce the size of loopCount_ to |
138 | | // uint16_t. |
139 | 1.82k | assert( |
140 | 1.82k | markedCount_ <= constants::kMaxCaptureGroupCount && |
141 | 1.82k | "Too many capture groups"); |
142 | 1.82k | assert(loopCount_ <= constants::kMaxLoopCount && "Too many loops"); |
143 | 1.82k | RegexBytecodeHeader header = { |
144 | 1.82k | markedCount_, |
145 | 1.82k | static_cast<uint16_t>(loopCount_), |
146 | 1.82k | flags_.toByte(), |
147 | 1.82k | matchConstraints_}; |
148 | 1.82k | RegexBytecodeStream bcs(header); |
149 | 1.82k | Node::compile(nodes_, bcs); |
150 | 1.82k | return bcs.acquireBytecode(); |
151 | 1.82k | } |
152 | | |
153 | | // Constructors |
154 | | explicit Regex(const CharT *p, const char16_t *f = u"") |
155 | 1 | : Regex( |
156 | 1 | {p, p + std::char_traits<CharT>::length(p)}, |
157 | 1 | {f, f + std::char_traits<CharT>::length(f)}) {} |
158 | | |
159 | | Regex( |
160 | | const llvh::ArrayRef<CharT> pattern, |
161 | 1.83k | const llvh::ArrayRef<char16_t> flags = {}) { |
162 | | // Compute the SyntaxFlags based on the flags string. |
163 | 1.83k | auto sflags = SyntaxFlags::fromString(flags); |
164 | 1.83k | if (!sflags) { |
165 | 1 | error_ = constants::ErrorType::InvalidFlags; |
166 | 1 | return; |
167 | 1 | } |
168 | 1.82k | flags_ = *sflags; |
169 | 1.82k | error_ = parse(pattern.begin(), pattern.end()); |
170 | 1.82k | } |
171 | | |
172 | | // Disallow copy-assignment and copy-construction. |
173 | | Regex &operator=(const Regex &) = delete; |
174 | | Regex(const Regex &) = delete; |
175 | | |
176 | | /// Move-assignment and move-construction. |
177 | | Regex &operator=(Regex &&) = default; |
178 | | Regex(Regex &&) = default; |
179 | | |
180 | | // Accessors. |
181 | | unsigned markCount() const { |
182 | | return markedCount_; |
183 | | } |
184 | 2.99M | SyntaxFlags flags() const { |
185 | 2.99M | return flags_; |
186 | 2.99M | } |
187 | | |
188 | 0 | std::deque<llvh::SmallVector<char16_t, 5>> &getOrderedNamedGroups() { |
189 | 0 | return orderedGroupNames_; |
190 | 0 | } |
191 | | |
192 | 1.82k | std::deque<llvh::SmallVector<char16_t, 5>> acquireOrderedGroupNames() { |
193 | 1.82k | return std::move(orderedGroupNames_); |
194 | 1.82k | } |
195 | | |
196 | 0 | ParsedGroupNamesMapping &getGroupNamesMapping() { |
197 | 0 | return nameMapping_; |
198 | 0 | } |
199 | | |
200 | 1.82k | ParsedGroupNamesMapping acquireGroupNamesMapping() { |
201 | 1.82k | return std::move(nameMapping_); |
202 | 1.82k | } |
203 | | |
204 | 0 | void sawNamedBackrefBeforeGroup() { |
205 | 0 | sawNamedBackrefBeforeGroup_ = true; |
206 | 0 | } |
207 | | |
208 | | /// \return any errors produced during parsing, or ErrorType::None if none. |
209 | 2 | constants::ErrorType getError() const { |
210 | 2 | return error_; |
211 | 2 | } |
212 | | |
213 | | /// \return whether the regex was parsed successfully. |
214 | 3.65k | bool valid() const { |
215 | 3.65k | return error_ == constants::ErrorType::None; |
216 | 3.65k | } |
217 | | |
218 | | /// \return the set of match constraints for the regex. |
219 | | MatchConstraintSet matchConstraints() const { |
220 | | return matchConstraints_; |
221 | | } |
222 | | |
223 | | private: |
224 | | template <class ForwardIterator> |
225 | | constants::ErrorType parse(ForwardIterator first, ForwardIterator last); |
226 | | |
227 | | /// Attempt to parse the regex from the range [\p first, \p last), using |
228 | | /// \p backRefLimit as the maximum decimal escape to interpret as a |
229 | | /// backreference. The maximum backreference that was in fact encountered |
230 | | /// is returned by reference in \p out_max_back_ref, if that is larger than |
231 | | /// its current value. \return an error code. |
232 | | template <class ForwardIterator> |
233 | | constants::ErrorType parseWithBackRefLimit( |
234 | | ForwardIterator first, |
235 | | ForwardIterator last, |
236 | | uint32_t backRefLimit, |
237 | | bool hasNamedGroups, |
238 | | uint32_t *outMaxBackRef); |
239 | | |
240 | | // Note- this method does not insert into the AST, it populates a different |
241 | | // datastructure. Returns false if the given name was already defined. |
242 | | bool addNamedCaptureGroup(GroupName &&identifier, uint32_t groupNum); |
243 | | |
244 | | bool resolveNamedBackRefs(); |
245 | | |
246 | | void pushLeftAnchor(); |
247 | | void pushRightAnchor(); |
248 | | void pushMatchAny(); |
249 | | void pushLoop( |
250 | | uint32_t min, |
251 | | uint32_t max, |
252 | | NodeList loopedList, |
253 | | uint32_t mexp_begin, |
254 | | bool greedy); |
255 | | BracketNode *startBracketList(bool negate); |
256 | | void pushChar(CodePoint c); |
257 | | void pushCharClass(CharacterClass c); |
258 | | void pushBackRef(uint32_t i); |
259 | | void pushNamedBackRef(GroupName &&identifier); |
260 | | void pushAlternation(std::vector<NodeList> alternatives); |
261 | | void pushMarkedSubexpression(NodeList, uint32_t mexp); |
262 | | void pushWordBoundary(bool); |
263 | | void pushLookaround(NodeList, uint16_t, uint16_t, bool, bool); |
264 | | }; |
265 | | |
266 | | template <typename Receiver> |
267 | | constants::ErrorType parseRegex( |
268 | | const char16_t *start, |
269 | | const char16_t *end, |
270 | | Receiver *receiver, |
271 | | SyntaxFlags flags, |
272 | | uint32_t backRefLimit, |
273 | | bool hasNamedGroups, |
274 | | uint32_t *outMaxBackRef); |
275 | | |
276 | | template <class Traits> |
277 | | template <class ForwardIterator> |
278 | | constants::ErrorType Regex<Traits>::parse( |
279 | | ForwardIterator first, |
280 | 1.82k | ForwardIterator last) { |
281 | 1.82k | uint32_t maxBackRef = 0; |
282 | 1.82k | bool hasNamedGroups = flags_.unicode; |
283 | 1.82k | auto result = parseWithBackRefLimit( |
284 | 1.82k | first, |
285 | 1.82k | last, |
286 | 1.82k | constants::kMaxCaptureGroupCount, |
287 | 1.82k | hasNamedGroups, |
288 | 1.82k | &maxBackRef); |
289 | | |
290 | | // Validate loop and capture group count. |
291 | 1.82k | if (loopCount_ > constants::kMaxLoopCount) { |
292 | 0 | return constants::ErrorType::PatternExceedsParseLimits; |
293 | 0 | } |
294 | | |
295 | | // See comment --DecimalEscape-- |
296 | | // We parsed without a backreference limit because we had to parse to discover |
297 | | // the limit. Now we know that we wrongly interpreted a decimal escape as a |
298 | | // backreference. See ES6 Annex B.1.4 DecimalEscape "but only if the integer |
299 | | // value DecimalEscape is <= NCapturingParens". Now that we know the true |
300 | | // capture group count, either produce an error (if Unicode) or re-parse with |
301 | | // that as the limit so overlarge decimal escapes will be ignored. |
302 | 1.82k | bool reparseForNumberedBackref = |
303 | 1.82k | result == constants::ErrorType::None && maxBackRef > markedCount_; |
304 | | // We must also reparse if there were any named capture groups used and it is |
305 | | // not unicode mode. |
306 | 1.82k | bool reparseForNamedBackref = false; |
307 | 1.82k | if (!flags_.unicode && nameMapping_.size() > 0 && |
308 | 0 | sawNamedBackrefBeforeGroup_) { |
309 | 0 | reparseForNamedBackref = true; |
310 | 0 | hasNamedGroups = true; |
311 | 0 | } |
312 | | |
313 | 1.82k | if (reparseForNumberedBackref || reparseForNamedBackref) { |
314 | 0 | if (flags_.unicode) { |
315 | 0 | return constants::ErrorType::EscapeInvalid; |
316 | 0 | } |
317 | | |
318 | 0 | uint32_t backRefLimit = markedCount_; |
319 | 0 | uint32_t reparsedMaxBackRef = 0; |
320 | 0 | loopCount_ = 0; |
321 | 0 | markedCount_ = 0; |
322 | 0 | matchConstraints_ = 0; |
323 | 0 | nameMapping_.clear(); |
324 | 0 | orderedGroupNames_.clear(); |
325 | 0 | result = parseWithBackRefLimit( |
326 | 0 | first, last, backRefLimit, hasNamedGroups, &reparsedMaxBackRef); |
327 | 0 | assert( |
328 | 0 | reparsedMaxBackRef <= backRefLimit && |
329 | 0 | "invalid backreference generated"); |
330 | 0 | (void)reparsedMaxBackRef; |
331 | 0 | } |
332 | 1.82k | return result; |
333 | 1.82k | } |
334 | | |
335 | | template <class Traits> |
336 | | template <class ForwardIterator> |
337 | | constants::ErrorType Regex<Traits>::parseWithBackRefLimit( |
338 | | ForwardIterator first, |
339 | | ForwardIterator last, |
340 | | uint32_t backRefLimit, |
341 | | bool hasNamedGroups, |
342 | 1.82k | uint32_t *outMaxBackRef) { |
343 | | // Initialize our node list with a single no-op node (it must never be empty.) |
344 | 1.82k | nodes_.clear(); |
345 | 1.82k | appendNode<Node>(); |
346 | 1.82k | auto result = parseRegex( |
347 | 1.82k | first, last, this, flags_, backRefLimit, hasNamedGroups, outMaxBackRef); |
348 | | |
349 | | // If we succeeded, add a goal node as the last node and perform optimizations |
350 | | // on the list. |
351 | 1.82k | if (result == constants::ErrorType::None) { |
352 | 1.82k | appendNode<GoalNode>(); |
353 | 1.82k | Node::optimizeNodeList(nodes_, flags_, nodeHolder_); |
354 | 1.82k | if (!resolveNamedBackRefs()) { |
355 | 0 | return constants::ErrorType::NonexistentNamedCaptureReference; |
356 | 0 | } |
357 | 1.82k | } |
358 | | |
359 | | // Compute any match constraints. |
360 | 1.82k | matchConstraints_ = Node::matchConstraintsForList(nodes_); |
361 | | |
362 | 1.82k | return result; |
363 | 1.82k | } |
364 | | |
365 | | template <class Traits> |
366 | 1.82k | bool Regex<Traits>::resolveNamedBackRefs() { |
367 | 1.82k | for (auto &[name, backRef] : unresolvedNamedBackRefs_) { |
368 | 0 | auto search = nameMapping_.find(name); |
369 | 0 | if (search == nameMapping_.end()) { |
370 | 0 | return false; |
371 | 0 | } |
372 | 0 | auto groupNum = search->second; |
373 | 0 | backRef->setBackRef(groupNum - 1); |
374 | 0 | } |
375 | 1.82k | return true; |
376 | 1.82k | } |
377 | | |
378 | | template <class Traits> |
379 | | void Regex<Traits>::pushLoop( |
380 | | uint32_t min, |
381 | | uint32_t max, |
382 | | NodeList loopedExpr, |
383 | | uint32_t mexp_begin, |
384 | 79.6k | bool greedy) { |
385 | 79.6k | appendNode<LoopNode>( |
386 | 79.6k | loopCount_++, |
387 | 79.6k | min, |
388 | 79.6k | max, |
389 | 79.6k | greedy, |
390 | 79.6k | mexp_begin, |
391 | 79.6k | markedCount_, |
392 | 79.6k | std::move(loopedExpr)); |
393 | 79.6k | } |
394 | | |
395 | | template <class Traits> |
396 | 1.42M | void Regex<Traits>::pushChar(CodePoint c) { |
397 | 1.42M | bool icase = flags().ignoreCase; |
398 | 1.42M | if (icase) |
399 | 51.4k | c = traits_.canonicalize(c, flags().unicode); |
400 | 1.42M | appendNode<MatchCharNode>(Node::CodePointList{c}, flags()); |
401 | 1.42M | } |
402 | | |
403 | | template <class Traits> |
404 | 45.0k | void Regex<Traits>::pushCharClass(CharacterClass c) { |
405 | 45.0k | auto bracket = startBracketList(false); |
406 | 45.0k | bracket->addClass(c); |
407 | 45.0k | } |
408 | | |
409 | | template <class Traits> |
410 | 60.6k | void Regex<Traits>::pushMarkedSubexpression(NodeList nodes, uint32_t mexp) { |
411 | 60.6k | appendNode<MarkedSubexpressionNode>(std::move(nodes), mexp); |
412 | 60.6k | } |
413 | | |
414 | | template <class Traits> |
415 | 10.3k | void Regex<Traits>::pushLeftAnchor() { |
416 | 10.3k | appendNode<LeftAnchorNode>(flags()); |
417 | 10.3k | } |
418 | | |
419 | | template <class Traits> |
420 | 1 | void Regex<Traits>::pushRightAnchor() { |
421 | 1 | appendNode<RightAnchorNode>(); |
422 | 1 | } |
423 | | |
424 | | template <class Traits> |
425 | 86.2k | void Regex<Traits>::pushMatchAny() { |
426 | 86.2k | appendNode<MatchAnyNode>(flags()); |
427 | 86.2k | } |
428 | | |
429 | | template <class Traits> |
430 | 42.3k | void Regex<Traits>::pushWordBoundary(bool invert) { |
431 | 42.3k | appendNode<WordBoundaryNode>(invert); |
432 | 42.3k | } |
433 | | |
434 | | template <class Traits> |
435 | 110 | void Regex<Traits>::pushBackRef(uint32_t i) { |
436 | 110 | appendNode<BackRefNode>(i); |
437 | 110 | } |
438 | | |
439 | | template <class Traits> |
440 | 0 | void Regex<Traits>::pushNamedBackRef(GroupName &&identifier) { |
441 | 0 | auto search = nameMapping_.find(identifier); |
442 | 0 | if (search == nameMapping_.end()) { |
443 | | // If this name hasn't been defined yet, we have a case of an ambiguous |
444 | | // named backref. It could be valid or not, because the group name could be |
445 | | // defined in the future. We will revist these nodes at the end to see if |
446 | | // they are valid. |
447 | 0 | BackRefNode *backRef = appendNode<BackRefNode>(0); |
448 | 0 | unresolvedNamedBackRefs_.emplace_back(std::move(identifier), backRef); |
449 | 0 | return; |
450 | 0 | } |
451 | 0 | auto groupNum = search->second; |
452 | 0 | appendNode<BackRefNode>(groupNum - 1); |
453 | 0 | } |
454 | | |
455 | | template <class Traits> |
456 | 789 | void Regex<Traits>::pushAlternation(std::vector<NodeList> alternatives) { |
457 | 789 | appendNode<AlternationNode>(std::move(alternatives)); |
458 | 789 | } |
459 | | |
460 | | template <class Traits> |
461 | 45.0k | BracketNode<Traits> *Regex<Traits>::startBracketList(bool negate) { |
462 | 45.0k | return appendNode<BracketNode>(traits_, negate, flags_); |
463 | 45.0k | } |
464 | | |
465 | | template <class Traits> |
466 | | void Regex<Traits>::pushLookaround( |
467 | | NodeList exp, |
468 | | uint16_t mexpBegin, |
469 | | uint16_t mexpEnd, |
470 | | bool invert, |
471 | 0 | bool forwards) { |
472 | 0 | if (!forwards) { |
473 | 0 | Node::reverseNodeList(exp); |
474 | 0 | } |
475 | 0 | nodeHolder_.push_back(std::make_unique<GoalNode>()); |
476 | 0 | exp.push_back(nodeHolder_.back().get()); |
477 | 0 | appendNode<LookaroundNode>( |
478 | 0 | std::move(exp), mexpBegin, mexpEnd, invert, forwards); |
479 | 0 | } |
480 | | |
481 | | template <class Traits> |
482 | | bool Regex<Traits>::addNamedCaptureGroup( |
483 | | GroupName &&identifier, |
484 | 0 | uint32_t groupNum) { |
485 | | // Add one to the given group number because later on this is used to index |
486 | | // into the whole matches array, which will be prepended with the entire match |
487 | | // string so all these group numbers will be off by one if we don't offset it |
488 | | // here. |
489 | 0 | auto &elm = orderedGroupNames_.emplace_back(std::move(identifier)); |
490 | 0 | auto res = nameMapping_.try_emplace(elm, groupNum + 1); |
491 | 0 | return res.second; |
492 | 0 | } |
493 | | |
494 | | } // namespace regex |
495 | | } // namespace hermes |
496 | | |
497 | | #endif // HERMES_REGEX_COMPILER_H |