/src/serenity/Userland/Libraries/LibRegex/RegexByteCode.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #pragma once |
8 | | |
9 | | #include "RegexBytecodeStreamOptimizer.h" |
10 | | #include "RegexMatch.h" |
11 | | #include "RegexOptions.h" |
12 | | |
13 | | #include <AK/Concepts.h> |
14 | | #include <AK/DisjointChunks.h> |
15 | | #include <AK/Forward.h> |
16 | | #include <AK/HashMap.h> |
17 | | #include <AK/NonnullOwnPtr.h> |
18 | | #include <AK/OwnPtr.h> |
19 | | #include <AK/Traits.h> |
20 | | #include <AK/TypeCasts.h> |
21 | | #include <AK/Types.h> |
22 | | #include <AK/Vector.h> |
23 | | #include <LibUnicode/Forward.h> |
24 | | |
25 | | namespace regex { |
26 | | |
27 | | using ByteCodeValueType = u64; |
28 | | |
29 | | #define ENUMERATE_OPCODES \ |
30 | 6 | __ENUMERATE_OPCODE(Compare) \ |
31 | 6 | __ENUMERATE_OPCODE(Jump) \ |
32 | 6 | __ENUMERATE_OPCODE(JumpNonEmpty) \ |
33 | 6 | __ENUMERATE_OPCODE(ForkJump) \ |
34 | 6 | __ENUMERATE_OPCODE(ForkStay) \ |
35 | 6 | __ENUMERATE_OPCODE(ForkReplaceJump) \ |
36 | 6 | __ENUMERATE_OPCODE(ForkReplaceStay) \ |
37 | 6 | __ENUMERATE_OPCODE(FailForks) \ |
38 | 6 | __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ |
39 | 6 | __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ |
40 | 6 | __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ |
41 | 6 | __ENUMERATE_OPCODE(CheckBegin) \ |
42 | 6 | __ENUMERATE_OPCODE(CheckEnd) \ |
43 | 6 | __ENUMERATE_OPCODE(CheckBoundary) \ |
44 | 6 | __ENUMERATE_OPCODE(Save) \ |
45 | 6 | __ENUMERATE_OPCODE(Restore) \ |
46 | 6 | __ENUMERATE_OPCODE(GoBack) \ |
47 | 6 | __ENUMERATE_OPCODE(ClearCaptureGroup) \ |
48 | 6 | __ENUMERATE_OPCODE(Repeat) \ |
49 | 6 | __ENUMERATE_OPCODE(ResetRepeat) \ |
50 | 6 | __ENUMERATE_OPCODE(Checkpoint) \ |
51 | 6 | __ENUMERATE_OPCODE(Exit) |
52 | | |
53 | | // clang-format off |
54 | | enum class OpCodeId : ByteCodeValueType { |
55 | | #define __ENUMERATE_OPCODE(x) x, |
56 | | ENUMERATE_OPCODES |
57 | | #undef __ENUMERATE_OPCODE |
58 | | |
59 | | First = Compare, |
60 | | Last = Exit, |
61 | | }; |
62 | | // clang-format on |
63 | | |
64 | | #define ENUMERATE_CHARACTER_COMPARE_TYPES \ |
65 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \ |
66 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \ |
67 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \ |
68 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \ |
69 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \ |
70 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \ |
71 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ |
72 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ |
73 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ |
74 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ |
75 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ |
76 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ |
77 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \ |
78 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \ |
79 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \ |
80 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(And) \ |
81 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \ |
82 | 0 | __ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr) |
83 | | |
84 | | enum class CharacterCompareType : ByteCodeValueType { |
85 | | #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x, |
86 | | ENUMERATE_CHARACTER_COMPARE_TYPES |
87 | | #undef __ENUMERATE_CHARACTER_COMPARE_TYPE |
88 | | }; |
89 | | |
90 | | #define ENUMERATE_CHARACTER_CLASSES \ |
91 | 0 | __ENUMERATE_CHARACTER_CLASS(Alnum) \ |
92 | 0 | __ENUMERATE_CHARACTER_CLASS(Cntrl) \ |
93 | 0 | __ENUMERATE_CHARACTER_CLASS(Lower) \ |
94 | 0 | __ENUMERATE_CHARACTER_CLASS(Space) \ |
95 | 0 | __ENUMERATE_CHARACTER_CLASS(Alpha) \ |
96 | 0 | __ENUMERATE_CHARACTER_CLASS(Digit) \ |
97 | 0 | __ENUMERATE_CHARACTER_CLASS(Print) \ |
98 | 0 | __ENUMERATE_CHARACTER_CLASS(Upper) \ |
99 | 0 | __ENUMERATE_CHARACTER_CLASS(Blank) \ |
100 | 0 | __ENUMERATE_CHARACTER_CLASS(Graph) \ |
101 | 0 | __ENUMERATE_CHARACTER_CLASS(Punct) \ |
102 | 0 | __ENUMERATE_CHARACTER_CLASS(Word) \ |
103 | 0 | __ENUMERATE_CHARACTER_CLASS(Xdigit) |
104 | | |
105 | | enum class CharClass : ByteCodeValueType { |
106 | | #define __ENUMERATE_CHARACTER_CLASS(x) x, |
107 | | ENUMERATE_CHARACTER_CLASSES |
108 | | #undef __ENUMERATE_CHARACTER_CLASS |
109 | | }; |
110 | | |
111 | | #define ENUMERATE_BOUNDARY_CHECK_TYPES \ |
112 | 0 | __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \ |
113 | 0 | __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord) |
114 | | |
115 | | enum class BoundaryCheckType : ByteCodeValueType { |
116 | | #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x, |
117 | | ENUMERATE_BOUNDARY_CHECK_TYPES |
118 | | #undef __ENUMERATE_BOUNDARY_CHECK_TYPE |
119 | | }; |
120 | | |
121 | | struct CharRange { |
122 | | u32 const from; |
123 | | u32 const to; |
124 | | |
125 | | CharRange(u64 value) |
126 | 48.7M | : from(value >> 32) |
127 | 48.7M | , to(value & 0xffffffff) |
128 | 48.7M | { |
129 | 48.7M | } |
130 | | |
131 | | CharRange(u32 from, u32 to) |
132 | 85.8M | : from(from) |
133 | 85.8M | , to(to) |
134 | 85.8M | { |
135 | 85.8M | } |
136 | | |
137 | 15.5M | operator ByteCodeValueType() const { return ((u64)from << 32) | to; } |
138 | | }; |
139 | | |
140 | | struct CompareTypeAndValuePair { |
141 | | CharacterCompareType type; |
142 | | ByteCodeValueType value; |
143 | | }; |
144 | | |
145 | | class OpCode; |
146 | | |
147 | | class ByteCode : public DisjointChunks<ByteCodeValueType> { |
148 | | using Base = DisjointChunks<ByteCodeValueType>; |
149 | | |
150 | | public: |
151 | | ByteCode() |
152 | 96.1M | { |
153 | 96.1M | ensure_opcodes_initialized(); |
154 | 96.1M | } |
155 | | |
156 | 3.63M | ByteCode(ByteCode const&) = default; |
157 | 99.7M | virtual ~ByteCode() = default; |
158 | | |
159 | 1.54M | ByteCode& operator=(ByteCode&&) = default; |
160 | | ByteCode& operator=(Base&& value) |
161 | 0 | { |
162 | 0 | static_cast<Base&>(*this) = move(value); |
163 | 0 | return *this; |
164 | 0 | } |
165 | | |
166 | | template<typename... Args> |
167 | | void empend(Args&&... args) |
168 | 257M | { |
169 | 257M | if (is_empty()) |
170 | 45.5M | Base::append({}); |
171 | 257M | Base::last_chunk().empend(forward<Args>(args)...); |
172 | 257M | } void regex::ByteCode::empend<unsigned long>(unsigned long&&) Line | Count | Source | 168 | 201M | { | 169 | 201M | if (is_empty()) | 170 | 45.5M | Base::append({}); | 171 | 201M | Base::last_chunk().empend(forward<Args>(args)...); | 172 | 201M | } |
void regex::ByteCode::empend<unsigned long&>(unsigned long&) Line | Count | Source | 168 | 56.0M | { | 169 | 56.0M | if (is_empty()) | 170 | 25 | Base::append({}); | 171 | 56.0M | Base::last_chunk().empend(forward<Args>(args)...); | 172 | 56.0M | } |
void regex::ByteCode::empend<unsigned int>(unsigned int&&) Line | Count | Source | 168 | 96.1k | { | 169 | 96.1k | if (is_empty()) | 170 | 0 | Base::append({}); | 171 | 96.1k | Base::last_chunk().empend(forward<Args>(args)...); | 172 | 96.1k | } |
|
173 | | template<typename T> |
174 | | void append(T&& value) |
175 | 92.8M | { |
176 | 92.8M | if (is_empty()) |
177 | 44.0M | Base::append({}); |
178 | 92.8M | Base::last_chunk().append(forward<T>(value)); |
179 | 92.8M | } void regex::ByteCode::append<unsigned long>(unsigned long&&) Line | Count | Source | 175 | 45.1M | { | 176 | 45.1M | if (is_empty()) | 177 | 44.0M | Base::append({}); | 178 | 45.1M | Base::last_chunk().append(forward<T>(value)); | 179 | 45.1M | } |
void regex::ByteCode::append<regex::CharRange>(regex::CharRange&&) Line | Count | Source | 175 | 9.33M | { | 176 | 9.33M | if (is_empty()) | 177 | 0 | Base::append({}); | 178 | 9.33M | Base::last_chunk().append(forward<T>(value)); | 179 | 9.33M | } |
void regex::ByteCode::append<int>(int&&) Line | Count | Source | 175 | 601k | { | 176 | 601k | if (is_empty()) | 177 | 0 | Base::append({}); | 178 | 601k | Base::last_chunk().append(forward<T>(value)); | 179 | 601k | } |
void regex::ByteCode::append<unsigned long&>(unsigned long&) Line | Count | Source | 175 | 37.7M | { | 176 | 37.7M | if (is_empty()) | 177 | 0 | Base::append({}); | 178 | 37.7M | Base::last_chunk().append(forward<T>(value)); | 179 | 37.7M | } |
|
180 | | template<typename T> |
181 | | void prepend(T&& value) |
182 | 925k | { |
183 | 925k | if (is_empty()) |
184 | 0 | return append(forward<T>(value)); |
185 | 925k | Base::first_chunk().prepend(forward<T>(value)); |
186 | 925k | } |
187 | | |
188 | | void append(Span<ByteCodeValueType const> value) |
189 | 346k | { |
190 | 346k | if (is_empty()) |
191 | 0 | Base::append({}); |
192 | 346k | auto& last = Base::last_chunk(); |
193 | 346k | last.ensure_capacity(value.size()); |
194 | 346k | for (auto v : value) |
195 | 22.0M | last.unchecked_append(v); |
196 | 346k | } |
197 | | |
198 | | void ensure_capacity(size_t capacity) |
199 | 347k | { |
200 | 347k | if (is_empty()) |
201 | 399 | Base::append({}); |
202 | 347k | Base::last_chunk().ensure_capacity(capacity); |
203 | 347k | } |
204 | | |
205 | | void last_chunk() const = delete; |
206 | | void first_chunk() const = delete; |
207 | | |
208 | | void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs) |
209 | 43.6M | { |
210 | 43.6M | Optimizer::append_character_class(*this, move(pairs)); |
211 | 43.6M | } |
212 | | |
213 | | void insert_bytecode_check_boundary(BoundaryCheckType type) |
214 | 20.3k | { |
215 | 20.3k | ByteCode bytecode; |
216 | 20.3k | bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary); |
217 | 20.3k | bytecode.empend((ByteCodeValueType)type); |
218 | | |
219 | 20.3k | extend(move(bytecode)); |
220 | 20.3k | } |
221 | | |
222 | | void insert_bytecode_clear_capture_group(size_t index) |
223 | 11.0M | { |
224 | 11.0M | empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup)); |
225 | 11.0M | empend(index); |
226 | 11.0M | } |
227 | | |
228 | | void insert_bytecode_compare_string(StringView view) |
229 | 1.25M | { |
230 | 1.25M | empend(static_cast<ByteCodeValueType>(OpCodeId::Compare)); |
231 | 1.25M | empend(static_cast<u64>(1)); // number of arguments |
232 | 1.25M | empend(2 + view.length()); // size of arguments |
233 | 1.25M | empend(static_cast<ByteCodeValueType>(CharacterCompareType::String)); |
234 | 1.25M | insert_string(view); |
235 | 1.25M | } |
236 | | |
237 | | void insert_bytecode_group_capture_left(size_t capture_groups_count) |
238 | 289k | { |
239 | 289k | empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup)); |
240 | 289k | empend(capture_groups_count); |
241 | 289k | } |
242 | | |
243 | | void insert_bytecode_group_capture_right(size_t capture_groups_count) |
244 | 285k | { |
245 | 285k | empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup)); |
246 | 285k | empend(capture_groups_count); |
247 | 285k | } |
248 | | |
249 | | void insert_bytecode_group_capture_right(size_t capture_groups_count, StringView name) |
250 | 1.23k | { |
251 | 1.23k | empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup)); |
252 | 1.23k | empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); |
253 | 1.23k | empend(name.length()); |
254 | 1.23k | empend(capture_groups_count); |
255 | 1.23k | } |
256 | | |
257 | | enum class LookAroundType { |
258 | | LookAhead, |
259 | | LookBehind, |
260 | | NegatedLookAhead, |
261 | | NegatedLookBehind, |
262 | | }; |
263 | | void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0) |
264 | 1.37k | { |
265 | | // FIXME: The save stack will grow infinitely with repeated failures |
266 | | // as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture). |
267 | 1.37k | switch (type) { |
268 | 1 | case LookAroundType::LookAhead: { |
269 | | // SAVE |
270 | | // REGEXP BODY |
271 | | // RESTORE |
272 | 1 | empend((ByteCodeValueType)OpCodeId::Save); |
273 | 1 | extend(move(lookaround_body)); |
274 | 1 | empend((ByteCodeValueType)OpCodeId::Restore); |
275 | 1 | return; |
276 | 0 | } |
277 | 7 | case LookAroundType::NegatedLookAhead: { |
278 | | // JUMP _A |
279 | | // LABEL _L |
280 | | // REGEXP BODY |
281 | | // FAIL |
282 | | // LABEL _A |
283 | | // SAVE |
284 | | // FORKJUMP _L |
285 | | // RESTORE |
286 | 7 | auto body_length = lookaround_body.size(); |
287 | 7 | empend((ByteCodeValueType)OpCodeId::Jump); |
288 | 7 | empend((ByteCodeValueType)body_length + 1); // JUMP to label _A |
289 | 7 | extend(move(lookaround_body)); |
290 | 7 | empend((ByteCodeValueType)OpCodeId::FailForks); |
291 | 7 | empend((ByteCodeValueType)OpCodeId::Save); |
292 | 7 | empend((ByteCodeValueType)OpCodeId::ForkJump); |
293 | 7 | empend((ByteCodeValueType) - (body_length + 4)); // JUMP to label _L |
294 | 7 | empend((ByteCodeValueType)OpCodeId::Restore); |
295 | 7 | return; |
296 | 0 | } |
297 | 8 | case LookAroundType::LookBehind: |
298 | | // SAVE |
299 | | // GOBACK match_length(BODY) |
300 | | // REGEXP BODY |
301 | | // RESTORE |
302 | 8 | empend((ByteCodeValueType)OpCodeId::Save); |
303 | 8 | empend((ByteCodeValueType)OpCodeId::GoBack); |
304 | 8 | empend((ByteCodeValueType)match_length); |
305 | 8 | extend(move(lookaround_body)); |
306 | 8 | empend((ByteCodeValueType)OpCodeId::Restore); |
307 | 8 | return; |
308 | 1.36k | case LookAroundType::NegatedLookBehind: { |
309 | | // JUMP _A |
310 | | // LABEL _L |
311 | | // GOBACK match_length(BODY) |
312 | | // REGEXP BODY |
313 | | // FAIL |
314 | | // LABEL _A |
315 | | // SAVE |
316 | | // FORKJUMP _L |
317 | | // RESTORE |
318 | 1.36k | auto body_length = lookaround_body.size(); |
319 | 1.36k | empend((ByteCodeValueType)OpCodeId::Jump); |
320 | 1.36k | empend((ByteCodeValueType)body_length + 3); // JUMP to label _A |
321 | 1.36k | empend((ByteCodeValueType)OpCodeId::GoBack); |
322 | 1.36k | empend((ByteCodeValueType)match_length); |
323 | 1.36k | extend(move(lookaround_body)); |
324 | 1.36k | empend((ByteCodeValueType)OpCodeId::FailForks); |
325 | 1.36k | empend((ByteCodeValueType)OpCodeId::Save); |
326 | 1.36k | empend((ByteCodeValueType)OpCodeId::ForkJump); |
327 | 1.36k | empend((ByteCodeValueType) - (body_length + 6)); // JUMP to label _L |
328 | 1.36k | empend((ByteCodeValueType)OpCodeId::Restore); |
329 | 1.36k | return; |
330 | 0 | } |
331 | 1.37k | } |
332 | | |
333 | 0 | VERIFY_NOT_REACHED(); |
334 | 0 | } |
335 | | |
336 | | void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right) |
337 | 1.18k | { |
338 | | |
339 | | // FORKJUMP _ALT |
340 | | // REGEXP ALT2 |
341 | | // JUMP _END |
342 | | // LABEL _ALT |
343 | | // REGEXP ALT1 |
344 | | // LABEL _END |
345 | | |
346 | | // Optimisation: Eliminate extra work by unifying common pre-and-postfix exprs. |
347 | 1.18k | Optimizer::append_alternation(*this, move(left), move(right)); |
348 | 1.18k | } |
349 | | |
350 | | template<Integral T> |
351 | | static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true) |
352 | 3.70k | { |
353 | 3.70k | if (!maximum.has_value()) { |
354 | 2.05k | if (minimum == 0) |
355 | 221 | return transform_bytecode_repetition_any(bytecode_to_repeat, greedy); |
356 | 1.83k | if (minimum == 1) |
357 | 65 | return transform_bytecode_repetition_min_one(bytecode_to_repeat, greedy); |
358 | 1.83k | } |
359 | | |
360 | 3.42k | ByteCode new_bytecode; |
361 | 3.42k | new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id); |
362 | | |
363 | 3.42k | if (maximum.has_value()) { |
364 | | // (REPEAT REGEXP MIN) |
365 | | // LABEL _MAX_LOOP | |
366 | | // FORK END | |
367 | | // REGEXP | |
368 | | // REPEAT _MAX_LOOP MAX-MIN | if max > min |
369 | | // FORK END | |
370 | | // REGEXP | |
371 | | // LABEL END | |
372 | | // RESET _MAX_LOOP | |
373 | 1.65k | auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump); |
374 | 1.65k | if (maximum.value() > minimum) { |
375 | 1.58k | new_bytecode.empend(jump_kind); |
376 | 1.58k | new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. |
377 | 1.58k | auto pre_loop_fork_jump_index = new_bytecode.size(); |
378 | 1.58k | new_bytecode.extend(bytecode_to_repeat); |
379 | 1.58k | auto repetitions = maximum.value() - minimum; |
380 | 1.58k | auto fork_jump_address = new_bytecode.size(); |
381 | 1.58k | if (repetitions > 1) { |
382 | 1.52k | new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat); |
383 | 1.52k | new_bytecode.empend(bytecode_to_repeat.size() + 2); |
384 | 1.52k | new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1)); |
385 | 1.52k | new_bytecode.empend(max_repetition_mark_id); |
386 | 1.52k | new_bytecode.empend(jump_kind); |
387 | 1.52k | new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. |
388 | 1.52k | auto post_loop_fork_jump_index = new_bytecode.size(); |
389 | 1.52k | new_bytecode.extend(bytecode_to_repeat); |
390 | 1.52k | fork_jump_address = new_bytecode.size(); |
391 | | |
392 | 1.52k | new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index); |
393 | | |
394 | 1.52k | new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat); |
395 | 1.52k | new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id); |
396 | 1.52k | } |
397 | 1.58k | new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index); |
398 | 1.58k | } |
399 | 1.77k | } else { |
400 | | // no maximum value set, repeat finding if possible: |
401 | | // (REPEAT REGEXP MIN) |
402 | | // LABEL _START |
403 | | // CHECKPOINT _C |
404 | | // REGEXP |
405 | | // JUMP_NONEMPTY _C _START FORK |
406 | | |
407 | | // Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n) |
408 | 1.77k | auto checkpoint = s_next_checkpoint_serial_id++; |
409 | 1.77k | new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint); |
410 | 1.77k | new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)checkpoint); |
411 | | |
412 | 1.77k | auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay); |
413 | 1.77k | new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty); |
414 | 1.77k | new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 2); // Jump to the last iteration |
415 | 1.77k | new_bytecode.empend(checkpoint); // if _C is not empty. |
416 | 1.77k | new_bytecode.empend(jump_kind); |
417 | 1.77k | } |
418 | | |
419 | 3.42k | bytecode_to_repeat = move(new_bytecode); |
420 | 3.42k | } _ZN5regex8ByteCode37transform_bytecode_repetition_min_maxITkN2AK8Concepts8IntegralEjEEvRS0_T_NS2_8OptionalIS5_EEmmb Line | Count | Source | 352 | 592 | { | 353 | 592 | if (!maximum.has_value()) { | 354 | 52 | if (minimum == 0) | 355 | 12 | return transform_bytecode_repetition_any(bytecode_to_repeat, greedy); | 356 | 40 | if (minimum == 1) | 357 | 4 | return transform_bytecode_repetition_min_one(bytecode_to_repeat, greedy); | 358 | 40 | } | 359 | | | 360 | 576 | ByteCode new_bytecode; | 361 | 576 | new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id); | 362 | | | 363 | 576 | if (maximum.has_value()) { | 364 | | // (REPEAT REGEXP MIN) | 365 | | // LABEL _MAX_LOOP | | 366 | | // FORK END | | 367 | | // REGEXP | | 368 | | // REPEAT _MAX_LOOP MAX-MIN | if max > min | 369 | | // FORK END | | 370 | | // REGEXP | | 371 | | // LABEL END | | 372 | | // RESET _MAX_LOOP | | 373 | 540 | auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump); | 374 | 540 | if (maximum.value() > minimum) { | 375 | 539 | new_bytecode.empend(jump_kind); | 376 | 539 | new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. | 377 | 539 | auto pre_loop_fork_jump_index = new_bytecode.size(); | 378 | 539 | new_bytecode.extend(bytecode_to_repeat); | 379 | 539 | auto repetitions = maximum.value() - minimum; | 380 | 539 | auto fork_jump_address = new_bytecode.size(); | 381 | 539 | if (repetitions > 1) { | 382 | 477 | new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat); | 383 | 477 | new_bytecode.empend(bytecode_to_repeat.size() + 2); | 384 | 477 | new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1)); | 385 | 477 | new_bytecode.empend(max_repetition_mark_id); | 386 | 477 | new_bytecode.empend(jump_kind); | 387 | 477 | new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. | 388 | 477 | auto post_loop_fork_jump_index = new_bytecode.size(); | 389 | 477 | new_bytecode.extend(bytecode_to_repeat); | 390 | 477 | fork_jump_address = new_bytecode.size(); | 391 | | | 392 | 477 | new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index); | 393 | | | 394 | 477 | new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat); | 395 | 477 | new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id); | 396 | 477 | } | 397 | 539 | new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index); | 398 | 539 | } | 399 | 540 | } else { | 400 | | // no maximum value set, repeat finding if possible: | 401 | | // (REPEAT REGEXP MIN) | 402 | | // LABEL _START | 403 | | // CHECKPOINT _C | 404 | | // REGEXP | 405 | | // JUMP_NONEMPTY _C _START FORK | 406 | | | 407 | | // Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n) | 408 | 36 | auto checkpoint = s_next_checkpoint_serial_id++; | 409 | 36 | new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint); | 410 | 36 | new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)checkpoint); | 411 | | | 412 | 36 | auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay); | 413 | 36 | new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty); | 414 | 36 | new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 2); // Jump to the last iteration | 415 | 36 | new_bytecode.empend(checkpoint); // if _C is not empty. | 416 | 36 | new_bytecode.empend(jump_kind); | 417 | 36 | } | 418 | | | 419 | 576 | bytecode_to_repeat = move(new_bytecode); | 420 | 576 | } |
_ZN5regex8ByteCode37transform_bytecode_repetition_min_maxITkN2AK8Concepts8IntegralEmEEvRS0_T_NS2_8OptionalIS5_EEmmb Line | Count | Source | 352 | 3.11k | { | 353 | 3.11k | if (!maximum.has_value()) { | 354 | 2.00k | if (minimum == 0) | 355 | 209 | return transform_bytecode_repetition_any(bytecode_to_repeat, greedy); | 356 | 1.79k | if (minimum == 1) | 357 | 61 | return transform_bytecode_repetition_min_one(bytecode_to_repeat, greedy); | 358 | 1.79k | } | 359 | | | 360 | 2.84k | ByteCode new_bytecode; | 361 | 2.84k | new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id); | 362 | | | 363 | 2.84k | if (maximum.has_value()) { | 364 | | // (REPEAT REGEXP MIN) | 365 | | // LABEL _MAX_LOOP | | 366 | | // FORK END | | 367 | | // REGEXP | | 368 | | // REPEAT _MAX_LOOP MAX-MIN | if max > min | 369 | | // FORK END | | 370 | | // REGEXP | | 371 | | // LABEL END | | 372 | | // RESET _MAX_LOOP | | 373 | 1.11k | auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump); | 374 | 1.11k | if (maximum.value() > minimum) { | 375 | 1.04k | new_bytecode.empend(jump_kind); | 376 | 1.04k | new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. | 377 | 1.04k | auto pre_loop_fork_jump_index = new_bytecode.size(); | 378 | 1.04k | new_bytecode.extend(bytecode_to_repeat); | 379 | 1.04k | auto repetitions = maximum.value() - minimum; | 380 | 1.04k | auto fork_jump_address = new_bytecode.size(); | 381 | 1.04k | if (repetitions > 1) { | 382 | 1.04k | new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat); | 383 | 1.04k | new_bytecode.empend(bytecode_to_repeat.size() + 2); | 384 | 1.04k | new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1)); | 385 | 1.04k | new_bytecode.empend(max_repetition_mark_id); | 386 | 1.04k | new_bytecode.empend(jump_kind); | 387 | 1.04k | new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. | 388 | 1.04k | auto post_loop_fork_jump_index = new_bytecode.size(); | 389 | 1.04k | new_bytecode.extend(bytecode_to_repeat); | 390 | 1.04k | fork_jump_address = new_bytecode.size(); | 391 | | | 392 | 1.04k | new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index); | 393 | | | 394 | 1.04k | new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat); | 395 | 1.04k | new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id); | 396 | 1.04k | } | 397 | 1.04k | new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index); | 398 | 1.04k | } | 399 | 1.73k | } else { | 400 | | // no maximum value set, repeat finding if possible: | 401 | | // (REPEAT REGEXP MIN) | 402 | | // LABEL _START | 403 | | // CHECKPOINT _C | 404 | | // REGEXP | 405 | | // JUMP_NONEMPTY _C _START FORK | 406 | | | 407 | | // Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n) | 408 | 1.73k | auto checkpoint = s_next_checkpoint_serial_id++; | 409 | 1.73k | new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint); | 410 | 1.73k | new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)checkpoint); | 411 | | | 412 | 1.73k | auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay); | 413 | 1.73k | new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty); | 414 | 1.73k | new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 2); // Jump to the last iteration | 415 | 1.73k | new_bytecode.empend(checkpoint); // if _C is not empty. | 416 | 1.73k | new_bytecode.empend(jump_kind); | 417 | 1.73k | } | 418 | | | 419 | 2.84k | bytecode_to_repeat = move(new_bytecode); | 420 | 2.84k | } |
|
421 | | |
422 | | template<Integral T> |
423 | | void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id) |
424 | 6.83k | { |
425 | | // LABEL _LOOP |
426 | | // REGEXP |
427 | | // REPEAT _LOOP N-1 |
428 | | // REGEXP |
429 | 6.83k | if (n == 0) |
430 | 663 | return; |
431 | | |
432 | | // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the |
433 | | // REPEAT instruction forcing another loop. |
434 | 6.17k | extend(bytecode_to_repeat); |
435 | | |
436 | 6.17k | if (n > 1) { |
437 | 5.99k | empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat)); |
438 | 5.99k | empend(bytecode_to_repeat.size()); |
439 | 5.99k | empend(static_cast<ByteCodeValueType>(n - 1)); |
440 | 5.99k | empend(repetition_mark_id); |
441 | | |
442 | 5.99k | extend(bytecode_to_repeat); |
443 | 5.99k | } |
444 | 6.17k | } _ZN5regex8ByteCode28insert_bytecode_repetition_nITkN2AK8Concepts8IntegralEjEEvRS0_T_m Line | Count | Source | 424 | 3.98k | { | 425 | | // LABEL _LOOP | 426 | | // REGEXP | 427 | | // REPEAT _LOOP N-1 | 428 | | // REGEXP | 429 | 3.98k | if (n == 0) | 430 | 657 | return; | 431 | | | 432 | | // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the | 433 | | // REPEAT instruction forcing another loop. | 434 | 3.33k | extend(bytecode_to_repeat); | 435 | | | 436 | 3.33k | if (n > 1) { | 437 | 3.15k | empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat)); | 438 | 3.15k | empend(bytecode_to_repeat.size()); | 439 | 3.15k | empend(static_cast<ByteCodeValueType>(n - 1)); | 440 | 3.15k | empend(repetition_mark_id); | 441 | | | 442 | 3.15k | extend(bytecode_to_repeat); | 443 | 3.15k | } | 444 | 3.33k | } |
_ZN5regex8ByteCode28insert_bytecode_repetition_nITkN2AK8Concepts8IntegralEmEEvRS0_T_m Line | Count | Source | 424 | 2.84k | { | 425 | | // LABEL _LOOP | 426 | | // REGEXP | 427 | | // REPEAT _LOOP N-1 | 428 | | // REGEXP | 429 | 2.84k | if (n == 0) | 430 | 6 | return; | 431 | | | 432 | | // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the | 433 | | // REPEAT instruction forcing another loop. | 434 | 2.84k | extend(bytecode_to_repeat); | 435 | | | 436 | 2.84k | if (n > 1) { | 437 | 2.84k | empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat)); | 438 | 2.84k | empend(bytecode_to_repeat.size()); | 439 | 2.84k | empend(static_cast<ByteCodeValueType>(n - 1)); | 440 | 2.84k | empend(repetition_mark_id); | 441 | | | 442 | 2.84k | extend(bytecode_to_repeat); | 443 | 2.84k | } | 444 | 2.84k | } |
|
445 | | |
446 | | static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy) |
447 | 462k | { |
448 | | // LABEL _START = -bytecode_to_repeat.size() |
449 | | // CHECKPOINT _C |
450 | | // REGEXP |
451 | | // JUMP_NONEMPTY _C _START FORKSTAY (FORKJUMP -> Greedy) |
452 | | |
453 | 462k | auto checkpoint = s_next_checkpoint_serial_id++; |
454 | 462k | bytecode_to_repeat.prepend((ByteCodeValueType)checkpoint); |
455 | 462k | bytecode_to_repeat.prepend((ByteCodeValueType)OpCodeId::Checkpoint); |
456 | | |
457 | 462k | bytecode_to_repeat.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty); |
458 | 462k | bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 3); // Jump to the _START label... |
459 | 462k | bytecode_to_repeat.empend(checkpoint); // ...if _C is not empty |
460 | | |
461 | 462k | if (greedy) |
462 | 462k | bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); |
463 | 8 | else |
464 | 8 | bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); |
465 | 462k | } |
466 | | |
467 | | static void transform_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy) |
468 | 270k | { |
469 | | // LABEL _START |
470 | | // FORKJUMP _END (FORKSTAY -> Greedy) |
471 | | // CHECKPOINT _C |
472 | | // REGEXP |
473 | | // JUMP_NONEMPTY _C _START JUMP |
474 | | // LABEL _END |
475 | | |
476 | | // LABEL _START = m_bytes.size(); |
477 | 270k | ByteCode bytecode; |
478 | | |
479 | 270k | if (greedy) |
480 | 270k | bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); |
481 | 19 | else |
482 | 19 | bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); |
483 | | |
484 | 270k | bytecode.empend(bytecode_to_repeat.size() + 2 + 4); // Jump to the _END label |
485 | | |
486 | 270k | auto checkpoint = s_next_checkpoint_serial_id++; |
487 | 270k | bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint)); |
488 | 270k | bytecode.empend(static_cast<ByteCodeValueType>(checkpoint)); |
489 | | |
490 | 270k | bytecode.extend(bytecode_to_repeat); |
491 | | |
492 | 270k | bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::JumpNonEmpty)); |
493 | 270k | bytecode.empend(-bytecode.size() - 3); // Jump(...) to the _START label... |
494 | 270k | bytecode.empend(checkpoint); // ...only if _C passes. |
495 | 270k | bytecode.empend((ByteCodeValueType)OpCodeId::Jump); |
496 | | // LABEL _END = bytecode.size() |
497 | | |
498 | 270k | bytecode_to_repeat = move(bytecode); |
499 | 270k | } |
500 | | |
501 | | static void transform_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy) |
502 | 1.25M | { |
503 | | // FORKJUMP _END (FORKSTAY -> Greedy) |
504 | | // REGEXP |
505 | | // LABEL _END |
506 | 1.25M | ByteCode bytecode; |
507 | | |
508 | 1.25M | if (greedy) |
509 | 1.21M | bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); |
510 | 32.4k | else |
511 | 32.4k | bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); |
512 | | |
513 | 1.25M | bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label |
514 | | |
515 | 1.25M | bytecode.extend(move(bytecode_to_repeat)); |
516 | | // LABEL _END = bytecode.size() |
517 | | |
518 | 1.25M | bytecode_to_repeat = move(bytecode); |
519 | 1.25M | } |
520 | | |
521 | | OpCode& get_opcode(MatchState& state) const; |
522 | | |
523 | 14.5k | static void reset_checkpoint_serial_id() { s_next_checkpoint_serial_id = 0; } |
524 | | |
525 | | private: |
526 | | void insert_string(StringView view) |
527 | 1.25M | { |
528 | 1.25M | empend((ByteCodeValueType)view.length()); |
529 | 91.3M | for (size_t i = 0; i < view.length(); ++i) |
530 | 90.0M | empend((ByteCodeValueType)view[i]); |
531 | 1.25M | } |
532 | | |
533 | | void ensure_opcodes_initialized(); |
534 | | ALWAYS_INLINE OpCode& get_opcode_by_id(OpCodeId id) const; |
535 | | static OwnPtr<OpCode> s_opcodes[(size_t)OpCodeId::Last + 1]; |
536 | | static bool s_opcodes_initialized; |
537 | | static size_t s_next_checkpoint_serial_id; |
538 | | }; |
539 | | |
540 | | #define ENUMERATE_EXECUTION_RESULTS \ |
541 | 0 | __ENUMERATE_EXECUTION_RESULT(Continue) \ |
542 | 0 | __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \ |
543 | 0 | __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \ |
544 | 0 | __ENUMERATE_EXECUTION_RESULT(Failed) \ |
545 | 0 | __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \ |
546 | 0 | __ENUMERATE_EXECUTION_RESULT(Succeeded) |
547 | | |
548 | | enum class ExecutionResult : u8 { |
549 | | #define __ENUMERATE_EXECUTION_RESULT(x) x, |
550 | | ENUMERATE_EXECUTION_RESULTS |
551 | | #undef __ENUMERATE_EXECUTION_RESULT |
552 | | }; |
553 | | |
554 | | StringView execution_result_name(ExecutionResult result); |
555 | | StringView opcode_id_name(OpCodeId opcode_id); |
556 | | StringView boundary_check_type_name(BoundaryCheckType); |
557 | | StringView character_compare_type_name(CharacterCompareType result); |
558 | | StringView character_class_name(CharClass ch_class); |
559 | | |
560 | | class OpCode { |
561 | | public: |
562 | 132 | OpCode() = default; |
563 | 0 | virtual ~OpCode() = default; |
564 | | |
565 | | virtual OpCodeId opcode_id() const = 0; |
566 | | virtual size_t size() const = 0; |
567 | | virtual ExecutionResult execute(MatchInput const& input, MatchState& state) const = 0; |
568 | | |
569 | | ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const |
570 | 587M | { |
571 | 587M | return m_bytecode->at(state().instruction_position + 1 + offset); |
572 | 587M | } |
573 | | |
574 | | ALWAYS_INLINE StringView name() const; |
575 | | static StringView name(OpCodeId); |
576 | | |
577 | 363M | ALWAYS_INLINE void set_state(MatchState& state) { m_state = &state; } |
578 | | |
579 | 363M | ALWAYS_INLINE void set_bytecode(ByteCode& bytecode) { m_bytecode = &bytecode; } |
580 | | |
581 | | ALWAYS_INLINE MatchState const& state() const |
582 | 608M | { |
583 | 608M | VERIFY(m_state); |
584 | 608M | return *m_state; |
585 | 608M | } |
586 | | |
587 | | ByteString to_byte_string() const |
588 | 0 | { |
589 | 0 | return ByteString::formatted("[{:#02X}] {}", (int)opcode_id(), name(opcode_id())); |
590 | 0 | } |
591 | | |
592 | | virtual ByteString arguments_string() const = 0; |
593 | | |
594 | 0 | ALWAYS_INLINE ByteCode const& bytecode() const { return *m_bytecode; } |
595 | | |
596 | | protected: |
597 | | ByteCode* m_bytecode { nullptr }; |
598 | | MatchState* m_state { nullptr }; |
599 | | }; |
600 | | |
601 | | class OpCode_Exit final : public OpCode { |
602 | | public: |
603 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
604 | 4.30k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; } |
605 | 770k | ALWAYS_INLINE size_t size() const override { return 1; } |
606 | 0 | ByteString arguments_string() const override { return ByteString::empty(); } |
607 | | }; |
608 | | |
609 | | class OpCode_FailForks final : public OpCode { |
610 | | public: |
611 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
612 | 22.6k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; } |
613 | 39.5k | ALWAYS_INLINE size_t size() const override { return 1; } |
614 | 0 | ByteString arguments_string() const override { return ByteString::empty(); } |
615 | | }; |
616 | | |
617 | | class OpCode_Save final : public OpCode { |
618 | | public: |
619 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
620 | 19.8k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; } |
621 | 19.9k | ALWAYS_INLINE size_t size() const override { return 1; } |
622 | 0 | ByteString arguments_string() const override { return ByteString::empty(); } |
623 | | }; |
624 | | |
625 | | class OpCode_Restore final : public OpCode { |
626 | | public: |
627 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
628 | 19.8k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; } |
629 | 19.9k | ALWAYS_INLINE size_t size() const override { return 1; } |
630 | 0 | ByteString arguments_string() const override { return ByteString::empty(); } |
631 | | }; |
632 | | |
633 | | class OpCode_GoBack final : public OpCode { |
634 | | public: |
635 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
636 | 1.34k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; } |
637 | 1.41k | ALWAYS_INLINE size_t size() const override { return 2; } |
638 | 0 | ALWAYS_INLINE size_t count() const { return argument(0); } |
639 | 0 | ByteString arguments_string() const override { return ByteString::formatted("count={}", count()); } |
640 | | }; |
641 | | |
642 | | class OpCode_Jump final : public OpCode { |
643 | | public: |
644 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
645 | 2.47M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; } |
646 | 8.87M | ALWAYS_INLINE size_t size() const override { return 2; } |
647 | 3.83M | ALWAYS_INLINE ssize_t offset() const { return argument(0); } |
648 | | ByteString arguments_string() const override |
649 | 0 | { |
650 | 0 | return ByteString::formatted("offset={} [&{}]", offset(), state().instruction_position + size() + offset()); |
651 | 0 | } |
652 | | }; |
653 | | |
654 | | class OpCode_ForkJump : public OpCode { |
655 | | public: |
656 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
657 | 1.81M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; } |
658 | 3.54M | ALWAYS_INLINE size_t size() const override { return 2; } |
659 | 1.50M | ALWAYS_INLINE ssize_t offset() const { return argument(0); } |
660 | | ByteString arguments_string() const override |
661 | 0 | { |
662 | 0 | return ByteString::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position); |
663 | 0 | } |
664 | | }; |
665 | | |
666 | | class OpCode_ForkReplaceJump final : public OpCode_ForkJump { |
667 | | public: |
668 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
669 | 0 | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceJump; } |
670 | | }; |
671 | | |
672 | | class OpCode_ForkStay : public OpCode { |
673 | | public: |
674 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
675 | 8.07M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; } |
676 | 156M | ALWAYS_INLINE size_t size() const override { return 2; } |
677 | 53.4M | ALWAYS_INLINE ssize_t offset() const { return argument(0); } |
678 | | ByteString arguments_string() const override |
679 | 0 | { |
680 | 0 | return ByteString::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position); |
681 | 0 | } |
682 | | }; |
683 | | |
684 | | class OpCode_ForkReplaceStay final : public OpCode_ForkStay { |
685 | | public: |
686 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
687 | 0 | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceStay; } |
688 | | }; |
689 | | |
690 | | class OpCode_CheckBegin final : public OpCode { |
691 | | public: |
692 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
693 | 192k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; } |
694 | 5.76M | ALWAYS_INLINE size_t size() const override { return 1; } |
695 | 0 | ByteString arguments_string() const override { return ByteString::empty(); } |
696 | | }; |
697 | | |
698 | | class OpCode_CheckEnd final : public OpCode { |
699 | | public: |
700 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
701 | 107k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; } |
702 | 3.99M | ALWAYS_INLINE size_t size() const override { return 1; } |
703 | 0 | ByteString arguments_string() const override { return ByteString::empty(); } |
704 | | }; |
705 | | |
706 | | class OpCode_CheckBoundary final : public OpCode { |
707 | | public: |
708 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
709 | 12.2k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; } |
710 | 13.3k | ALWAYS_INLINE size_t size() const override { return 2; } |
711 | 0 | ALWAYS_INLINE size_t arguments_count() const { return 1; } |
712 | 0 | ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); } |
713 | 0 | ByteString arguments_string() const override { return ByteString::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); } |
714 | | }; |
715 | | |
716 | | class OpCode_ClearCaptureGroup final : public OpCode { |
717 | | public: |
718 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
719 | 11.2M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; } |
720 | 14.3M | ALWAYS_INLINE size_t size() const override { return 2; } |
721 | 0 | ALWAYS_INLINE size_t id() const { return argument(0); } |
722 | 0 | ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } |
723 | | }; |
724 | | |
725 | | class OpCode_SaveLeftCaptureGroup final : public OpCode { |
726 | | public: |
727 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
728 | 796k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; } |
729 | 16.4M | ALWAYS_INLINE size_t size() const override { return 2; } |
730 | 39.2M | ALWAYS_INLINE size_t id() const { return argument(0); } |
731 | 0 | ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } |
732 | | }; |
733 | | |
734 | | class OpCode_SaveRightCaptureGroup final : public OpCode { |
735 | | public: |
736 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
737 | 739k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; } |
738 | 10.6M | ALWAYS_INLINE size_t size() const override { return 2; } |
739 | 9.48M | ALWAYS_INLINE size_t id() const { return argument(0); } |
740 | 0 | ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } |
741 | | }; |
742 | | |
743 | | class OpCode_SaveRightNamedCaptureGroup final : public OpCode { |
744 | | public: |
745 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
746 | 57 | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; } |
747 | 49 | ALWAYS_INLINE size_t size() const override { return 4; } |
748 | 0 | ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; } |
749 | 0 | ALWAYS_INLINE size_t length() const { return argument(1); } |
750 | 0 | ALWAYS_INLINE size_t id() const { return argument(2); } |
751 | | ByteString arguments_string() const override |
752 | 0 | { |
753 | 0 | return ByteString::formatted("name={}, length={}", name(), length()); |
754 | 0 | } |
755 | | }; |
756 | | |
757 | | class OpCode_Compare final : public OpCode { |
758 | | public: |
759 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
760 | 76.4M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; } |
761 | 167M | ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; } |
762 | 107M | ALWAYS_INLINE size_t arguments_count() const { return argument(0); } |
763 | 167M | ALWAYS_INLINE size_t arguments_size() const { return argument(1); } |
764 | | ByteString arguments_string() const override; |
765 | | Vector<ByteString> variable_arguments_to_byte_string(Optional<MatchInput const&> input = {}) const; |
766 | | Vector<CompareTypeAndValuePair> flat_compares() const; |
767 | | static bool matches_character_class(CharClass, u32, bool insensitive); |
768 | | |
769 | | private: |
770 | | ALWAYS_INLINE static void compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched); |
771 | | ALWAYS_INLINE static bool compare_string(MatchInput const& input, MatchState& state, RegexStringView str, bool& had_zero_length_match); |
772 | | ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); |
773 | | ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); |
774 | | ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched); |
775 | | ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched); |
776 | | ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); |
777 | | ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); |
778 | | }; |
779 | | |
780 | | class OpCode_Repeat : public OpCode { |
781 | | public: |
782 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
783 | 1.12M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Repeat; } |
784 | 2.82M | ALWAYS_INLINE size_t size() const override { return 4; } |
785 | 383k | ALWAYS_INLINE size_t offset() const { return argument(0); } |
786 | 3.79M | ALWAYS_INLINE u64 count() const { return argument(1); } |
787 | 5.66M | ALWAYS_INLINE size_t id() const { return argument(2); } |
788 | | ByteString arguments_string() const override |
789 | 0 | { |
790 | 0 | auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0; |
791 | 0 | return ByteString::formatted("offset={} [&{}] count={} id={} rep={}, sp: {}", |
792 | 0 | static_cast<ssize_t>(offset()), |
793 | 0 | state().instruction_position - offset(), |
794 | 0 | count() + 1, |
795 | 0 | id(), |
796 | 0 | reps + 1, |
797 | 0 | state().string_position); |
798 | 0 | } |
799 | | }; |
800 | | |
801 | | class OpCode_ResetRepeat : public OpCode { |
802 | | public: |
803 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
804 | 170k | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ResetRepeat; } |
805 | 5.33M | ALWAYS_INLINE size_t size() const override { return 2; } |
806 | 15.0M | ALWAYS_INLINE size_t id() const { return argument(0); } |
807 | | ByteString arguments_string() const override |
808 | 0 | { |
809 | 0 | auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0; |
810 | 0 | return ByteString::formatted("id={} rep={}", id(), reps + 1); |
811 | 0 | } |
812 | | }; |
813 | | |
814 | | class OpCode_Checkpoint final : public OpCode { |
815 | | public: |
816 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
817 | 1.39M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Checkpoint; } |
818 | 53.4M | ALWAYS_INLINE size_t size() const override { return 2; } |
819 | 51.0M | ALWAYS_INLINE size_t id() const { return argument(0); } |
820 | 0 | ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } |
821 | | }; |
822 | | |
823 | | class OpCode_JumpNonEmpty final : public OpCode { |
824 | | public: |
825 | | ExecutionResult execute(MatchInput const& input, MatchState& state) const override; |
826 | 1.95M | ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::JumpNonEmpty; } |
827 | 56.9M | ALWAYS_INLINE size_t size() const override { return 4; } |
828 | 43.8M | ALWAYS_INLINE ssize_t offset() const { return argument(0); } |
829 | 42.1M | ALWAYS_INLINE ssize_t checkpoint() const { return argument(1); } |
830 | 42.9M | ALWAYS_INLINE OpCodeId form() const { return (OpCodeId)argument(2); } |
831 | | ByteString arguments_string() const override |
832 | 0 | { |
833 | 0 | return ByteString::formatted("{} offset={} [&{}], cp={}", |
834 | 0 | opcode_id_name(form()), |
835 | 0 | offset(), state().instruction_position + size() + offset(), |
836 | 0 | checkpoint()); |
837 | 0 | } |
838 | | }; |
839 | | |
840 | | ALWAYS_INLINE OpCode& ByteCode::get_opcode(regex::MatchState& state) const |
841 | 363M | { |
842 | 363M | OpCodeId opcode_id; |
843 | 363M | if (auto opcode_ptr = static_cast<DisjointChunks<ByteCodeValueType> const&>(*this).find(state.instruction_position)) |
844 | 363M | opcode_id = (OpCodeId)*opcode_ptr; |
845 | 772k | else |
846 | 772k | opcode_id = OpCodeId::Exit; |
847 | | |
848 | 363M | auto& opcode = get_opcode_by_id(opcode_id); |
849 | 363M | opcode.set_state(state); |
850 | 363M | return opcode; |
851 | 363M | } |
852 | | |
853 | | ALWAYS_INLINE OpCode& ByteCode::get_opcode_by_id(OpCodeId id) const |
854 | 363M | { |
855 | 363M | VERIFY(id >= OpCodeId::First && id <= OpCodeId::Last); |
856 | | |
857 | 363M | auto& opcode = s_opcodes[(u32)id]; |
858 | 363M | opcode->set_bytecode(*const_cast<ByteCode*>(this)); |
859 | 363M | return *opcode; |
860 | 363M | } |
861 | | |
862 | | template<typename T> |
863 | | bool is(OpCode const&); |
864 | | |
865 | | template<typename T> |
866 | | ALWAYS_INLINE bool is(OpCode const&) |
867 | | { |
868 | | return false; |
869 | | } |
870 | | |
871 | | template<typename T> |
872 | | ALWAYS_INLINE bool is(OpCode const* opcode) |
873 | | { |
874 | | return is<T>(*opcode); |
875 | | } |
876 | | |
877 | | template<> |
878 | | ALWAYS_INLINE bool is<OpCode_ForkStay>(OpCode const& opcode) |
879 | 0 | { |
880 | 0 | return opcode.opcode_id() == OpCodeId::ForkStay; |
881 | 0 | } |
882 | | |
883 | | template<> |
884 | | ALWAYS_INLINE bool is<OpCode_Exit>(OpCode const& opcode) |
885 | 0 | { |
886 | 0 | return opcode.opcode_id() == OpCodeId::Exit; |
887 | 0 | } |
888 | | |
889 | | template<> |
890 | | ALWAYS_INLINE bool is<OpCode_Compare>(OpCode const& opcode) |
891 | 0 | { |
892 | 0 | return opcode.opcode_id() == OpCodeId::Compare; |
893 | 0 | } |
894 | | |
895 | | template<typename T> |
896 | | ALWAYS_INLINE T const& to(OpCode const& opcode) |
897 | 0 | { |
898 | 0 | return verify_cast<T>(opcode); |
899 | 0 | } |
900 | | |
901 | | template<typename T> |
902 | | ALWAYS_INLINE T* to(OpCode* opcode) |
903 | | { |
904 | | return verify_cast<T>(opcode); |
905 | | } |
906 | | |
907 | | template<typename T> |
908 | | ALWAYS_INLINE T const* to(OpCode const* opcode) |
909 | | { |
910 | | return verify_cast<T>(opcode); |
911 | | } |
912 | | |
913 | | template<typename T> |
914 | | ALWAYS_INLINE T& to(OpCode& opcode) |
915 | 0 | { |
916 | 0 | return verify_cast<T>(opcode); |
917 | 0 | } |
918 | | |
919 | | } |