/src/serenity/Userland/Libraries/LibRegex/RegexByteCode.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include "RegexByteCode.h" |
8 | | #include "RegexDebug.h" |
9 | | #include <AK/BinarySearch.h> |
10 | | #include <AK/CharacterTypes.h> |
11 | | #include <AK/StringBuilder.h> |
12 | | #include <LibUnicode/CharacterTypes.h> |
13 | | |
14 | | // U+2028 LINE SEPARATOR |
15 | | constexpr static u32 const LineSeparator { 0x2028 }; |
16 | | // U+2029 PARAGRAPH SEPARATOR |
17 | | constexpr static u32 const ParagraphSeparator { 0x2029 }; |
18 | | |
19 | | namespace regex { |
20 | | |
21 | | StringView OpCode::name(OpCodeId opcode_id) |
22 | 0 | { |
23 | 0 | switch (opcode_id) { |
24 | 0 | #define __ENUMERATE_OPCODE(x) \ |
25 | 0 | case OpCodeId::x: \ |
26 | 0 | return #x##sv; |
27 | 0 | ENUMERATE_OPCODES |
28 | 0 | #undef __ENUMERATE_OPCODE |
29 | 0 | default: |
30 | 0 | VERIFY_NOT_REACHED(); |
31 | 0 | return "<Unknown>"sv; |
32 | 0 | } |
33 | 0 | } |
34 | | |
35 | | StringView OpCode::name() const |
36 | 0 | { |
37 | 0 | return name(opcode_id()); |
38 | 0 | } |
39 | | |
40 | | StringView execution_result_name(ExecutionResult result) |
41 | 0 | { |
42 | 0 | switch (result) { |
43 | 0 | #define __ENUMERATE_EXECUTION_RESULT(x) \ |
44 | 0 | case ExecutionResult::x: \ |
45 | 0 | return #x##sv; |
46 | 0 | ENUMERATE_EXECUTION_RESULTS |
47 | 0 | #undef __ENUMERATE_EXECUTION_RESULT |
48 | 0 | default: |
49 | 0 | VERIFY_NOT_REACHED(); |
50 | 0 | return "<Unknown>"sv; |
51 | 0 | } |
52 | 0 | } |
53 | | |
54 | | StringView opcode_id_name(OpCodeId opcode) |
55 | 0 | { |
56 | 0 | switch (opcode) { |
57 | 0 | #define __ENUMERATE_OPCODE(x) \ |
58 | 0 | case OpCodeId::x: \ |
59 | 0 | return #x##sv; |
60 | | |
61 | 0 | ENUMERATE_OPCODES |
62 | | |
63 | 0 | #undef __ENUMERATE_OPCODE |
64 | 0 | default: |
65 | 0 | VERIFY_NOT_REACHED(); |
66 | 0 | return "<Unknown>"sv; |
67 | 0 | } |
68 | 0 | } |
69 | | |
70 | | StringView boundary_check_type_name(BoundaryCheckType ty) |
71 | 0 | { |
72 | 0 | switch (ty) { |
73 | 0 | #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \ |
74 | 0 | case BoundaryCheckType::x: \ |
75 | 0 | return #x##sv; |
76 | 0 | ENUMERATE_BOUNDARY_CHECK_TYPES |
77 | 0 | #undef __ENUMERATE_BOUNDARY_CHECK_TYPE |
78 | 0 | default: |
79 | 0 | VERIFY_NOT_REACHED(); |
80 | 0 | return "<Unknown>"sv; |
81 | 0 | } |
82 | 0 | } |
83 | | |
84 | | StringView character_compare_type_name(CharacterCompareType ch_compare_type) |
85 | 0 | { |
86 | 0 | switch (ch_compare_type) { |
87 | 0 | #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \ |
88 | 0 | case CharacterCompareType::x: \ |
89 | 0 | return #x##sv; |
90 | 0 | ENUMERATE_CHARACTER_COMPARE_TYPES |
91 | 0 | #undef __ENUMERATE_CHARACTER_COMPARE_TYPE |
92 | 0 | default: |
93 | 0 | VERIFY_NOT_REACHED(); |
94 | 0 | return "<Unknown>"sv; |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | | StringView character_class_name(CharClass ch_class) |
99 | 0 | { |
100 | 0 | switch (ch_class) { |
101 | 0 | #define __ENUMERATE_CHARACTER_CLASS(x) \ |
102 | 0 | case CharClass::x: \ |
103 | 0 | return #x##sv; |
104 | 0 | ENUMERATE_CHARACTER_CLASSES |
105 | 0 | #undef __ENUMERATE_CHARACTER_CLASS |
106 | 0 | default: |
107 | 0 | VERIFY_NOT_REACHED(); |
108 | 0 | return "<Unknown>"sv; |
109 | 0 | } |
110 | 0 | } |
111 | | |
112 | | static void advance_string_position(MatchState& state, RegexStringView view, Optional<u32> code_point = {}) |
113 | 59.6M | { |
114 | 59.6M | ++state.string_position; |
115 | | |
116 | 59.6M | if (view.unicode()) { |
117 | 0 | if (!code_point.has_value() && (state.string_position_in_code_units < view.length_in_code_units())) |
118 | 0 | code_point = view[state.string_position_in_code_units]; |
119 | 0 | if (code_point.has_value()) |
120 | 0 | state.string_position_in_code_units += view.length_of_code_point(*code_point); |
121 | 59.6M | } else { |
122 | 59.6M | ++state.string_position_in_code_units; |
123 | 59.6M | } |
124 | 59.6M | } |
125 | | |
126 | | static void advance_string_position(MatchState& state, RegexStringView, RegexStringView advance_by) |
127 | 0 | { |
128 | 0 | state.string_position += advance_by.length(); |
129 | 0 | state.string_position_in_code_units += advance_by.length_in_code_units(); |
130 | 0 | } |
131 | | |
132 | | static void reverse_string_position(MatchState& state, RegexStringView view, size_t amount) |
133 | 0 | { |
134 | 0 | VERIFY(state.string_position >= amount); |
135 | 0 | state.string_position -= amount; |
136 | |
|
137 | 0 | if (view.unicode()) |
138 | 0 | state.string_position_in_code_units = view.code_unit_offset_of(state.string_position); |
139 | 0 | else |
140 | 0 | state.string_position_in_code_units -= amount; |
141 | 0 | } |
142 | | |
143 | | static void save_string_position(MatchInput const& input, MatchState const& state) |
144 | 0 | { |
145 | 0 | input.saved_positions.append(state.string_position); |
146 | 0 | input.saved_forks_since_last_save.append(state.forks_since_last_save); |
147 | 0 | input.saved_code_unit_positions.append(state.string_position_in_code_units); |
148 | 0 | } |
149 | | |
150 | | static bool restore_string_position(MatchInput const& input, MatchState& state) |
151 | 0 | { |
152 | 0 | if (input.saved_positions.is_empty()) |
153 | 0 | return false; |
154 | | |
155 | 0 | state.string_position = input.saved_positions.take_last(); |
156 | 0 | state.string_position_in_code_units = input.saved_code_unit_positions.take_last(); |
157 | 0 | state.forks_since_last_save = input.saved_forks_since_last_save.take_last(); |
158 | 0 | return true; |
159 | 0 | } |
160 | | |
161 | | OwnPtr<OpCode> ByteCode::s_opcodes[(size_t)OpCodeId::Last + 1]; |
162 | | bool ByteCode::s_opcodes_initialized { false }; |
163 | | size_t ByteCode::s_next_checkpoint_serial_id { 0 }; |
164 | | |
165 | | void ByteCode::ensure_opcodes_initialized() |
166 | 78.8M | { |
167 | 78.8M | if (s_opcodes_initialized) |
168 | 78.8M | return; |
169 | 138 | for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) { |
170 | 132 | switch ((OpCodeId)i) { |
171 | 0 | #define __ENUMERATE_OPCODE(OpCode) \ |
172 | 132 | case OpCodeId::OpCode: \ |
173 | 132 | s_opcodes[i] = make<OpCode_##OpCode>(); \ |
174 | 132 | break; |
175 | | |
176 | 132 | ENUMERATE_OPCODES |
177 | | |
178 | 132 | #undef __ENUMERATE_OPCODE |
179 | 132 | } |
180 | 132 | } |
181 | 6 | s_opcodes_initialized = true; |
182 | 6 | } |
183 | | |
184 | | ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(MatchInput const& input, MatchState& state) const |
185 | 1.01M | { |
186 | 1.01M | if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size()) |
187 | 1.01M | return ExecutionResult::Succeeded; |
188 | | |
189 | 0 | return ExecutionResult::Failed; |
190 | 1.01M | } |
191 | | |
192 | | ALWAYS_INLINE ExecutionResult OpCode_Save::execute(MatchInput const& input, MatchState& state) const |
193 | 0 | { |
194 | 0 | save_string_position(input, state); |
195 | 0 | state.forks_since_last_save = 0; |
196 | 0 | return ExecutionResult::Continue; |
197 | 0 | } |
198 | | |
199 | | ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(MatchInput const& input, MatchState& state) const |
200 | 0 | { |
201 | 0 | if (!restore_string_position(input, state)) |
202 | 0 | return ExecutionResult::Failed; |
203 | 0 | return ExecutionResult::Continue; |
204 | 0 | } |
205 | | |
206 | | ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(MatchInput const& input, MatchState& state) const |
207 | 0 | { |
208 | 0 | if (count() > state.string_position) |
209 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
210 | | |
211 | 0 | reverse_string_position(state, input.view, count()); |
212 | 0 | return ExecutionResult::Continue; |
213 | 0 | } |
214 | | |
215 | | ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(MatchInput const& input, MatchState& state) const |
216 | 0 | { |
217 | 0 | input.fail_counter += state.forks_since_last_save; |
218 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
219 | 0 | } |
220 | | |
221 | | ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(MatchInput const&, MatchState& state) const |
222 | 0 | { |
223 | 0 | state.instruction_position += offset(); |
224 | 0 | return ExecutionResult::Continue; |
225 | 0 | } |
226 | | |
227 | | ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(MatchInput const&, MatchState& state) const |
228 | 0 | { |
229 | 0 | state.fork_at_position = state.instruction_position + size() + offset(); |
230 | 0 | state.forks_since_last_save++; |
231 | 0 | return ExecutionResult::Fork_PrioHigh; |
232 | 0 | } |
233 | | |
234 | | ALWAYS_INLINE ExecutionResult OpCode_ForkReplaceJump::execute(MatchInput const& input, MatchState& state) const |
235 | 0 | { |
236 | 0 | state.fork_at_position = state.instruction_position + size() + offset(); |
237 | 0 | input.fork_to_replace = state.instruction_position; |
238 | 0 | state.forks_since_last_save++; |
239 | 0 | return ExecutionResult::Fork_PrioHigh; |
240 | 0 | } |
241 | | |
242 | | ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(MatchInput const&, MatchState& state) const |
243 | 52.7M | { |
244 | 52.7M | state.fork_at_position = state.instruction_position + size() + offset(); |
245 | 52.7M | state.forks_since_last_save++; |
246 | 52.7M | return ExecutionResult::Fork_PrioLow; |
247 | 52.7M | } |
248 | | |
249 | | ALWAYS_INLINE ExecutionResult OpCode_ForkReplaceStay::execute(MatchInput const& input, MatchState& state) const |
250 | 12.5k | { |
251 | 12.5k | state.fork_at_position = state.instruction_position + size() + offset(); |
252 | 12.5k | input.fork_to_replace = state.instruction_position; |
253 | 12.5k | return ExecutionResult::Fork_PrioLow; |
254 | 12.5k | } |
255 | | |
256 | | ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state) const |
257 | 4.82M | { |
258 | 4.82M | auto is_at_line_boundary = [&] { |
259 | 4.82M | if (state.string_position == 0) |
260 | 4.82M | return true; |
261 | | |
262 | 0 | if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) { |
263 | 0 | auto input_view = input.view.substring_view(state.string_position - 1, 1)[0]; |
264 | 0 | return input_view == '\r' || input_view == '\n' || input_view == LineSeparator || input_view == ParagraphSeparator; |
265 | 0 | } |
266 | | |
267 | 0 | return false; |
268 | 0 | }(); |
269 | 4.82M | if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine)) |
270 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
271 | | |
272 | 4.82M | if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotBeginOfLine)) |
273 | 0 | || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine)) |
274 | 0 | || (is_at_line_boundary && (input.regex_options & AllFlags::Global))) |
275 | 4.82M | return ExecutionResult::Continue; |
276 | | |
277 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
278 | 4.82M | } |
279 | | |
280 | | ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& input, MatchState& state) const |
281 | 0 | { |
282 | 0 | auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; }; |
283 | 0 | auto is_word_boundary = [&] { |
284 | 0 | if (state.string_position == input.view.length()) { |
285 | 0 | return (state.string_position > 0 && isword(input.view[state.string_position_in_code_units - 1])); |
286 | 0 | } |
287 | | |
288 | 0 | if (state.string_position == 0) { |
289 | 0 | return (isword(input.view[0])); |
290 | 0 | } |
291 | | |
292 | 0 | return !!(isword(input.view[state.string_position_in_code_units]) ^ isword(input.view[state.string_position_in_code_units - 1])); |
293 | 0 | }; |
294 | 0 | switch (type()) { |
295 | 0 | case BoundaryCheckType::Word: { |
296 | 0 | if (is_word_boundary()) |
297 | 0 | return ExecutionResult::Continue; |
298 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
299 | 0 | } |
300 | 0 | case BoundaryCheckType::NonWord: { |
301 | 0 | if (!is_word_boundary()) |
302 | 0 | return ExecutionResult::Continue; |
303 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
304 | 0 | } |
305 | 0 | } |
306 | 0 | VERIFY_NOT_REACHED(); |
307 | 0 | } |
308 | | |
309 | | ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state) const |
310 | 4.04M | { |
311 | 4.04M | auto is_at_line_boundary = [&] { |
312 | 4.04M | if (state.string_position == input.view.length()) |
313 | 1.01M | return true; |
314 | | |
315 | 3.02M | if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) { |
316 | 0 | auto input_view = input.view.substring_view(state.string_position, 1)[0]; |
317 | 0 | return input_view == '\r' || input_view == '\n' || input_view == LineSeparator || input_view == ParagraphSeparator; |
318 | 0 | } |
319 | | |
320 | 3.02M | return false; |
321 | 3.02M | }(); |
322 | 4.04M | if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine)) |
323 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
324 | | |
325 | 4.04M | if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotEndOfLine)) |
326 | 3.02M | || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine))) |
327 | 1.01M | return ExecutionResult::Continue; |
328 | | |
329 | 3.02M | return ExecutionResult::Failed_ExecuteLowPrioForks; |
330 | 4.04M | } |
331 | | |
332 | | ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(MatchInput const& input, MatchState& state) const |
333 | 2.63M | { |
334 | 2.63M | if (input.match_index < state.capture_group_matches.size()) { |
335 | 0 | auto& group = state.capture_group_matches.mutable_at(input.match_index); |
336 | 0 | auto group_id = id(); |
337 | 0 | if (group_id >= group.size()) |
338 | 0 | group.resize(group_id + 1); |
339 | |
|
340 | 0 | group[group_id].reset(); |
341 | 0 | } |
342 | 2.63M | return ExecutionResult::Continue; |
343 | 2.63M | } |
344 | | |
345 | | ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(MatchInput const& input, MatchState& state) const |
346 | 16.6M | { |
347 | 16.6M | if (input.match_index >= state.capture_group_matches.size()) { |
348 | 4.92M | state.capture_group_matches.ensure_capacity(input.match_index); |
349 | 4.92M | auto capacity = state.capture_group_matches.capacity(); |
350 | 9.84M | for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i) |
351 | 4.92M | state.capture_group_matches.empend(); |
352 | 4.92M | } |
353 | | |
354 | 16.6M | if (id() >= state.capture_group_matches.at(input.match_index).size()) { |
355 | 8.91M | state.capture_group_matches.mutable_at(input.match_index).ensure_capacity(id()); |
356 | 8.91M | auto capacity = state.capture_group_matches.at(input.match_index).capacity(); |
357 | 33.7M | for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i) |
358 | 24.8M | state.capture_group_matches.mutable_at(input.match_index).empend(); |
359 | 8.91M | } |
360 | | |
361 | 16.6M | state.capture_group_matches.mutable_at(input.match_index).at(id()).left_column = state.string_position; |
362 | 16.6M | return ExecutionResult::Continue; |
363 | 16.6M | } |
364 | | |
365 | | ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput const& input, MatchState& state) const |
366 | 10.5M | { |
367 | 10.5M | auto& match = state.capture_group_matches.mutable_at(input.match_index).at(id()); |
368 | 10.5M | auto start_position = match.left_column; |
369 | 10.5M | if (state.string_position < start_position) { |
370 | 0 | dbgln("Right capture group {} is before left capture group {}!", state.string_position, start_position); |
371 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
372 | 0 | } |
373 | | |
374 | 10.5M | auto length = state.string_position - start_position; |
375 | | |
376 | 10.5M | if (start_position < match.column) |
377 | 0 | return ExecutionResult::Continue; |
378 | | |
379 | 10.5M | VERIFY(start_position + length <= input.view.length()); |
380 | | |
381 | 10.5M | auto view = input.view.substring_view(start_position, length); |
382 | | |
383 | 10.5M | if (input.regex_options & AllFlags::StringCopyMatches) { |
384 | 0 | match = { view.to_byte_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string |
385 | 10.5M | } else { |
386 | 10.5M | match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string |
387 | 10.5M | } |
388 | | |
389 | 10.5M | return ExecutionResult::Continue; |
390 | 10.5M | } |
391 | | |
392 | | ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchInput const& input, MatchState& state) const |
393 | 0 | { |
394 | 0 | auto& match = state.capture_group_matches.mutable_at(input.match_index).at(id()); |
395 | 0 | auto start_position = match.left_column; |
396 | 0 | if (state.string_position < start_position) |
397 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
398 | | |
399 | 0 | auto length = state.string_position - start_position; |
400 | |
|
401 | 0 | if (start_position < match.column) |
402 | 0 | return ExecutionResult::Continue; |
403 | | |
404 | 0 | VERIFY(start_position + length <= input.view.length()); |
405 | | |
406 | 0 | auto view = input.view.substring_view(start_position, length); |
407 | |
|
408 | 0 | if (input.regex_options & AllFlags::StringCopyMatches) { |
409 | 0 | match = { view.to_byte_string(), name(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string |
410 | 0 | } else { |
411 | 0 | match = { view, name(), input.line, start_position, input.global_offset + start_position }; // take view to original string |
412 | 0 | } |
413 | |
|
414 | 0 | return ExecutionResult::Continue; |
415 | 0 | } |
416 | | |
417 | | ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, MatchState& state) const |
418 | 69.5M | { |
419 | 69.5M | auto argument_count = arguments_count(); |
420 | 69.5M | auto has_single_argument = argument_count == 1; |
421 | | |
422 | 69.5M | bool inverse { false }; |
423 | 69.5M | bool temporary_inverse { false }; |
424 | 69.5M | bool reset_temp_inverse { false }; |
425 | 69.5M | struct DisjunctionState { |
426 | 69.5M | bool active { false }; |
427 | 69.5M | bool is_conjunction { false }; |
428 | 69.5M | bool fail { false }; |
429 | 69.5M | bool inverse_matched { false }; |
430 | 69.5M | size_t initial_position; |
431 | 69.5M | size_t initial_code_unit_position; |
432 | 69.5M | Optional<size_t> last_accepted_position {}; |
433 | 69.5M | Optional<size_t> last_accepted_code_unit_position {}; |
434 | 69.5M | }; |
435 | | |
436 | 69.5M | Vector<DisjunctionState, 4> disjunction_states; |
437 | 69.5M | disjunction_states.empend(); |
438 | | |
439 | 99.0M | auto current_disjunction_state = [&]() -> DisjunctionState& { return disjunction_states.last(); }; |
440 | | |
441 | 249M | auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; }; |
442 | | |
443 | 69.5M | size_t string_position = state.string_position; |
444 | 69.5M | bool inverse_matched { false }; |
445 | 69.5M | bool had_zero_length_match { false }; |
446 | | |
447 | 69.5M | state.string_position_before_match = state.string_position; |
448 | | |
449 | 69.5M | size_t offset { state.instruction_position + 3 }; |
450 | 182M | for (size_t i = 0; i < argument_count; ++i) { |
451 | 118M | if (state.string_position > string_position) |
452 | 0 | break; |
453 | | |
454 | 118M | if (reset_temp_inverse) { |
455 | 24.3M | reset_temp_inverse = false; |
456 | 24.3M | temporary_inverse = false; |
457 | 93.9M | } else { |
458 | 93.9M | reset_temp_inverse = true; |
459 | 93.9M | } |
460 | | |
461 | 118M | auto compare_type = (CharacterCompareType)m_bytecode->at(offset++); |
462 | | |
463 | 118M | switch (compare_type) { |
464 | 12.6M | case CharacterCompareType::Inverse: |
465 | 12.6M | inverse = !inverse; |
466 | 12.6M | continue; |
467 | 0 | case CharacterCompareType::TemporaryInverse: |
468 | | // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode. |
469 | | // it follows that this cannot be the last compare element. |
470 | 0 | VERIFY(i != arguments_count() - 1); |
471 | | |
472 | 0 | temporary_inverse = true; |
473 | 0 | reset_temp_inverse = false; |
474 | 0 | continue; |
475 | 4.91M | case CharacterCompareType::Char: { |
476 | 4.91M | u32 ch = m_bytecode->at(offset++); |
477 | | |
478 | | // We want to compare a string that is longer or equal in length to the available string |
479 | 4.91M | if (input.view.length() <= state.string_position) |
480 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
481 | | |
482 | 4.91M | compare_char(input, state, ch, current_inversion_state(), inverse_matched); |
483 | 4.91M | break; |
484 | 4.91M | } |
485 | 23.7M | case CharacterCompareType::AnyChar: { |
486 | | // We want to compare a string that is definitely longer than the available string |
487 | 23.7M | if (input.view.length() <= state.string_position) |
488 | 931k | return ExecutionResult::Failed_ExecuteLowPrioForks; |
489 | | |
490 | 22.8M | auto input_view = input.view.substring_view(state.string_position, 1)[0]; |
491 | 22.8M | auto is_equivalent_to_newline = input_view == '\n' |
492 | 22.8M | || (input.regex_options.has_flag_set(AllFlags::Internal_ECMA262DotSemantics) |
493 | 22.8M | ? (input_view == '\r' || input_view == LineSeparator || input_view == ParagraphSeparator) |
494 | 22.8M | : false); |
495 | | |
496 | 22.8M | if (!is_equivalent_to_newline || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) { |
497 | 22.8M | if (current_inversion_state()) |
498 | 0 | inverse_matched = true; |
499 | 22.8M | else |
500 | 22.8M | advance_string_position(state, input.view, input_view); |
501 | 22.8M | } |
502 | 22.8M | break; |
503 | 23.7M | } |
504 | 0 | case CharacterCompareType::String: { |
505 | 0 | VERIFY(!current_inversion_state()); |
506 | | |
507 | 0 | auto const& length = m_bytecode->at(offset++); |
508 | | |
509 | | // We want to compare a string that is definitely longer than the available string |
510 | 0 | if (input.view.length() < state.string_position + length) |
511 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
512 | | |
513 | 0 | Optional<ByteString> str; |
514 | 0 | Utf16Data utf16; |
515 | 0 | Vector<u32> data; |
516 | 0 | data.ensure_capacity(length); |
517 | 0 | for (size_t i = offset; i < offset + length; ++i) |
518 | 0 | data.unchecked_append(m_bytecode->at(i)); |
519 | |
|
520 | 0 | auto view = input.view.construct_as_same(data, str, utf16); |
521 | 0 | offset += length; |
522 | 0 | if (compare_string(input, state, view, had_zero_length_match)) { |
523 | 0 | if (current_inversion_state()) |
524 | 0 | inverse_matched = true; |
525 | 0 | } |
526 | 0 | break; |
527 | 0 | } |
528 | 22.0M | case CharacterCompareType::CharClass: { |
529 | 22.0M | if (input.view.length() <= state.string_position_in_code_units) |
530 | 2.86M | return ExecutionResult::Failed_ExecuteLowPrioForks; |
531 | | |
532 | 19.2M | auto character_class = (CharClass)m_bytecode->at(offset++); |
533 | 19.2M | auto ch = input.view[state.string_position_in_code_units]; |
534 | | |
535 | 19.2M | compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched); |
536 | 19.2M | break; |
537 | 22.0M | } |
538 | 18.6M | case CharacterCompareType::LookupTable: { |
539 | 18.6M | if (input.view.length() <= state.string_position) |
540 | 925k | return ExecutionResult::Failed_ExecuteLowPrioForks; |
541 | | |
542 | 17.7M | auto count = m_bytecode->at(offset++); |
543 | 17.7M | auto range_data = m_bytecode->template spans<4>().slice(offset, count); |
544 | 17.7M | offset += count; |
545 | | |
546 | 17.7M | auto ch = input.view[state.string_position_in_code_units]; |
547 | | |
548 | 35.8M | auto const* matching_range = binary_search(range_data, ch, nullptr, [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) { |
549 | 35.8M | auto upper_case_needle = needle; |
550 | 35.8M | auto lower_case_needle = needle; |
551 | 35.8M | if (insensitive) { |
552 | 0 | upper_case_needle = to_ascii_uppercase(needle); |
553 | 0 | lower_case_needle = to_ascii_lowercase(needle); |
554 | 0 | } |
555 | | |
556 | 35.8M | if (lower_case_needle >= range.from && lower_case_needle <= range.to) |
557 | 5.42M | return 0; |
558 | 30.4M | if (upper_case_needle >= range.from && upper_case_needle <= range.to) |
559 | 0 | return 0; |
560 | 30.4M | if (lower_case_needle > range.to || upper_case_needle > range.to) |
561 | 25.4M | return 1; |
562 | 4.97M | return -1; |
563 | 30.4M | }); |
564 | | |
565 | 17.7M | if (matching_range) { |
566 | 2.92M | if (current_inversion_state()) |
567 | 524 | inverse_matched = true; |
568 | 2.92M | else |
569 | 2.92M | advance_string_position(state, input.view, ch); |
570 | 2.92M | } |
571 | 17.7M | break; |
572 | 18.6M | } |
573 | 0 | case CharacterCompareType::CharRange: { |
574 | 0 | if (input.view.length() <= state.string_position) |
575 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
576 | | |
577 | 0 | auto value = (CharRange)m_bytecode->at(offset++); |
578 | |
|
579 | 0 | auto from = value.from; |
580 | 0 | auto to = value.to; |
581 | 0 | auto ch = input.view[state.string_position_in_code_units]; |
582 | |
|
583 | 0 | compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched); |
584 | 0 | break; |
585 | 0 | } |
586 | 11.8M | case CharacterCompareType::Reference: { |
587 | 11.8M | auto reference_number = (size_t)m_bytecode->at(offset++); |
588 | 11.8M | auto& groups = state.capture_group_matches.at(input.match_index); |
589 | 11.8M | if (groups.size() <= reference_number) |
590 | 0 | return ExecutionResult::Failed_ExecuteLowPrioForks; |
591 | | |
592 | 11.8M | auto str = groups.at(reference_number).view; |
593 | | |
594 | | // We want to compare a string that is definitely longer than the available string |
595 | 11.8M | if (input.view.length() < state.string_position + str.length()) |
596 | 1.01M | return ExecutionResult::Failed_ExecuteLowPrioForks; |
597 | | |
598 | 10.8M | if (compare_string(input, state, str, had_zero_length_match)) { |
599 | 10.8M | if (current_inversion_state()) |
600 | 0 | inverse_matched = true; |
601 | 10.8M | } |
602 | 10.8M | break; |
603 | 11.8M | } |
604 | 0 | case CharacterCompareType::Property: { |
605 | 0 | auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++)); |
606 | 0 | compare_property(input, state, property, current_inversion_state(), inverse_matched); |
607 | 0 | break; |
608 | 11.8M | } |
609 | 0 | case CharacterCompareType::GeneralCategory: { |
610 | 0 | auto general_category = static_cast<Unicode::GeneralCategory>(m_bytecode->at(offset++)); |
611 | 0 | compare_general_category(input, state, general_category, current_inversion_state(), inverse_matched); |
612 | 0 | break; |
613 | 11.8M | } |
614 | 0 | case CharacterCompareType::Script: { |
615 | 0 | auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++)); |
616 | 0 | compare_script(input, state, script, current_inversion_state(), inverse_matched); |
617 | 0 | break; |
618 | 11.8M | } |
619 | 0 | case CharacterCompareType::ScriptExtension: { |
620 | 0 | auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++)); |
621 | 0 | compare_script_extension(input, state, script, current_inversion_state(), inverse_matched); |
622 | 0 | break; |
623 | 11.8M | } |
624 | 0 | case CharacterCompareType::And: |
625 | 0 | disjunction_states.append({ |
626 | 0 | .active = true, |
627 | 0 | .is_conjunction = current_inversion_state(), |
628 | 0 | .fail = current_inversion_state(), |
629 | 0 | .inverse_matched = current_inversion_state(), |
630 | 0 | .initial_position = state.string_position, |
631 | 0 | .initial_code_unit_position = state.string_position_in_code_units, |
632 | 0 | }); |
633 | 0 | continue; |
634 | 12.6M | case CharacterCompareType::Or: |
635 | 12.6M | disjunction_states.append({ |
636 | 12.6M | .active = true, |
637 | 12.6M | .is_conjunction = !current_inversion_state(), |
638 | 12.6M | .fail = !current_inversion_state(), |
639 | 12.6M | .inverse_matched = !current_inversion_state(), |
640 | 12.6M | .initial_position = state.string_position, |
641 | 12.6M | .initial_code_unit_position = state.string_position_in_code_units, |
642 | 12.6M | }); |
643 | 12.6M | continue; |
644 | 11.7M | case CharacterCompareType::EndAndOr: { |
645 | 11.7M | auto disjunction_state = disjunction_states.take_last(); |
646 | 11.7M | if (!disjunction_state.fail) { |
647 | 11.7M | state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position); |
648 | 11.7M | state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position); |
649 | 11.7M | } |
650 | 11.7M | inverse_matched = disjunction_state.inverse_matched || disjunction_state.fail; |
651 | 11.7M | break; |
652 | 11.8M | } |
653 | 0 | default: |
654 | 0 | warnln("Undefined comparison: {}", (int)compare_type); |
655 | 0 | VERIFY_NOT_REACHED(); |
656 | 0 | break; |
657 | 118M | } |
658 | | |
659 | 87.3M | auto& new_disjunction_state = current_disjunction_state(); |
660 | 87.3M | if (current_inversion_state() && (!inverse || new_disjunction_state.active) && !inverse_matched) { |
661 | 23.4M | advance_string_position(state, input.view); |
662 | 23.4M | inverse_matched = true; |
663 | 23.4M | } |
664 | | |
665 | 87.3M | if (!has_single_argument && new_disjunction_state.active) { |
666 | 23.4M | auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length(); |
667 | | |
668 | 23.4M | if (!failed) { |
669 | 23.4M | new_disjunction_state.last_accepted_position = state.string_position; |
670 | 23.4M | new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units; |
671 | 23.4M | new_disjunction_state.inverse_matched |= inverse_matched; |
672 | 23.4M | } |
673 | | |
674 | 23.4M | if (new_disjunction_state.is_conjunction) |
675 | 0 | new_disjunction_state.fail = failed && new_disjunction_state.fail; |
676 | 23.4M | else |
677 | 23.4M | new_disjunction_state.fail = failed || new_disjunction_state.fail; |
678 | | |
679 | 23.4M | state.string_position = new_disjunction_state.initial_position; |
680 | 23.4M | state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position; |
681 | 23.4M | inverse_matched = false; |
682 | 23.4M | } |
683 | 87.3M | } |
684 | | |
685 | 63.8M | if (!has_single_argument) { |
686 | 11.7M | auto& new_disjunction_state = current_disjunction_state(); |
687 | 11.7M | if (new_disjunction_state.active) { |
688 | 0 | if (!new_disjunction_state.fail) { |
689 | 0 | state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position); |
690 | 0 | state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position); |
691 | 0 | } |
692 | 0 | } |
693 | 11.7M | } |
694 | | |
695 | 63.8M | if (current_inversion_state() && !inverse_matched) |
696 | 0 | advance_string_position(state, input.view); |
697 | | |
698 | 63.8M | if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length()) |
699 | 15.9M | return ExecutionResult::Failed_ExecuteLowPrioForks; |
700 | | |
701 | 47.9M | return ExecutionResult::Continue; |
702 | 63.8M | } |
703 | | |
704 | | ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched) |
705 | 15.7M | { |
706 | 15.7M | if (state.string_position == input.view.length()) |
707 | 0 | return; |
708 | | |
709 | | // FIXME: Figure out how to do this if unicode() without performing a substring split first. |
710 | 15.7M | auto input_view = input.view.unicode() |
711 | 15.7M | ? input.view.substring_view(state.string_position, 1)[0] |
712 | 15.7M | : input.view.code_unit_at(state.string_position_in_code_units); |
713 | | |
714 | 15.7M | bool equal; |
715 | 15.7M | if (input.regex_options & AllFlags::Insensitive) { |
716 | 0 | if (input.view.unicode()) |
717 | 0 | equal = Unicode::equals_ignoring_case(Utf32View { &input_view, 1 }, Utf32View { &ch1, 1 }); |
718 | 0 | else |
719 | 0 | equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1); |
720 | 15.7M | } else { |
721 | 15.7M | equal = input_view == ch1; |
722 | 15.7M | } |
723 | | |
724 | 15.7M | if (equal) { |
725 | 6.47M | if (inverse) |
726 | 0 | inverse_matched = true; |
727 | 6.47M | else |
728 | 6.47M | advance_string_position(state, input.view, ch1); |
729 | 6.47M | } |
730 | 15.7M | } |
731 | | |
732 | | ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, MatchState& state, RegexStringView str, bool& had_zero_length_match) |
733 | 10.8M | { |
734 | 10.8M | if (state.string_position + str.length() > input.view.length()) { |
735 | 0 | if (str.is_empty()) { |
736 | 0 | had_zero_length_match = true; |
737 | 0 | return true; |
738 | 0 | } |
739 | 0 | return false; |
740 | 0 | } |
741 | | |
742 | 10.8M | if (str.length() == 0) { |
743 | 0 | had_zero_length_match = true; |
744 | 0 | return true; |
745 | 0 | } |
746 | | |
747 | 10.8M | if (str.length() == 1) { |
748 | 10.8M | auto inverse_matched = false; |
749 | 10.8M | compare_char(input, state, str[0], false, inverse_matched); |
750 | 10.8M | return !inverse_matched; |
751 | 10.8M | } |
752 | | |
753 | 0 | auto subject = input.view.substring_view(state.string_position, str.length()); |
754 | 0 | bool equals; |
755 | 0 | if (input.regex_options & AllFlags::Insensitive) |
756 | 0 | equals = subject.equals_ignoring_case(str); |
757 | 0 | else |
758 | 0 | equals = subject.equals(str); |
759 | |
|
760 | 0 | if (equals) |
761 | 0 | advance_string_position(state, input.view, str); |
762 | |
|
763 | 0 | return equals; |
764 | 10.8M | } |
765 | | |
766 | | ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched) |
767 | 19.2M | { |
768 | 19.2M | if (matches_character_class(character_class, ch, input.regex_options & AllFlags::Insensitive)) { |
769 | 3.93M | if (inverse) |
770 | 842 | inverse_matched = true; |
771 | 3.93M | else |
772 | 3.93M | advance_string_position(state, input.view, ch); |
773 | 3.93M | } |
774 | 19.2M | } |
775 | | |
776 | | bool OpCode_Compare::matches_character_class(CharClass character_class, u32 ch, bool insensitive) |
777 | 19.3M | { |
778 | 19.3M | constexpr auto is_space_or_line_terminator = [](u32 code_point) { |
779 | 19.2M | if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029)) |
780 | 539 | return true; |
781 | 19.2M | if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff)) |
782 | 627k | return true; |
783 | 18.6M | return Unicode::code_point_has_space_separator_general_category(code_point); |
784 | 19.2M | }; |
785 | | |
786 | 19.3M | switch (character_class) { |
787 | 21.4k | case CharClass::Alnum: |
788 | 21.4k | return is_ascii_alphanumeric(ch); |
789 | 6.79k | case CharClass::Alpha: |
790 | 6.79k | return is_ascii_alpha(ch); |
791 | 0 | case CharClass::Blank: |
792 | 0 | return is_ascii_blank(ch); |
793 | 113 | case CharClass::Cntrl: |
794 | 113 | return is_ascii_control(ch); |
795 | 910 | case CharClass::Digit: |
796 | 910 | return is_ascii_digit(ch); |
797 | 0 | case CharClass::Graph: |
798 | 0 | return is_ascii_graphical(ch); |
799 | 52.3k | case CharClass::Lower: |
800 | 52.3k | return is_ascii_lower_alpha(ch) || (insensitive && is_ascii_upper_alpha(ch)); |
801 | 4.67k | case CharClass::Print: |
802 | 4.67k | return is_ascii_printable(ch); |
803 | 21.4k | case CharClass::Punct: |
804 | 21.4k | return is_ascii_punctuation(ch); |
805 | 19.2M | case CharClass::Space: |
806 | 19.2M | return is_space_or_line_terminator(ch); |
807 | 17.2k | case CharClass::Upper: |
808 | 17.2k | return is_ascii_upper_alpha(ch) || (insensitive && is_ascii_lower_alpha(ch)); |
809 | 99 | case CharClass::Word: |
810 | 99 | return is_ascii_alphanumeric(ch) || ch == '_'; |
811 | 6.11k | case CharClass::Xdigit: |
812 | 6.11k | return is_ascii_hex_digit(ch); |
813 | 19.3M | } |
814 | | |
815 | 0 | VERIFY_NOT_REACHED(); |
816 | 0 | } |
817 | | |
818 | | ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched) |
819 | 0 | { |
820 | 0 | if (input.regex_options & AllFlags::Insensitive) { |
821 | 0 | from = to_ascii_lowercase(from); |
822 | 0 | to = to_ascii_lowercase(to); |
823 | 0 | ch = to_ascii_lowercase(ch); |
824 | 0 | } |
825 | |
|
826 | 0 | if (ch >= from && ch <= to) { |
827 | 0 | if (inverse) |
828 | 0 | inverse_matched = true; |
829 | 0 | else |
830 | 0 | advance_string_position(state, input.view, ch); |
831 | 0 | } |
832 | 0 | } |
833 | | |
834 | | ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched) |
835 | 0 | { |
836 | 0 | if (state.string_position == input.view.length()) |
837 | 0 | return; |
838 | | |
839 | 0 | u32 code_point = input.view[state.string_position_in_code_units]; |
840 | 0 | bool equal = Unicode::code_point_has_property(code_point, property); |
841 | |
|
842 | 0 | if (equal) { |
843 | 0 | if (inverse) |
844 | 0 | inverse_matched = true; |
845 | 0 | else |
846 | 0 | advance_string_position(state, input.view, code_point); |
847 | 0 | } |
848 | 0 | } |
849 | | |
850 | | ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched) |
851 | 0 | { |
852 | 0 | if (state.string_position == input.view.length()) |
853 | 0 | return; |
854 | | |
855 | 0 | u32 code_point = input.view[state.string_position_in_code_units]; |
856 | 0 | bool equal = Unicode::code_point_has_general_category(code_point, general_category); |
857 | |
|
858 | 0 | if (equal) { |
859 | 0 | if (inverse) |
860 | 0 | inverse_matched = true; |
861 | 0 | else |
862 | 0 | advance_string_position(state, input.view, code_point); |
863 | 0 | } |
864 | 0 | } |
865 | | |
866 | | ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched) |
867 | 0 | { |
868 | 0 | if (state.string_position == input.view.length()) |
869 | 0 | return; |
870 | | |
871 | 0 | u32 code_point = input.view[state.string_position_in_code_units]; |
872 | 0 | bool equal = Unicode::code_point_has_script(code_point, script); |
873 | |
|
874 | 0 | if (equal) { |
875 | 0 | if (inverse) |
876 | 0 | inverse_matched = true; |
877 | 0 | else |
878 | 0 | advance_string_position(state, input.view, code_point); |
879 | 0 | } |
880 | 0 | } |
881 | | |
882 | | ALWAYS_INLINE void OpCode_Compare::compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched) |
883 | 0 | { |
884 | 0 | if (state.string_position == input.view.length()) |
885 | 0 | return; |
886 | | |
887 | 0 | u32 code_point = input.view[state.string_position_in_code_units]; |
888 | 0 | bool equal = Unicode::code_point_has_script_extension(code_point, script); |
889 | |
|
890 | 0 | if (equal) { |
891 | 0 | if (inverse) |
892 | 0 | inverse_matched = true; |
893 | 0 | else |
894 | 0 | advance_string_position(state, input.view, code_point); |
895 | 0 | } |
896 | 0 | } |
897 | | |
898 | | ByteString OpCode_Compare::arguments_string() const |
899 | 0 | { |
900 | 0 | return ByteString::formatted("argc={}, args={} ", arguments_count(), arguments_size()); |
901 | 0 | } |
902 | | |
903 | | Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const |
904 | 23.7M | { |
905 | 23.7M | Vector<CompareTypeAndValuePair> result; |
906 | | |
907 | 23.7M | size_t offset { state().instruction_position + 3 }; |
908 | | |
909 | 47.9M | for (size_t i = 0; i < arguments_count(); ++i) { |
910 | 24.1M | auto compare_type = (CharacterCompareType)m_bytecode->at(offset++); |
911 | | |
912 | 24.1M | if (compare_type == CharacterCompareType::Char) { |
913 | 20.8M | auto ch = m_bytecode->at(offset++); |
914 | 20.8M | result.append({ compare_type, ch }); |
915 | 20.8M | } else if (compare_type == CharacterCompareType::Reference) { |
916 | 769 | auto ref = m_bytecode->at(offset++); |
917 | 769 | result.append({ compare_type, ref }); |
918 | 3.35M | } else if (compare_type == CharacterCompareType::String) { |
919 | 750k | auto& length = m_bytecode->at(offset++); |
920 | 60.5M | for (size_t k = 0; k < length; ++k) |
921 | 59.7M | result.append({ CharacterCompareType::Char, m_bytecode->at(offset + k) }); |
922 | 750k | offset += length; |
923 | 2.60M | } else if (compare_type == CharacterCompareType::CharClass) { |
924 | 322k | auto character_class = m_bytecode->at(offset++); |
925 | 322k | result.append({ compare_type, character_class }); |
926 | 2.28M | } else if (compare_type == CharacterCompareType::CharRange) { |
927 | 508 | auto value = m_bytecode->at(offset++); |
928 | 508 | result.append({ compare_type, value }); |
929 | 2.28M | } else if (compare_type == CharacterCompareType::LookupTable) { |
930 | 86.8k | auto count = m_bytecode->at(offset++); |
931 | 9.04M | for (size_t i = 0; i < count; ++i) |
932 | 8.95M | result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) }); |
933 | 2.19M | } else if (compare_type == CharacterCompareType::GeneralCategory |
934 | 2.19M | || compare_type == CharacterCompareType::Property |
935 | 2.19M | || compare_type == CharacterCompareType::Script |
936 | 2.19M | || compare_type == CharacterCompareType::ScriptExtension) { |
937 | 0 | auto value = m_bytecode->at(offset++); |
938 | 0 | result.append({ compare_type, value }); |
939 | 2.19M | } else { |
940 | 2.19M | result.append({ compare_type, 0 }); |
941 | 2.19M | } |
942 | 24.1M | } |
943 | 23.7M | return result; |
944 | 23.7M | } |
945 | | |
946 | | Vector<ByteString> OpCode_Compare::variable_arguments_to_byte_string(Optional<MatchInput const&> input) const |
947 | 0 | { |
948 | 0 | Vector<ByteString> result; |
949 | |
|
950 | 0 | size_t offset { state().instruction_position + 3 }; |
951 | 0 | RegexStringView const& view = ((input.has_value()) ? input.value().view : StringView {}); |
952 | |
|
953 | 0 | for (size_t i = 0; i < arguments_count(); ++i) { |
954 | 0 | auto compare_type = (CharacterCompareType)m_bytecode->at(offset++); |
955 | 0 | result.empend(ByteString::formatted("type={} [{}]", (size_t)compare_type, character_compare_type_name(compare_type))); |
956 | |
|
957 | 0 | auto string_start_offset = state().string_position_before_match; |
958 | |
|
959 | 0 | if (compare_type == CharacterCompareType::Char) { |
960 | 0 | auto ch = m_bytecode->at(offset++); |
961 | 0 | auto is_ascii = is_ascii_printable(ch); |
962 | 0 | if (is_ascii) |
963 | 0 | result.empend(ByteString::formatted(" value='{:c}'", static_cast<char>(ch))); |
964 | 0 | else |
965 | 0 | result.empend(ByteString::formatted(" value={:x}", ch)); |
966 | |
|
967 | 0 | if (!view.is_null() && view.length() > string_start_offset) { |
968 | 0 | if (is_ascii) { |
969 | 0 | result.empend(ByteString::formatted( |
970 | 0 | " compare against: '{}'", |
971 | 0 | view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_byte_string())); |
972 | 0 | } else { |
973 | 0 | auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_byte_string(); |
974 | 0 | u8 buf[8] { 0 }; |
975 | 0 | __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf))); |
976 | 0 | result.empend(ByteString::formatted(" compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}", |
977 | 0 | buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])); |
978 | 0 | } |
979 | 0 | } |
980 | 0 | } else if (compare_type == CharacterCompareType::Reference) { |
981 | 0 | auto ref = m_bytecode->at(offset++); |
982 | 0 | result.empend(ByteString::formatted(" number={}", ref)); |
983 | 0 | if (input.has_value()) { |
984 | 0 | if (state().capture_group_matches.size() > input->match_index) { |
985 | 0 | auto& match = state().capture_group_matches[input->match_index]; |
986 | 0 | if (match.size() > ref) { |
987 | 0 | auto& group = match[ref]; |
988 | 0 | result.empend(ByteString::formatted(" left={}", group.left_column)); |
989 | 0 | result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units())); |
990 | 0 | result.empend(ByteString::formatted(" contents='{}'", group.view)); |
991 | 0 | } else { |
992 | 0 | result.empend(ByteString::formatted(" (invalid ref, max={})", match.size() - 1)); |
993 | 0 | } |
994 | 0 | } else { |
995 | 0 | result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1)); |
996 | 0 | } |
997 | 0 | } |
998 | 0 | } else if (compare_type == CharacterCompareType::String) { |
999 | 0 | auto& length = m_bytecode->at(offset++); |
1000 | 0 | StringBuilder str_builder; |
1001 | 0 | for (size_t i = 0; i < length; ++i) |
1002 | 0 | str_builder.append(m_bytecode->at(offset++)); |
1003 | 0 | result.empend(ByteString::formatted(" value=\"{}\"", str_builder.string_view().substring_view(0, length))); |
1004 | 0 | if (!view.is_null() && view.length() > state().string_position) |
1005 | 0 | result.empend(ByteString::formatted( |
1006 | 0 | " compare against: \"{}\"", |
1007 | 0 | input.value().view.substring_view(string_start_offset, string_start_offset + length > view.length() ? 0 : length).to_byte_string())); |
1008 | 0 | } else if (compare_type == CharacterCompareType::CharClass) { |
1009 | 0 | auto character_class = (CharClass)m_bytecode->at(offset++); |
1010 | 0 | result.empend(ByteString::formatted(" ch_class={} [{}]", (size_t)character_class, character_class_name(character_class))); |
1011 | 0 | if (!view.is_null() && view.length() > state().string_position) |
1012 | 0 | result.empend(ByteString::formatted( |
1013 | 0 | " compare against: '{}'", |
1014 | 0 | input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_byte_string())); |
1015 | 0 | } else if (compare_type == CharacterCompareType::CharRange) { |
1016 | 0 | auto value = (CharRange)m_bytecode->at(offset++); |
1017 | 0 | result.empend(ByteString::formatted(" ch_range={:x}-{:x}", value.from, value.to)); |
1018 | 0 | if (!view.is_null() && view.length() > state().string_position) |
1019 | 0 | result.empend(ByteString::formatted( |
1020 | 0 | " compare against: '{}'", |
1021 | 0 | input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_byte_string())); |
1022 | 0 | } else if (compare_type == CharacterCompareType::LookupTable) { |
1023 | 0 | auto count = m_bytecode->at(offset++); |
1024 | 0 | for (size_t j = 0; j < count; ++j) { |
1025 | 0 | auto range = (CharRange)m_bytecode->at(offset++); |
1026 | 0 | result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to)); |
1027 | 0 | } |
1028 | 0 | if (!view.is_null() && view.length() > state().string_position) |
1029 | 0 | result.empend(ByteString::formatted( |
1030 | 0 | " compare against: '{}'", |
1031 | 0 | input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_byte_string())); |
1032 | 0 | } else if (compare_type == CharacterCompareType::GeneralCategory |
1033 | 0 | || compare_type == CharacterCompareType::Property |
1034 | 0 | || compare_type == CharacterCompareType::Script |
1035 | 0 | || compare_type == CharacterCompareType::ScriptExtension) { |
1036 | |
|
1037 | 0 | auto value = m_bytecode->at(offset++); |
1038 | 0 | result.empend(ByteString::formatted(" value={}", value)); |
1039 | 0 | } |
1040 | 0 | } |
1041 | 0 | return result; |
1042 | 0 | } |
1043 | | |
1044 | | ALWAYS_INLINE ExecutionResult OpCode_Repeat::execute(MatchInput const&, MatchState& state) const |
1045 | 2.58M | { |
1046 | 2.58M | VERIFY(count() > 0); |
1047 | | |
1048 | 2.58M | if (id() >= state.repetition_marks.size()) |
1049 | 2.54M | state.repetition_marks.resize(id() + 1); |
1050 | 2.58M | auto& repetition_mark = state.repetition_marks.mutable_at(id()); |
1051 | | |
1052 | 2.58M | if (repetition_mark == count() - 1) { |
1053 | 2.53M | repetition_mark = 0; |
1054 | 2.53M | } else { |
1055 | 48.6k | state.instruction_position -= offset() + size(); |
1056 | 48.6k | ++repetition_mark; |
1057 | 48.6k | } |
1058 | | |
1059 | 2.58M | return ExecutionResult::Continue; |
1060 | 2.58M | } |
1061 | | |
1062 | | ALWAYS_INLINE ExecutionResult OpCode_ResetRepeat::execute(MatchInput const&, MatchState& state) const |
1063 | 4.92M | { |
1064 | 4.92M | if (id() >= state.repetition_marks.size()) |
1065 | 4.82M | state.repetition_marks.resize(id() + 1); |
1066 | | |
1067 | 4.92M | state.repetition_marks.mutable_at(id()) = 0; |
1068 | 4.92M | return ExecutionResult::Continue; |
1069 | 4.92M | } |
1070 | | |
1071 | | ALWAYS_INLINE ExecutionResult OpCode_Checkpoint::execute(MatchInput const&, MatchState& state) const |
1072 | 56.6M | { |
1073 | 56.6M | auto id = this->id(); |
1074 | 56.6M | if (id >= state.checkpoints.size()) |
1075 | 7.36M | state.checkpoints.resize(id + 1); |
1076 | | |
1077 | 56.6M | state.checkpoints[id] = state.string_position + 1; |
1078 | 56.6M | return ExecutionResult::Continue; |
1079 | 56.6M | } |
1080 | | |
1081 | | ALWAYS_INLINE ExecutionResult OpCode_JumpNonEmpty::execute(MatchInput const& input, MatchState& state) const |
1082 | 45.8M | { |
1083 | 45.8M | u64 current_position = state.string_position; |
1084 | 45.8M | auto checkpoint_position = state.checkpoints.get(checkpoint()).value_or(0); |
1085 | | |
1086 | 45.8M | if (checkpoint_position != 0 && checkpoint_position != current_position + 1) { |
1087 | 45.8M | auto form = this->form(); |
1088 | | |
1089 | 45.8M | if (form == OpCodeId::Jump) { |
1090 | 39.5M | state.instruction_position += offset(); |
1091 | 39.5M | return ExecutionResult::Continue; |
1092 | 39.5M | } |
1093 | | |
1094 | 6.29M | state.fork_at_position = state.instruction_position + size() + offset(); |
1095 | | |
1096 | 6.29M | if (form == OpCodeId::ForkJump) { |
1097 | 6.29M | state.forks_since_last_save++; |
1098 | 6.29M | return ExecutionResult::Fork_PrioHigh; |
1099 | 6.29M | } |
1100 | | |
1101 | 0 | if (form == OpCodeId::ForkStay) { |
1102 | 0 | state.forks_since_last_save++; |
1103 | 0 | return ExecutionResult::Fork_PrioLow; |
1104 | 0 | } |
1105 | | |
1106 | 0 | if (form == OpCodeId::ForkReplaceStay) { |
1107 | 0 | input.fork_to_replace = state.instruction_position; |
1108 | 0 | return ExecutionResult::Fork_PrioLow; |
1109 | 0 | } |
1110 | | |
1111 | 0 | if (form == OpCodeId::ForkReplaceJump) { |
1112 | 0 | input.fork_to_replace = state.instruction_position; |
1113 | 0 | return ExecutionResult::Fork_PrioHigh; |
1114 | 0 | } |
1115 | 0 | } |
1116 | | |
1117 | 0 | return ExecutionResult::Continue; |
1118 | 45.8M | } |
1119 | | |
1120 | | } |