Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/regexp-macro-assembler.h"
6 :
7 : #include "src/assembler.h"
8 : #include "src/isolate-inl.h"
9 : #include "src/regexp/regexp-stack.h"
10 : #include "src/simulator.h"
11 : #include "src/unicode-inl.h"
12 :
13 : #ifdef V8_INTL_SUPPORT
14 : #include "unicode/uchar.h"
15 : #endif // V8_INTL_SUPPORT
16 :
17 : namespace v8 {
18 : namespace internal {
19 :
20 0 : RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
21 : : slow_safe_compiler_(false),
22 : global_mode_(NOT_GLOBAL),
23 : isolate_(isolate),
24 93584 : zone_(zone) {}
25 :
26 :
27 93584 : RegExpMacroAssembler::~RegExpMacroAssembler() {
28 0 : }
29 :
30 :
31 462264 : int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
32 : Address byte_offset2,
33 : size_t byte_length,
34 : Isolate* isolate) {
35 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
36 462264 : isolate->regexp_macro_assembler_canonicalize();
37 : // This function is not allowed to cause a garbage collection.
38 : // A GC might move the calling generated code and invalidate the
39 : // return address on the stack.
40 : DCHECK_EQ(0, byte_length % 2);
41 : uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
42 : uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
43 462264 : size_t length = byte_length >> 1;
44 :
45 : #ifdef V8_INTL_SUPPORT
46 462264 : if (isolate == nullptr) {
47 35 : for (size_t i = 0; i < length; i++) {
48 35 : uc32 c1 = substring1[i];
49 35 : uc32 c2 = substring2[i];
50 35 : if (unibrow::Utf16::IsLeadSurrogate(c1)) {
51 : // Non-BMP characters do not have case-equivalents in the BMP.
52 : // Both have to be non-BMP for them to be able to match.
53 9 : if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
54 9 : if (i + 1 < length) {
55 9 : uc16 c1t = substring1[i + 1];
56 9 : uc16 c2t = substring2[i + 1];
57 27 : if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
58 9 : unibrow::Utf16::IsTrailSurrogate(c2t)) {
59 : c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
60 : c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
61 : i++;
62 : }
63 : }
64 : }
65 35 : c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
66 35 : c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
67 35 : if (c1 != c2) return 0;
68 : }
69 : return 1;
70 : }
71 : #endif // V8_INTL_SUPPORT
72 : DCHECK_NOT_NULL(isolate);
73 27217 : for (size_t i = 0; i < length; i++) {
74 462289 : unibrow::uchar c1 = substring1[i];
75 462289 : unibrow::uchar c2 = substring2[i];
76 462289 : if (c1 != c2) {
77 435176 : unibrow::uchar s1[1] = {c1};
78 435176 : canonicalize->get(c1, '\0', s1);
79 435176 : if (s1[0] != c2) {
80 435156 : unibrow::uchar s2[1] = {c2};
81 435156 : canonicalize->get(c2, '\0', s2);
82 435156 : if (s1[0] != s2[0]) {
83 435072 : return 0;
84 : }
85 : }
86 : }
87 : }
88 : return 1;
89 : }
90 :
91 :
92 85 : void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
93 : Label* on_failure) {
94 : Label ok;
95 : // Check that current character is not a trail surrogate.
96 85 : LoadCurrentCharacter(cp_offset, &ok);
97 85 : CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
98 : // Check that previous character is not a lead surrogate.
99 85 : LoadCurrentCharacter(cp_offset - 1, &ok);
100 85 : CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
101 85 : Bind(&ok);
102 85 : }
103 :
104 0 : void RegExpMacroAssembler::CheckPosition(int cp_offset,
105 : Label* on_outside_input) {
106 0 : LoadCurrentCharacter(cp_offset, on_outside_input, true);
107 0 : }
108 :
109 0 : bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
110 : Label* on_no_match) {
111 0 : return false;
112 : }
113 :
114 : #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
115 :
116 93584 : NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
117 : Zone* zone)
118 93584 : : RegExpMacroAssembler(isolate, zone) {}
119 :
120 :
121 93584 : NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
122 93584 : }
123 :
124 :
125 245782 : bool NativeRegExpMacroAssembler::CanReadUnaligned() {
126 245782 : return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
127 : }
128 :
129 598979 : const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
130 : String* subject,
131 : int start_index) {
132 598979 : if (subject->IsConsString()) {
133 : subject = ConsString::cast(subject)->first();
134 598979 : } else if (subject->IsSlicedString()) {
135 0 : start_index += SlicedString::cast(subject)->offset();
136 : subject = SlicedString::cast(subject)->parent();
137 : }
138 598979 : if (subject->IsThinString()) {
139 : subject = ThinString::cast(subject)->actual();
140 : }
141 : DCHECK_LE(0, start_index);
142 : DCHECK_LE(start_index, subject->length());
143 598979 : if (subject->IsSeqOneByteString()) {
144 : return reinterpret_cast<const byte*>(
145 504272 : SeqOneByteString::cast(subject)->GetChars() + start_index);
146 94707 : } else if (subject->IsSeqTwoByteString()) {
147 : return reinterpret_cast<const byte*>(
148 94565 : SeqTwoByteString::cast(subject)->GetChars() + start_index);
149 142 : } else if (subject->IsExternalOneByteString()) {
150 : return reinterpret_cast<const byte*>(
151 26 : ExternalOneByteString::cast(subject)->GetChars() + start_index);
152 : } else {
153 : DCHECK(subject->IsExternalTwoByteString());
154 : return reinterpret_cast<const byte*>(
155 116 : ExternalTwoByteString::cast(subject)->GetChars() + start_index);
156 : }
157 : }
158 :
159 :
160 379 : int NativeRegExpMacroAssembler::CheckStackGuardState(
161 : Isolate* isolate, int start_index, bool is_direct_call,
162 : Address* return_address, Code* re_code, String** subject,
163 : const byte** input_start, const byte** input_end) {
164 : DCHECK(re_code->instruction_start() <= *return_address);
165 : DCHECK(*return_address <= re_code->instruction_end());
166 : int return_value = 0;
167 : // Prepare for possible GC.
168 : HandleScope handles(isolate);
169 : Handle<Code> code_handle(re_code);
170 379 : Handle<String> subject_handle(*subject);
171 379 : bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
172 :
173 : StackLimitCheck check(isolate);
174 379 : bool js_has_overflowed = check.JsHasOverflowed();
175 :
176 379 : if (is_direct_call) {
177 : // Direct calls from JavaScript can be interrupted in two ways:
178 : // 1. A real stack overflow, in which case we let the caller throw the
179 : // exception.
180 : // 2. The stack guard was used to interrupt execution for another purpose,
181 : // forcing the call through the runtime system.
182 64 : return_value = js_has_overflowed ? EXCEPTION : RETRY;
183 315 : } else if (js_has_overflowed) {
184 229 : isolate->StackOverflow();
185 : return_value = EXCEPTION;
186 : } else {
187 86 : Object* result = isolate->stack_guard()->HandleInterrupts();
188 86 : if (result->IsException(isolate)) return_value = EXCEPTION;
189 : }
190 :
191 : DisallowHeapAllocation no_gc;
192 :
193 379 : if (*code_handle != re_code) { // Return address no longer valid
194 0 : intptr_t delta = code_handle->address() - re_code->address();
195 : // Overwrite the return address on the stack.
196 0 : *return_address += delta;
197 : }
198 :
199 : // If we continue, we need to update the subject string addresses.
200 379 : if (return_value == 0) {
201 : // String encoding might have changed.
202 81 : if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
203 : // If we changed between an LATIN1 and an UC16 string, the specialized
204 : // code cannot be used, and we need to restart regexp matching from
205 : // scratch (including, potentially, compiling a new version of the code).
206 : return_value = RETRY;
207 : } else {
208 81 : *subject = *subject_handle;
209 81 : intptr_t byte_length = *input_end - *input_start;
210 81 : *input_start = StringCharacterPosition(*subject, start_index);
211 81 : *input_end = *input_start + byte_length;
212 : }
213 : }
214 379 : return return_value;
215 : }
216 :
217 :
218 598898 : NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
219 : Handle<Code> regexp_code,
220 : Handle<String> subject,
221 : int* offsets_vector,
222 : int offsets_vector_length,
223 : int previous_index,
224 : Isolate* isolate) {
225 :
226 : DCHECK(subject->IsFlat());
227 : DCHECK_LE(0, previous_index);
228 : DCHECK_LE(previous_index, subject->length());
229 :
230 : // No allocations before calling the regexp, but we can't use
231 : // DisallowHeapAllocation, since regexps might be preempted, and another
232 : // thread might do allocation anyway.
233 :
234 : String* subject_ptr = *subject;
235 : // Character offsets into string.
236 : int start_offset = previous_index;
237 598898 : int char_length = subject_ptr->length() - start_offset;
238 : int slice_offset = 0;
239 :
240 : // The string has been flattened, so if it is a cons string it contains the
241 : // full string in the first part.
242 598898 : if (StringShape(subject_ptr).IsCons()) {
243 : DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
244 : subject_ptr = ConsString::cast(subject_ptr)->first();
245 598898 : } else if (StringShape(subject_ptr).IsSliced()) {
246 : SlicedString* slice = SlicedString::cast(subject_ptr);
247 : subject_ptr = slice->parent();
248 : slice_offset = slice->offset();
249 : }
250 598898 : if (StringShape(subject_ptr).IsThin()) {
251 : subject_ptr = ThinString::cast(subject_ptr)->actual();
252 : }
253 : // Ensure that an underlying string has the same representation.
254 : bool is_one_byte = subject_ptr->IsOneByteRepresentation();
255 : DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
256 : // String is now either Sequential or External
257 598898 : int char_size_shift = is_one_byte ? 0 : 1;
258 :
259 : const byte* input_start =
260 598898 : StringCharacterPosition(subject_ptr, start_offset + slice_offset);
261 598898 : int byte_length = char_length << char_size_shift;
262 598898 : const byte* input_end = input_start + byte_length;
263 : Result res = Execute(*regexp_code,
264 : *subject,
265 : start_offset,
266 : input_start,
267 : input_end,
268 : offsets_vector,
269 : offsets_vector_length,
270 598898 : isolate);
271 598898 : return res;
272 : }
273 :
274 :
275 598982 : NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
276 : Code* code,
277 : String* input, // This needs to be the unpacked (sliced, cons) string.
278 : int start_offset,
279 : const byte* input_start,
280 : const byte* input_end,
281 : int* output,
282 : int output_size,
283 : Isolate* isolate) {
284 : // Ensure that the minimum stack has been allocated.
285 598982 : RegExpStackScope stack_scope(isolate);
286 598982 : Address stack_base = stack_scope.stack()->stack_base();
287 :
288 : int direct_call = 0;
289 1197964 : int result = CALL_GENERATED_REGEXP_CODE(
290 : isolate, code->entry(), input, start_offset, input_start, input_end,
291 : output, output_size, stack_base, direct_call, isolate);
292 : DCHECK(result >= RETRY);
293 :
294 598982 : if (result == EXCEPTION && !isolate->has_pending_exception()) {
295 : // We detected a stack overflow (on the backtrack stack) in RegExp code,
296 : // but haven't created the exception yet.
297 16 : isolate->StackOverflow();
298 : }
299 598982 : return static_cast<Result>(result);
300 : }
301 :
302 :
303 : const byte NativeRegExpMacroAssembler::word_character_map[] = {
304 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
305 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
306 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
308 :
309 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
310 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
311 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7'
312 : 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
313 :
314 : 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G'
315 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O'
316 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W'
317 : 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_'
318 :
319 : 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g'
320 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
321 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
322 : 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
323 : // Latin-1 range
324 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
328 :
329 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333 :
334 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
335 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338 :
339 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
340 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
341 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
342 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
343 : };
344 :
345 :
346 443 : Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
347 : Address* stack_base,
348 443 : Isolate* isolate) {
349 443 : RegExpStack* regexp_stack = isolate->regexp_stack();
350 : size_t size = regexp_stack->stack_capacity();
351 : Address old_stack_base = regexp_stack->stack_base();
352 : DCHECK(old_stack_base == *stack_base);
353 : DCHECK(stack_pointer <= old_stack_base);
354 : DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
355 443 : Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
356 443 : if (new_stack_base == nullptr) {
357 : return nullptr;
358 : }
359 437 : *stack_base = new_stack_base;
360 437 : intptr_t stack_content_size = old_stack_base - stack_pointer;
361 437 : return new_stack_base - stack_content_size;
362 : }
363 :
364 : #endif // V8_INTERPRETED_REGEXP
365 :
366 : } // namespace internal
367 : } // namespace v8
|