Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/regexp-macro-assembler.h"
6 :
7 : #include "src/assembler.h"
8 : #include "src/isolate-inl.h"
9 : #include "src/regexp/regexp-stack.h"
10 : #include "src/simulator.h"
11 :
12 : #ifdef V8_INTL_SUPPORT
13 : #include "unicode/uchar.h"
14 : #endif // V8_INTL_SUPPORT
15 :
16 : namespace v8 {
17 : namespace internal {
18 :
19 0 : RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20 : : slow_safe_compiler_(false),
21 : global_mode_(NOT_GLOBAL),
22 : isolate_(isolate),
23 92258 : zone_(zone) {}
24 :
25 :
26 92258 : RegExpMacroAssembler::~RegExpMacroAssembler() {
27 0 : }
28 :
29 :
30 693412 : int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31 : Address byte_offset2,
32 : size_t byte_length,
33 : Isolate* isolate) {
34 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35 693412 : isolate->regexp_macro_assembler_canonicalize();
36 : // This function is not allowed to cause a garbage collection.
37 : // A GC might move the calling generated code and invalidate the
38 : // return address on the stack.
39 : DCHECK(byte_length % 2 == 0);
40 : uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41 : uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42 693412 : size_t length = byte_length >> 1;
43 :
44 : #ifdef V8_INTL_SUPPORT
45 693412 : if (isolate == nullptr) {
46 58 : for (size_t i = 0; i < length; i++) {
47 58 : uc32 c1 = substring1[i];
48 58 : uc32 c2 = substring2[i];
49 58 : if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50 : // Non-BMP characters do not have case-equivalents in the BMP.
51 : // Both have to be non-BMP for them to be able to match.
52 14 : if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53 14 : if (i + 1 < length) {
54 14 : uc16 c1t = substring1[i + 1];
55 14 : uc16 c2t = substring2[i + 1];
56 42 : if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57 14 : unibrow::Utf16::IsTrailSurrogate(c2t)) {
58 : c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59 : c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60 : i++;
61 : }
62 : }
63 : }
64 58 : c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65 58 : c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66 58 : if (c1 != c2) return 0;
67 : }
68 : return 1;
69 : }
70 : #endif // V8_INTL_SUPPORT
71 : DCHECK_NOT_NULL(isolate);
72 40830 : for (size_t i = 0; i < length; i++) {
73 693444 : unibrow::uchar c1 = substring1[i];
74 693444 : unibrow::uchar c2 = substring2[i];
75 693444 : if (c1 != c2) {
76 652772 : unibrow::uchar s1[1] = {c1};
77 652772 : canonicalize->get(c1, '\0', s1);
78 652772 : if (s1[0] != c2) {
79 652742 : unibrow::uchar s2[1] = {c2};
80 652742 : canonicalize->get(c2, '\0', s2);
81 652742 : if (s1[0] != s2[0]) {
82 652614 : return 0;
83 : }
84 : }
85 : }
86 : }
87 : return 1;
88 : }
89 :
90 :
91 102 : void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92 : Label* on_failure) {
93 : Label ok;
94 : // Check that current character is not a trail surrogate.
95 102 : LoadCurrentCharacter(cp_offset, &ok);
96 102 : CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97 : // Check that previous character is not a lead surrogate.
98 102 : LoadCurrentCharacter(cp_offset - 1, &ok);
99 102 : CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100 102 : Bind(&ok);
101 102 : }
102 :
103 0 : void RegExpMacroAssembler::CheckPosition(int cp_offset,
104 : Label* on_outside_input) {
105 0 : LoadCurrentCharacter(cp_offset, on_outside_input, true);
106 0 : }
107 :
108 0 : bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
109 : Label* on_no_match) {
110 0 : return false;
111 : }
112 :
113 : #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
114 :
115 92258 : NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
116 : Zone* zone)
117 92258 : : RegExpMacroAssembler(isolate, zone) {}
118 :
119 :
120 92258 : NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
121 92258 : }
122 :
123 :
124 268841 : bool NativeRegExpMacroAssembler::CanReadUnaligned() {
125 268841 : return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
126 : }
127 :
128 700474 : const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
129 : String* subject,
130 : int start_index) {
131 700474 : if (subject->IsConsString()) {
132 : subject = ConsString::cast(subject)->first();
133 700474 : } else if (subject->IsSlicedString()) {
134 11 : start_index += SlicedString::cast(subject)->offset();
135 : subject = SlicedString::cast(subject)->parent();
136 : }
137 700474 : if (subject->IsThinString()) {
138 : subject = ThinString::cast(subject)->actual();
139 : }
140 : DCHECK(start_index >= 0);
141 : DCHECK(start_index <= subject->length());
142 700474 : if (subject->IsSeqOneByteString()) {
143 : return reinterpret_cast<const byte*>(
144 590157 : SeqOneByteString::cast(subject)->GetChars() + start_index);
145 110317 : } else if (subject->IsSeqTwoByteString()) {
146 : return reinterpret_cast<const byte*>(
147 110132 : SeqTwoByteString::cast(subject)->GetChars() + start_index);
148 185 : } else if (subject->IsExternalOneByteString()) {
149 : return reinterpret_cast<const byte*>(
150 37 : ExternalOneByteString::cast(subject)->GetChars() + start_index);
151 : } else {
152 : DCHECK(subject->IsExternalTwoByteString());
153 : return reinterpret_cast<const byte*>(
154 148 : ExternalTwoByteString::cast(subject)->GetChars() + start_index);
155 : }
156 : }
157 :
158 :
159 440 : int NativeRegExpMacroAssembler::CheckStackGuardState(
160 : Isolate* isolate, int start_index, bool is_direct_call,
161 : Address* return_address, Code* re_code, String** subject,
162 : const byte** input_start, const byte** input_end) {
163 : DCHECK(re_code->instruction_start() <= *return_address);
164 : DCHECK(*return_address <= re_code->instruction_end());
165 : int return_value = 0;
166 : // Prepare for possible GC.
167 : HandleScope handles(isolate);
168 : Handle<Code> code_handle(re_code);
169 440 : Handle<String> subject_handle(*subject);
170 440 : bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
171 :
172 : StackLimitCheck check(isolate);
173 440 : bool js_has_overflowed = check.JsHasOverflowed();
174 :
175 440 : if (is_direct_call) {
176 : // Direct calls from JavaScript can be interrupted in two ways:
177 : // 1. A real stack overflow, in which case we let the caller throw the
178 : // exception.
179 : // 2. The stack guard was used to interrupt execution for another purpose,
180 : // forcing the call through the runtime system.
181 81 : return_value = js_has_overflowed ? EXCEPTION : RETRY;
182 359 : } else if (js_has_overflowed) {
183 259 : isolate->StackOverflow();
184 : return_value = EXCEPTION;
185 : } else {
186 100 : Object* result = isolate->stack_guard()->HandleInterrupts();
187 100 : if (result->IsException(isolate)) return_value = EXCEPTION;
188 : }
189 :
190 : DisallowHeapAllocation no_gc;
191 :
192 440 : if (*code_handle != re_code) { // Return address no longer valid
193 0 : intptr_t delta = code_handle->address() - re_code->address();
194 : // Overwrite the return address on the stack.
195 0 : *return_address += delta;
196 : }
197 :
198 : // If we continue, we need to update the subject string addresses.
199 440 : if (return_value == 0) {
200 : // String encoding might have changed.
201 94 : if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
202 : // If we changed between an LATIN1 and an UC16 string, the specialized
203 : // code cannot be used, and we need to restart regexp matching from
204 : // scratch (including, potentially, compiling a new version of the code).
205 : return_value = RETRY;
206 : } else {
207 94 : *subject = *subject_handle;
208 94 : intptr_t byte_length = *input_end - *input_start;
209 94 : *input_start = StringCharacterPosition(*subject, start_index);
210 94 : *input_end = *input_start + byte_length;
211 : }
212 : }
213 440 : return return_value;
214 : }
215 :
216 :
217 700380 : NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
218 : Handle<Code> regexp_code,
219 : Handle<String> subject,
220 : int* offsets_vector,
221 : int offsets_vector_length,
222 : int previous_index,
223 : Isolate* isolate) {
224 :
225 : DCHECK(subject->IsFlat());
226 : DCHECK(previous_index >= 0);
227 : DCHECK(previous_index <= subject->length());
228 :
229 : // No allocations before calling the regexp, but we can't use
230 : // DisallowHeapAllocation, since regexps might be preempted, and another
231 : // thread might do allocation anyway.
232 :
233 : String* subject_ptr = *subject;
234 : // Character offsets into string.
235 : int start_offset = previous_index;
236 700380 : int char_length = subject_ptr->length() - start_offset;
237 : int slice_offset = 0;
238 :
239 : // The string has been flattened, so if it is a cons string it contains the
240 : // full string in the first part.
241 700380 : if (StringShape(subject_ptr).IsCons()) {
242 : DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
243 : subject_ptr = ConsString::cast(subject_ptr)->first();
244 700380 : } else if (StringShape(subject_ptr).IsSliced()) {
245 : SlicedString* slice = SlicedString::cast(subject_ptr);
246 : subject_ptr = slice->parent();
247 : slice_offset = slice->offset();
248 : }
249 700380 : if (StringShape(subject_ptr).IsThin()) {
250 : subject_ptr = ThinString::cast(subject_ptr)->actual();
251 : }
252 : // Ensure that an underlying string has the same representation.
253 : bool is_one_byte = subject_ptr->IsOneByteRepresentation();
254 : DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
255 : // String is now either Sequential or External
256 700380 : int char_size_shift = is_one_byte ? 0 : 1;
257 :
258 : const byte* input_start =
259 700380 : StringCharacterPosition(subject_ptr, start_offset + slice_offset);
260 700380 : int byte_length = char_length << char_size_shift;
261 700380 : const byte* input_end = input_start + byte_length;
262 : Result res = Execute(*regexp_code,
263 : *subject,
264 : start_offset,
265 : input_start,
266 : input_end,
267 : offsets_vector,
268 : offsets_vector_length,
269 700380 : isolate);
270 700380 : return res;
271 : }
272 :
273 :
274 700478 : NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
275 : Code* code,
276 : String* input, // This needs to be the unpacked (sliced, cons) string.
277 : int start_offset,
278 : const byte* input_start,
279 : const byte* input_end,
280 : int* output,
281 : int output_size,
282 : Isolate* isolate) {
283 : // Ensure that the minimum stack has been allocated.
284 700478 : RegExpStackScope stack_scope(isolate);
285 700478 : Address stack_base = stack_scope.stack()->stack_base();
286 :
287 : int direct_call = 0;
288 1400956 : int result = CALL_GENERATED_REGEXP_CODE(
289 : isolate, code->entry(), input, start_offset, input_start, input_end,
290 : output, output_size, stack_base, direct_call, isolate);
291 : DCHECK(result >= RETRY);
292 :
293 700478 : if (result == EXCEPTION && !isolate->has_pending_exception()) {
294 : // We detected a stack overflow (on the backtrack stack) in RegExp code,
295 : // but haven't created the exception yet.
296 17 : isolate->StackOverflow();
297 : }
298 700478 : return static_cast<Result>(result);
299 : }
300 :
301 :
302 : const byte NativeRegExpMacroAssembler::word_character_map[] = {
303 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
304 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
305 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
306 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307 :
308 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
309 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
310 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7'
311 : 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
312 :
313 : 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G'
314 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O'
315 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W'
316 : 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_'
317 :
318 : 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g'
319 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
320 : 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
321 : 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
322 : // Latin-1 range
323 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327 :
328 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332 :
333 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
335 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337 :
338 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
340 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
341 : 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
342 : };
343 :
344 :
345 505 : Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
346 : Address* stack_base,
347 505 : Isolate* isolate) {
348 505 : RegExpStack* regexp_stack = isolate->regexp_stack();
349 : size_t size = regexp_stack->stack_capacity();
350 : Address old_stack_base = regexp_stack->stack_base();
351 : DCHECK(old_stack_base == *stack_base);
352 : DCHECK(stack_pointer <= old_stack_base);
353 : DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
354 505 : Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
355 505 : if (new_stack_base == NULL) {
356 : return NULL;
357 : }
358 498 : *stack_base = new_stack_base;
359 498 : intptr_t stack_content_size = old_stack_base - stack_pointer;
360 498 : return new_stack_base - stack_content_size;
361 : }
362 :
363 : #endif // V8_INTERPRETED_REGEXP
364 :
365 : } // namespace internal
366 : } // namespace v8
|