LCOV - code coverage report
Current view: top level - src/regexp - regexp-macro-assembler.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 79 81 97.5 %
Date: 2019-04-17 Functions: 13 13 100.0 %

          Line data    Source code
       1             : // Copyright 2012 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/regexp-macro-assembler.h"
       6             : 
       7             : #include "src/assembler.h"
       8             : #include "src/isolate-inl.h"
       9             : #include "src/regexp/regexp-stack.h"
      10             : #include "src/simulator.h"
      11             : #include "src/unicode-inl.h"
      12             : 
      13             : #ifdef V8_INTL_SUPPORT
      14             : #include "unicode/uchar.h"
      15             : #include "unicode/unistr.h"
      16             : #endif  // V8_INTL_SUPPORT
      17             : 
      18             : namespace v8 {
      19             : namespace internal {
      20             : 
      21        3275 : RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
      22             :     : slow_safe_compiler_(false),
      23             :       global_mode_(NOT_GLOBAL),
      24             :       isolate_(isolate),
      25       85448 :       zone_(zone) {}
      26             : 
      27             : RegExpMacroAssembler::~RegExpMacroAssembler() = default;
      28             : 
      29      416053 : int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
      30             :                                                      Address byte_offset2,
      31             :                                                      size_t byte_length,
      32             :                                                      Isolate* isolate) {
      33             :   // This function is not allowed to cause a garbage collection.
      34             :   // A GC might move the calling generated code and invalidate the
      35             :   // return address on the stack.
      36             :   DCHECK_EQ(0, byte_length % 2);
      37             : 
      38             : #ifdef V8_INTL_SUPPORT
      39      416053 :   int32_t length = (int32_t)(byte_length >> 1);
      40             :   icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
      41      832106 :                                length);
      42     1248159 :   return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
      43      832106 :                                length, U_FOLD_CASE_DEFAULT) == 0;
      44             : #else
      45             :   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
      46             :   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
      47             :   size_t length = byte_length >> 1;
      48             :   DCHECK_NOT_NULL(isolate);
      49             :   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
      50             :       isolate->regexp_macro_assembler_canonicalize();
      51             :   for (size_t i = 0; i < length; i++) {
      52             :     unibrow::uchar c1 = substring1[i];
      53             :     unibrow::uchar c2 = substring2[i];
      54             :     if (c1 != c2) {
      55             :       unibrow::uchar s1[1] = {c1};
      56             :       canonicalize->get(c1, '\0', s1);
      57             :       if (s1[0] != c2) {
      58             :         unibrow::uchar s2[1] = {c2};
      59             :         canonicalize->get(c2, '\0', s2);
      60             :         if (s1[0] != s2[0]) {
      61             :           return 0;
      62             :         }
      63             :       }
      64             :     }
      65             :   }
      66             :   return 1;
      67             : #endif  // V8_INTL_SUPPORT
      68             : }
      69             : 
      70             : 
      71          84 : void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
      72             :                                                    Label* on_failure) {
      73          84 :   Label ok;
      74             :   // Check that current character is not a trail surrogate.
      75          84 :   LoadCurrentCharacter(cp_offset, &ok);
      76          84 :   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
      77             :   // Check that previous character is not a lead surrogate.
      78          84 :   LoadCurrentCharacter(cp_offset - 1, &ok);
      79          84 :   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
      80          84 :   Bind(&ok);
      81          84 : }
      82             : 
      83        1288 : void RegExpMacroAssembler::CheckPosition(int cp_offset,
      84             :                                          Label* on_outside_input) {
      85        1288 :   LoadCurrentCharacter(cp_offset, on_outside_input, true);
      86        1288 : }
      87             : 
      88        1927 : bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
      89             :                                                       Label* on_no_match) {
      90        1927 :   return false;
      91             : }
      92             : 
      93       82173 : NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
      94             :                                                        Zone* zone)
      95       82173 :     : RegExpMacroAssembler(isolate, zone) {}
      96             : 
      97             : NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
      98             : 
      99      185392 : bool NativeRegExpMacroAssembler::CanReadUnaligned() {
     100      185392 :   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
     101             : }
     102             : 
     103      111568 : const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
     104             :     String subject, int start_index, const DisallowHeapAllocation& no_gc) {
     105      111568 :   if (subject->IsConsString()) {
     106           0 :     subject = ConsString::cast(subject)->first();
     107      111568 :   } else if (subject->IsSlicedString()) {
     108           2 :     start_index += SlicedString::cast(subject)->offset();
     109           2 :     subject = SlicedString::cast(subject)->parent();
     110             :   }
     111      111568 :   if (subject->IsThinString()) {
     112           0 :     subject = ThinString::cast(subject)->actual();
     113             :   }
     114             :   DCHECK_LE(0, start_index);
     115             :   DCHECK_LE(start_index, subject->length());
     116      111568 :   if (subject->IsSeqOneByteString()) {
     117             :     return reinterpret_cast<const byte*>(
     118       40899 :         SeqOneByteString::cast(subject)->GetChars(no_gc) + start_index);
     119       70669 :   } else if (subject->IsSeqTwoByteString()) {
     120             :     return reinterpret_cast<const byte*>(
     121       70600 :         SeqTwoByteString::cast(subject)->GetChars(no_gc) + start_index);
     122          69 :   } else if (subject->IsExternalOneByteString()) {
     123             :     return reinterpret_cast<const byte*>(
     124          24 :         ExternalOneByteString::cast(subject)->GetChars() + start_index);
     125             :   } else {
     126             :     DCHECK(subject->IsExternalTwoByteString());
     127             :     return reinterpret_cast<const byte*>(
     128          45 :         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
     129             :   }
     130             : }
     131             : 
     132         413 : int NativeRegExpMacroAssembler::CheckStackGuardState(
     133             :     Isolate* isolate, int start_index, bool is_direct_call,
     134             :     Address* return_address, Code re_code, Address* subject,
     135             :     const byte** input_start, const byte** input_end) {
     136             :   AllowHeapAllocation allow_allocation;
     137             :   DCHECK(re_code->raw_instruction_start() <= *return_address);
     138             :   DCHECK(*return_address <= re_code->raw_instruction_end());
     139             :   int return_value = 0;
     140             :   // Prepare for possible GC.
     141             :   HandleScope handles(isolate);
     142             :   Handle<Code> code_handle(re_code, isolate);
     143         413 :   Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
     144         413 :   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
     145             : 
     146             :   StackLimitCheck check(isolate);
     147         413 :   bool js_has_overflowed = check.JsHasOverflowed();
     148             : 
     149         413 :   if (is_direct_call) {
     150             :     // Direct calls from JavaScript can be interrupted in two ways:
     151             :     // 1. A real stack overflow, in which case we let the caller throw the
     152             :     //    exception.
     153             :     // 2. The stack guard was used to interrupt execution for another purpose,
     154             :     //    forcing the call through the runtime system.
     155         158 :     return_value = js_has_overflowed ? EXCEPTION : RETRY;
     156         255 :   } else if (js_has_overflowed) {
     157          80 :     isolate->StackOverflow();
     158             :     return_value = EXCEPTION;
     159             :   } else {
     160         175 :     Object result = isolate->stack_guard()->HandleInterrupts();
     161         175 :     if (result->IsException(isolate)) return_value = EXCEPTION;
     162             :   }
     163             : 
     164             :   DisallowHeapAllocation no_gc;
     165             : 
     166         413 :   if (*code_handle != re_code) {  // Return address no longer valid
     167           1 :     intptr_t delta = code_handle->address() - re_code->address();
     168             :     // Overwrite the return address on the stack.
     169           1 :     *return_address += delta;
     170             :   }
     171             : 
     172             :   // If we continue, we need to update the subject string addresses.
     173         413 :   if (return_value == 0) {
     174             :     // String encoding might have changed.
     175         159 :     if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
     176             :         is_one_byte) {
     177             :       // If we changed between an LATIN1 and an UC16 string, the specialized
     178             :       // code cannot be used, and we need to restart regexp matching from
     179             :       // scratch (including, potentially, compiling a new version of the code).
     180             :       return_value = RETRY;
     181             :     } else {
     182         155 :       *subject = subject_handle->ptr();
     183         155 :       intptr_t byte_length = *input_end - *input_start;
     184             :       *input_start =
     185         155 :           StringCharacterPosition(*subject_handle, start_index, no_gc);
     186         155 :       *input_end = *input_start + byte_length;
     187             :     }
     188             :   }
     189         413 :   return return_value;
     190             : }
     191             : 
     192             : // Returns a {Result} sentinel, or the number of successful matches.
     193      111413 : int NativeRegExpMacroAssembler::Match(Handle<Code> regexp_code,
     194             :                                       Handle<String> subject,
     195             :                                       int* offsets_vector,
     196             :                                       int offsets_vector_length,
     197             :                                       int previous_index, Isolate* isolate) {
     198             :   DCHECK(subject->IsFlat());
     199             :   DCHECK_LE(0, previous_index);
     200             :   DCHECK_LE(previous_index, subject->length());
     201             : 
     202             :   // No allocations before calling the regexp, but we can't use
     203             :   // DisallowHeapAllocation, since regexps might be preempted, and another
     204             :   // thread might do allocation anyway.
     205             : 
     206             :   String subject_ptr = *subject;
     207             :   // Character offsets into string.
     208             :   int start_offset = previous_index;
     209      111413 :   int char_length = subject_ptr->length() - start_offset;
     210             :   int slice_offset = 0;
     211             : 
     212             :   // The string has been flattened, so if it is a cons string it contains the
     213             :   // full string in the first part.
     214      111413 :   if (StringShape(subject_ptr).IsCons()) {
     215             :     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
     216             :     subject_ptr = ConsString::cast(subject_ptr)->first();
     217      111413 :   } else if (StringShape(subject_ptr).IsSliced()) {
     218             :     SlicedString slice = SlicedString::cast(subject_ptr);
     219             :     subject_ptr = slice->parent();
     220             :     slice_offset = slice->offset();
     221             :   }
     222      111413 :   if (StringShape(subject_ptr).IsThin()) {
     223             :     subject_ptr = ThinString::cast(subject_ptr)->actual();
     224             :   }
     225             :   // Ensure that an underlying string has the same representation.
     226             :   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
     227             :   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
     228             :   // String is now either Sequential or External
     229      111413 :   int char_size_shift = is_one_byte ? 0 : 1;
     230             : 
     231             :   DisallowHeapAllocation no_gc;
     232             :   const byte* input_start =
     233      111413 :       StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc);
     234      111413 :   int byte_length = char_length << char_size_shift;
     235      111413 :   const byte* input_end = input_start + byte_length;
     236             :   return Execute(*regexp_code, *subject, start_offset, input_start, input_end,
     237      111413 :                  offsets_vector, offsets_vector_length, isolate);
     238             : }
     239             : 
     240             : // Returns a {Result} sentinel, or the number of successful matches.
     241      111469 : int NativeRegExpMacroAssembler::Execute(
     242             :     Code code,
     243             :     String input,  // This needs to be the unpacked (sliced, cons) string.
     244             :     int start_offset, const byte* input_start, const byte* input_end,
     245             :     int* output, int output_size, Isolate* isolate) {
     246             :   // Ensure that the minimum stack has been allocated.
     247      222938 :   RegExpStackScope stack_scope(isolate);
     248             :   Address stack_base = stack_scope.stack()->stack_base();
     249             : 
     250             :   int direct_call = 0;
     251             : 
     252             :   using RegexpMatcherSig = int(
     253             :       Address input_string, int start_offset,  // NOLINT(readability/casting)
     254             :       const byte* input_start, const byte* input_end, int* output,
     255             :       int output_size, Address stack_base, int direct_call, Isolate* isolate);
     256             : 
     257             :   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
     258             :   int result =
     259             :       fn.CallIrregexp(input.ptr(), start_offset, input_start, input_end, output,
     260             :                       output_size, stack_base, direct_call, isolate);
     261             :   DCHECK(result >= RETRY);
     262             : 
     263      111569 :   if (result == EXCEPTION && !isolate->has_pending_exception()) {
     264             :     // We detected a stack overflow (on the backtrack stack) in RegExp code,
     265             :     // but haven't created the exception yet. Additionally, we allow heap
     266             :     // allocation because even though it invalidates {input_start} and
     267             :     // {input_end}, we are about to return anyway.
     268             :     AllowHeapAllocation allow_allocation;
     269           4 :     isolate->StackOverflow();
     270             :   }
     271      111469 :   return result;
     272             : }
     273             : 
     274             : // clang-format off
     275             : const byte NativeRegExpMacroAssembler::word_character_map[] = {
     276             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     277             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     278             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     279             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     280             : 
     281             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     282             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     283             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
     284             :     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
     285             : 
     286             :     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
     287             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
     288             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
     289             :     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
     290             : 
     291             :     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
     292             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
     293             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
     294             :     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
     295             :     // Latin-1 range
     296             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     297             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     298             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     299             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     300             : 
     301             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     302             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     303             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     304             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     305             : 
     306             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     307             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     308             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     309             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     310             : 
     311             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     312             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     313             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     314             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     315             : };
     316             : // clang-format on
     317             : 
     318         350 : Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
     319             :                                               Address* stack_base,
     320             :                                               Isolate* isolate) {
     321             :   RegExpStack* regexp_stack = isolate->regexp_stack();
     322             :   size_t size = regexp_stack->stack_capacity();
     323             :   Address old_stack_base = regexp_stack->stack_base();
     324             :   DCHECK(old_stack_base == *stack_base);
     325             :   DCHECK(stack_pointer <= old_stack_base);
     326             :   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
     327         350 :   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
     328         350 :   if (new_stack_base == kNullAddress) {
     329             :     return kNullAddress;
     330             :   }
     331         346 :   *stack_base = new_stack_base;
     332         346 :   intptr_t stack_content_size = old_stack_base - stack_pointer;
     333         346 :   return new_stack_base - stack_content_size;
     334             : }
     335             : 
     336             : }  // namespace internal
     337      121996 : }  // namespace v8

Generated by: LCOV version 1.10