LCOV - code coverage report
Current view: top level - src/regexp - regexp-macro-assembler.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 104 115 90.4 %
Date: 2019-01-20 Functions: 11 14 78.6 %

          Line data    Source code
       1             : // Copyright 2012 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/regexp-macro-assembler.h"
       6             : 
       7             : #include "src/assembler.h"
       8             : #include "src/isolate-inl.h"
       9             : #include "src/regexp/regexp-stack.h"
      10             : #include "src/simulator.h"
      11             : #include "src/unicode-inl.h"
      12             : 
      13             : #ifdef V8_INTL_SUPPORT
      14             : #include "unicode/uchar.h"
      15             : #endif  // V8_INTL_SUPPORT
      16             : 
      17             : namespace v8 {
      18             : namespace internal {
      19             : 
      20           0 : RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
      21             :     : slow_safe_compiler_(false),
      22             :       global_mode_(NOT_GLOBAL),
      23             :       isolate_(isolate),
      24       85333 :       zone_(zone) {}
      25             : 
      26             : RegExpMacroAssembler::~RegExpMacroAssembler() = default;
      27             : 
      28      416053 : int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
      29             :                                                      Address byte_offset2,
      30             :                                                      size_t byte_length,
      31             :                                                      Isolate* isolate) {
      32             :   // This function is not allowed to cause a garbage collection.
      33             :   // A GC might move the calling generated code and invalidate the
      34             :   // return address on the stack.
      35             :   DCHECK_EQ(0, byte_length % 2);
      36      416053 :   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
      37      416053 :   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
      38      416053 :   size_t length = byte_length >> 1;
      39             : 
      40             : #ifdef V8_INTL_SUPPORT
      41      416053 :   if (isolate == nullptr) {
      42          35 :     for (size_t i = 0; i < length; i++) {
      43          35 :       uc32 c1 = substring1[i];
      44          35 :       uc32 c2 = substring2[i];
      45          35 :       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
      46             :         // Non-BMP characters do not have case-equivalents in the BMP.
      47             :         // Both have to be non-BMP for them to be able to match.
      48           9 :         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
      49           9 :         if (i + 1 < length) {
      50           9 :           uc16 c1t = substring1[i + 1];
      51           9 :           uc16 c2t = substring2[i + 1];
      52          18 :           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
      53             :               unibrow::Utf16::IsTrailSurrogate(c2t)) {
      54             :             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
      55             :             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
      56             :             i++;
      57             :           }
      58             :         }
      59             :       }
      60          35 :       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
      61          35 :       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
      62          35 :       if (c1 != c2) return 0;
      63             :     }
      64             :     return 1;
      65             :   }
      66             : #endif  // V8_INTL_SUPPORT
      67             :   DCHECK_NOT_NULL(isolate);
      68             :   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
      69      416018 :       isolate->regexp_macro_assembler_canonicalize();
      70      440542 :   for (size_t i = 0; i < length; i++) {
      71      416090 :     unibrow::uchar c1 = substring1[i];
      72      416090 :     unibrow::uchar c2 = substring2[i];
      73      416090 :     if (c1 != c2) {
      74      391687 :       unibrow::uchar s1[1] = {c1};
      75      391687 :       canonicalize->get(c1, '\0', s1);
      76      391687 :       if (s1[0] != c2) {
      77      391660 :         unibrow::uchar s2[1] = {c2};
      78      391660 :         canonicalize->get(c2, '\0', s2);
      79      391660 :         if (s1[0] != s2[0]) {
      80      391566 :           return 0;
      81             :         }
      82             :       }
      83             :     }
      84             :   }
      85             :   return 1;
      86             : }
      87             : 
      88             : 
      89          85 : void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
      90             :                                                    Label* on_failure) {
      91          85 :   Label ok;
      92             :   // Check that current character is not a trail surrogate.
      93          85 :   LoadCurrentCharacter(cp_offset, &ok);
      94          85 :   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
      95             :   // Check that previous character is not a lead surrogate.
      96          85 :   LoadCurrentCharacter(cp_offset - 1, &ok);
      97          85 :   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
      98          85 :   Bind(&ok);
      99          85 : }
     100             : 
     101           0 : void RegExpMacroAssembler::CheckPosition(int cp_offset,
     102             :                                          Label* on_outside_input) {
     103           0 :   LoadCurrentCharacter(cp_offset, on_outside_input, true);
     104           0 : }
     105             : 
     106           0 : bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
     107             :                                                       Label* on_no_match) {
     108           0 :   return false;
     109             : }
     110             : 
     111             : #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
     112             : 
     113       85333 : NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
     114             :                                                        Zone* zone)
     115       85333 :     : RegExpMacroAssembler(isolate, zone) {}
     116             : 
     117             : NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
     118             : 
     119      213650 : bool NativeRegExpMacroAssembler::CanReadUnaligned() {
     120      213650 :   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
     121             : }
     122             : 
     123      118258 : const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
     124             :     String subject, int start_index, const DisallowHeapAllocation& no_gc) {
     125      118258 :   if (subject->IsConsString()) {
     126           0 :     subject = ConsString::cast(subject)->first();
     127      118258 :   } else if (subject->IsSlicedString()) {
     128           2 :     start_index += SlicedString::cast(subject)->offset();
     129           2 :     subject = SlicedString::cast(subject)->parent();
     130             :   }
     131      118258 :   if (subject->IsThinString()) {
     132           0 :     subject = ThinString::cast(subject)->actual();
     133             :   }
     134             :   DCHECK_LE(0, start_index);
     135             :   DCHECK_LE(start_index, subject->length());
     136      118258 :   if (subject->IsSeqOneByteString()) {
     137             :     return reinterpret_cast<const byte*>(
     138       46941 :         SeqOneByteString::cast(subject)->GetChars(no_gc) + start_index);
     139       71317 :   } else if (subject->IsSeqTwoByteString()) {
     140             :     return reinterpret_cast<const byte*>(
     141       71249 :         SeqTwoByteString::cast(subject)->GetChars(no_gc) + start_index);
     142          68 :   } else if (subject->IsExternalOneByteString()) {
     143             :     return reinterpret_cast<const byte*>(
     144          23 :         ExternalOneByteString::cast(subject)->GetChars() + start_index);
     145             :   } else {
     146             :     DCHECK(subject->IsExternalTwoByteString());
     147             :     return reinterpret_cast<const byte*>(
     148          45 :         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
     149             :   }
     150             : }
     151             : 
     152         400 : int NativeRegExpMacroAssembler::CheckStackGuardState(
     153             :     Isolate* isolate, int start_index, bool is_direct_call,
     154             :     Address* return_address, Code re_code, Address* subject,
     155             :     const byte** input_start, const byte** input_end) {
     156             :   AllowHeapAllocation allow_allocation;
     157             :   DCHECK(re_code->raw_instruction_start() <= *return_address);
     158             :   DCHECK(*return_address <= re_code->raw_instruction_end());
     159             :   int return_value = 0;
     160             :   // Prepare for possible GC.
     161             :   HandleScope handles(isolate);
     162             :   Handle<Code> code_handle(re_code, isolate);
     163         400 :   Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
     164         400 :   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
     165             : 
     166             :   StackLimitCheck check(isolate);
     167         400 :   bool js_has_overflowed = check.JsHasOverflowed();
     168             : 
     169         400 :   if (is_direct_call) {
     170             :     // Direct calls from JavaScript can be interrupted in two ways:
     171             :     // 1. A real stack overflow, in which case we let the caller throw the
     172             :     //    exception.
     173             :     // 2. The stack guard was used to interrupt execution for another purpose,
     174             :     //    forcing the call through the runtime system.
     175         141 :     return_value = js_has_overflowed ? EXCEPTION : RETRY;
     176         259 :   } else if (js_has_overflowed) {
     177          90 :     isolate->StackOverflow();
     178             :     return_value = EXCEPTION;
     179             :   } else {
     180         169 :     Object result = isolate->stack_guard()->HandleInterrupts();
     181         169 :     if (result->IsException(isolate)) return_value = EXCEPTION;
     182             :   }
     183             : 
     184             :   DisallowHeapAllocation no_gc;
     185             : 
     186         400 :   if (*code_handle != re_code) {  // Return address no longer valid
     187           0 :     intptr_t delta = code_handle->address() - re_code->address();
     188             :     // Overwrite the return address on the stack.
     189           0 :     *return_address += delta;
     190             :   }
     191             : 
     192             :   // If we continue, we need to update the subject string addresses.
     193         400 :   if (return_value == 0) {
     194             :     // String encoding might have changed.
     195         159 :     if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
     196             :         is_one_byte) {
     197             :       // If we changed between an LATIN1 and an UC16 string, the specialized
     198             :       // code cannot be used, and we need to restart regexp matching from
     199             :       // scratch (including, potentially, compiling a new version of the code).
     200             :       return_value = RETRY;
     201             :     } else {
     202         159 :       *subject = subject_handle->ptr();
     203         159 :       intptr_t byte_length = *input_end - *input_start;
     204             :       *input_start =
     205         159 :           StringCharacterPosition(*subject_handle, start_index, no_gc);
     206         159 :       *input_end = *input_start + byte_length;
     207             :     }
     208             :   }
     209         400 :   return return_value;
     210             : }
     211             : 
     212      118099 : NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
     213             :     Handle<Code> regexp_code,
     214             :     Handle<String> subject,
     215             :     int* offsets_vector,
     216             :     int offsets_vector_length,
     217             :     int previous_index,
     218             :     Isolate* isolate) {
     219             : 
     220             :   DCHECK(subject->IsFlat());
     221             :   DCHECK_LE(0, previous_index);
     222             :   DCHECK_LE(previous_index, subject->length());
     223             : 
     224             :   // No allocations before calling the regexp, but we can't use
     225             :   // DisallowHeapAllocation, since regexps might be preempted, and another
     226             :   // thread might do allocation anyway.
     227             : 
     228      118099 :   String subject_ptr = *subject;
     229             :   // Character offsets into string.
     230             :   int start_offset = previous_index;
     231      118099 :   int char_length = subject_ptr->length() - start_offset;
     232             :   int slice_offset = 0;
     233             : 
     234             :   // The string has been flattened, so if it is a cons string it contains the
     235             :   // full string in the first part.
     236      118099 :   if (StringShape(subject_ptr).IsCons()) {
     237             :     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
     238           0 :     subject_ptr = ConsString::cast(subject_ptr)->first();
     239      118099 :   } else if (StringShape(subject_ptr).IsSliced()) {
     240         754 :     SlicedString slice = SlicedString::cast(subject_ptr);
     241         754 :     subject_ptr = slice->parent();
     242             :     slice_offset = slice->offset();
     243             :   }
     244      118099 :   if (StringShape(subject_ptr).IsThin()) {
     245           5 :     subject_ptr = ThinString::cast(subject_ptr)->actual();
     246             :   }
     247             :   // Ensure that an underlying string has the same representation.
     248      118099 :   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
     249             :   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
     250             :   // String is now either Sequential or External
     251      118099 :   int char_size_shift = is_one_byte ? 0 : 1;
     252             : 
     253             :   DisallowHeapAllocation no_gc;
     254             :   const byte* input_start =
     255      118099 :       StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc);
     256      118099 :   int byte_length = char_length << char_size_shift;
     257      118099 :   const byte* input_end = input_start + byte_length;
     258             :   Result res = Execute(*regexp_code,
     259             :                        *subject,
     260             :                        start_offset,
     261             :                        input_start,
     262             :                        input_end,
     263             :                        offsets_vector,
     264             :                        offsets_vector_length,
     265      118099 :                        isolate);
     266      118099 :   return res;
     267             : }
     268             : 
     269      118169 : NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
     270             :     Code code,
     271             :     String input,  // This needs to be the unpacked (sliced, cons) string.
     272             :     int start_offset, const byte* input_start, const byte* input_end,
     273             :     int* output, int output_size, Isolate* isolate) {
     274             :   // Ensure that the minimum stack has been allocated.
     275      118169 :   RegExpStackScope stack_scope(isolate);
     276      118169 :   Address stack_base = stack_scope.stack()->stack_base();
     277             : 
     278             :   int direct_call = 0;
     279             : 
     280             :   using RegexpMatcherSig = int(
     281             :       Address input_string, int start_offset,  // NOLINT(readability/casting)
     282             :       const byte* input_start, const byte* input_end, int* output,
     283             :       int output_size, Address stack_base, int direct_call, Isolate* isolate);
     284             : 
     285             :   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
     286             :   int result = fn.Call(input.ptr(), start_offset, input_start, input_end,
     287             :                        output, output_size, stack_base, direct_call, isolate);
     288             :   DCHECK(result >= RETRY);
     289             : 
     290      118169 :   if (result == EXCEPTION && !isolate->has_pending_exception()) {
     291             :     // We detected a stack overflow (on the backtrack stack) in RegExp code,
     292             :     // but haven't created the exception yet. Additionally, we allow heap
     293             :     // allocation because even though it invalidates {input_start} and
     294             :     // {input_end}, we are about to return anyway.
     295             :     AllowHeapAllocation allow_allocation;
     296           6 :     isolate->StackOverflow();
     297             :   }
     298      118169 :   return static_cast<Result>(result);
     299             : }
     300             : 
     301             : // clang-format off
     302             : const byte NativeRegExpMacroAssembler::word_character_map[] = {
     303             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     304             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     305             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     306             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     307             : 
     308             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     309             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     310             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
     311             :     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
     312             : 
     313             :     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
     314             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
     315             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
     316             :     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
     317             : 
     318             :     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
     319             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
     320             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
     321             :     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
     322             :     // Latin-1 range
     323             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     324             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     325             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     326             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     327             : 
     328             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     329             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     330             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     331             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     332             : 
     333             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     334             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     335             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     336             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     337             : 
     338             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     339             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     340             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     341             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     342             : };
     343             : // clang-format on
     344             : 
     345         329 : Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
     346             :                                               Address* stack_base,
     347         329 :                                               Isolate* isolate) {
     348         329 :   RegExpStack* regexp_stack = isolate->regexp_stack();
     349             :   size_t size = regexp_stack->stack_capacity();
     350             :   Address old_stack_base = regexp_stack->stack_base();
     351             :   DCHECK(old_stack_base == *stack_base);
     352             :   DCHECK(stack_pointer <= old_stack_base);
     353             :   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
     354         329 :   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
     355         329 :   if (new_stack_base == kNullAddress) {
     356             :     return kNullAddress;
     357             :   }
     358         324 :   *stack_base = new_stack_base;
     359         324 :   intptr_t stack_content_size = old_stack_base - stack_pointer;
     360         324 :   return new_stack_base - stack_content_size;
     361             : }
     362             : 
     363             : #endif  // V8_INTERPRETED_REGEXP
     364             : 
     365             : }  // namespace internal
     366      183867 : }  // namespace v8

Generated by: LCOV version 1.10