LCOV - code coverage report
Current view: top level - src/regexp - regexp-macro-assembler.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 107 109 98.2 %
Date: 2019-03-21 Functions: 13 13 100.0 %

          Line data    Source code
       1             : // Copyright 2012 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/regexp-macro-assembler.h"
       6             : 
       7             : #include "src/assembler.h"
       8             : #include "src/isolate-inl.h"
       9             : #include "src/regexp/regexp-stack.h"
      10             : #include "src/simulator.h"
      11             : #include "src/unicode-inl.h"
      12             : 
      13             : #ifdef V8_INTL_SUPPORT
      14             : #include "unicode/uchar.h"
      15             : #endif  // V8_INTL_SUPPORT
      16             : 
      17             : namespace v8 {
      18             : namespace internal {
      19             : 
      20        3262 : RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
      21             :     : slow_safe_compiler_(false),
      22             :       global_mode_(NOT_GLOBAL),
      23             :       isolate_(isolate),
      24       85361 :       zone_(zone) {}
      25             : 
      26             : RegExpMacroAssembler::~RegExpMacroAssembler() = default;
      27             : 
      28      416053 : int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
      29             :                                                      Address byte_offset2,
      30             :                                                      size_t byte_length,
      31             :                                                      Isolate* isolate) {
      32             :   // This function is not allowed to cause a garbage collection.
      33             :   // A GC might move the calling generated code and invalidate the
      34             :   // return address on the stack.
      35             :   DCHECK_EQ(0, byte_length % 2);
      36      416053 :   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
      37      416053 :   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
      38      416053 :   size_t length = byte_length >> 1;
      39             : 
      40             : #ifdef V8_INTL_SUPPORT
      41      416053 :   if (isolate == nullptr) {
      42         105 :     for (size_t i = 0; i < length; i++) {
      43          35 :       uc32 c1 = substring1[i];
      44          35 :       uc32 c2 = substring2[i];
      45          35 :       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
      46             :         // Non-BMP characters do not have case-equivalents in the BMP.
      47             :         // Both have to be non-BMP for them to be able to match.
      48           9 :         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
      49           9 :         if (i + 1 < length) {
      50           9 :           uc16 c1t = substring1[i + 1];
      51           9 :           uc16 c2t = substring2[i + 1];
      52           9 :           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
      53             :               unibrow::Utf16::IsTrailSurrogate(c2t)) {
      54             :             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
      55             :             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
      56             :             i++;
      57             :           }
      58             :         }
      59             :       }
      60          35 :       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
      61          35 :       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
      62          35 :       if (c1 != c2) return 0;
      63             :     }
      64             :     return 1;
      65             :   }
      66             : #endif  // V8_INTL_SUPPORT
      67             :   DCHECK_NOT_NULL(isolate);
      68             :   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
      69             :       isolate->regexp_macro_assembler_canonicalize();
      70      465066 :   for (size_t i = 0; i < length; i++) {
      71      416090 :     unibrow::uchar c1 = substring1[i];
      72      416090 :     unibrow::uchar c2 = substring2[i];
      73      416090 :     if (c1 != c2) {
      74      391687 :       unibrow::uchar s1[1] = {c1};
      75      391687 :       canonicalize->get(c1, '\0', s1);
      76      391687 :       if (s1[0] != c2) {
      77      391660 :         unibrow::uchar s2[1] = {c2};
      78      391660 :         canonicalize->get(c2, '\0', s2);
      79      391660 :         if (s1[0] != s2[0]) {
      80      391566 :           return 0;
      81             :         }
      82             :       }
      83             :     }
      84             :   }
      85             :   return 1;
      86             : }
      87             : 
      88             : 
      89          84 : void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
      90             :                                                    Label* on_failure) {
      91          84 :   Label ok;
      92             :   // Check that current character is not a trail surrogate.
      93          84 :   LoadCurrentCharacter(cp_offset, &ok);
      94          84 :   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
      95             :   // Check that previous character is not a lead surrogate.
      96          84 :   LoadCurrentCharacter(cp_offset - 1, &ok);
      97          84 :   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
      98          84 :   Bind(&ok);
      99          84 : }
     100             : 
     101        1283 : void RegExpMacroAssembler::CheckPosition(int cp_offset,
     102             :                                          Label* on_outside_input) {
     103        1283 :   LoadCurrentCharacter(cp_offset, on_outside_input, true);
     104        1283 : }
     105             : 
     106        1879 : bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
     107             :                                                       Label* on_no_match) {
     108        1879 :   return false;
     109             : }
     110             : 
     111       82099 : NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
     112             :                                                        Zone* zone)
     113       82099 :     : RegExpMacroAssembler(isolate, zone) {}
     114             : 
     115             : NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
     116             : 
     117      185250 : bool NativeRegExpMacroAssembler::CanReadUnaligned() {
     118      185250 :   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
     119             : }
     120             : 
     121      113659 : const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
     122             :     String subject, int start_index, const DisallowHeapAllocation& no_gc) {
     123      113659 :   if (subject->IsConsString()) {
     124           0 :     subject = ConsString::cast(subject)->first();
     125      113659 :   } else if (subject->IsSlicedString()) {
     126           4 :     start_index += SlicedString::cast(subject)->offset();
     127           4 :     subject = SlicedString::cast(subject)->parent();
     128             :   }
     129      113659 :   if (subject->IsThinString()) {
     130           1 :     subject = ThinString::cast(subject)->actual();
     131             :   }
     132             :   DCHECK_LE(0, start_index);
     133             :   DCHECK_LE(start_index, subject->length());
     134      113659 :   if (subject->IsSeqOneByteString()) {
     135             :     return reinterpret_cast<const byte*>(
     136       40804 :         SeqOneByteString::cast(subject)->GetChars(no_gc) + start_index);
     137       72855 :   } else if (subject->IsSeqTwoByteString()) {
     138             :     return reinterpret_cast<const byte*>(
     139       70586 :         SeqTwoByteString::cast(subject)->GetChars(no_gc) + start_index);
     140        2269 :   } else if (subject->IsExternalOneByteString()) {
     141             :     return reinterpret_cast<const byte*>(
     142          96 :         ExternalOneByteString::cast(subject)->GetChars() + start_index);
     143             :   } else {
     144             :     DCHECK(subject->IsExternalTwoByteString());
     145             :     return reinterpret_cast<const byte*>(
     146        2173 :         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
     147             :   }
     148             : }
     149             : 
     150         391 : int NativeRegExpMacroAssembler::CheckStackGuardState(
     151             :     Isolate* isolate, int start_index, bool is_direct_call,
     152             :     Address* return_address, Code re_code, Address* subject,
     153             :     const byte** input_start, const byte** input_end) {
     154             :   AllowHeapAllocation allow_allocation;
     155             :   DCHECK(re_code->raw_instruction_start() <= *return_address);
     156             :   DCHECK(*return_address <= re_code->raw_instruction_end());
     157             :   int return_value = 0;
     158             :   // Prepare for possible GC.
     159             :   HandleScope handles(isolate);
     160             :   Handle<Code> code_handle(re_code, isolate);
     161         391 :   Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
     162         391 :   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
     163             : 
     164             :   StackLimitCheck check(isolate);
     165         391 :   bool js_has_overflowed = check.JsHasOverflowed();
     166             : 
     167         391 :   if (is_direct_call) {
     168             :     // Direct calls from JavaScript can be interrupted in two ways:
     169             :     // 1. A real stack overflow, in which case we let the caller throw the
     170             :     //    exception.
     171             :     // 2. The stack guard was used to interrupt execution for another purpose,
     172             :     //    forcing the call through the runtime system.
     173         143 :     return_value = js_has_overflowed ? EXCEPTION : RETRY;
     174         248 :   } else if (js_has_overflowed) {
     175          92 :     isolate->StackOverflow();
     176             :     return_value = EXCEPTION;
     177             :   } else {
     178         156 :     Object result = isolate->stack_guard()->HandleInterrupts();
     179         156 :     if (result->IsException(isolate)) return_value = EXCEPTION;
     180             :   }
     181             : 
     182             :   DisallowHeapAllocation no_gc;
     183             : 
     184         391 :   if (*code_handle != re_code) {  // Return address no longer valid
     185           1 :     intptr_t delta = code_handle->address() - re_code->address();
     186             :     // Overwrite the return address on the stack.
     187           1 :     *return_address += delta;
     188             :   }
     189             : 
     190             :   // If we continue, we need to update the subject string addresses.
     191         391 :   if (return_value == 0) {
     192             :     // String encoding might have changed.
     193         140 :     if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
     194             :         is_one_byte) {
     195             :       // If we changed between an LATIN1 and an UC16 string, the specialized
     196             :       // code cannot be used, and we need to restart regexp matching from
     197             :       // scratch (including, potentially, compiling a new version of the code).
     198             :       return_value = RETRY;
     199             :     } else {
     200         136 :       *subject = subject_handle->ptr();
     201         136 :       intptr_t byte_length = *input_end - *input_start;
     202             :       *input_start =
     203         136 :           StringCharacterPosition(*subject_handle, start_index, no_gc);
     204         136 :       *input_end = *input_start + byte_length;
     205             :     }
     206             :   }
     207         391 :   return return_value;
     208             : }
     209             : 
     210             : // Returns a {Result} sentinel, or the number of successful matches.
     211      113523 : int NativeRegExpMacroAssembler::Match(Handle<Code> regexp_code,
     212             :                                       Handle<String> subject,
     213             :                                       int* offsets_vector,
     214             :                                       int offsets_vector_length,
     215             :                                       int previous_index, Isolate* isolate) {
     216             :   DCHECK(subject->IsFlat());
     217             :   DCHECK_LE(0, previous_index);
     218             :   DCHECK_LE(previous_index, subject->length());
     219             : 
     220             :   // No allocations before calling the regexp, but we can't use
     221             :   // DisallowHeapAllocation, since regexps might be preempted, and another
     222             :   // thread might do allocation anyway.
     223             : 
     224      113523 :   String subject_ptr = *subject;
     225             :   // Character offsets into string.
     226             :   int start_offset = previous_index;
     227      113523 :   int char_length = subject_ptr->length() - start_offset;
     228             :   int slice_offset = 0;
     229             : 
     230             :   // The string has been flattened, so if it is a cons string it contains the
     231             :   // full string in the first part.
     232      227046 :   if (StringShape(subject_ptr).IsCons()) {
     233             :     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
     234           0 :     subject_ptr = ConsString::cast(subject_ptr)->first();
     235      227046 :   } else if (StringShape(subject_ptr).IsSliced()) {
     236             :     SlicedString slice = SlicedString::cast(subject_ptr);
     237         808 :     subject_ptr = slice->parent();
     238             :     slice_offset = slice->offset();
     239             :   }
     240      227046 :   if (StringShape(subject_ptr).IsThin()) {
     241           4 :     subject_ptr = ThinString::cast(subject_ptr)->actual();
     242             :   }
     243             :   // Ensure that an underlying string has the same representation.
     244      113523 :   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
     245             :   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
     246             :   // String is now either Sequential or External
     247      113523 :   int char_size_shift = is_one_byte ? 0 : 1;
     248             : 
     249             :   DisallowHeapAllocation no_gc;
     250             :   const byte* input_start =
     251      113523 :       StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc);
     252      113523 :   int byte_length = char_length << char_size_shift;
     253      113523 :   const byte* input_end = input_start + byte_length;
     254             :   return Execute(*regexp_code, *subject, start_offset, input_start, input_end,
     255      113523 :                  offsets_vector, offsets_vector_length, isolate);
     256             : }
     257             : 
     258             : // Returns a {Result} sentinel, or the number of successful matches.
     259      113579 : int NativeRegExpMacroAssembler::Execute(
     260             :     Code code,
     261             :     String input,  // This needs to be the unpacked (sliced, cons) string.
     262             :     int start_offset, const byte* input_start, const byte* input_end,
     263             :     int* output, int output_size, Isolate* isolate) {
     264             :   // Ensure that the minimum stack has been allocated.
     265      227158 :   RegExpStackScope stack_scope(isolate);
     266             :   Address stack_base = stack_scope.stack()->stack_base();
     267             : 
     268             :   int direct_call = 0;
     269             : 
     270             :   using RegexpMatcherSig = int(
     271             :       Address input_string, int start_offset,  // NOLINT(readability/casting)
     272             :       const byte* input_start, const byte* input_end, int* output,
     273             :       int output_size, Address stack_base, int direct_call, Isolate* isolate);
     274             : 
     275             :   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
     276             :   int result =
     277             :       fn.CallIrregexp(input.ptr(), start_offset, input_start, input_end, output,
     278             :                       output_size, stack_base, direct_call, isolate);
     279             :   DCHECK(result >= RETRY);
     280             : 
     281      113692 :   if (result == EXCEPTION && !isolate->has_pending_exception()) {
     282             :     // We detected a stack overflow (on the backtrack stack) in RegExp code,
     283             :     // but haven't created the exception yet. Additionally, we allow heap
     284             :     // allocation because even though it invalidates {input_start} and
     285             :     // {input_end}, we are about to return anyway.
     286             :     AllowHeapAllocation allow_allocation;
     287           5 :     isolate->StackOverflow();
     288             :   }
     289      113579 :   return result;
     290             : }
     291             : 
     292             : // clang-format off
     293             : const byte NativeRegExpMacroAssembler::word_character_map[] = {
     294             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     295             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     296             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     297             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     298             : 
     299             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     300             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     301             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
     302             :     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
     303             : 
     304             :     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
     305             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
     306             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
     307             :     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
     308             : 
     309             :     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
     310             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
     311             :     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
     312             :     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
     313             :     // Latin-1 range
     314             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     315             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     316             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     317             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     318             : 
     319             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     320             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     321             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     322             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     323             : 
     324             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     325             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     326             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     327             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     328             : 
     329             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     330             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     331             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     332             :     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
     333             : };
     334             : // clang-format on
     335             : 
     336         350 : Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
     337             :                                               Address* stack_base,
     338             :                                               Isolate* isolate) {
     339             :   RegExpStack* regexp_stack = isolate->regexp_stack();
     340             :   size_t size = regexp_stack->stack_capacity();
     341             :   Address old_stack_base = regexp_stack->stack_base();
     342             :   DCHECK(old_stack_base == *stack_base);
     343             :   DCHECK(stack_pointer <= old_stack_base);
     344             :   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
     345         350 :   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
     346         350 :   if (new_stack_base == kNullAddress) {
     347             :     return kNullAddress;
     348             :   }
     349         346 :   *stack_base = new_stack_base;
     350         346 :   intptr_t stack_content_size = old_stack_base - stack_pointer;
     351         346 :   return new_stack_base - stack_content_size;
     352             : }
     353             : 
     354             : }  // namespace internal
     355      120216 : }  // namespace v8

Generated by: LCOV version 1.10