LCOV - code coverage report
Current view: top level - src - unicode-inl.h (source / functions) Hit Total Coverage
Test: app.info Lines: 66 66 100.0 %
Date: 2019-03-21 Functions: 8 8 100.0 %

          Line data    Source code
       1             : // Copyright 2007-2010 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_UNICODE_INL_H_
       6             : #define V8_UNICODE_INL_H_
       7             : 
       8             : #include "src/unicode.h"
       9             : #include "src/base/logging.h"
      10             : #include "src/utils.h"
      11             : 
      12             : namespace unibrow {
      13             : 
      14             : template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
      15             :   CacheEntry entry = entries_[code_point & kMask];
      16             :   if (entry.code_point() == code_point) return entry.value();
      17             :   return CalculateValue(code_point);
      18             : }
      19             : 
      20             : template <class T, int s> bool Predicate<T, s>::CalculateValue(
      21             :     uchar code_point) {
      22             :   bool result = T::Is(code_point);
      23             :   entries_[code_point & kMask] = CacheEntry(code_point, result);
      24             :   return result;
      25             : }
      26             : 
      27    16725399 : template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
      28             :     uchar* result) {
      29    16725399 :   CacheEntry entry = entries_[c & kMask];
      30    16725399 :   if (entry.code_point_ == c) {
      31      960219 :     if (entry.offset_ == 0) {
      32             :       return 0;
      33             :     } else {
      34       59717 :       result[0] = c + entry.offset_;
      35       59717 :       return 1;
      36             :     }
      37             :   } else {
      38    15765180 :     return CalculateValue(c, n, result);
      39             :   }
      40             : }
      41             : 
      42    15765180 : template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
      43             :     uchar* result) {
      44    15765180 :   bool allow_caching = true;
      45    15765180 :   int length = T::Convert(c, n, result, &allow_caching);
      46    15765180 :   if (allow_caching) {
      47    15547970 :     if (length == 1) {
      48        7351 :       entries_[c & kMask] = CacheEntry(c, result[0] - c);
      49        7351 :       return 1;
      50             :     } else {
      51    15540619 :       entries_[c & kMask] = CacheEntry(c, 0);
      52    15540619 :       return 0;
      53             :     }
      54             :   } else {
      55             :     return length;
      56             :   }
      57             : }
      58             : 
      59             : // Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
      60             : // stream in. This **must** be followed by a call to ValueOfIncrementalFinish
      61             : // when the stream is complete, to ensure incomplete sequences are handled.
      62   206563497 : uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
      63             :                                Utf8IncrementalBuffer* buffer) {
      64             :   DCHECK_NOT_NULL(buffer);
      65   206563497 :   State old_state = *state;
      66   206563497 :   byte next = **cursor;
      67   206563497 :   *cursor += 1;
      68             : 
      69   206563497 :   if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
      70             :     DCHECK_EQ(0u, *buffer);
      71   203041076 :     return static_cast<uchar>(next);
      72             :   }
      73             : 
      74             :   // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
      75             :   // char in that sequence.
      76             :   Utf8DfaDecoder::Decode(next, state, buffer);
      77             : 
      78     3522421 :   switch (*state) {
      79             :     case State::kAccept: {
      80             :       uchar t = *buffer;
      81     1330102 :       *buffer = 0;
      82     1330102 :       return t;
      83             :     }
      84             : 
      85             :     case State::kReject:
      86        3139 :       *state = State::kAccept;
      87        3139 :       *buffer = 0;
      88             : 
      89             :       // If we hit a bad byte, we need to determine if we were trying to start
      90             :       // a sequence or continue one. If we were trying to start a sequence,
      91             :       // that means it's just an invalid lead byte and we need to continue to
      92             :       // the next (which we already did above). If we were already in a
      93             :       // sequence, we need to reprocess this same byte after resetting to the
      94             :       // initial state.
      95        3139 :       if (old_state != State::kAccept) {
      96             :         // We were trying to continue a sequence, so let's reprocess this byte
      97             :         // next time.
      98         869 :         *cursor -= 1;
      99             :       }
     100             :       return kBadChar;
     101             : 
     102             :     default:
     103             :       return kIncomplete;
     104             :   }
     105             : }
     106             : 
     107             : unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
     108             :   static const int kMask = ~(1 << 6);
     109        6378 :   if (c <= kMaxOneByteChar) {
     110        6026 :     str[0] = c;
     111             :     return 1;
     112             :   }
     113         352 :   str[0] = 0xC0 | (c >> 6);
     114         352 :   str[1] = 0x80 | (c & kMask);
     115             :   return 2;
     116             : }
     117             : 
     118             : // Encode encodes the UTF-16 code units c and previous into the given str
     119             : // buffer, and combines surrogate code units into single code points. If
     120             : // replace_invalid is set to true, orphan surrogate code units will be replaced
     121             : // with kBadChar.
     122   183084906 : unsigned Utf8::Encode(char* str,
     123             :                       uchar c,
     124             :                       int previous,
     125             :                       bool replace_invalid) {
     126             :   static const int kMask = ~(1 << 6);
     127   183084906 :   if (c <= kMaxOneByteChar) {
     128    32706773 :     str[0] = c;
     129    32706773 :     return 1;
     130   150378133 :   } else if (c <= kMaxTwoByteChar) {
     131    45124746 :     str[0] = 0xC0 | (c >> 6);
     132    45124746 :     str[1] = 0x80 | (c & kMask);
     133    45124746 :     return 2;
     134   105253387 :   } else if (c <= kMaxThreeByteChar) {
     135             :     DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
     136   180505492 :     if (Utf16::IsSurrogatePair(previous, c)) {
     137             :       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
     138    30000654 :       return Encode(str - kUnmatchedSize,
     139    15000327 :                     Utf16::CombineSurrogatePair(previous, c),
     140             :                     Utf16::kNoPreviousCharacter,
     141    15000327 :                     replace_invalid) - kUnmatchedSize;
     142   150504872 :     } else if (replace_invalid &&
     143          12 :                (Utf16::IsLeadSurrogate(c) ||
     144             :                Utf16::IsTrailSurrogate(c))) {
     145             :       c = kBadChar;
     146             :     }
     147    75252419 :     str[0] = 0xE0 | (c >> 12);
     148    75252419 :     str[1] = 0x80 | ((c >> 6) & kMask);
     149    75252419 :     str[2] = 0x80 | (c & kMask);
     150    75252419 :     return 3;
     151             :   } else {
     152    15000641 :     str[0] = 0xF0 | (c >> 18);
     153    15000641 :     str[1] = 0x80 | ((c >> 12) & kMask);
     154    15000641 :     str[2] = 0x80 | ((c >> 6) & kMask);
     155    15000641 :     str[3] = 0x80 | (c & kMask);
     156    15000641 :     return 4;
     157             :   }
     158             : }
     159             : 
     160             : 
     161             : uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
     162   207546607 :   if (length <= 0) return kBadChar;
     163   181330169 :   byte first = bytes[0];
     164             :   // Characters between 0000 and 007F are encoded as a single character
     165   181330169 :   if (V8_LIKELY(first <= kMaxOneByteChar)) {
     166   181273010 :     *cursor += 1;
     167   181273010 :     return first;
     168             :   }
     169       57159 :   return CalculateValue(bytes, length, cursor);
     170             : }
     171             : 
     172             : unsigned Utf8::Length(uchar c, int previous) {
     173    30102856 :   if (c <= kMaxOneByteChar) {
     174             :     return 1;
     175      372736 :   } else if (c <= kMaxTwoByteChar) {
     176             :     return 2;
     177             :   } else if (c <= kMaxThreeByteChar) {
     178             :     DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
     179      248470 :     if (Utf16::IsSurrogatePair(previous, c)) {
     180             :       return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
     181             :     }
     182             :     return 3;
     183             :   } else {
     184             :     return 4;
     185             :   }
     186             : }
     187             : 
     188             : bool Utf8::IsValidCharacter(uchar c) {
     189             :   return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
     190             :          (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
     191             :           c != kBadChar);
     192             : }
     193             : 
     194             : }  // namespace unibrow
     195             : 
     196             : #endif  // V8_UNICODE_INL_H_

Generated by: LCOV version 1.10