LCOV - code coverage report
Current view: top level - src - unicode-inl.h (source / functions) Hit Total Coverage
Test: app.info Lines: 50 50 100.0 %
Date: 2019-04-19 Functions: 2 2 100.0 %

          Line data    Source code
       1             : // Copyright 2007-2010 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_UNICODE_INL_H_
       6             : #define V8_UNICODE_INL_H_
       7             : 
       8             : #include "src/unicode.h"
       9             : #include "src/base/logging.h"
      10             : #include "src/utils.h"
      11             : 
      12             : namespace unibrow {
      13             : 
      14             : #ifndef V8_INTL_SUPPORT
      15             : template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
      16             :   CacheEntry entry = entries_[code_point & kMask];
      17             :   if (entry.code_point() == code_point) return entry.value();
      18             :   return CalculateValue(code_point);
      19             : }
      20             : 
      21             : template <class T, int s> bool Predicate<T, s>::CalculateValue(
      22             :     uchar code_point) {
      23             :   bool result = T::Is(code_point);
      24             :   entries_[code_point & kMask] = CacheEntry(code_point, result);
      25             :   return result;
      26             : }
      27             : 
      28             : template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
      29             :     uchar* result) {
      30             :   CacheEntry entry = entries_[c & kMask];
      31             :   if (entry.code_point_ == c) {
      32             :     if (entry.offset_ == 0) {
      33             :       return 0;
      34             :     } else {
      35             :       result[0] = c + entry.offset_;
      36             :       return 1;
      37             :     }
      38             :   } else {
      39             :     return CalculateValue(c, n, result);
      40             :   }
      41             : }
      42             : 
      43             : template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
      44             :     uchar* result) {
      45             :   bool allow_caching = true;
      46             :   int length = T::Convert(c, n, result, &allow_caching);
      47             :   if (allow_caching) {
      48             :     if (length == 1) {
      49             :       entries_[c & kMask] = CacheEntry(c, result[0] - c);
      50             :       return 1;
      51             :     } else {
      52             :       entries_[c & kMask] = CacheEntry(c, 0);
      53             :       return 0;
      54             :     }
      55             :   } else {
      56             :     return length;
      57             :   }
      58             : }
      59             : #endif  // !V8_INTL_SUPPORT
      60             : 
      61             : // Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
      62             : // stream in. This **must** be followed by a call to ValueOfIncrementalFinish
      63             : // when the stream is complete, to ensure incomplete sequences are handled.
      64   209229038 : uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
      65             :                                Utf8IncrementalBuffer* buffer) {
      66             :   DCHECK_NOT_NULL(buffer);
      67   209229038 :   State old_state = *state;
      68   209229038 :   byte next = **cursor;
      69   209229038 :   *cursor += 1;
      70             : 
      71   209229038 :   if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
      72             :     DCHECK_EQ(0u, *buffer);
      73   205706429 :     return static_cast<uchar>(next);
      74             :   }
      75             : 
      76             :   // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
      77             :   // char in that sequence.
      78             :   Utf8DfaDecoder::Decode(next, state, buffer);
      79             : 
      80     3522609 :   switch (*state) {
      81             :     case State::kAccept: {
      82             :       uchar t = *buffer;
      83     1330198 :       *buffer = 0;
      84     1330198 :       return t;
      85             :     }
      86             : 
      87             :     case State::kReject:
      88        3139 :       *state = State::kAccept;
      89        3139 :       *buffer = 0;
      90             : 
      91             :       // If we hit a bad byte, we need to determine if we were trying to start
      92             :       // a sequence or continue one. If we were trying to start a sequence,
      93             :       // that means it's just an invalid lead byte and we need to continue to
      94             :       // the next (which we already did above). If we were already in a
      95             :       // sequence, we need to reprocess this same byte after resetting to the
      96             :       // initial state.
      97        3139 :       if (old_state != State::kAccept) {
      98             :         // We were trying to continue a sequence, so let's reprocess this byte
      99             :         // next time.
     100         869 :         *cursor -= 1;
     101             :       }
     102             :       return kBadChar;
     103             : 
     104             :     default:
     105             :       return kIncomplete;
     106             :   }
     107             : }
     108             : 
     109             : unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
     110             :   static const int kMask = ~(1 << 6);
     111        8123 :   if (c <= kMaxOneByteChar) {
     112        7751 :     str[0] = c;
     113             :     return 1;
     114             :   }
     115         372 :   str[0] = 0xC0 | (c >> 6);
     116         372 :   str[1] = 0x80 | (c & kMask);
     117             :   return 2;
     118             : }
     119             : 
     120             : // Encode encodes the UTF-16 code units c and previous into the given str
     121             : // buffer, and combines surrogate code units into single code points. If
     122             : // replace_invalid is set to true, orphan surrogate code units will be replaced
     123             : // with kBadChar.
     124   183218433 : unsigned Utf8::Encode(char* str,
     125             :                       uchar c,
     126             :                       int previous,
     127             :                       bool replace_invalid) {
     128             :   static const int kMask = ~(1 << 6);
     129   183218433 :   if (c <= kMaxOneByteChar) {
     130    32840272 :     str[0] = c;
     131    32840272 :     return 1;
     132   150378161 :   } else if (c <= kMaxTwoByteChar) {
     133    45124756 :     str[0] = 0xC0 | (c >> 6);
     134    45124756 :     str[1] = 0x80 | (c & kMask);
     135    45124756 :     return 2;
     136   105253405 :   } else if (c <= kMaxThreeByteChar) {
     137             :     DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
     138   180505528 :     if (Utf16::IsSurrogatePair(previous, c)) {
     139             :       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
     140    30000654 :       return Encode(str - kUnmatchedSize,
     141    15000327 :                     Utf16::CombineSurrogatePair(previous, c),
     142             :                     Utf16::kNoPreviousCharacter,
     143    15000327 :                     replace_invalid) - kUnmatchedSize;
     144   150504908 :     } else if (replace_invalid &&
     145          12 :                (Utf16::IsLeadSurrogate(c) ||
     146             :                Utf16::IsTrailSurrogate(c))) {
     147             :       c = kBadChar;
     148             :     }
     149    75252437 :     str[0] = 0xE0 | (c >> 12);
     150    75252437 :     str[1] = 0x80 | ((c >> 6) & kMask);
     151    75252437 :     str[2] = 0x80 | (c & kMask);
     152    75252437 :     return 3;
     153             :   } else {
     154    15000641 :     str[0] = 0xF0 | (c >> 18);
     155    15000641 :     str[1] = 0x80 | ((c >> 12) & kMask);
     156    15000641 :     str[2] = 0x80 | ((c >> 6) & kMask);
     157    15000641 :     str[3] = 0x80 | (c & kMask);
     158    15000641 :     return 4;
     159             :   }
     160             : }
     161             : 
     162             : 
     163             : uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
     164   208026574 :   if (length <= 0) return kBadChar;
     165   181753530 :   byte first = bytes[0];
     166             :   // Characters between 0000 and 007F are encoded as a single character
     167   181753530 :   if (V8_LIKELY(first <= kMaxOneByteChar)) {
     168   181696555 :     *cursor += 1;
     169   181696555 :     return first;
     170             :   }
     171       56975 :   return CalculateValue(bytes, length, cursor);
     172             : }
     173             : 
     174             : unsigned Utf8::Length(uchar c, int previous) {
     175    30235892 :   if (c <= kMaxOneByteChar) {
     176             :     return 1;
     177      372764 :   } else if (c <= kMaxTwoByteChar) {
     178             :     return 2;
     179             :   } else if (c <= kMaxThreeByteChar) {
     180             :     DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
     181      248488 :     if (Utf16::IsSurrogatePair(previous, c)) {
     182             :       return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
     183             :     }
     184             :     return 3;
     185             :   } else {
     186             :     return 4;
     187             :   }
     188             : }
     189             : 
     190             : bool Utf8::IsValidCharacter(uchar c) {
     191             :   return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
     192             :          (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
     193             :           c != kBadChar);
     194             : }
     195             : 
     196             : }  // namespace unibrow
     197             : 
     198             : #endif  // V8_UNICODE_INL_H_

Generated by: LCOV version 1.10