LCOV - code coverage report
Current view: top level - src - unicode.h (source / functions) Hit Total Coverage
Test: app.info Lines: 10 10 100.0 %
Date: 2019-03-21 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_UNICODE_H_
       6             : #define V8_UNICODE_H_
       7             : 
       8             : #include <sys/types.h>
       9             : #include "src/globals.h"
      10             : #include "src/third_party/utf8-decoder/utf8-decoder.h"
      11             : #include "src/utils.h"
      12             : /**
      13             :  * \file
      14             :  * Definitions and convenience functions for working with unicode.
      15             :  */
      16             : 
      17             : namespace unibrow {
      18             : 
      19             : typedef unsigned int uchar;
      20             : typedef unsigned char byte;
      21             : 
      22             : /**
      23             :  * The max length of the result of converting the case of a single
      24             :  * character.
      25             :  */
      26             : const int kMaxMappingSize = 4;
      27             : 
      28             : template <class T, int size = 256>
      29             : class Predicate {
      30             :  public:
      31             :   inline Predicate() = default;
      32             :   inline bool get(uchar c);
      33             : 
      34             :  private:
      35             :   friend class Test;
      36             :   bool CalculateValue(uchar c);
      37             :   class CacheEntry {
      38             :    public:
      39             :     inline CacheEntry()
      40             :         : bit_field_(CodePointField::encode(0) | ValueField::encode(0)) {}
      41             :     inline CacheEntry(uchar code_point, bool value)
      42             :         : bit_field_(
      43             :               CodePointField::encode(CodePointField::kMask & code_point) |
      44             :               ValueField::encode(value)) {
      45             :       DCHECK_IMPLIES((CodePointField::kMask & code_point) != code_point,
      46             :                      code_point == static_cast<uchar>(-1));
      47             :     }
      48             : 
      49             :     uchar code_point() const { return CodePointField::decode(bit_field_); }
      50             :     bool value() const { return ValueField::decode(bit_field_); }
      51             : 
      52             :    private:
      53             :     class CodePointField : public v8::internal::BitField<uchar, 0, 21> {};
      54             :     class ValueField : public v8::internal::BitField<bool, 21, 1> {};
      55             : 
      56             :     uint32_t bit_field_;
      57             :   };
      58             :   static const int kSize = size;
      59             :   static const int kMask = kSize - 1;
      60             :   CacheEntry entries_[kSize];
      61             : };
      62             : 
      63             : 
      64             : // A cache used in case conversion.  It caches the value for characters
      65             : // that either have no mapping or map to a single character independent
      66             : // of context.  Characters that map to more than one character or that
      67             : // map differently depending on context are always looked up.
      68             : template <class T, int size = 256>
      69             : class Mapping {
      70             :  public:
      71    47264621 :   inline Mapping() = default;
      72             :   inline int get(uchar c, uchar n, uchar* result);
      73             :  private:
      74             :   friend class Test;
      75             :   int CalculateValue(uchar c, uchar n, uchar* result);
      76             :   struct CacheEntry {
      77    47199232 :     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
      78             :     inline CacheEntry(uchar code_point, signed offset)
      79             :       : code_point_(code_point),
      80             :         offset_(offset) { }
      81             :     uchar code_point_;
      82             :     signed offset_;
      83             :     static const int kNoChar = (1 << 21) - 1;
      84             :   };
      85             :   static const int kSize = size;
      86             :   static const int kMask = kSize - 1;
      87             :   CacheEntry entries_[kSize];
      88             : };
      89             : 
      90             : 
      91             : class UnicodeData {
      92             :  private:
      93             :   friend class Test;
      94             :   static int GetByteCount();
      95             :   static const uchar kMaxCodePoint;
      96             : };
      97             : 
      98             : 
      99             : class Utf16 {
     100             :  public:
     101             :   static const int kNoPreviousCharacter = -1;
     102             :   static inline bool IsSurrogatePair(int lead, int trail) {
     103   105541475 :     return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
     104             :   }
     105             :   static inline bool IsLeadSurrogate(int code) {
     106    96229454 :     return (code & 0xfc00) == 0xd800;
     107             :   }
     108             :   static inline bool IsTrailSurrogate(int code) {
     109    15064760 :     return (code & 0xfc00) == 0xdc00;
     110             :   }
     111             : 
     112             :   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
     113    15075059 :     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
     114             :   }
     115             :   static const uchar kMaxNonSurrogateCharCode = 0xffff;
     116             :   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
     117             :   // of UTF-8 data.  The special case where the unit is a surrogate
     118             :   // trail produces 1 byte net, because the encoding of the pair is
     119             :   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
     120             :   // can be reclaimed.
     121             :   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
     122             :   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
     123             :   // The illegality stems from the surrogate not being part of a pair.
     124             :   static const int kUtf8BytesToCodeASurrogate = 3;
     125             :   static inline uint16_t LeadSurrogate(uint32_t char_code) {
     126     3240356 :     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
     127             :   }
     128             :   static inline uint16_t TrailSurrogate(uint32_t char_code) {
     129     3240356 :     return 0xdc00 + (char_code & 0x3ff);
     130             :   }
     131             : };
     132             : 
     133             : class V8_EXPORT_PRIVATE Utf8 {
     134             :  public:
     135             :   using State = Utf8DfaDecoder::State;
     136             : 
     137             :   static inline uchar Length(uchar chr, int previous);
     138             :   static inline unsigned EncodeOneByte(char* out, uint8_t c);
     139             :   static inline unsigned Encode(char* out,
     140             :                                 uchar c,
     141             :                                 int previous,
     142             :                                 bool replace_invalid = false);
     143             :   static uchar CalculateValue(const byte* str, size_t length, size_t* cursor);
     144             : 
     145             :   // The unicode replacement character, used to signal invalid unicode
     146             :   // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
     147             :   static const uchar kBadChar = 0xFFFD;
     148             :   static const uchar kBufferEmpty = 0x0;
     149             :   static const uchar kIncomplete = 0xFFFFFFFC;  // any non-valid code point.
     150             :   static const unsigned kMaxEncodedSize   = 4;
     151             :   static const unsigned kMaxOneByteChar   = 0x7f;
     152             :   static const unsigned kMaxTwoByteChar   = 0x7ff;
     153             :   static const unsigned kMaxThreeByteChar = 0xffff;
     154             :   static const unsigned kMaxFourByteChar  = 0x1fffff;
     155             : 
     156             :   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
     157             :   // that match are coded as a 4 byte UTF-8 sequence.
     158             :   static const unsigned kBytesSavedByCombiningSurrogates = 2;
     159             :   static const unsigned kSizeOfUnmatchedSurrogate = 3;
     160             :   // The maximum size a single UTF-16 code unit may take up when encoded as
     161             :   // UTF-8.
     162             :   static const unsigned kMax16BitCodeUnitSize  = 3;
     163             :   static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
     164             : 
     165             :   typedef uint32_t Utf8IncrementalBuffer;
     166             :   static inline uchar ValueOfIncremental(const byte** cursor, State* state,
     167             :                                          Utf8IncrementalBuffer* buffer);
     168             :   static uchar ValueOfIncrementalFinish(State* state);
     169             : 
     170             :   // Excludes non-characters from the set of valid code points.
     171             :   static inline bool IsValidCharacter(uchar c);
     172             : 
     173             :   // Validate if the input has a valid utf-8 encoding. Unlike JS source code
     174             :   // this validation function will accept any unicode code point, including
     175             :   // kBadChar and BOMs.
     176             :   //
     177             :   // This method checks for:
     178             :   // - valid utf-8 endcoding (e.g. no over-long encodings),
     179             :   // - absence of surrogates,
     180             :   // - valid code point range.
     181             :   static bool ValidateEncoding(const byte* str, size_t length);
     182             : };
     183             : 
     184             : struct Uppercase {
     185             :   static bool Is(uchar c);
     186             : };
     187             : struct Letter {
     188             :   static bool Is(uchar c);
     189             : };
     190             : #ifndef V8_INTL_SUPPORT
     191             : struct V8_EXPORT_PRIVATE ID_Start {
     192             :   static bool Is(uchar c);
     193             : };
     194             : struct V8_EXPORT_PRIVATE ID_Continue {
     195             :   static bool Is(uchar c);
     196             : };
     197             : struct V8_EXPORT_PRIVATE WhiteSpace {
     198             :   static bool Is(uchar c);
     199             : };
     200             : #endif  // !V8_INTL_SUPPORT
     201             : 
     202             : // LineTerminator:       'JS_Line_Terminator' in point.properties
     203             : // ES#sec-line-terminators lists exactly 4 code points:
     204             : // LF (U+000A), CR (U+000D), LS(U+2028), PS(U+2029)
     205             : V8_INLINE bool IsLineTerminator(uchar c) {
     206   744523429 :   return c == 0x000A || c == 0x000D || c == 0x2028 || c == 0x2029;
     207             : }
     208             : 
     209             : V8_INLINE bool IsStringLiteralLineTerminator(uchar c) {
     210     9168290 :   return c == 0x000A || c == 0x000D;
     211             : }
     212             : 
     213             : #ifndef V8_INTL_SUPPORT
     214             : struct ToLowercase {
     215             :   static const int kMaxWidth = 3;
     216             :   static const bool kIsToLower = true;
     217             :   static int Convert(uchar c,
     218             :                      uchar n,
     219             :                      uchar* result,
     220             :                      bool* allow_caching_ptr);
     221             : };
     222             : struct ToUppercase {
     223             :   static const int kMaxWidth = 3;
     224             :   static const bool kIsToLower = false;
     225             :   static int Convert(uchar c,
     226             :                      uchar n,
     227             :                      uchar* result,
     228             :                      bool* allow_caching_ptr);
     229             : };
     230             : #endif
     231             : struct Ecma262Canonicalize {
     232             :   static const int kMaxWidth = 1;
     233             :   static int Convert(uchar c,
     234             :                      uchar n,
     235             :                      uchar* result,
     236             :                      bool* allow_caching_ptr);
     237             : };
     238             : struct Ecma262UnCanonicalize {
     239             :   static const int kMaxWidth = 4;
     240             :   static int Convert(uchar c,
     241             :                      uchar n,
     242             :                      uchar* result,
     243             :                      bool* allow_caching_ptr);
     244             : };
     245             : struct CanonicalizationRange {
     246             :   static const int kMaxWidth = 1;
     247             :   static int Convert(uchar c,
     248             :                      uchar n,
     249             :                      uchar* result,
     250             :                      bool* allow_caching_ptr);
     251             : };
     252             : 
     253             : }  // namespace unibrow
     254             : 
     255             : #endif  // V8_UNICODE_H_

Generated by: LCOV version 1.10