LCOV - code coverage report
Current view: top level - src - unicode.h (source / functions) Hit Total Coverage
Test: app.info Lines: 8 8 100.0 %
Date: 2019-04-19 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_UNICODE_H_
       6             : #define V8_UNICODE_H_
       7             : 
       8             : #include <sys/types.h>
       9             : #include "src/globals.h"
      10             : #include "src/third_party/utf8-decoder/utf8-decoder.h"
      11             : #include "src/utils.h"
      12             : /**
      13             :  * \file
      14             :  * Definitions and convenience functions for working with unicode.
      15             :  */
      16             : 
      17             : namespace unibrow {
      18             : 
      19             : typedef unsigned int uchar;
      20             : typedef unsigned char byte;
      21             : 
      22             : /**
      23             :  * The max length of the result of converting the case of a single
      24             :  * character.
      25             :  */
      26             : const int kMaxMappingSize = 4;
      27             : 
      28             : #ifndef V8_INTL_SUPPORT
      29             : template <class T, int size = 256>
      30             : class Predicate {
      31             :  public:
      32             :   inline Predicate() = default;
      33             :   inline bool get(uchar c);
      34             : 
      35             :  private:
      36             :   friend class Test;
      37             :   bool CalculateValue(uchar c);
      38             :   class CacheEntry {
      39             :    public:
      40             :     inline CacheEntry()
      41             :         : bit_field_(CodePointField::encode(0) | ValueField::encode(0)) {}
      42             :     inline CacheEntry(uchar code_point, bool value)
      43             :         : bit_field_(
      44             :               CodePointField::encode(CodePointField::kMask & code_point) |
      45             :               ValueField::encode(value)) {
      46             :       DCHECK_IMPLIES((CodePointField::kMask & code_point) != code_point,
      47             :                      code_point == static_cast<uchar>(-1));
      48             :     }
      49             : 
      50             :     uchar code_point() const { return CodePointField::decode(bit_field_); }
      51             :     bool value() const { return ValueField::decode(bit_field_); }
      52             : 
      53             :    private:
      54             :     class CodePointField : public v8::internal::BitField<uchar, 0, 21> {};
      55             :     class ValueField : public v8::internal::BitField<bool, 21, 1> {};
      56             : 
      57             :     uint32_t bit_field_;
      58             :   };
      59             :   static const int kSize = size;
      60             :   static const int kMask = kSize - 1;
      61             :   CacheEntry entries_[kSize];
      62             : };
      63             : 
      64             : 
      65             : // A cache used in case conversion.  It caches the value for characters
      66             : // that either have no mapping or map to a single character independent
      67             : // of context.  Characters that map to more than one character or that
      68             : // map differently depending on context are always looked up.
      69             : template <class T, int size = 256>
      70             : class Mapping {
      71             :  public:
      72             :   inline Mapping() = default;
      73             :   inline int get(uchar c, uchar n, uchar* result);
      74             :  private:
      75             :   friend class Test;
      76             :   int CalculateValue(uchar c, uchar n, uchar* result);
      77             :   struct CacheEntry {
      78             :     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
      79             :     inline CacheEntry(uchar code_point, signed offset)
      80             :       : code_point_(code_point),
      81             :         offset_(offset) { }
      82             :     uchar code_point_;
      83             :     signed offset_;
      84             :     static const int kNoChar = (1 << 21) - 1;
      85             :   };
      86             :   static const int kSize = size;
      87             :   static const int kMask = kSize - 1;
      88             :   CacheEntry entries_[kSize];
      89             : };
      90             : 
      91             : class UnicodeData {
      92             :  private:
      93             :   friend class Test;
      94             :   static int GetByteCount();
      95             :   static const uchar kMaxCodePoint;
      96             : };
      97             : 
      98             : #endif  // !V8_INTL_SUPPORT
      99             : 
     100             : class Utf16 {
     101             :  public:
     102             :   static const int kNoPreviousCharacter = -1;
     103             :   static inline bool IsSurrogatePair(int lead, int trail) {
     104   105542034 :     return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
     105             :   }
     106             :   static inline bool IsLeadSurrogate(int code) {
     107    96255679 :     return (code & 0xfc00) == 0xd800;
     108             :   }
     109             :   static inline bool IsTrailSurrogate(int code) {
     110    15064760 :     return (code & 0xfc00) == 0xdc00;
     111             :   }
     112             : 
     113             :   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
     114    15075050 :     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
     115             :   }
     116             :   static const uchar kMaxNonSurrogateCharCode = 0xffff;
     117             :   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
     118             :   // of UTF-8 data.  The special case where the unit is a surrogate
     119             :   // trail produces 1 byte net, because the encoding of the pair is
     120             :   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
     121             :   // can be reclaimed.
     122             :   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
     123             :   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
     124             :   // The illegality stems from the surrogate not being part of a pair.
     125             :   static const int kUtf8BytesToCodeASurrogate = 3;
     126             :   static inline uint16_t LeadSurrogate(uint32_t char_code) {
     127     3244313 :     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
     128             :   }
     129             :   static inline uint16_t TrailSurrogate(uint32_t char_code) {
     130     3244313 :     return 0xdc00 + (char_code & 0x3ff);
     131             :   }
     132             : };
     133             : 
     134             : class V8_EXPORT_PRIVATE Utf8 {
     135             :  public:
     136             :   using State = Utf8DfaDecoder::State;
     137             : 
     138             :   static inline uchar Length(uchar chr, int previous);
     139             :   static inline unsigned EncodeOneByte(char* out, uint8_t c);
     140             :   static inline unsigned Encode(char* out,
     141             :                                 uchar c,
     142             :                                 int previous,
     143             :                                 bool replace_invalid = false);
     144             :   static uchar CalculateValue(const byte* str, size_t length, size_t* cursor);
     145             : 
     146             :   // The unicode replacement character, used to signal invalid unicode
     147             :   // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
     148             :   static const uchar kBadChar = 0xFFFD;
     149             :   static const uchar kBufferEmpty = 0x0;
     150             :   static const uchar kIncomplete = 0xFFFFFFFC;  // any non-valid code point.
     151             :   static const unsigned kMaxEncodedSize   = 4;
     152             :   static const unsigned kMaxOneByteChar   = 0x7f;
     153             :   static const unsigned kMaxTwoByteChar   = 0x7ff;
     154             :   static const unsigned kMaxThreeByteChar = 0xffff;
     155             :   static const unsigned kMaxFourByteChar  = 0x1fffff;
     156             : 
     157             :   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
     158             :   // that match are coded as a 4 byte UTF-8 sequence.
     159             :   static const unsigned kBytesSavedByCombiningSurrogates = 2;
     160             :   static const unsigned kSizeOfUnmatchedSurrogate = 3;
     161             :   // The maximum size a single UTF-16 code unit may take up when encoded as
     162             :   // UTF-8.
     163             :   static const unsigned kMax16BitCodeUnitSize  = 3;
     164             :   static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
     165             : 
     166             :   typedef uint32_t Utf8IncrementalBuffer;
     167             :   static inline uchar ValueOfIncremental(const byte** cursor, State* state,
     168             :                                          Utf8IncrementalBuffer* buffer);
     169             :   static uchar ValueOfIncrementalFinish(State* state);
     170             : 
     171             :   // Excludes non-characters from the set of valid code points.
     172             :   static inline bool IsValidCharacter(uchar c);
     173             : 
     174             :   // Validate if the input has a valid utf-8 encoding. Unlike JS source code
     175             :   // this validation function will accept any unicode code point, including
     176             :   // kBadChar and BOMs.
     177             :   //
     178             :   // This method checks for:
     179             :   // - valid utf-8 endcoding (e.g. no over-long encodings),
     180             :   // - absence of surrogates,
     181             :   // - valid code point range.
     182             :   static bool ValidateEncoding(const byte* str, size_t length);
     183             : };
     184             : 
     185             : struct Uppercase {
     186             :   static bool Is(uchar c);
     187             : };
     188             : struct Letter {
     189             :   static bool Is(uchar c);
     190             : };
     191             : #ifndef V8_INTL_SUPPORT
     192             : struct V8_EXPORT_PRIVATE ID_Start {
     193             :   static bool Is(uchar c);
     194             : };
     195             : struct V8_EXPORT_PRIVATE ID_Continue {
     196             :   static bool Is(uchar c);
     197             : };
     198             : struct V8_EXPORT_PRIVATE WhiteSpace {
     199             :   static bool Is(uchar c);
     200             : };
     201             : #endif  // !V8_INTL_SUPPORT
     202             : 
     203             : // LineTerminator:       'JS_Line_Terminator' in point.properties
     204             : // ES#sec-line-terminators lists exactly 4 code points:
     205             : // LF (U+000A), CR (U+000D), LS(U+2028), PS(U+2029)
     206             : V8_INLINE bool IsLineTerminator(uchar c) {
     207   749485623 :   return c == 0x000A || c == 0x000D || c == 0x2028 || c == 0x2029;
     208             : }
     209             : 
     210             : V8_INLINE bool IsStringLiteralLineTerminator(uchar c) {
     211     9153650 :   return c == 0x000A || c == 0x000D;
     212             : }
     213             : 
     214             : #ifndef V8_INTL_SUPPORT
     215             : struct ToLowercase {
     216             :   static const int kMaxWidth = 3;
     217             :   static const bool kIsToLower = true;
     218             :   static int Convert(uchar c,
     219             :                      uchar n,
     220             :                      uchar* result,
     221             :                      bool* allow_caching_ptr);
     222             : };
     223             : struct ToUppercase {
     224             :   static const int kMaxWidth = 3;
     225             :   static const bool kIsToLower = false;
     226             :   static int Convert(uchar c,
     227             :                      uchar n,
     228             :                      uchar* result,
     229             :                      bool* allow_caching_ptr);
     230             : };
     231             : struct V8_EXPORT_PRIVATE Ecma262Canonicalize {
     232             :   static const int kMaxWidth = 1;
     233             :   static int Convert(uchar c,
     234             :                      uchar n,
     235             :                      uchar* result,
     236             :                      bool* allow_caching_ptr);
     237             : };
     238             : struct V8_EXPORT_PRIVATE Ecma262UnCanonicalize {
     239             :   static const int kMaxWidth = 4;
     240             :   static int Convert(uchar c,
     241             :                      uchar n,
     242             :                      uchar* result,
     243             :                      bool* allow_caching_ptr);
     244             : };
     245             : struct V8_EXPORT_PRIVATE CanonicalizationRange {
     246             :   static const int kMaxWidth = 1;
     247             :   static int Convert(uchar c,
     248             :                      uchar n,
     249             :                      uchar* result,
     250             :                      bool* allow_caching_ptr);
     251             : };
     252             : #endif  // !V8_INTL_SUPPORT
     253             : 
     254             : }  // namespace unibrow
     255             : 
     256             : #endif  // V8_UNICODE_H_

Generated by: LCOV version 1.10