LCOV - code coverage report
Current view: top level - src - unicode.h (source / functions) Hit Total Coverage
Test: app.info Lines: 13 13 100.0 %
Date: 2017-04-26 Functions: 0 0 -

          Line data    Source code
       1             : // Copyright 2011 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_UNICODE_H_
       6             : #define V8_UNICODE_H_
       7             : 
       8             : #include <sys/types.h>
       9             : #include "src/globals.h"
      10             : #include "src/utils.h"
      11             : /**
      12             :  * \file
      13             :  * Definitions and convenience functions for working with unicode.
      14             :  */
      15             : 
      16             : namespace unibrow {
      17             : 
      18             : typedef unsigned int uchar;
      19             : typedef unsigned char byte;
      20             : 
      21             : /**
      22             :  * The max length of the result of converting the case of a single
      23             :  * character.
      24             :  */
      25             : const int kMaxMappingSize = 4;
      26             : 
      27             : template <class T, int size = 256>
      28             : class Predicate {
      29             :  public:
      30    93733430 :   inline Predicate() { }
      31             :   inline bool get(uchar c);
      32             : 
      33             :  private:
      34             :   friend class Test;
      35             :   bool CalculateValue(uchar c);
      36             :   class CacheEntry {
      37             :    public:
      38             :     inline CacheEntry()
      39    93587200 :         : bit_field_(CodePointField::encode(0) | ValueField::encode(0)) {}
      40             :     inline CacheEntry(uchar code_point, bool value)
      41             :         : bit_field_(CodePointField::encode(code_point) |
      42     5388768 :                      ValueField::encode(value)) {}
      43             : 
      44             :     uchar code_point() const { return CodePointField::decode(bit_field_); }
      45             :     bool value() const { return ValueField::decode(bit_field_); }
      46             : 
      47             :    private:
      48             :     class CodePointField : public v8::internal::BitField<uchar, 0, 21> {};
      49             :     class ValueField : public v8::internal::BitField<bool, 21, 1> {};
      50             : 
      51             :     uint32_t bit_field_;
      52             :   };
      53             :   static const int kSize = size;
      54             :   static const int kMask = kSize - 1;
      55             :   CacheEntry entries_[kSize];
      56             : };
      57             : 
      58             : 
      59             : // A cache used in case conversion.  It caches the value for characters
      60             : // that either have no mapping or map to a single character independent
      61             : // of context.  Characters that map to more than one character or that
      62             : // map differently depending on context are always looked up.
      63             : template <class T, int size = 256>
      64             : class Mapping {
      65             :  public:
      66    62292188 :   inline Mapping() { }
      67             :   inline int get(uchar c, uchar n, uchar* result);
      68             :  private:
      69             :   friend class Test;
      70             :   int CalculateValue(uchar c, uchar n, uchar* result);
      71             :   struct CacheEntry {
      72    62170624 :     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
      73             :     inline CacheEntry(uchar code_point, signed offset)
      74             :       : code_point_(code_point),
      75             :         offset_(offset) { }
      76             :     uchar code_point_;
      77             :     signed offset_;
      78             :     static const int kNoChar = (1 << 21) - 1;
      79             :   };
      80             :   static const int kSize = size;
      81             :   static const int kMask = kSize - 1;
      82             :   CacheEntry entries_[kSize];
      83             : };
      84             : 
      85             : 
      86             : class UnicodeData {
      87             :  private:
      88             :   friend class Test;
      89             :   static int GetByteCount();
      90             :   static const uchar kMaxCodePoint;
      91             : };
      92             : 
      93             : 
      94             : class Utf16 {
      95             :  public:
      96             :   static inline bool IsSurrogatePair(int lead, int trail) {
      97      199187 :     return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
      98             :   }
      99             :   static inline bool IsLeadSurrogate(int code) {
     100  1218881334 :     if (code == kNoPreviousCharacter) return false;
     101  1215733974 :     return (code & 0xfc00) == 0xd800;
     102             :   }
     103             :   static inline bool IsTrailSurrogate(int code) {
     104    34735258 :     if (code == kNoPreviousCharacter) return false;
     105    34736049 :     return (code & 0xfc00) == 0xdc00;
     106             :   }
     107             : 
     108             :   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
     109        3234 :     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
     110             :   }
     111             :   static const int kNoPreviousCharacter = -1;
     112             :   static const uchar kMaxNonSurrogateCharCode = 0xffff;
     113             :   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
     114             :   // of UTF-8 data.  The special case where the unit is a surrogate
     115             :   // trail produces 1 byte net, because the encoding of the pair is
     116             :   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
     117             :   // can be reclaimed.
     118             :   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
     119             :   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
     120             :   // The illegality stems from the surrogate not being part of a pair.
     121             :   static const int kUtf8BytesToCodeASurrogate = 3;
     122             :   static inline uint16_t LeadSurrogate(uint32_t char_code) {
     123    15543986 :     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
     124             :   }
     125             :   static inline uint16_t TrailSurrogate(uint32_t char_code) {
     126    15543993 :     return 0xdc00 + (char_code & 0x3ff);
     127             :   }
     128             : };
     129             : 
     130             : 
     131             : class Utf8 {
     132             :  public:
     133             :   static inline uchar Length(uchar chr, int previous);
     134             :   static inline unsigned EncodeOneByte(char* out, uint8_t c);
     135             :   static inline unsigned Encode(char* out,
     136             :                                 uchar c,
     137             :                                 int previous,
     138             :                                 bool replace_invalid = false);
     139             :   static uchar CalculateValue(const byte* str, size_t length, size_t* cursor);
     140             : 
     141             :   // The unicode replacement character, used to signal invalid unicode
     142             :   // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
     143             :   static const uchar kBadChar = 0xFFFD;
     144             :   static const uchar kBufferEmpty = 0x0;
     145             :   static const uchar kIncomplete = 0xFFFFFFFC;  // any non-valid code point.
     146             :   static const unsigned kMaxEncodedSize   = 4;
     147             :   static const unsigned kMaxOneByteChar   = 0x7f;
     148             :   static const unsigned kMaxTwoByteChar   = 0x7ff;
     149             :   static const unsigned kMaxThreeByteChar = 0xffff;
     150             :   static const unsigned kMaxFourByteChar  = 0x1fffff;
     151             : 
     152             :   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
     153             :   // that match are coded as a 4 byte UTF-8 sequence.
     154             :   static const unsigned kBytesSavedByCombiningSurrogates = 2;
     155             :   static const unsigned kSizeOfUnmatchedSurrogate = 3;
     156             :   // The maximum size a single UTF-16 code unit may take up when encoded as
     157             :   // UTF-8.
     158             :   static const unsigned kMax16BitCodeUnitSize  = 3;
     159             :   static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
     160             : 
     161             :   typedef uint32_t Utf8IncrementalBuffer;
     162             :   static uchar ValueOfIncremental(byte next_byte,
     163             :                                   Utf8IncrementalBuffer* buffer);
     164             :   static uchar ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer);
     165             : 
     166             :   // Excludes non-characters from the set of valid code points.
     167             :   static inline bool IsValidCharacter(uchar c);
     168             : 
     169             :   static bool Validate(const byte* str, size_t length);
     170             : };
     171             : 
     172             : struct Uppercase {
     173             :   static bool Is(uchar c);
     174             : };
     175             : struct Lowercase {
     176             :   static bool Is(uchar c);
     177             : };
     178             : struct Letter {
     179             :   static bool Is(uchar c);
     180             : };
     181             : struct V8_EXPORT_PRIVATE ID_Start {
     182             :   static bool Is(uchar c);
     183             : };
     184             : struct V8_EXPORT_PRIVATE ID_Continue {
     185             :   static bool Is(uchar c);
     186             : };
     187             : struct V8_EXPORT_PRIVATE WhiteSpace {
     188             :   static bool Is(uchar c);
     189             : };
     190             : struct V8_EXPORT_PRIVATE LineTerminator {
     191             :   static bool Is(uchar c);
     192             : };
     193             : struct ToLowercase {
     194             :   static const int kMaxWidth = 3;
     195             :   static const bool kIsToLower = true;
     196             :   static int Convert(uchar c,
     197             :                      uchar n,
     198             :                      uchar* result,
     199             :                      bool* allow_caching_ptr);
     200             : };
     201             : struct ToUppercase {
     202             :   static const int kMaxWidth = 3;
     203             :   static const bool kIsToLower = false;
     204             :   static int Convert(uchar c,
     205             :                      uchar n,
     206             :                      uchar* result,
     207             :                      bool* allow_caching_ptr);
     208             : };
     209             : struct Ecma262Canonicalize {
     210             :   static const int kMaxWidth = 1;
     211             :   static int Convert(uchar c,
     212             :                      uchar n,
     213             :                      uchar* result,
     214             :                      bool* allow_caching_ptr);
     215             : };
     216             : struct Ecma262UnCanonicalize {
     217             :   static const int kMaxWidth = 4;
     218             :   static int Convert(uchar c,
     219             :                      uchar n,
     220             :                      uchar* result,
     221             :                      bool* allow_caching_ptr);
     222             : };
     223             : struct CanonicalizationRange {
     224             :   static const int kMaxWidth = 1;
     225             :   static int Convert(uchar c,
     226             :                      uchar n,
     227             :                      uchar* result,
     228             :                      bool* allow_caching_ptr);
     229             : };
     230             : 
     231             : }  // namespace unibrow
     232             : 
     233             : #endif  // V8_UNICODE_H_

Generated by: LCOV version 1.10