LCOV - app.info - src/unicode.h

LCOV - code coverage report

Current view:	top level - src - unicode.h (source / functions)		Hit	Total	Coverage
Test:	app.info	Lines:	14	14	100.0 %
Date:	2017-10-20	Functions:	0	0	-

          Line data    Source code

       1             : // Copyright 2011 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_UNICODE_H_
       6             : #define V8_UNICODE_H_
       7             : 
       8             : #include <sys/types.h>
       9             : #include "src/globals.h"
      10             : #include "src/utils.h"
      11             : /**
      12             :  * \file
      13             :  * Definitions and convenience functions for working with unicode.
      14             :  */
      15             : 
      16             : namespace unibrow {
      17             : 
      18             : typedef unsigned int uchar;
      19             : typedef unsigned char byte;
      20             : 
      21             : /**
      22             :  * The max length of the result of converting the case of a single
      23             :  * character.
      24             :  */
      25             : const int kMaxMappingSize = 4;
      26             : 
      27             : template <class T, int size = 256>
      28             : class Predicate {
      29             :  public:
      30   546125949 :   inline Predicate() { }
      31             :   inline bool get(uchar c);
      32             : 
      33             :  private:
      34             :   friend class Test;
      35             :   bool CalculateValue(uchar c);
      36             :   class CacheEntry {
      37             :    public:
      38             :     inline CacheEntry()
      39   545061376 :         : bit_field_(CodePointField::encode(0) | ValueField::encode(0)) {}
      40             :     inline CacheEntry(uchar code_point, bool value)
      41             :         : bit_field_(CodePointField::encode(code_point) |
      42     3200659 :                      ValueField::encode(value)) {}
      43             : 
      44             :     uchar code_point() const { return CodePointField::decode(bit_field_); }
      45             :     bool value() const { return ValueField::decode(bit_field_); }
      46             : 
      47             :    private:
      48             :     class CodePointField : public v8::internal::BitField<uchar, 0, 21> {};
      49             :     class ValueField : public v8::internal::BitField<bool, 21, 1> {};
      50             : 
      51             :     uint32_t bit_field_;
      52             :   };
      53             :   static const int kSize = size;
      54             :   static const int kMask = kSize - 1;
      55             :   CacheEntry entries_[kSize];
      56             : };
      57             : 
      58             : 
      59             : // A cache used in case conversion.  It caches the value for characters
      60             : // that either have no mapping or map to a single character independent
      61             : // of context.  Characters that map to more than one character or that
      62             : // map differently depending on context are always looked up.
      63             : template <class T, int size = 256>
      64             : class Mapping {
      65             :  public:
      66    42298839 :   inline Mapping() { }
      67             :   inline int get(uchar c, uchar n, uchar* result);
      68             :  private:
      69             :   friend class Test;
      70             :   int CalculateValue(uchar c, uchar n, uchar* result);
      71             :   struct CacheEntry {
      72    42243840 :     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
      73             :     inline CacheEntry(uchar code_point, signed offset)
      74             :       : code_point_(code_point),
      75             :         offset_(offset) { }
      76             :     uchar code_point_;
      77             :     signed offset_;
      78             :     static const int kNoChar = (1 << 21) - 1;
      79             :   };
      80             :   static const int kSize = size;
      81             :   static const int kMask = kSize - 1;
      82             :   CacheEntry entries_[kSize];
      83             : };
      84             : 
      85             : 
      86             : class UnicodeData {
      87             :  private:
      88             :   friend class Test;
      89             :   static int GetByteCount();
      90             :   static const uchar kMaxCodePoint;
      91             : };
      92             : 
      93             : 
      94             : class Utf16 {
      95             :  public:
      96             :   static inline bool IsSurrogatePair(int lead, int trail) {
      97      169164 :     return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
      98             :   }
      99             :   static inline bool IsLeadSurrogate(int code) {
     100  1012831102 :     if (code == kNoPreviousCharacter) return false;
     101  1010798289 :     return (code & 0xfc00) == 0xd800;
     102             :   }
     103             :   static inline bool IsTrailSurrogate(int code) {
     104    23588611 :     if (code == kNoPreviousCharacter) return false;
     105    23589214 :     return (code & 0xfc00) == 0xdc00;
     106             :   }
     107             : 
     108             :   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
     109        2410 :     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
     110             :   }
     111             :   static const int kNoPreviousCharacter = -1;
     112             :   static const uchar kMaxNonSurrogateCharCode = 0xffff;
     113             :   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
     114             :   // of UTF-8 data.  The special case where the unit is a surrogate
     115             :   // trail produces 1 byte net, because the encoding of the pair is
     116             :   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
     117             :   // can be reclaimed.
     118             :   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
     119             :   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
     120             :   // The illegality stems from the surrogate not being part of a pair.
     121             :   static const int kUtf8BytesToCodeASurrogate = 3;
     122             :   static inline uint16_t LeadSurrogate(uint32_t char_code) {
     123    10032050 :     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
     124             :   }
     125             :   static inline uint16_t TrailSurrogate(uint32_t char_code) {
     126    10032050 :     return 0xdc00 + (char_code & 0x3ff);
     127             :   }
     128             : };
     129             : 
     130             : class V8_EXPORT_PRIVATE Utf8 {
     131             :  public:
     132             :   static inline uchar Length(uchar chr, int previous);
     133             :   static inline unsigned EncodeOneByte(char* out, uint8_t c);
     134             :   static inline unsigned Encode(char* out,
     135             :                                 uchar c,
     136             :                                 int previous,
     137             :                                 bool replace_invalid = false);
     138             :   static uchar CalculateValue(const byte* str, size_t length, size_t* cursor);
     139             : 
     140             :   // The unicode replacement character, used to signal invalid unicode
     141             :   // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
     142             :   static const uchar kBadChar = 0xFFFD;
     143             :   static const uchar kBufferEmpty = 0x0;
     144             :   static const uchar kIncomplete = 0xFFFFFFFC;  // any non-valid code point.
     145             :   static const unsigned kMaxEncodedSize   = 4;
     146             :   static const unsigned kMaxOneByteChar   = 0x7f;
     147             :   static const unsigned kMaxTwoByteChar   = 0x7ff;
     148             :   static const unsigned kMaxThreeByteChar = 0xffff;
     149             :   static const unsigned kMaxFourByteChar  = 0x1fffff;
     150             : 
     151             :   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
     152             :   // that match are coded as a 4 byte UTF-8 sequence.
     153             :   static const unsigned kBytesSavedByCombiningSurrogates = 2;
     154             :   static const unsigned kSizeOfUnmatchedSurrogate = 3;
     155             :   // The maximum size a single UTF-16 code unit may take up when encoded as
     156             :   // UTF-8.
     157             :   static const unsigned kMax16BitCodeUnitSize  = 3;
     158             :   static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
     159             : 
     160             :   typedef uint32_t Utf8IncrementalBuffer;
     161             :   static uchar ValueOfIncremental(byte next_byte,
     162             :                                   Utf8IncrementalBuffer* buffer);
     163             :   static uchar ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer);
     164             : 
     165             :   // Excludes non-characters from the set of valid code points.
     166             :   static inline bool IsValidCharacter(uchar c);
     167             : 
     168             :   // Validate if the input has a valid utf-8 encoding. Unlike JS source code
     169             :   // this validation function will accept any unicode code point, including
     170             :   // kBadChar and BOMs.
     171             :   //
     172             :   // This method checks for:
     173             :   // - valid utf-8 endcoding (e.g. no over-long encodings),
     174             :   // - absence of surrogates,
     175             :   // - valid code point range.
     176             :   static bool ValidateEncoding(const byte* str, size_t length);
     177             : };
     178             : 
     179             : struct Uppercase {
     180             :   static bool Is(uchar c);
     181             : };
     182             : struct Letter {
     183             :   static bool Is(uchar c);
     184             : };
     185             : #ifndef V8_INTL_SUPPORT
     186             : struct V8_EXPORT_PRIVATE ID_Start {
     187             :   static bool Is(uchar c);
     188             : };
     189             : struct V8_EXPORT_PRIVATE ID_Continue {
     190             :   static bool Is(uchar c);
     191             : };
     192             : struct V8_EXPORT_PRIVATE WhiteSpace {
     193             :   static bool Is(uchar c);
     194             : };
     195             : #endif  // !V8_INTL_SUPPORT
     196             : 
     197             : // LineTerminator:       'JS_Line_Terminator' in point.properties
     198             : // ES#sec-line-terminators lists exactly 4 code points:
     199             : // LF (U+000A), CR (U+000D), LS(U+2028), PS(U+2029)
     200             : V8_INLINE bool IsLineTerminator(uchar c) {
     201   216006189 :   return c == 0x000A || c == 0x000D || c == 0x2028 || c == 0x2029;
     202             : }
     203             : 
     204             : #ifndef V8_INTL_SUPPORT
     205             : struct ToLowercase {
     206             :   static const int kMaxWidth = 3;
     207             :   static const bool kIsToLower = true;
     208             :   static int Convert(uchar c,
     209             :                      uchar n,
     210             :                      uchar* result,
     211             :                      bool* allow_caching_ptr);
     212             : };
     213             : struct ToUppercase {
     214             :   static const int kMaxWidth = 3;
     215             :   static const bool kIsToLower = false;
     216             :   static int Convert(uchar c,
     217             :                      uchar n,
     218             :                      uchar* result,
     219             :                      bool* allow_caching_ptr);
     220             : };
     221             : #endif
     222             : struct Ecma262Canonicalize {
     223             :   static const int kMaxWidth = 1;
     224             :   static int Convert(uchar c,
     225             :                      uchar n,
     226             :                      uchar* result,
     227             :                      bool* allow_caching_ptr);
     228             : };
     229             : struct Ecma262UnCanonicalize {
     230             :   static const int kMaxWidth = 4;
     231             :   static int Convert(uchar c,
     232             :                      uchar n,
     233             :                      uchar* result,
     234             :                      bool* allow_caching_ptr);
     235             : };
     236             : struct CanonicalizationRange {
     237             :   static const int kMaxWidth = 1;
     238             :   static int Convert(uchar c,
     239             :                      uchar n,
     240             :                      uchar* result,
     241             :                      bool* allow_caching_ptr);
     242             : };
     243             : 
     244             : }  // namespace unibrow
     245             : 
     246             : #endif  // V8_UNICODE_H_

Generated by: LCOV version 1.10