LCOV - code coverage report
Current view: top level - src/inspector - string-16.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 54 191 28.3 %
Date: 2017-04-26 Functions: 15 25 60.0 %

          Line data    Source code
       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/inspector/string-16.h"
       6             : 
       7             : #include <algorithm>
       8             : #include <cctype>
       9             : #include <cstdlib>
      10             : #include <cstring>
      11             : #include <limits>
      12             : #include <string>
      13             : 
      14             : #include "src/base/platform/platform.h"
      15             : #include "src/conversions.h"
      16             : 
      17             : namespace v8_inspector {
      18             : 
      19             : namespace {
      20             : 
      21      405592 : bool isASCII(UChar c) { return !(c & ~0x7F); }
      22             : 
      23             : bool isSpaceOrNewLine(UChar c) {
      24         168 :   return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
      25             : }
      26             : 
      27      194186 : int charactersToInteger(const UChar* characters, size_t length,
      28             :                         bool* ok = nullptr) {
      29             :   std::vector<char> buffer;
      30      194186 :   buffer.reserve(length + 1);
      31      599610 :   for (size_t i = 0; i < length; ++i) {
      32      810848 :     if (!isASCII(characters[i])) {
      33           0 :       if (ok) *ok = false;
      34             :       return 0;
      35             :     }
      36      810848 :     buffer.push_back(static_cast<char>(characters[i]));
      37             :   }
      38      388372 :   buffer.push_back('\0');
      39             : 
      40             :   char* endptr;
      41             :   int64_t result =
      42      194186 :       static_cast<int64_t>(std::strtol(buffer.data(), &endptr, 10));
      43      194186 :   if (ok) {
      44      194186 :     *ok = !(*endptr) && result <= std::numeric_limits<int>::max() &&
      45      194186 :           result >= std::numeric_limits<int>::min();
      46             :   }
      47      194186 :   return static_cast<int>(result);
      48             : }
      49             : 
      50             : const UChar replacementCharacter = 0xFFFD;
      51             : using UChar32 = uint32_t;
      52             : 
      53             : inline int inlineUTF8SequenceLengthNonASCII(char b0) {
      54           0 :   if ((b0 & 0xC0) != 0xC0) return 0;
      55           0 :   if ((b0 & 0xE0) == 0xC0) return 2;
      56           0 :   if ((b0 & 0xF0) == 0xE0) return 3;
      57           0 :   if ((b0 & 0xF8) == 0xF0) return 4;
      58             :   return 0;
      59             : }
      60             : 
      61           0 : inline int inlineUTF8SequenceLength(char b0) {
      62           0 :   return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
      63             : }
      64             : 
      65             : // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
      66             : // into the first byte, depending on how many bytes follow.  There are
      67             : // as many entries in this table as there are UTF-8 sequence types.
      68             : // (I.e., one byte sequence, two byte... etc.). Remember that sequences
      69             : // for *legal* UTF-8 will be 4 or fewer bytes total.
      70             : static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
      71             :                                                0xF0, 0xF8, 0xFC};
      72             : 
      73             : typedef enum {
      74             :   conversionOK,     // conversion successful
      75             :   sourceExhausted,  // partial character in source, but hit end
      76             :   targetExhausted,  // insuff. room in target for conversion
      77             :   sourceIllegal     // source sequence is illegal/malformed
      78             : } ConversionResult;
      79             : 
      80           0 : ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
      81             :                                     const UChar* sourceEnd, char** targetStart,
      82             :                                     char* targetEnd, bool strict) {
      83             :   ConversionResult result = conversionOK;
      84           0 :   const UChar* source = *sourceStart;
      85           0 :   char* target = *targetStart;
      86           0 :   while (source < sourceEnd) {
      87             :     UChar32 ch;
      88             :     uint32_t bytesToWrite = 0;
      89             :     const UChar32 byteMask = 0xBF;
      90             :     const UChar32 byteMark = 0x80;
      91             :     const UChar* oldSource =
      92             :         source;  // In case we have to back up because of target overflow.
      93           0 :     ch = static_cast<uint16_t>(*source++);
      94             :     // If we have a surrogate pair, convert to UChar32 first.
      95           0 :     if (ch >= 0xD800 && ch <= 0xDBFF) {
      96             :       // If the 16 bits following the high surrogate are in the source buffer...
      97           0 :       if (source < sourceEnd) {
      98           0 :         UChar32 ch2 = static_cast<uint16_t>(*source);
      99             :         // If it's a low surrogate, convert to UChar32.
     100           0 :         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
     101           0 :           ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
     102           0 :           ++source;
     103           0 :         } else if (strict) {  // it's an unpaired high surrogate
     104             :           --source;           // return to the illegal value itself
     105             :           result = sourceIllegal;
     106             :           break;
     107             :         }
     108             :       } else {     // We don't have the 16 bits following the high surrogate.
     109             :         --source;  // return to the high surrogate
     110             :         result = sourceExhausted;
     111             :         break;
     112             :       }
     113           0 :     } else if (strict) {
     114             :       // UTF-16 surrogate values are illegal in UTF-32
     115           0 :       if (ch >= 0xDC00 && ch <= 0xDFFF) {
     116             :         --source;  // return to the illegal value itself
     117             :         result = sourceIllegal;
     118             :         break;
     119             :       }
     120             :     }
     121             :     // Figure out how many bytes the result will require
     122           0 :     if (ch < (UChar32)0x80) {
     123             :       bytesToWrite = 1;
     124           0 :     } else if (ch < (UChar32)0x800) {
     125             :       bytesToWrite = 2;
     126           0 :     } else if (ch < (UChar32)0x10000) {
     127             :       bytesToWrite = 3;
     128           0 :     } else if (ch < (UChar32)0x110000) {
     129             :       bytesToWrite = 4;
     130             :     } else {
     131             :       bytesToWrite = 3;
     132             :       ch = replacementCharacter;
     133             :     }
     134             : 
     135           0 :     target += bytesToWrite;
     136           0 :     if (target > targetEnd) {
     137             :       source = oldSource;  // Back up source pointer!
     138           0 :       target -= bytesToWrite;
     139             :       result = targetExhausted;
     140           0 :       break;
     141             :     }
     142           0 :     switch (bytesToWrite) {  // note: everything falls through.
     143             :       case 4:
     144           0 :         *--target = static_cast<char>((ch | byteMark) & byteMask);
     145           0 :         ch >>= 6;
     146             :       case 3:
     147           0 :         *--target = static_cast<char>((ch | byteMark) & byteMask);
     148           0 :         ch >>= 6;
     149             :       case 2:
     150           0 :         *--target = static_cast<char>((ch | byteMark) & byteMask);
     151           0 :         ch >>= 6;
     152             :       case 1:
     153           0 :         *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]);
     154             :     }
     155           0 :     target += bytesToWrite;
     156             :   }
     157           0 :   *sourceStart = source;
     158           0 :   *targetStart = target;
     159           0 :   return result;
     160             : }
     161             : 
     162             : /**
     163             :  * Is this code point a BMP code point (U+0000..U+ffff)?
     164             :  * @param c 32-bit code point
     165             :  * @return TRUE or FALSE
     166             :  * @stable ICU 2.8
     167             :  */
     168             : #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)
     169             : 
     170             : /**
     171             :  * Is this code point a supplementary code point (U+10000..U+10ffff)?
     172             :  * @param c 32-bit code point
     173             :  * @return TRUE or FALSE
     174             :  * @stable ICU 2.8
     175             :  */
     176             : #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff)
     177             : 
     178             : /**
     179             :  * Is this code point a surrogate (U+d800..U+dfff)?
     180             :  * @param c 32-bit code point
     181             :  * @return TRUE or FALSE
     182             :  * @stable ICU 2.4
     183             :  */
     184             : #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800)
     185             : 
     186             : /**
     187             :  * Get the lead surrogate (0xd800..0xdbff) for a
     188             :  * supplementary code point (0x10000..0x10ffff).
     189             :  * @param supplementary 32-bit code point (U+10000..U+10ffff)
     190             :  * @return lead surrogate (U+d800..U+dbff) for supplementary
     191             :  * @stable ICU 2.4
     192             :  */
     193             : #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)
     194             : 
     195             : /**
     196             :  * Get the trail surrogate (0xdc00..0xdfff) for a
     197             :  * supplementary code point (0x10000..0x10ffff).
     198             :  * @param supplementary 32-bit code point (U+10000..U+10ffff)
     199             :  * @return trail surrogate (U+dc00..U+dfff) for supplementary
     200             :  * @stable ICU 2.4
     201             :  */
     202             : #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00)
     203             : 
     204             : // This must be called with the length pre-determined by the first byte.
     205             : // If presented with a length > 4, this returns false.  The Unicode
     206             : // definition of UTF-8 goes up to 4-byte sequences.
     207           0 : static bool isLegalUTF8(const unsigned char* source, int length) {
     208             :   unsigned char a;
     209           0 :   const unsigned char* srcptr = source + length;
     210           0 :   switch (length) {
     211             :     default:
     212             :       return false;
     213             :     // Everything else falls through when "true"...
     214             :     case 4:
     215           0 :       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     216             :     case 3:
     217           0 :       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     218             :     case 2:
     219           0 :       if ((a = (*--srcptr)) > 0xBF) return false;
     220             : 
     221             :       // no fall-through in this inner switch
     222           0 :       switch (*source) {
     223             :         case 0xE0:
     224           0 :           if (a < 0xA0) return false;
     225             :           break;
     226             :         case 0xED:
     227           0 :           if (a > 0x9F) return false;
     228             :           break;
     229             :         case 0xF0:
     230           0 :           if (a < 0x90) return false;
     231             :           break;
     232             :         case 0xF4:
     233           0 :           if (a > 0x8F) return false;
     234             :           break;
     235             :         default:
     236           0 :           if (a < 0x80) return false;
     237             :       }
     238             : 
     239             :     case 1:
     240           0 :       if (*source >= 0x80 && *source < 0xC2) return false;
     241             :   }
     242           0 :   if (*source > 0xF4) return false;
     243           0 :   return true;
     244             : }
     245             : 
     246             : // Magic values subtracted from a buffer value during UTF8 conversion.
     247             : // This table contains as many values as there might be trailing bytes
     248             : // in a UTF-8 sequence.
     249             : static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
     250             :                                            0x00003080UL,
     251             :                                            0x000E2080UL,
     252             :                                            0x03C82080UL,
     253             :                                            static_cast<UChar32>(0xFA082080UL),
     254             :                                            static_cast<UChar32>(0x82082080UL)};
     255             : 
     256           0 : static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) {
     257             :   UChar32 character = 0;
     258             : 
     259             :   // The cases all fall through.
     260           0 :   switch (length) {
     261             :     case 6:
     262           0 :       character += static_cast<unsigned char>(*sequence++);
     263           0 :       character <<= 6;
     264             :     case 5:
     265           0 :       character += static_cast<unsigned char>(*sequence++);
     266           0 :       character <<= 6;
     267             :     case 4:
     268           0 :       character += static_cast<unsigned char>(*sequence++);
     269           0 :       character <<= 6;
     270             :     case 3:
     271           0 :       character += static_cast<unsigned char>(*sequence++);
     272           0 :       character <<= 6;
     273             :     case 2:
     274           0 :       character += static_cast<unsigned char>(*sequence++);
     275           0 :       character <<= 6;
     276             :     case 1:
     277           0 :       character += static_cast<unsigned char>(*sequence++);
     278             :   }
     279             : 
     280           0 :   return character - offsetsFromUTF8[length - 1];
     281             : }
     282             : 
     283           0 : ConversionResult convertUTF8ToUTF16(const char** sourceStart,
     284             :                                     const char* sourceEnd, UChar** targetStart,
     285             :                                     UChar* targetEnd, bool* sourceAllASCII,
     286             :                                     bool strict) {
     287             :   ConversionResult result = conversionOK;
     288           0 :   const char* source = *sourceStart;
     289           0 :   UChar* target = *targetStart;
     290             :   UChar orAllData = 0;
     291           0 :   while (source < sourceEnd) {
     292           0 :     int utf8SequenceLength = inlineUTF8SequenceLength(*source);
     293           0 :     if (sourceEnd - source < utf8SequenceLength) {
     294             :       result = sourceExhausted;
     295             :       break;
     296             :     }
     297             :     // Do this check whether lenient or strict
     298           0 :     if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
     299           0 :                      utf8SequenceLength)) {
     300             :       result = sourceIllegal;
     301             :       break;
     302             :     }
     303             : 
     304           0 :     UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
     305             : 
     306           0 :     if (target >= targetEnd) {
     307           0 :       source -= utf8SequenceLength;  // Back up source pointer!
     308             :       result = targetExhausted;
     309           0 :       break;
     310             :     }
     311             : 
     312           0 :     if (U_IS_BMP(character)) {
     313             :       // UTF-16 surrogate values are illegal in UTF-32
     314           0 :       if (U_IS_SURROGATE(character)) {
     315           0 :         if (strict) {
     316           0 :           source -= utf8SequenceLength;  // return to the illegal value itself
     317             :           result = sourceIllegal;
     318           0 :           break;
     319             :         }
     320           0 :         *target++ = replacementCharacter;
     321           0 :         orAllData |= replacementCharacter;
     322             :       } else {
     323           0 :         *target++ = static_cast<UChar>(character);  // normal case
     324           0 :         orAllData |= character;
     325             :       }
     326           0 :     } else if (U_IS_SUPPLEMENTARY(character)) {
     327             :       // target is a character in range 0xFFFF - 0x10FFFF
     328           0 :       if (target + 1 >= targetEnd) {
     329           0 :         source -= utf8SequenceLength;  // Back up source pointer!
     330             :         result = targetExhausted;
     331           0 :         break;
     332             :       }
     333           0 :       *target++ = U16_LEAD(character);
     334           0 :       *target++ = U16_TRAIL(character);
     335             :       orAllData = 0xffff;
     336             :     } else {
     337           0 :       if (strict) {
     338           0 :         source -= utf8SequenceLength;  // return to the start
     339             :         result = sourceIllegal;
     340           0 :         break;  // Bail out; shouldn't continue
     341             :       } else {
     342           0 :         *target++ = replacementCharacter;
     343           0 :         orAllData |= replacementCharacter;
     344             :       }
     345             :     }
     346             :   }
     347           0 :   *sourceStart = source;
     348           0 :   *targetStart = target;
     349             : 
     350           0 :   if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f);
     351             : 
     352           0 :   return result;
     353             : }
     354             : 
     355             : // Helper to write a three-byte UTF-8 code point to the buffer, caller must
     356             : // check room is available.
     357           0 : static inline void putUTF8Triple(char*& buffer, UChar ch) {
     358           0 :   *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
     359           0 :   *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
     360           0 :   *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
     361           0 : }
     362             : 
     363             : }  // namespace
     364             : 
     365             : // static
     366     2480423 : String16 String16::fromInteger(int number) {
     367             :   char arr[50];
     368             :   v8::internal::Vector<char> buffer(arr, arraysize(arr));
     369     2480423 :   return String16(IntToCString(number, buffer));
     370             : }
     371             : 
     372             : // static
     373     4157309 : String16 String16::fromInteger(size_t number) {
     374             :   const size_t kBufferSize = 50;
     375             :   char buffer[kBufferSize];
     376             : #if !defined(_WIN32) && !defined(_WIN64)
     377     4157309 :   v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
     378             : #else
     379             :   v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
     380             : #endif
     381     4157309 :   return String16(buffer);
     382             : }
     383             : 
     384             : // static
     385       34134 : String16 String16::fromDouble(double number) {
     386             :   char arr[50];
     387             :   v8::internal::Vector<char> buffer(arr, arraysize(arr));
     388       34134 :   return String16(DoubleToCString(number, buffer));
     389             : }
     390             : 
     391             : // static
     392           6 : String16 String16::fromDouble(double number, int precision) {
     393             :   std::unique_ptr<char[]> str(
     394           6 :       v8::internal::DoubleToPrecisionCString(number, precision));
     395          12 :   return String16(str.get());
     396             : }
     397             : 
     398      194186 : int String16::toInteger(bool* ok) const {
     399      194186 :   return charactersToInteger(characters16(), length(), ok);
     400             : }
     401             : 
     402         144 : String16 String16::stripWhiteSpace() const {
     403         144 :   if (!length()) return String16();
     404             : 
     405             :   size_t start = 0;
     406          66 :   size_t end = length() - 1;
     407             : 
     408             :   // skip white space from start
     409         162 :   while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start;
     410             : 
     411             :   // only white space
     412          66 :   if (start > end) return String16();
     413             : 
     414             :   // skip white space from end
     415          78 :   while (end && isSpaceOrNewLine(characters16()[end])) --end;
     416             : 
     417          66 :   if (!start && end == length() - 1) return *this;
     418          12 :   return String16(characters16() + start, end + 1 - start);
     419             : }
     420             : 
     421     4707376 : String16Builder::String16Builder() {}
     422             : 
     423    60928841 : void String16Builder::append(const String16& s) {
     424             :   m_buffer.insert(m_buffer.end(), s.characters16(),
     425   121857682 :                   s.characters16() + s.length());
     426    60928841 : }
     427             : 
     428  1953602476 : void String16Builder::append(UChar c) { m_buffer.push_back(c); }
     429             : 
     430        6476 : void String16Builder::append(char c) {
     431        6476 :   UChar u = c;
     432        6476 :   m_buffer.push_back(u);
     433        6476 : }
     434             : 
     435           0 : void String16Builder::append(const UChar* characters, size_t length) {
     436           0 :   m_buffer.insert(m_buffer.end(), characters, characters + length);
     437           0 : }
     438             : 
     439    14014670 : void String16Builder::append(const char* characters, size_t length) {
     440    28029340 :   m_buffer.insert(m_buffer.end(), characters, characters + length);
     441    14014670 : }
     442             : 
     443        5332 : void String16Builder::appendNumber(int number) {
     444             :   const int kBufferSize = 11;
     445             :   char buffer[kBufferSize];
     446        5332 :   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number);
     447             :   DCHECK_GT(kBufferSize, chars);
     448       10664 :   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
     449        5332 : }
     450             : 
     451           0 : void String16Builder::appendNumber(size_t number) {
     452             :   const int kBufferSize = 20;
     453             :   char buffer[kBufferSize];
     454             : #if !defined(_WIN32) && !defined(_WIN64)
     455           0 :   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
     456             : #else
     457             :   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
     458             : #endif
     459             :   DCHECK_GT(kBufferSize, chars);
     460           0 :   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
     461           0 : }
     462             : 
     463     2353688 : String16 String16Builder::toString() {
     464     4707376 :   return String16(m_buffer.data(), m_buffer.size());
     465             : }
     466             : 
     467     2306064 : void String16Builder::reserveCapacity(size_t capacity) {
     468     2306064 :   m_buffer.reserve(capacity);
     469     2306064 : }
     470             : 
     471           0 : String16 String16::fromUTF8(const char* stringStart, size_t length) {
     472           0 :   if (!stringStart || !length) return String16();
     473             : 
     474           0 :   std::vector<UChar> buffer(length);
     475             :   UChar* bufferStart = buffer.data();
     476             : 
     477           0 :   UChar* bufferCurrent = bufferStart;
     478           0 :   const char* stringCurrent = stringStart;
     479           0 :   if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
     480           0 :                          bufferCurrent + buffer.size(), 0,
     481           0 :                          true) != conversionOK)
     482             :     return String16();
     483             : 
     484           0 :   size_t utf16Length = bufferCurrent - bufferStart;
     485             :   return String16(bufferStart, utf16Length);
     486             : }
     487             : 
     488           0 : std::string String16::utf8() const {
     489             :   size_t length = this->length();
     490             : 
     491           0 :   if (!length) return std::string("");
     492             : 
     493             :   // Allocate a buffer big enough to hold all the characters
     494             :   // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
     495             :   // Optimization ideas, if we find this function is hot:
     496             :   //  * We could speculatively create a CStringBuffer to contain 'length'
     497             :   //    characters, and resize if necessary (i.e. if the buffer contains
     498             :   //    non-ascii characters). (Alternatively, scan the buffer first for
     499             :   //    ascii characters, so we know this will be sufficient).
     500             :   //  * We could allocate a CStringBuffer with an appropriate size to
     501             :   //    have a good chance of being able to write the string into the
     502             :   //    buffer without reallocing (say, 1.5 x length).
     503           0 :   if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();
     504           0 :   std::vector<char> bufferVector(length * 3);
     505           0 :   char* buffer = bufferVector.data();
     506           0 :   const UChar* characters = m_impl.data();
     507             : 
     508             :   ConversionResult result =
     509             :       convertUTF16ToUTF8(&characters, characters + length, &buffer,
     510           0 :                          buffer + bufferVector.size(), false);
     511             :   DCHECK(
     512             :       result !=
     513             :       targetExhausted);  // (length * 3) should be sufficient for any conversion
     514             : 
     515             :   // Only produced from strict conversion.
     516             :   DCHECK(result != sourceIllegal);
     517             : 
     518             :   // Check for an unconverted high surrogate.
     519           0 :   if (result == sourceExhausted) {
     520             :     // This should be one unpaired high surrogate. Treat it the same
     521             :     // was as an unpaired high surrogate would have been handled in
     522             :     // the middle of a string with non-strict conversion - which is
     523             :     // to say, simply encode it to UTF-8.
     524             :     DCHECK((characters + 1) == (m_impl.data() + length));
     525             :     DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF));
     526             :     // There should be room left, since one UChar hasn't been
     527             :     // converted.
     528             :     DCHECK((buffer + 3) <= (buffer + bufferVector.size()));
     529           0 :     putUTF8Triple(buffer, *characters);
     530             :   }
     531             : 
     532           0 :   return std::string(bufferVector.data(), buffer - bufferVector.data());
     533             : }
     534             : 
     535             : }  // namespace v8_inspector

Generated by: LCOV version 1.10