LCOV - code coverage report
Current view: top level - src - intl.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 121 137 88.3 %
Date: 2017-10-20 Functions: 17 22 77.3 %

          Line data    Source code
       1             : // Copyright 2013 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_INTL_SUPPORT
       6             : #error Internationalization is expected to be enabled.
       7             : #endif  // V8_INTL_SUPPORT
       8             : 
       9             : #include "src/intl.h"
      10             : 
      11             : #include <memory>
      12             : 
      13             : #include "src/factory.h"
      14             : #include "src/isolate.h"
      15             : #include "src/objects-inl.h"
      16             : #include "src/string-case.h"
      17             : #include "unicode/calendar.h"
      18             : #include "unicode/gregocal.h"
      19             : #include "unicode/timezone.h"
      20             : #include "unicode/ustring.h"
      21             : #include "unicode/uvernum.h"
      22             : #include "unicode/uversion.h"
      23             : 
      24             : namespace v8 {
      25             : namespace internal {
      26             : 
      27             : namespace {
      28        6018 : inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
      29             : 
      30             : const uint8_t kToLower[256] = {
      31             :     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
      32             :     0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
      33             :     0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
      34             :     0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
      35             :     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
      36             :     0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
      37             :     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
      38             :     0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
      39             :     0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
      40             :     0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
      41             :     0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
      42             :     0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
      43             :     0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
      44             :     0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
      45             :     0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
      46             :     0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
      47             :     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
      48             :     0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
      49             :     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
      50             :     0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
      51             :     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
      52             :     0xFC, 0xFD, 0xFE, 0xFF,
      53             : };
      54             : 
      55             : inline uint16_t ToLatin1Lower(uint16_t ch) {
      56        1710 :   return static_cast<uint16_t>(kToLower[ch]);
      57             : }
      58             : 
      59             : inline uint16_t ToASCIIUpper(uint16_t ch) {
      60           0 :   return ch & ~((ch >= 'a' && ch <= 'z') << 5);
      61             : }
      62             : 
      63             : // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
      64             : inline uint16_t ToLatin1Upper(uint16_t ch) {
      65             :   DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
      66             :   return ch &
      67        1373 :          ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xF7))
      68        1373 :            << 5);
      69             : }
      70             : 
      71             : template <typename Char>
      72           0 : bool ToUpperFastASCII(const Vector<const Char>& src,
      73             :                       Handle<SeqOneByteString> result) {
      74             :   // Do a faster loop for the case where all the characters are ASCII.
      75             :   uint16_t ored = 0;
      76             :   int32_t index = 0;
      77           0 :   for (auto it = src.begin(); it != src.end(); ++it) {
      78           0 :     uint16_t ch = static_cast<uint16_t>(*it);
      79           0 :     ored |= ch;
      80           0 :     result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
      81             :   }
      82           0 :   return !(ored & ~0x7F);
      83             : }
      84             : 
      85             : const uint16_t sharp_s = 0xDF;
      86             : 
      87             : template <typename Char>
      88         428 : bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,
      89             :                     int* sharp_s_count) {
      90             :   // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
      91             : 
      92             :   // There are two special cases.
      93             :   //  1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
      94             :   //  2. Lower case sharp-S converts to "SS" (two characters)
      95         214 :   *sharp_s_count = 0;
      96        2348 :   for (auto it = src.begin(); it != src.end(); ++it) {
      97        1026 :     uint16_t ch = static_cast<uint16_t>(*it);
      98        1026 :     if (V8_UNLIKELY(ch == sharp_s)) {
      99         105 :       ++(*sharp_s_count);
     100         105 :       continue;
     101             :     }
     102         921 :     if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
     103             :       // Since this upper-cased character does not fit in an 8-bit string, we
     104             :       // need to take the 16-bit path.
     105             :       return false;
     106             :     }
     107        1710 :     *dest++ = ToLatin1Upper(ch);
     108             :   }
     109             : 
     110             :   return true;
     111             : }
     112             : 
     113             : template <typename Char>
     114          85 : void ToUpperWithSharpS(const Vector<const Char>& src,
     115             :                        Handle<SeqOneByteString> result) {
     116             :   int32_t dest_index = 0;
     117        1416 :   for (auto it = src.begin(); it != src.end(); ++it) {
     118         623 :     uint16_t ch = static_cast<uint16_t>(*it);
     119         623 :     if (ch == sharp_s) {
     120         105 :       result->SeqOneByteStringSet(dest_index++, 'S');
     121         105 :       result->SeqOneByteStringSet(dest_index++, 'S');
     122             :     } else {
     123         518 :       result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
     124             :     }
     125             :   }
     126          85 : }
     127             : 
     128        1500 : inline int FindFirstUpperOrNonAscii(String* s, int length) {
     129        6749 :   for (int index = 0; index < length; ++index) {
     130             :     uint16_t ch = s->Get(index);
     131        6018 :     if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
     132             :       return index;
     133             :     }
     134             :   }
     135             :   return length;
     136             : }
     137             : 
     138             : }  // namespace
     139             : 
     140       55341 : const uint8_t* ToLatin1LowerTable() { return &kToLower[0]; }
     141             : 
     142      375880 : const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,
     143             :                                     std::unique_ptr<uc16[]>* dest,
     144             :                                     int32_t length) {
     145             :   DCHECK(flat.IsFlat());
     146      187976 :   if (flat.IsOneByte()) {
     147      182619 :     if (!*dest) {
     148      182547 :       dest->reset(NewArray<uc16>(length));
     149             :       CopyChars(dest->get(), flat.ToOneByteVector().start(), length);
     150             :     }
     151      182619 :     return reinterpret_cast<const UChar*>(dest->get());
     152             :   } else {
     153             :     return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
     154             :   }
     155             : }
     156             : 
     157        2150 : MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
     158             :                                           bool is_to_upper, const char* lang) {
     159        2150 :   auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
     160             :   int32_t src_length = s->length();
     161             :   int32_t dest_length = src_length;
     162             :   UErrorCode status;
     163             :   Handle<SeqTwoByteString> result;
     164        2150 :   std::unique_ptr<uc16[]> sap;
     165             : 
     166        2150 :   if (dest_length == 0) return isolate->heap()->empty_string();
     167             : 
     168             :   // This is not a real loop. It'll be executed only once (no overflow) or
     169             :   // twice (overflow).
     170         186 :   for (int i = 0; i < 2; ++i) {
     171             :     // Case conversion can increase the string length (e.g. sharp-S => SS) so
     172             :     // that we have to handle RangeError exceptions here.
     173        4672 :     ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
     174             :         isolate, result, isolate->factory()->NewRawTwoByteString(dest_length));
     175             :     DisallowHeapAllocation no_gc;
     176             :     DCHECK(s->IsFlat());
     177        2336 :     String::FlatContent flat = s->GetFlatContent();
     178        2336 :     const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
     179        2336 :     status = U_ZERO_ERROR;
     180        2336 :     dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
     181        2336 :                                  dest_length, src, src_length, lang, &status);
     182        2336 :     if (status != U_BUFFER_OVERFLOW_ERROR) break;
     183             :   }
     184             : 
     185             :   // In most cases, the output will fill the destination buffer completely
     186             :   // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
     187             :   // Only in rare cases, it'll be shorter than the destination buffer and
     188             :   // |result| has to be truncated.
     189             :   DCHECK(U_SUCCESS(status));
     190        2150 :   if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
     191             :     DCHECK(dest_length == result->length());
     192        2015 :     return *result;
     193             :   }
     194         135 :   if (U_SUCCESS(status)) {
     195             :     DCHECK(dest_length < result->length());
     196             :     return *Handle<SeqTwoByteString>::cast(
     197         270 :         SeqString::Truncate(result, dest_length));
     198             :   }
     199           0 :   return *s;
     200             : }
     201             : 
     202             : // A stripped-down version of ConvertToLower that can only handle flat one-byte
     203             : // strings and does not allocate. Note that {src} could still be, e.g., a
     204             : // one-byte sliced string with a two-byte parent string.
     205             : // Called from TF builtins.
     206        3666 : MUST_USE_RESULT Object* ConvertOneByteToLower(String* src, String* dst,
     207             :                                               Isolate* isolate) {
     208             :   DCHECK_EQ(src->length(), dst->length());
     209             :   DCHECK(src->HasOnlyOneByteChars());
     210             :   DCHECK(src->IsFlat());
     211             :   DCHECK(dst->IsSeqOneByteString());
     212             : 
     213             :   DisallowHeapAllocation no_gc;
     214             : 
     215             :   const int length = src->length();
     216        3666 :   String::FlatContent src_flat = src->GetFlatContent();
     217        3666 :   uint8_t* dst_data = SeqOneByteString::cast(dst)->GetChars();
     218             : 
     219        3666 :   if (src_flat.IsOneByte()) {
     220             :     const uint8_t* src_data = src_flat.ToOneByteVector().start();
     221             : 
     222        3660 :     bool has_changed_character = false;
     223             :     int index_to_first_unprocessed =
     224             :         FastAsciiConvert<true>(reinterpret_cast<char*>(dst_data),
     225             :                                reinterpret_cast<const char*>(src_data), length,
     226        3660 :                                &has_changed_character);
     227             : 
     228        3660 :     if (index_to_first_unprocessed == length) {
     229        3642 :       return has_changed_character ? dst : src;
     230             :     }
     231             : 
     232             :     // If not ASCII, we keep the result up to index_to_first_unprocessed and
     233             :     // process the rest.
     234        1710 :     for (int index = index_to_first_unprocessed; index < length; ++index) {
     235        3420 :       dst_data[index] = ToLatin1Lower(static_cast<uint16_t>(src_data[index]));
     236             :     }
     237             :   } else {
     238             :     DCHECK(src_flat.IsTwoByte());
     239           6 :     int index_to_first_unprocessed = FindFirstUpperOrNonAscii(src, length);
     240           6 :     if (index_to_first_unprocessed == length) return src;
     241             : 
     242             :     const uint16_t* src_data = src_flat.ToUC16Vector().start();
     243           0 :     CopyChars(dst_data, src_data, index_to_first_unprocessed);
     244           0 :     for (int index = index_to_first_unprocessed; index < length; ++index) {
     245           0 :       dst_data[index] = ToLatin1Lower(static_cast<uint16_t>(src_data[index]));
     246             :     }
     247             :   }
     248             : 
     249          18 :   return dst;
     250             : }
     251             : 
     252        4882 : MUST_USE_RESULT Object* ConvertToLower(Handle<String> s, Isolate* isolate) {
     253        4882 :   if (!s->HasOnlyOneByteChars()) {
     254             :     // Use a slower implementation for strings with characters beyond U+00FF.
     255        1415 :     return LocaleConvertCase(s, isolate, false, "");
     256             :   }
     257             : 
     258             :   int length = s->length();
     259             : 
     260             :   // We depend here on the invariant that the length of a Latin1
     261             :   // string is invariant under ToLowerCase, and the result always
     262             :   // fits in the Latin1 range in the *root locale*. It does not hold
     263             :   // for ToUpperCase even in the root locale.
     264             : 
     265             :   // Scan the string for uppercase and non-ASCII characters for strings
     266             :   // shorter than a machine-word without any memory allocation overhead.
     267             :   // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()
     268             :   // to two parts, one for scanning the prefix with no change and the other for
     269             :   // handling ASCII-only characters.
     270             : 
     271             :   bool is_short = length < static_cast<int>(sizeof(uintptr_t));
     272        3467 :   if (is_short) {
     273        1494 :     bool is_lower_ascii = FindFirstUpperOrNonAscii(*s, length) == length;
     274        2219 :     if (is_lower_ascii) return *s;
     275             :   }
     276             : 
     277             :   Handle<SeqOneByteString> result =
     278        5484 :       isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
     279             : 
     280        2742 :   return ConvertOneByteToLower(*s, *result, isolate);
     281             : }
     282             : 
     283        6306 : MUST_USE_RESULT Object* ConvertToUpper(Handle<String> s, Isolate* isolate) {
     284             :   int32_t length = s->length();
     285        6306 :   if (s->HasOnlyOneByteChars() && length > 0) {
     286             :     Handle<SeqOneByteString> result =
     287       12048 :         isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
     288             : 
     289             :     DCHECK(s->IsFlat());
     290             :     int sharp_s_count;
     291             :     bool is_result_single_byte;
     292             :     {
     293             :       DisallowHeapAllocation no_gc;
     294        6024 :       String::FlatContent flat = s->GetFlatContent();
     295        6024 :       uint8_t* dest = result->GetChars();
     296        6024 :       if (flat.IsOneByte()) {
     297             :         Vector<const uint8_t> src = flat.ToOneByteVector();
     298        6024 :         bool has_changed_character = false;
     299             :         int index_to_first_unprocessed =
     300        6024 :             FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
     301             :                                     reinterpret_cast<const char*>(src.start()),
     302        6024 :                                     length, &has_changed_character);
     303        6024 :         if (index_to_first_unprocessed == length)
     304       11620 :           return has_changed_character ? *result : *s;
     305             :         // If not ASCII, we keep the result up to index_to_first_unprocessed and
     306             :         // process the rest.
     307             :         is_result_single_byte =
     308         214 :             ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),
     309         428 :                            dest + index_to_first_unprocessed, &sharp_s_count);
     310             :       } else {
     311             :         DCHECK(flat.IsTwoByte());
     312           0 :         Vector<const uint16_t> src = flat.ToUC16Vector();
     313           0 :         if (ToUpperFastASCII(src, result)) return *result;
     314           0 :         is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);
     315             :       }
     316             :     }
     317             : 
     318             :     // Go to the full Unicode path if there are characters whose uppercase
     319             :     // is beyond the Latin-1 range (cannot be represented in OneByteString).
     320         214 :     if (V8_UNLIKELY(!is_result_single_byte)) {
     321          66 :       return LocaleConvertCase(s, isolate, true, "");
     322             :     }
     323             : 
     324         211 :     if (sharp_s_count == 0) return *result;
     325             : 
     326             :     // We have sharp_s_count sharp-s characters, but the result is still
     327             :     // in the Latin-1 range.
     328         170 :     ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
     329             :         isolate, result,
     330             :         isolate->factory()->NewRawOneByteString(length + sharp_s_count));
     331             :     DisallowHeapAllocation no_gc;
     332          85 :     String::FlatContent flat = s->GetFlatContent();
     333          85 :     if (flat.IsOneByte()) {
     334          85 :       ToUpperWithSharpS(flat.ToOneByteVector(), result);
     335             :     } else {
     336           0 :       ToUpperWithSharpS(flat.ToUC16Vector(), result);
     337             :     }
     338             : 
     339          85 :     return *result;
     340             :   }
     341             : 
     342         282 :   return LocaleConvertCase(s, isolate, true, "");
     343             : }
     344             : 
     345        5261 : MUST_USE_RESULT Object* ConvertCase(Handle<String> s, bool is_upper,
     346             :                                     Isolate* isolate) {
     347        5261 :   return is_upper ? ConvertToUpper(s, isolate) : ConvertToLower(s, isolate);
     348             : }
     349             : 
     350          36 : ICUTimezoneCache::ICUTimezoneCache() : timezone_(nullptr) { Clear(); }
     351             : 
     352          24 : ICUTimezoneCache::~ICUTimezoneCache() { Clear(); }
     353             : 
     354          24 : const char* ICUTimezoneCache::LocalTimezone(double time_ms) {
     355          24 :   bool is_dst = DaylightSavingsOffset(time_ms) != 0;
     356          24 :   char* name = is_dst ? dst_timezone_name_ : timezone_name_;
     357          24 :   if (name[0] == '\0') {
     358             :     icu::UnicodeString result;
     359          48 :     GetTimeZone()->getDisplayName(is_dst, icu::TimeZone::LONG, result);
     360             :     result += '\0';
     361             : 
     362          48 :     icu::CheckedArrayByteSink byte_sink(name, kMaxTimezoneChars);
     363          24 :     result.toUTF8(byte_sink);
     364          48 :     CHECK(!byte_sink.Overflowed());
     365             :   }
     366          24 :   return const_cast<const char*>(name);
     367             : }
     368             : 
     369           0 : icu::TimeZone* ICUTimezoneCache::GetTimeZone() {
     370          90 :   if (timezone_ == nullptr) {
     371          12 :     timezone_ = icu::TimeZone::createDefault();
     372             :   }
     373          90 :   return timezone_;
     374             : }
     375             : 
     376          66 : bool ICUTimezoneCache::GetOffsets(double time_ms, int32_t* raw_offset,
     377             :                                   int32_t* dst_offset) {
     378          66 :   UErrorCode status = U_ZERO_ERROR;
     379          66 :   GetTimeZone()->getOffset(time_ms, false, *raw_offset, *dst_offset, status);
     380         132 :   return U_SUCCESS(status);
     381             : }
     382             : 
     383          54 : double ICUTimezoneCache::DaylightSavingsOffset(double time_ms) {
     384             :   int32_t raw_offset, dst_offset;
     385          54 :   if (!GetOffsets(time_ms, &raw_offset, &dst_offset)) return 0;
     386          54 :   return dst_offset;
     387             : }
     388             : 
     389          12 : double ICUTimezoneCache::LocalTimeOffset() {
     390             :   int32_t raw_offset, dst_offset;
     391          12 :   if (!GetOffsets(icu::Calendar::getNow(), &raw_offset, &dst_offset)) return 0;
     392          12 :   return raw_offset;
     393             : }
     394             : 
     395          12 : void ICUTimezoneCache::Clear() {
     396          24 :   delete timezone_;
     397          36 :   timezone_ = nullptr;
     398          36 :   timezone_name_[0] = '\0';
     399          36 :   dst_timezone_name_[0] = '\0';
     400          12 : }
     401             : 
     402             : }  // namespace internal
     403             : }  // namespace v8

Generated by: LCOV version 1.10