LCOV - code coverage report
Current view: top level - test/unittests - unicode-unittest.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 69 69 100.0 %
Date: 2019-01-20 Functions: 15 18 83.3 %

          Line data    Source code
       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include <memory>
       6             : #include <string>
       7             : #include <vector>
       8             : 
       9             : #include "src/unicode-decoder.h"
      10             : #include "src/unicode-inl.h"
      11             : #include "src/vector.h"
      12             : #include "testing/gtest/include/gtest/gtest.h"
      13             : 
      14             : namespace v8 {
      15             : namespace internal {
      16             : 
      17             : namespace {
      18             : 
      19         959 : void DecodeNormally(const std::vector<byte>& bytes,
      20             :                     std::vector<unibrow::uchar>* output) {
      21         132 :   size_t cursor = 0;
      22         959 :   while (cursor < bytes.size()) {
      23             :     output->push_back(
      24        1390 :         unibrow::Utf8::ValueOf(bytes.data() + cursor, bytes.size(), &cursor));
      25             :   }
      26         132 : }
      27             : 
      28             : template <size_t kBufferSize>
      29         140 : void DecodeUtf16(unibrow::Utf8Decoder<kBufferSize>* decoder,
      30             :                  const std::vector<byte>& bytes,
      31             :                  std::vector<unibrow::uchar>* output) {
      32         140 :   auto vector = Vector<const char>::cast(VectorOf(bytes));
      33             :   decoder->Reset(vector);
      34             : 
      35         140 :   std::vector<uint16_t> utf16(decoder->Utf16Length());
      36         140 :   decoder->WriteUtf16(&(*utf16.begin()), decoder->Utf16Length(), vector);
      37             : 
      38             :   // Decode back into code points
      39        1690 :   for (size_t i = 0; i < utf16.size(); i++) {
      40         705 :     uint16_t b = utf16[i];
      41         705 :     if (unibrow::Utf16::IsLeadSurrogate(b)) {
      42          80 :       output->push_back(unibrow::Utf16::CombineSurrogatePair(b, utf16[++i]));
      43             :     } else {
      44        1370 :       output->push_back(b);
      45             :     }
      46             :   }
      47         140 : }
      48             : 
      49        1292 : void DecodeIncrementally(const std::vector<byte>& bytes,
      50             :                          std::vector<unibrow::uchar>* output) {
      51         132 :   unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
      52         132 :   unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
      53        1292 :   for (size_t i = 0; i < bytes.size();) {
      54             :     unibrow::uchar result =
      55        1028 :         unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer);
      56        1028 :     if (result != unibrow::Utf8::kIncomplete) {
      57         689 :       output->push_back(result);
      58             :     }
      59             :   }
      60         132 :   unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&state);
      61         132 :   if (result != unibrow::Utf8::kBufferEmpty) {
      62           6 :     output->push_back(result);
      63             :   }
      64         132 : }
      65             : 
      66             : }  // namespace
      67             : 
      68       15128 : TEST(UnicodeTest, Utf16BufferReuse) {
      69             :   unibrow::Utf8Decoder<4> utf16_decoder;
      70             : 
      71             :   // Not enough continuation bytes before string ends.
      72             :   typedef struct {
      73             :     std::vector<byte> bytes;
      74             :     std::vector<unibrow::uchar> unicode_expected;
      75          35 :   } TestCase;
      76             : 
      77             :   TestCase data[] = {
      78             :       {{0x00}, {0x0}},
      79             :       {{0xC2, 0x80}, {0x80}},
      80             :       {{0xE0, 0xA0, 0x80}, {0x800}},
      81             :       {{0xF0, 0x90, 0x80, 0x80}, {0x10000}},
      82             :       {{0xE0, 0xA0, 0x80}, {0x800}},
      83             :       {{0xC2, 0x80}, {0x80}},
      84             :       {{0x00}, {0x0}},
      85           7 :   };
      86           8 :   for (auto test : data) {
      87             :     // For figuring out which test fails:
      88           7 :     fprintf(stderr, "test: ");
      89          30 :     for (auto b : test.bytes) {
      90          16 :       fprintf(stderr, "%x ", b);
      91             :     }
      92           7 :     fprintf(stderr, "\n");
      93             : 
      94             :     std::vector<unibrow::uchar> output_utf16;
      95           7 :     DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
      96             : 
      97          21 :     CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
      98          21 :     for (size_t i = 0; i < output_utf16.size(); ++i) {
      99          14 :       CHECK_EQ(output_utf16[i], test.unicode_expected[i]);
     100             :     }
     101          22 :   }
     102           1 : }
     103             : 
     104       15128 : TEST(UnicodeTest, SurrogateOverrunsBuffer) {
     105             :   unibrow::Utf8Decoder<2> utf16_decoder;
     106             : 
     107             :   std::vector<unibrow::uchar> output_utf16;
     108             :   // Not enough continuation bytes before string ends.
     109             :   DecodeUtf16(&utf16_decoder, {0x00, 0xF0, 0x90, 0x80, 0x80, 0x00},
     110           2 :               &output_utf16);
     111           1 :   CHECK_EQ(output_utf16[0], 0x00);
     112           1 :   CHECK_EQ(output_utf16[1], 0x10000);
     113           1 :   CHECK_EQ(output_utf16[0], 0x00);
     114           1 : }
     115             : 
     116       15128 : TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
     117             :   // Unfortunately, V8 has two UTF-8 decoders. This test checks that they
     118             :   // produce the same result. This test was inspired by
     119             :   // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt .
     120             :   typedef struct {
     121             :     std::vector<byte> bytes;
     122             :     std::vector<unibrow::uchar> unicode_expected;
     123         660 :   } TestCase;
     124             : 
     125             :   TestCase data[] = {
     126             :       // Correct UTF-8 text.
     127             :       {{0xCE, 0xBA, 0xE1, 0xBD, 0xB9, 0xCF, 0x83, 0xCE, 0xBC, 0xCE, 0xB5},
     128             :        {0x3BA, 0x1F79, 0x3C3, 0x3BC, 0x3B5}},
     129             : 
     130             :       // First possible sequence of a certain length:
     131             :       // 1 byte
     132             :       {{0x00}, {0x0}},
     133             :       // 2 bytes
     134             :       {{0xC2, 0x80}, {0x80}},
     135             :       // 3 bytes
     136             :       {{0xE0, 0xA0, 0x80}, {0x800}},
     137             :       // 4 bytes
     138             :       {{0xF0, 0x90, 0x80, 0x80}, {0x10000}},
     139             :       // 5 bytes (not supported)
     140             :       {{0xF8, 0x88, 0x80, 0x80, 0x80},
     141             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     142             :       // 6 bytes (not supported)
     143             :       {{0xFC, 0x84, 0x80, 0x80, 0x80, 0x80},
     144             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     145             : 
     146             :       // Last possible sequence of certain length:
     147             :       // 1 byte
     148             :       {{0x7F}, {0x7F}},
     149             :       // 2 bytes
     150             :       {{0xDF, 0xBF}, {0x7FF}},
     151             :       // 3 bytes
     152             :       {{0xEF, 0xBF, 0xBF}, {0xFFFF}},
     153             :       // 4 bytes (this sequence is not a valid code point)
     154             :       {{0xF7, 0xBF, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     155             :       // 5 bytes (not supported)
     156             :       {{0xFB, 0xBF, 0xBF, 0xBF, 0xBF},
     157             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     158             :       // 6 bytes (not supported)
     159             :       {{0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF},
     160             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     161             :       // Other boundary conditions:
     162             :       {{0xED, 0x9F, 0xBF}, {0xD7FF}},
     163             :       {{0xEE, 0x80, 0x80}, {0xE000}},
     164             :       // U+fffd (invalid code point)
     165             :       {{0xEF, 0xBF, 0xBD}, {0xFFFD}},
     166             :       // U+10ffff (last valid code point)
     167             :       {{0xF4, 0x8F, 0xBF, 0xBF}, {0x10FFFF}},
     168             :       // First invalid (too large) code point
     169             :       {{0xF4, 0x90, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     170             : 
     171             :       // Malformed sequences:
     172             :       // Unexpected continuation bytes:
     173             :       // First continuation byte
     174             :       {{0x80}, {0xFFFD}},
     175             :       // Last continuation byte
     176             :       {{0xBF}, {0xFFFD}},
     177             :       // 2 continuation bytes
     178             :       {{0x80, 0xBF}, {0xFFFD, 0xFFFD}},
     179             :       // 3 continuation bytes
     180             :       {{0x80, 0xBF, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     181             :       // 4 continuation bytes
     182             :       {{0x80, 0xBF, 0x80, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     183             :       // 5 continuation bytes
     184             :       {{0x80, 0xBF, 0x80, 0xBF, 0x80},
     185             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     186             :       // 6 continuation bytes
     187             :       {{0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
     188             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     189             :       // 7 continuation bytes
     190             :       {{0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0xBF},
     191             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     192             :       // Sequence of all 64 possible continuation bytes
     193             :       {{0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A,
     194             :         0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
     195             :         0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0,
     196             :         0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB,
     197             :         0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
     198             :         0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF},
     199             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     200             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     201             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     202             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     203             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     204             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     205             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     206             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     207             :       // Using each possible continuation byte in a two-byte sequence:
     208             :       {{0xD0, 0x80, 0xD0, 0x81, 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, 0xD0, 0x85,
     209             :         0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, 0xD0, 0x89, 0xD0, 0x8A, 0xD0, 0x8B,
     210             :         0xD0, 0x8C, 0xD0, 0x8D, 0xD0, 0x8E, 0xD0, 0x8F, 0xD0, 0x90, 0xD0, 0x91,
     211             :         0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97,
     212             :         0xD0, 0x98, 0xD0, 0x99, 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D,
     213             :         0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xD0, 0xA1, 0xD0, 0xA2, 0xD0, 0xA3,
     214             :         0xD0, 0xA4, 0xD0, 0xA5, 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, 0xD0, 0xA9,
     215             :         0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, 0xD0, 0xAD, 0xD0, 0xAE, 0xD0, 0xAF,
     216             :         0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, 0xB3, 0xD0, 0xB4, 0xD0, 0xB5,
     217             :         0xD0, 0xB6, 0xD0, 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, 0xBB,
     218             :         0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, 0xBF},
     219             :        {0x400, 0x401, 0x402, 0x403, 0x404, 0x405, 0x406, 0x407, 0x408, 0x409,
     220             :         0x40A, 0x40B, 0x40C, 0x40D, 0x40E, 0x40F, 0x410, 0x411, 0x412, 0x413,
     221             :         0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D,
     222             :         0x41E, 0x41F, 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427,
     223             :         0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F, 0x430, 0x431,
     224             :         0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B,
     225             :         0x43C, 0x43D, 0x43E, 0x43F}},
     226             : 
     227             :       // Lonely first bytes:
     228             :       // All 32 first bytes of 32-byte sequences, each followed by a space
     229             :       // (generates 32 invalid char + space sequences.
     230             :       {{0xC0, 0x20, 0xC1, 0x20, 0xC2, 0x20, 0xC3, 0x20, 0xC4, 0x20, 0xC5,
     231             :         0x20, 0xC6, 0x20, 0xC7, 0x20, 0xC8, 0x20, 0xC9, 0x20, 0xCA, 0x20,
     232             :         0xCB, 0x20, 0xCC, 0x20, 0xCD, 0x20, 0xCE, 0x20, 0xCF, 0x20, 0xD0,
     233             :         0x20, 0xD1, 0x20, 0xD2, 0x20, 0xD3, 0x20, 0xD4, 0x20, 0xD5, 0x20,
     234             :         0xD6, 0x20, 0xD7, 0x20, 0xD8, 0x20, 0xD9, 0x20, 0xDA, 0x20, 0xDB,
     235             :         0x20, 0xDC, 0x20, 0xDD, 0x20, 0xDE, 0x20, 0xDF, 0x20},
     236             :        {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     237             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     238             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     239             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     240             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     241             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     242             :         0xFFFD, 0x20, 0xFFFD, 0x20}},
     243             :       // All 16 first bytes of 3-byte sequences, each followed by a space
     244             :       // (generates 16 invalid char + space sequences):
     245             :       {{0xE0, 0x20, 0xE1, 0x20, 0xE2, 0x20, 0xE3, 0x20, 0xE4, 0x20, 0xE5,
     246             :         0x20, 0xE6, 0x20, 0xE7, 0x20, 0xE8, 0x20, 0xE9, 0x20, 0xEA, 0x20,
     247             :         0xEB, 0x20, 0xEC, 0x20, 0xED, 0x20, 0xEE, 0x20, 0xEF, 0x20},
     248             :        {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     249             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     250             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     251             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20}},
     252             :       // All 8 first bytes of 4-byte sequences, each followed by a space
     253             :       // (generates 8 invalid char + space sequences):
     254             :       {{0xF0, 0x20, 0xF1, 0x20, 0xF2, 0x20, 0xF3, 0x20, 0xF4, 0x20, 0xF5, 0x20,
     255             :         0xF6, 0x20, 0xF7, 0x20},
     256             :        {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
     257             :         0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20}},
     258             :       // All 4 first bytes of 5-byte sequences (not supported), each followed by
     259             :       // a space (generates 4 invalid char + space sequences):
     260             :       {{0xF8, 0x20, 0xF9, 0x20, 0xFA, 0x20, 0xFB, 0x20},
     261             :        {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20}},
     262             :       // All 2 first bytes of 6-byte sequences (not supported), each followed by
     263             :       // a space (generates 2 invalid char + space sequences):
     264             :       {{0xFC, 0x20, 0xFD, 0x20}, {0xFFFD, 0x20, 0xFFFD, 0x20}},
     265             : 
     266             :       // Sequences with last continuation byte missing. Normally the whole
     267             :       // incomplete sequence generates a single invalid character (exceptions
     268             :       // explained below).
     269             : 
     270             :       // 2-byte sequences with last byte missing
     271             :       {{0xC0}, {0xFFFD}},
     272             :       {{0xDF}, {0xFFFD}},
     273             :       // 3-byte sequences with last byte missing.
     274             :       {{0xE8, 0x80}, {0xFFFD}},
     275             :       {{0xE0, 0xBF}, {0xFFFD}},
     276             :       {{0xEF, 0xBF}, {0xFFFD}},
     277             :       // Start of an overlong sequence. The first "maximal subpart" is the first
     278             :       // byte; it creates an invalid character. Each following byte generates an
     279             :       // invalid character too.
     280             :       {{0xE0, 0x80}, {0xFFFD, 0xFFFD}},
     281             :       // 4-byte sequences with last byte missing
     282             :       {{0xF1, 0x80, 0x80}, {0xFFFD}},
     283             :       {{0xF4, 0x8F, 0xBF}, {0xFFFD}},
     284             :       // Start of an overlong sequence. The first "maximal subpart" is the first
     285             :       // byte; it creates an invalid character. Each following byte generates an
     286             :       // invalid character too.
     287             :       {{0xF0, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     288             :       // 5-byte sequences (not supported) with last byte missing
     289             :       {{0xF8, 0x80, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     290             :       {{0xFB, 0xBF, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     291             :       // 6-byte sequences (not supported) with last byte missing
     292             :       {{0xFC, 0x80, 0x80, 0x80, 0x80},
     293             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     294             :       {{0xFD, 0xBF, 0xBF, 0xBF, 0xBF},
     295             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     296             : 
     297             :       // Concatenation of incomplete sequences: above incomplete sequences
     298             :       // concatenated.
     299             :       {{0xC0, 0xDF, 0xE8, 0x80, 0xE0, 0xBF, 0xEF, 0xBF, 0xE0, 0x80,
     300             :         0xF1, 0x80, 0x80, 0xF4, 0x8F, 0xBF, 0xF0, 0x80, 0x80, 0xF8,
     301             :         0x80, 0x80, 0x80, 0xFB, 0xBF, 0xBF, 0xBF, 0xFC, 0x80, 0x80,
     302             :         0x80, 0x80, 0xFD, 0xBF, 0xBF, 0xBF, 0xBF},
     303             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     304             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     305             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
     306             :         0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     307             : 
     308             :       // Incomplete sequence tests repeated with a space after the incomplete
     309             :       // sequence.
     310             : 
     311             :       // 2-byte sequences with last byte missing
     312             :       {{0xC0, 0x20}, {0xFFFD, 0x20}},
     313             :       {{0xDF, 0x20}, {0xFFFD, 0x20}},
     314             :       // 3-byte sequences with last byte missing
     315             :       {{0xE8, 0x80, 0x20}, {0xFFFD, 0x20}},
     316             :       {{0xE0, 0xBF, 0x20}, {0xFFFD, 0x20}},
     317             :       {{0xEF, 0xBF, 0x20}, {0xFFFD, 0x20}},
     318             :       // Start of overlong 3-byte sequence with last byte missing
     319             :       {{0xE0, 0x80, 0x20}, {0xFFFD, 0xFFFD, 0x20}},
     320             :       // 4-byte sequences with last byte missing
     321             :       {{0xF1, 0x80, 0x80, 0x20}, {0xFFFD, 0x20}},
     322             :       {{0xF4, 0x8F, 0xBF, 0x20}, {0xFFFD, 0x20}},
     323             :       // Start of overlong 4-byte sequence with last byte missing
     324             :       {{0xF0, 0x80, 0x80, 0x20}, {0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
     325             :       // 5-byte sequences (not supported) with last byte missing
     326             :       {{0xF8, 0x80, 0x80, 0x80, 0x20}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
     327             :       {{0xFB, 0xBF, 0xBF, 0xBF, 0x20}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
     328             :       // 6-byte sequences (not supported) with last byte missing
     329             :       {{0xFC, 0x80, 0x80, 0x80, 0x80, 0x20},
     330             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
     331             :       {{0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0x20},
     332             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
     333             : 
     334             :       // Impossible bytes
     335             :       {{0xFE}, {0xFFFD}},
     336             :       {{0xFF}, {0xFFFD}},
     337             :       {{0xFE, 0xFE, 0xFF, 0xFF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     338             :       // Lead-byte-like bytes which aren't valid lead bytes.
     339             :       {{0xC0}, {0xFFFD}},
     340             :       {{0xC0, 0xAA}, {0xFFFD, 0xFFFD}},
     341             :       {{0xC1}, {0xFFFD}},
     342             :       {{0xC1, 0xAA}, {0xFFFD, 0xFFFD}},
     343             :       {{0xF5}, {0xFFFD}},
     344             :       {{0xF5, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     345             :       {{0xF6}, {0xFFFD}},
     346             :       {{0xF6, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     347             :       {{0xF7}, {0xFFFD}},
     348             :       {{0xF7, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     349             :       {{0xF8}, {0xFFFD}},
     350             :       {{0xF8, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     351             :       {{0xF9}, {0xFFFD}},
     352             :       {{0xF9, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     353             :       {{0xFA}, {0xFFFD}},
     354             :       {{0xFA, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     355             :       {{0xFB}, {0xFFFD}},
     356             :       {{0xFB, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     357             :       {{0xFC}, {0xFFFD}},
     358             :       {{0xFC, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     359             :       {{0xFD}, {0xFFFD}},
     360             :       {{0xFD, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     361             :       {{0xFE}, {0xFFFD}},
     362             :       {{0xFE, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     363             :       {{0xFF}, {0xFFFD}},
     364             :       {{0xFF, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     365             : 
     366             :       // Overlong sequences:
     367             : 
     368             :       // Overlong encodings for "/"
     369             :       {{0xC0, 0xAF}, {0xFFFD, 0xFFFD}},
     370             :       {{0xE0, 0x80, 0xAF}, {0xFFFD, 0xFFFD, 0xFFFD}},
     371             :       {{0xF0, 0x80, 0x80, 0xAF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     372             :       // 5-byte sequence (not supported anyway)
     373             :       {{0xF8, 0x80, 0x80, 0x80, 0xAF},
     374             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     375             :       // 6-byte sequence (not supported anyway)
     376             :       {{0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF},
     377             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     378             : 
     379             :       // Maximum overlong sequences
     380             :       {{0xC1, 0xBF}, {0xFFFD, 0xFFFD}},
     381             :       {{0xE0, 0x9F, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
     382             :       {{0xF0, 0x8F, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     383             :       // 5-byte sequence (not supported anyway)
     384             :       {{0xF8, 0x87, 0xBF, 0xBF, 0xBF},
     385             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     386             :       // 6-byte sequence (not supported anyway)
     387             :       {{0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF},
     388             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     389             : 
     390             :       // Overlong encodings for 0
     391             :       {{0xC0, 0x80}, {0xFFFD, 0xFFFD}},
     392             :       {{0xE0, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     393             :       {{0xF0, 0x80, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     394             :       // 5-byte sequence (not supported anyway)
     395             :       {{0xF8, 0x80, 0x80, 0x80, 0x80},
     396             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     397             :       // 6-byte sequence (not supported anyway)
     398             :       {{0xFC, 0x80, 0x80, 0x80, 0x80, 0x80},
     399             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     400             : 
     401             :       // Illegal code positions:
     402             : 
     403             :       // Single UTF-16 surrogates
     404             :       {{0xED, 0xA0, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     405             :       {{0xED, 0xA0, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     406             :       {{0xED, 0xAD, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
     407             :       {{0xED, 0xAE, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     408             :       {{0xED, 0xAF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
     409             :       {{0xED, 0xB0, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     410             :       {{0xED, 0xBE, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
     411             :       {{0xED, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
     412             : 
     413             :       // Paired surrogates
     414             :       {{0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80},
     415             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     416             :       {{0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF},
     417             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     418             :       {{0xED, 0xAD, 0xBF, 0xED, 0xB0, 0x80},
     419             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     420             :       {{0xED, 0xAD, 0xBF, 0xED, 0xBF, 0xBF},
     421             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     422             :       {{0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80},
     423             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     424             :       {{0xED, 0xAE, 0x80, 0xED, 0xBF, 0xBF},
     425             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     426             :       {{0xED, 0xAF, 0xBF, 0xED, 0xB0, 0x80},
     427             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     428             :       {{0xED, 0xAF, 0xBF, 0xED, 0xBF, 0xBF},
     429             :        {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
     430             : 
     431             :       // Surrogates with the last byte missing.
     432             :       {{0xED, 0xA0}, {0xFFFD, 0xFFFD}},
     433             :       {{0xED, 0xA0}, {0xFFFD, 0xFFFD}},
     434             :       {{0xED, 0xAD}, {0xFFFD, 0xFFFD}},
     435             :       {{0xED, 0xAE}, {0xFFFD, 0xFFFD}},
     436             :       {{0xED, 0xAF}, {0xFFFD, 0xFFFD}},
     437             :       {{0xED, 0xB0}, {0xFFFD, 0xFFFD}},
     438             :       {{0xED, 0xBE}, {0xFFFD, 0xFFFD}},
     439             :       {{0xED, 0xBF}, {0xFFFD, 0xFFFD}},
     440             : 
     441             :       // Other non-characters
     442             :       {{0xEF, 0xBF, 0xBE}, {0xFFFE}},
     443             :       {{0xEF, 0xBF, 0xBF}, {0xFFFF}},
     444             :       {{0xEF, 0xB7, 0x90, 0xEF, 0xB7, 0x91, 0xEF, 0xB7, 0x92, 0xEF, 0xB7, 0x93,
     445             :         0xEF, 0xB7, 0x94, 0xEF, 0xB7, 0x95, 0xEF, 0xB7, 0x96, 0xEF, 0xB7, 0x97,
     446             :         0xEF, 0xB7, 0x98, 0xEF, 0xB7, 0x99, 0xEF, 0xB7, 0x9A, 0xEF, 0xB7, 0x9B,
     447             :         0xEF, 0xB7, 0x9C, 0xEF, 0xB7, 0x9D, 0xEF, 0xB7, 0x9E, 0xEF, 0xB7, 0x9F,
     448             :         0xEF, 0xB7, 0xA0, 0xEF, 0xB7, 0xA1, 0xEF, 0xB7, 0xA2, 0xEF, 0xB7, 0xA3,
     449             :         0xEF, 0xB7, 0xA4, 0xEF, 0xB7, 0xA5, 0xEF, 0xB7, 0xA6, 0xEF, 0xB7, 0xA7,
     450             :         0xEF, 0xB7, 0xA8, 0xEF, 0xB7, 0xA9, 0xEF, 0xB7, 0xAA, 0xEF, 0xB7, 0xAB,
     451             :         0xEF, 0xB7, 0xAC, 0xEF, 0xB7, 0xAD, 0xEF, 0xB7, 0xAE, 0xEF, 0xB7, 0xAF},
     452             :        {0xFDD0, 0xFDD1, 0xFDD2, 0xFDD3, 0xFDD4, 0xFDD5, 0xFDD6, 0xFDD7,
     453             :         0xFDD8, 0xFDD9, 0xFDDA, 0xFDDB, 0xFDDC, 0xFDDD, 0xFDDE, 0xFDDF,
     454             :         0xFDE0, 0xFDE1, 0xFDE2, 0xFDE3, 0xFDE4, 0xFDE5, 0xFDE6, 0xFDE7,
     455             :         0xFDE8, 0xFDE9, 0xFDEA, 0xFDEB, 0xFDEC, 0xFDED, 0xFDEE, 0xFDEF}},
     456             :       {{0xF0, 0x9F, 0xBF, 0xBE, 0xF0, 0x9F, 0xBF, 0xBF, 0xF0, 0xAF, 0xBF,
     457             :         0xBE, 0xF0, 0xAF, 0xBF, 0xBF, 0xF0, 0xBF, 0xBF, 0xBE, 0xF0, 0xBF,
     458             :         0xBF, 0xBF, 0xF1, 0x8F, 0xBF, 0xBE, 0xF1, 0x8F, 0xBF, 0xBF, 0xF1,
     459             :         0x9F, 0xBF, 0xBE, 0xF1, 0x9F, 0xBF, 0xBF, 0xF1, 0xAF, 0xBF, 0xBE,
     460             :         0xF1, 0xAF, 0xBF, 0xBF, 0xF1, 0xBF, 0xBF, 0xBE, 0xF1, 0xBF, 0xBF,
     461             :         0xBF, 0xF2, 0x8F, 0xBF, 0xBE, 0xF2, 0x8F, 0xBF, 0xBF},
     462             :        {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF,
     463             :         0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
     464             :         0x8FFFF}},
     465         132 :   };
     466             : 
     467             :   unibrow::Utf8Decoder<50> utf16_decoder;
     468             : 
     469         133 :   for (auto test : data) {
     470             :     // For figuring out which test fails:
     471         132 :     fprintf(stderr, "test: ");
     472        1184 :     for (auto b : test.bytes) {
     473         920 :       fprintf(stderr, "%x ", b);
     474             :     }
     475         132 :     fprintf(stderr, "\n");
     476             : 
     477             :     std::vector<unibrow::uchar> output_normal;
     478         132 :     DecodeNormally(test.bytes, &output_normal);
     479             : 
     480         396 :     CHECK_EQ(output_normal.size(), test.unicode_expected.size());
     481        1522 :     for (size_t i = 0; i < output_normal.size(); ++i) {
     482        1390 :       CHECK_EQ(output_normal[i], test.unicode_expected[i]);
     483             :     }
     484             : 
     485             :     std::vector<unibrow::uchar> output_incremental;
     486         132 :     DecodeIncrementally(test.bytes, &output_incremental);
     487             : 
     488         396 :     CHECK_EQ(output_incremental.size(), test.unicode_expected.size());
     489        1522 :     for (size_t i = 0; i < output_incremental.size(); ++i) {
     490        1390 :       CHECK_EQ(output_incremental[i], test.unicode_expected[i]);
     491             :     }
     492             : 
     493             :     std::vector<unibrow::uchar> output_utf16;
     494         132 :     DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
     495             : 
     496         396 :     CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
     497        1522 :     for (size_t i = 0; i < output_utf16.size(); ++i) {
     498        1390 :       CHECK_EQ(output_utf16[i], test.unicode_expected[i]);
     499             :     }
     500         397 :   }
     501           1 : }
     502             : 
     503             : }  // namespace internal
     504        9075 : }  // namespace v8

Generated by: LCOV version 1.10