Line data Source code
1 : // Copyright 2014 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 :
6 : #include "src/unicode-inl.h"
7 : #include "src/unicode-decoder.h"
8 : #include <stdio.h>
9 : #include <stdlib.h>
10 :
11 : namespace unibrow {
12 :
13 2106 : void Utf8DecoderBase::Reset(uint16_t* buffer, size_t buffer_length,
14 : const uint8_t* stream, size_t stream_length) {
15 : // Assume everything will fit in the buffer and stream won't be needed.
16 2106 : last_byte_of_buffer_unused_ = false;
17 2106 : unbuffered_start_ = NULL;
18 2106 : unbuffered_length_ = 0;
19 : bool writing_to_buffer = true;
20 : // Loop until stream is read, writing to buffer as long as buffer has space.
21 : size_t utf16_length = 0;
22 8489246 : while (stream_length != 0) {
23 8485034 : size_t cursor = 0;
24 8485034 : uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
25 : DCHECK(cursor > 0 && cursor <= stream_length);
26 8485034 : stream += cursor;
27 8485034 : stream_length -= cursor;
28 8485034 : bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
29 8485034 : utf16_length += is_two_characters ? 2 : 1;
30 : // Don't need to write to the buffer, but still need utf16_length.
31 16970068 : if (!writing_to_buffer) continue;
32 : // Write out the characters to the buffer.
33 : // Must check for equality with buffer_length as we've already updated it.
34 358992 : if (utf16_length <= buffer_length) {
35 358992 : if (is_two_characters) {
36 3048 : *buffer++ = Utf16::LeadSurrogate(character);
37 6096 : *buffer++ = Utf16::TrailSurrogate(character);
38 : } else {
39 355944 : *buffer++ = character;
40 : }
41 358992 : if (utf16_length == buffer_length) {
42 : // Just wrote last character of buffer
43 : writing_to_buffer = false;
44 526 : unbuffered_start_ = stream;
45 526 : unbuffered_length_ = stream_length;
46 : }
47 : continue;
48 : }
49 : // Have gone over buffer.
50 : // Last char of buffer is unused, set cursor back.
51 : DCHECK(is_two_characters);
52 : writing_to_buffer = false;
53 0 : last_byte_of_buffer_unused_ = true;
54 0 : unbuffered_start_ = stream - cursor;
55 0 : unbuffered_length_ = stream_length + cursor;
56 : }
57 2106 : utf16_length_ = utf16_length;
58 2106 : }
59 :
60 :
61 526 : void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
62 : size_t stream_length, uint16_t* data,
63 : size_t data_length) {
64 8127094 : while (data_length != 0) {
65 8126042 : size_t cursor = 0;
66 8126042 : uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
67 : // There's a total lack of bounds checking for stream
68 : // as it was already done in Reset.
69 8126042 : stream += cursor;
70 : DCHECK(stream_length >= cursor);
71 8126042 : stream_length -= cursor;
72 8126042 : if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
73 113183 : *data++ = Utf16::LeadSurrogate(character);
74 226366 : *data++ = Utf16::TrailSurrogate(character);
75 : DCHECK(data_length > 1);
76 113183 : data_length -= 2;
77 : } else {
78 8012859 : *data++ = character;
79 8012859 : data_length -= 1;
80 : }
81 : }
82 526 : }
83 :
84 : } // namespace unibrow
|