Line data Source code
1 : // Copyright 2014 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_UNICODE_DECODER_H_
6 : #define V8_UNICODE_DECODER_H_
7 :
8 : #include <sys/types.h>
9 : #include <algorithm>
10 : #include "src/globals.h"
11 : #include "src/memcopy.h"
12 : #include "src/unicode.h"
13 : #include "src/vector.h"
14 :
15 : namespace unibrow {
16 :
17 : class Utf8Iterator {
18 : public:
19 : explicit Utf8Iterator(const v8::internal::Vector<const char>& stream)
20 : : Utf8Iterator(stream, 0, false) {}
21 : Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset,
22 : bool trailing)
23 : : stream_(stream),
24 : cursor_(offset),
25 : offset_(0),
26 : char_(0),
27 30416398 : trailing_(false) {
28 : DCHECK_LE(offset, stream.length());
29 : // Read the first char, setting offset_ to offset in the process.
30 30416398 : ++*this;
31 :
32 : // This must be set after reading the first char, since the offset marks
33 : // the start of the octet sequence that the trailing char is part of.
34 30416372 : trailing_ = trailing;
35 : if (trailing) {
36 : DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode);
37 : }
38 : }
39 :
40 : uint16_t operator*();
41 : Utf8Iterator& operator++();
42 : Utf8Iterator operator++(int);
43 : bool Done();
44 : bool Trailing() { return trailing_; }
45 : size_t Offset() { return offset_; }
46 :
47 : private:
48 : const v8::internal::Vector<const char>& stream_;
49 : size_t cursor_;
50 : size_t offset_;
51 : uint32_t char_;
52 : bool trailing_;
53 : };
54 :
55 : class V8_EXPORT_PRIVATE Utf8DecoderBase {
56 : public:
57 : // Initialization done in subclass.
58 : inline Utf8DecoderBase();
59 : inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
60 : const v8::internal::Vector<const char>& stream);
61 : inline size_t Utf16Length() const { return utf16_length_; }
62 :
63 : protected:
64 : // This reads all characters and sets the utf16_length_.
65 : // The first buffer_length utf16 chars are cached in the buffer.
66 : void Reset(uint16_t* buffer, size_t buffer_length,
67 : const v8::internal::Vector<const char>& vector);
68 : static void WriteUtf16Slow(uint16_t* data, size_t length,
69 : const v8::internal::Vector<const char>& stream,
70 : size_t offset, bool trailing);
71 :
72 : size_t bytes_read_;
73 : size_t chars_written_;
74 : size_t utf16_length_;
75 : bool trailing_;
76 :
77 : private:
78 : DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
79 : };
80 :
81 : template <size_t kBufferSize>
82 : class Utf8Decoder : public Utf8DecoderBase {
83 : public:
84 : inline Utf8Decoder() = default;
85 : explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream);
86 : inline void Reset(const v8::internal::Vector<const char>& stream);
87 : inline size_t WriteUtf16(
88 : uint16_t* data, size_t length,
89 : const v8::internal::Vector<const char>& stream) const;
90 :
91 : private:
92 : uint16_t buffer_[kBufferSize];
93 : };
94 :
95 : Utf8DecoderBase::Utf8DecoderBase()
96 62428 : : bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {}
97 :
98 : Utf8DecoderBase::Utf8DecoderBase(
99 : uint16_t* buffer, size_t buffer_length,
100 : const v8::internal::Vector<const char>& stream) {
101 : Reset(buffer, buffer_length, stream);
102 : }
103 :
104 : template <size_t kBufferSize>
105 : Utf8Decoder<kBufferSize>::Utf8Decoder(
106 : const v8::internal::Vector<const char>& stream)
107 : : Utf8DecoderBase(buffer_, kBufferSize, stream) {}
108 :
109 : template <size_t kBufferSize>
110 : void Utf8Decoder<kBufferSize>::Reset(
111 : const v8::internal::Vector<const char>& stream) {
112 145 : Utf8DecoderBase::Reset(buffer_, kBufferSize, stream);
113 : }
114 :
115 : template <size_t kBufferSize>
116 145 : size_t Utf8Decoder<kBufferSize>::WriteUtf16(
117 : uint16_t* data, size_t data_length,
118 : const v8::internal::Vector<const char>& stream) const {
119 : DCHECK_GT(data_length, 0);
120 290 : data_length = std::min(data_length, utf16_length_);
121 :
122 : // memcpy everything in buffer.
123 290 : size_t memcpy_length = std::min(data_length, chars_written_);
124 145 : v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
125 :
126 145 : if (data_length <= chars_written_) return data_length;
127 :
128 : // Copy the rest the slow way.
129 9 : WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream,
130 : bytes_read_, trailing_);
131 9 : return data_length;
132 : }
133 :
134 : class Latin1 {
135 : public:
136 : static const unsigned kMaxChar = 0xff;
137 : // Convert the character to Latin-1 case equivalent if possible.
138 : static inline uint16_t TryConvertToLatin1(uint16_t);
139 : };
140 :
141 : uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
142 27272 : switch (c) {
143 : // This are equivalent characters in unicode.
144 : case 0x39c:
145 : case 0x3bc:
146 : return 0xb5;
147 : // This is an uppercase of a Latin-1 character
148 : // outside of Latin-1.
149 : case 0x178:
150 : return 0xff;
151 : }
152 : return c;
153 : }
154 :
155 :
156 : } // namespace unibrow
157 :
158 : #endif // V8_UNICODE_DECODER_H_
|