Line data Source code
1 : // Copyright 2014 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_UNICODE_DECODER_H_
6 : #define V8_UNICODE_DECODER_H_
7 :
8 : #include <sys/types.h>
9 : #include "src/globals.h"
10 : #include "src/utils.h"
11 :
12 : namespace unibrow {
13 :
14 : class V8_EXPORT_PRIVATE Utf8DecoderBase {
15 : public:
16 : // Initialization done in subclass.
17 : inline Utf8DecoderBase();
18 : inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
19 : const uint8_t* stream, size_t stream_length);
20 : inline size_t Utf16Length() const { return utf16_length_; }
21 :
22 : protected:
23 : // This reads all characters and sets the utf16_length_.
24 : // The first buffer_length utf16 chars are cached in the buffer.
25 : void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
26 : size_t stream_length);
27 : static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
28 : uint16_t* data, size_t length);
29 : const uint8_t* unbuffered_start_;
30 : size_t unbuffered_length_;
31 : size_t utf16_length_;
32 : bool last_byte_of_buffer_unused_;
33 :
34 : private:
35 : DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
36 : };
37 :
38 : template <size_t kBufferSize>
39 : class Utf8Decoder : public Utf8DecoderBase {
40 : public:
41 : inline Utf8Decoder() {}
42 : inline Utf8Decoder(const char* stream, size_t length);
43 : inline void Reset(const char* stream, size_t length);
44 : inline size_t WriteUtf16(uint16_t* data, size_t length) const;
45 :
46 : private:
47 : uint16_t buffer_[kBufferSize];
48 : };
49 :
50 :
51 : Utf8DecoderBase::Utf8DecoderBase()
52 : : unbuffered_start_(NULL),
53 : unbuffered_length_(0),
54 : utf16_length_(0),
55 146230 : last_byte_of_buffer_unused_(false) {}
56 :
57 :
58 : Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
59 : const uint8_t* stream, size_t stream_length) {
60 : Reset(buffer, buffer_length, stream, stream_length);
61 : }
62 :
63 :
64 : template <size_t kBufferSize>
65 : Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
66 : : Utf8DecoderBase(buffer_, kBufferSize,
67 : reinterpret_cast<const uint8_t*>(stream), length) {}
68 :
69 :
70 : template <size_t kBufferSize>
71 : void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
72 2102 : Utf8DecoderBase::Reset(buffer_, kBufferSize,
73 1816 : reinterpret_cast<const uint8_t*>(stream), length);
74 : }
75 :
76 :
77 : template <size_t kBufferSize>
78 1816 : size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
79 : size_t length) const {
80 : DCHECK(length > 0);
81 1816 : if (length > utf16_length_) length = utf16_length_;
82 : // memcpy everything in buffer.
83 : size_t buffer_length =
84 1816 : last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
85 1816 : size_t memcpy_length = length <= buffer_length ? length : buffer_length;
86 1816 : v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
87 1816 : if (length <= buffer_length) return length;
88 : DCHECK(unbuffered_start_ != NULL);
89 : // Copy the rest the slow way.
90 526 : WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
91 : length - buffer_length);
92 526 : return length;
93 : }
94 :
95 : class Latin1 {
96 : public:
97 : static const unsigned kMaxChar = 0xff;
98 : // Returns 0 if character does not convert to single latin-1 character
99 : // or if the character doesn't not convert back to latin-1 via inverse
100 : // operation (upper to lower, etc).
101 : static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
102 : };
103 :
104 :
105 : uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
106 : DCHECK(c > Latin1::kMaxChar);
107 139 : switch (c) {
108 : // This are equivalent characters in unicode.
109 : case 0x39c:
110 : case 0x3bc:
111 : return 0xb5;
112 : // This is an uppercase of a Latin-1 character
113 : // outside of Latin-1.
114 : case 0x178:
115 : return 0xff;
116 : }
117 : return 0;
118 : }
119 :
120 :
121 : } // namespace unibrow
122 :
123 : #endif // V8_UNICODE_DECODER_H_
|