/src/rapidjson/include/rapidjson/encodedstream.h
Line | Count | Source |
1 | | // Tencent is pleased to support the open source community by making RapidJSON available. |
2 | | // |
3 | | // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. |
4 | | // |
5 | | // Licensed under the MIT License (the "License"); you may not use this file except |
6 | | // in compliance with the License. You may obtain a copy of the License at |
7 | | // |
8 | | // http://opensource.org/licenses/MIT |
9 | | // |
10 | | // Unless required by applicable law or agreed to in writing, software distributed |
11 | | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR |
12 | | // CONDITIONS OF ANY KIND, either express or implied. See the License for the |
13 | | // specific language governing permissions and limitations under the License. |
14 | | |
15 | | #ifndef RAPIDJSON_ENCODEDSTREAM_H_ |
16 | | #define RAPIDJSON_ENCODEDSTREAM_H_ |
17 | | |
18 | | #include "stream.h" |
19 | | #include "memorystream.h" |
20 | | |
21 | | #ifdef __GNUC__ |
22 | | RAPIDJSON_DIAG_PUSH |
23 | | RAPIDJSON_DIAG_OFF(effc++) |
24 | | #endif |
25 | | |
26 | | #ifdef __clang__ |
27 | | RAPIDJSON_DIAG_PUSH |
28 | | RAPIDJSON_DIAG_OFF(padded) |
29 | | #endif |
30 | | |
31 | | RAPIDJSON_NAMESPACE_BEGIN |
32 | | |
33 | | //! Input byte stream wrapper with a statically bound encoding. |
34 | | /*! |
35 | | \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. |
36 | | \tparam InputByteStream Type of input byte stream. For example, FileReadStream. |
37 | | */ |
38 | | template <typename Encoding, typename InputByteStream> |
39 | | class EncodedInputStream { |
40 | | RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); |
41 | | public: |
42 | | typedef typename Encoding::Ch Ch; |
43 | | |
44 | | EncodedInputStream(InputByteStream& is) : is_(is) { |
45 | | current_ = Encoding::TakeBOM(is_); |
46 | | } |
47 | | |
48 | | Ch Peek() const { return current_; } |
49 | | Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; } |
50 | | size_t Tell() const { return is_.Tell(); } |
51 | | |
52 | | // Not implemented |
53 | | void Put(Ch) { RAPIDJSON_ASSERT(false); } |
54 | | void Flush() { RAPIDJSON_ASSERT(false); } |
55 | | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
56 | | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
57 | | |
58 | | private: |
59 | | EncodedInputStream(const EncodedInputStream&); |
60 | | EncodedInputStream& operator=(const EncodedInputStream&); |
61 | | |
62 | | InputByteStream& is_; |
63 | | Ch current_; |
64 | | }; |
65 | | |
66 | | //! Specialized for UTF8 MemoryStream. |
67 | | template <> |
68 | | class EncodedInputStream<UTF8<>, MemoryStream> { |
69 | | public: |
70 | | typedef UTF8<>::Ch Ch; |
71 | | |
72 | 0 | EncodedInputStream(MemoryStream& is) : is_(is) { |
73 | 0 | if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take(); |
74 | 0 | if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take(); |
75 | 0 | if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take(); |
76 | 0 | } |
77 | 0 | Ch Peek() const { return is_.Peek(); } |
78 | 0 | Ch Take() { return is_.Take(); } |
79 | 0 | size_t Tell() const { return is_.Tell(); } |
80 | | |
81 | | // Not implemented |
82 | 0 | void Put(Ch) {} |
83 | 0 | void Flush() {} |
84 | 0 | Ch* PutBegin() { return 0; } |
85 | 0 | size_t PutEnd(Ch*) { return 0; } |
86 | | |
87 | | MemoryStream& is_; |
88 | | |
89 | | private: |
90 | | EncodedInputStream(const EncodedInputStream&); |
91 | | EncodedInputStream& operator=(const EncodedInputStream&); |
92 | | }; |
93 | | |
94 | | //! Output byte stream wrapper with statically bound encoding. |
95 | | /*! |
96 | | \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. |
97 | | \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream. |
98 | | */ |
99 | | template <typename Encoding, typename OutputByteStream> |
100 | | class EncodedOutputStream { |
101 | | RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); |
102 | | public: |
103 | | typedef typename Encoding::Ch Ch; |
104 | | |
105 | | EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) { |
106 | | if (putBOM) |
107 | | Encoding::PutBOM(os_); |
108 | | } |
109 | | |
110 | | void Put(Ch c) { Encoding::Put(os_, c); } |
111 | | void Flush() { os_.Flush(); } |
112 | | |
113 | | // Not implemented |
114 | | Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} |
115 | | Ch Take() { RAPIDJSON_ASSERT(false); return 0;} |
116 | | size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } |
117 | | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
118 | | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
119 | | |
120 | | private: |
121 | | EncodedOutputStream(const EncodedOutputStream&); |
122 | | EncodedOutputStream& operator=(const EncodedOutputStream&); |
123 | | |
124 | | OutputByteStream& os_; |
125 | | }; |
126 | | |
127 | | #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x |
128 | | |
129 | | //! Input stream wrapper with dynamically bound encoding and automatic encoding detection. |
130 | | /*! |
131 | | \tparam CharType Type of character for reading. |
132 | | \tparam InputByteStream type of input byte stream to be wrapped. |
133 | | */ |
134 | | template <typename CharType, typename InputByteStream> |
135 | | class AutoUTFInputStream { |
136 | | RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); |
137 | | public: |
138 | | typedef CharType Ch; |
139 | | |
140 | | //! Constructor. |
141 | | /*! |
142 | | \param is input stream to be wrapped. |
143 | | \param type UTF encoding type if it is not detected from the stream. |
144 | | */ |
145 | | AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) { |
146 | | RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); |
147 | | DetectType(); |
148 | | static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) }; |
149 | | takeFunc_ = f[type_]; |
150 | | current_ = takeFunc_(*is_); |
151 | | } |
152 | | |
153 | | UTFType GetType() const { return type_; } |
154 | | bool HasBOM() const { return hasBOM_; } |
155 | | |
156 | | Ch Peek() const { return current_; } |
157 | | Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; } |
158 | | size_t Tell() const { return is_->Tell(); } |
159 | | |
160 | | // Not implemented |
161 | | void Put(Ch) { RAPIDJSON_ASSERT(false); } |
162 | | void Flush() { RAPIDJSON_ASSERT(false); } |
163 | | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
164 | | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
165 | | |
166 | | private: |
167 | | AutoUTFInputStream(const AutoUTFInputStream&); |
168 | | AutoUTFInputStream& operator=(const AutoUTFInputStream&); |
169 | | |
170 | | // Detect encoding type with BOM or RFC 4627 |
171 | | void DetectType() { |
172 | | // BOM (Byte Order Mark): |
173 | | // 00 00 FE FF UTF-32BE |
174 | | // FF FE 00 00 UTF-32LE |
175 | | // FE FF UTF-16BE |
176 | | // FF FE UTF-16LE |
177 | | // EF BB BF UTF-8 |
178 | | |
179 | | const unsigned char* c = reinterpret_cast<const unsigned char *>(is_->Peek4()); |
180 | | if (!c) |
181 | | return; |
182 | | |
183 | | unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24)); |
184 | | hasBOM_ = false; |
185 | | if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } |
186 | | else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } |
187 | | else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); } |
188 | | else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); } |
189 | | else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); } |
190 | | |
191 | | // RFC 4627: Section 3 |
192 | | // "Since the first two characters of a JSON text will always be ASCII |
193 | | // characters [RFC0020], it is possible to determine whether an octet |
194 | | // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking |
195 | | // at the pattern of nulls in the first four octets." |
196 | | // 00 00 00 xx UTF-32BE |
197 | | // 00 xx 00 xx UTF-16BE |
198 | | // xx 00 00 00 UTF-32LE |
199 | | // xx 00 xx 00 UTF-16LE |
200 | | // xx xx xx xx UTF-8 |
201 | | |
202 | | if (!hasBOM_) { |
203 | | int pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0); |
204 | | switch (pattern) { |
205 | | case 0x08: type_ = kUTF32BE; break; |
206 | | case 0x0A: type_ = kUTF16BE; break; |
207 | | case 0x01: type_ = kUTF32LE; break; |
208 | | case 0x05: type_ = kUTF16LE; break; |
209 | | case 0x0F: type_ = kUTF8; break; |
210 | | default: break; // Use type defined by user. |
211 | | } |
212 | | } |
213 | | |
214 | | // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. |
215 | | if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); |
216 | | if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); |
217 | | } |
218 | | |
219 | | typedef Ch (*TakeFunc)(InputByteStream& is); |
220 | | InputByteStream* is_; |
221 | | UTFType type_; |
222 | | Ch current_; |
223 | | TakeFunc takeFunc_; |
224 | | bool hasBOM_; |
225 | | }; |
226 | | |
227 | | //! Output stream wrapper with dynamically bound encoding and automatic encoding detection. |
228 | | /*! |
229 | | \tparam CharType Type of character for writing. |
230 | | \tparam OutputByteStream type of output byte stream to be wrapped. |
231 | | */ |
232 | | template <typename CharType, typename OutputByteStream> |
233 | | class AutoUTFOutputStream { |
234 | | RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); |
235 | | public: |
236 | | typedef CharType Ch; |
237 | | |
238 | | //! Constructor. |
239 | | /*! |
240 | | \param os output stream to be wrapped. |
241 | | \param type UTF encoding type. |
242 | | \param putBOM Whether to write BOM at the beginning of the stream. |
243 | | */ |
244 | | AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) { |
245 | | RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); |
246 | | |
247 | | // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. |
248 | | if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); |
249 | | if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); |
250 | | |
251 | | static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) }; |
252 | | putFunc_ = f[type_]; |
253 | | |
254 | | if (putBOM) |
255 | | PutBOM(); |
256 | | } |
257 | | |
258 | | UTFType GetType() const { return type_; } |
259 | | |
260 | | void Put(Ch c) { putFunc_(*os_, c); } |
261 | | void Flush() { os_->Flush(); } |
262 | | |
263 | | // Not implemented |
264 | | Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} |
265 | | Ch Take() { RAPIDJSON_ASSERT(false); return 0;} |
266 | | size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } |
267 | | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
268 | | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
269 | | |
270 | | private: |
271 | | AutoUTFOutputStream(const AutoUTFOutputStream&); |
272 | | AutoUTFOutputStream& operator=(const AutoUTFOutputStream&); |
273 | | |
274 | | void PutBOM() { |
275 | | typedef void (*PutBOMFunc)(OutputByteStream&); |
276 | | static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) }; |
277 | | f[type_](*os_); |
278 | | } |
279 | | |
280 | | typedef void (*PutFunc)(OutputByteStream&, Ch); |
281 | | |
282 | | OutputByteStream* os_; |
283 | | UTFType type_; |
284 | | PutFunc putFunc_; |
285 | | }; |
286 | | |
287 | | #undef RAPIDJSON_ENCODINGS_FUNC |
288 | | |
289 | | RAPIDJSON_NAMESPACE_END |
290 | | |
291 | | #ifdef __clang__ |
292 | | RAPIDJSON_DIAG_POP |
293 | | #endif |
294 | | |
295 | | #ifdef __GNUC__ |
296 | | RAPIDJSON_DIAG_POP |
297 | | #endif |
298 | | |
299 | | #endif // RAPIDJSON_FILESTREAM_H_ |