/work/obj-fuzz/dist/include/mozilla/Encoding.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution. |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | | // option. This file may not be copied, modified, or distributed |
8 | | // except according to those terms. |
9 | | |
10 | | // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the |
11 | | // "top-level directory" in the above notice refers to |
12 | | // third_party/rust/encoding_c/. |
13 | | |
14 | | #ifndef mozilla_Encoding_h |
15 | | #define mozilla_Encoding_h |
16 | | |
17 | | #include "mozilla/CheckedInt.h" |
18 | | #include "mozilla/NotNull.h" |
19 | | #include "mozilla/Span.h" |
20 | | #include "mozilla/Tuple.h" |
21 | | #include "nsString.h" |
22 | | |
23 | | namespace mozilla { |
24 | | class Encoding; |
25 | | class Decoder; |
26 | | class Encoder; |
27 | | }; // namespace mozilla |
28 | | |
29 | | #define ENCODING_RS_ENCODING mozilla::Encoding |
30 | | #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR mozilla::NotNull<const mozilla::Encoding*> |
31 | | #define ENCODING_RS_ENCODER mozilla::Encoder |
32 | | #define ENCODING_RS_DECODER mozilla::Decoder |
33 | | |
34 | | #include "encoding_rs.h" |
35 | | |
36 | | extern "C" { |
37 | | |
38 | | nsresult |
39 | | mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding, |
40 | | uint8_t const* src, |
41 | | size_t src_len, |
42 | | nsAString* dst); |
43 | | |
44 | | nsresult |
45 | | mozilla_encoding_decode_to_nsstring_with_bom_removal( |
46 | | mozilla::Encoding const* encoding, |
47 | | uint8_t const* src, |
48 | | size_t src_len, |
49 | | nsAString* dst); |
50 | | |
51 | | nsresult |
52 | | mozilla_encoding_decode_to_nsstring_without_bom_handling( |
53 | | mozilla::Encoding const* encoding, |
54 | | uint8_t const* src, |
55 | | size_t src_len, |
56 | | nsAString* dst); |
57 | | |
58 | | nsresult |
59 | | mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement( |
60 | | mozilla::Encoding const* encoding, |
61 | | uint8_t const* src, |
62 | | size_t src_len, |
63 | | nsAString* dst); |
64 | | |
65 | | nsresult |
66 | | mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding, |
67 | | char16_t const* src, |
68 | | size_t src_len, |
69 | | nsACString* dst); |
70 | | |
71 | | nsresult |
72 | | mozilla_encoding_decode_to_nscstring(mozilla::Encoding const** encoding, |
73 | | nsACString const* src, |
74 | | nsACString* dst); |
75 | | |
76 | | nsresult |
77 | | mozilla_encoding_decode_to_nscstring_with_bom_removal( |
78 | | mozilla::Encoding const* encoding, |
79 | | nsACString const* src, |
80 | | nsACString* dst); |
81 | | |
82 | | nsresult |
83 | | mozilla_encoding_decode_to_nscstring_without_bom_handling( |
84 | | mozilla::Encoding const* encoding, |
85 | | nsACString const* src, |
86 | | nsACString* dst); |
87 | | |
88 | | nsresult |
89 | | mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling( |
90 | | mozilla::Encoding const* encoding, |
91 | | uint8_t const* src, |
92 | | size_t src_len, |
93 | | nsACString* dst, |
94 | | size_t already_validated); |
95 | | |
96 | | nsresult |
97 | | mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( |
98 | | mozilla::Encoding const* encoding, |
99 | | nsACString const* src, |
100 | | nsACString* dst); |
101 | | |
102 | | nsresult |
103 | | mozilla_encoding_encode_from_nscstring(mozilla::Encoding const** encoding, |
104 | | nsACString const* src, |
105 | | nsACString* dst); |
106 | | |
107 | | } // extern "C" |
108 | | |
109 | | namespace mozilla { |
110 | | |
111 | | /** |
112 | | * Return value from `Decoder`/`Encoder` to indicate that input |
113 | | * was exhausted. |
114 | | */ |
115 | | const uint32_t kInputEmpty = INPUT_EMPTY; |
116 | | |
117 | | /** |
118 | | * Return value from `Decoder`/`Encoder` to indicate that output |
119 | | * space was insufficient. |
120 | | */ |
121 | | const uint32_t kOutputFull = OUTPUT_FULL; |
122 | | |
123 | | /** |
124 | | * An encoding as defined in the Encoding Standard |
125 | | * (https://encoding.spec.whatwg.org/). |
126 | | * |
127 | | * See https://docs.rs/encoding_rs/ for the Rust API docs. |
128 | | * |
129 | | * An _encoding_ defines a mapping from a byte sequence to a Unicode code point |
130 | | * sequence and, in most cases, vice versa. Each encoding has a name, an output |
131 | | * encoding, and one or more labels. |
132 | | * |
133 | | * _Labels_ are ASCII-case-insensitive strings that are used to identify an |
134 | | * encoding in formats and protocols. The _name_ of the encoding is the |
135 | | * preferred label in the case appropriate for returning from the |
136 | | * `characterSet` property of the `Document` DOM interface, except for |
137 | | * the replacement encoding whose name is not one of its labels. |
138 | | * |
139 | | * The _output encoding_ is the encoding used for form submission and URL |
140 | | * parsing on Web pages in the encoding. This is UTF-8 for the replacement, |
141 | | * UTF-16LE and UTF-16BE encodings and the encoding itself for other |
142 | | * encodings. |
143 | | * |
144 | | * # Streaming vs. Non-Streaming |
145 | | * |
146 | | * When you have the entire input in a single buffer, you can use the |
147 | | * methods `Decode()`, `DecodeWithBOMRemoval()`, |
148 | | * `DecodeWithoutBOMHandling()`, |
149 | | * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and |
150 | | * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and |
151 | | * NewEncoder()` methods), these methods perform heap allocations. You should |
152 | | * the `Decoder` and `Encoder` objects when your input is split into multiple |
153 | | * buffers or when you want to control the allocation of the output buffers. |
154 | | * |
155 | | * # Instances |
156 | | * |
157 | | * All instances of `Encoding` are statically allocated and have the process's |
158 | | * lifetime. There is precisely one unique `Encoding` instance for each |
159 | | * encoding defined in the Encoding Standard. |
160 | | * |
161 | | * To obtain a reference to a particular encoding whose identity you know at |
162 | | * compile time, use a `static` that refers to encoding. There is a `static` |
163 | | * for each encoding. The `static`s are named in all caps with hyphens |
164 | | * replaced with underscores and with `_ENCODING` appended to the |
165 | | * name. For example, if you know at compile time that you will want to |
166 | | * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`. |
167 | | * |
168 | | * If you don't know what encoding you need at compile time and need to |
169 | | * dynamically get an encoding by label, use `Encoding::for_label()`. |
170 | | * |
171 | | * Pointers to `Encoding` can be compared with `==` to check for the sameness |
172 | | * of two encodings. |
173 | | * |
174 | | * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer |
175 | | * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use |
176 | | * `const mozilla::Encoding*` in the C signature and |
177 | | * `*const encoding_rs::Encoding` is the corresponding Rust signature. |
178 | | */ |
179 | | class Encoding final |
180 | | { |
181 | | public: |
182 | | /** |
183 | | * Implements the _get an encoding_ algorithm |
184 | | * (https://encoding.spec.whatwg.org/#concept-encoding-get). |
185 | | * |
186 | | * If, after ASCII-lowercasing and removing leading and trailing |
187 | | * whitespace, the argument matches a label defined in the Encoding |
188 | | * Standard, `const Encoding*` representing the corresponding |
189 | | * encoding is returned. If there is no match, `nullptr` is returned. |
190 | | * |
191 | | * This is the right method to use if the action upon the method returning |
192 | | * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) |
193 | | * instead. When the action upon the method returning `nullptr` is not to |
194 | | * proceed with a fallback but to refuse processing, |
195 | | * `ForLabelNoReplacement()` is more appropriate. |
196 | | */ |
197 | | static inline const Encoding* ForLabel(Span<const char> aLabel) |
198 | 0 | { |
199 | 0 | return encoding_for_label( |
200 | 0 | reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length()); |
201 | 0 | } |
202 | | |
203 | | /** |
204 | | * `nsAString` argument version. See above for docs. |
205 | | */ |
206 | | static inline const Encoding* ForLabel(const nsAString& aLabel) |
207 | 0 | { |
208 | 0 | return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel)); |
209 | 0 | } |
210 | | |
211 | | /** |
212 | | * This method behaves the same as `ForLabel()`, except when `ForLabel()` |
213 | | * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead. |
214 | | * |
215 | | * This method is useful in scenarios where a fatal error is required |
216 | | * upon invalid label, because in those cases the caller typically wishes |
217 | | * to treat the labels that map to the replacement encoding as fatal |
218 | | * errors, too. |
219 | | * |
220 | | * It is not OK to use this method when the action upon the method returning |
221 | | * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In |
222 | | * such a case, the `ForLabel()` method should be used instead in order to avoid |
223 | | * unsafe fallback for labels that `ForLabel()` maps to `REPLACEMENT_ENCODING`. |
224 | | */ |
225 | | static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) |
226 | 1.11M | { |
227 | 1.11M | return encoding_for_label_no_replacement( |
228 | 1.11M | reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length()); |
229 | 1.11M | } |
230 | | |
231 | | /** |
232 | | * `nsAString` argument version. See above for docs. |
233 | | */ |
234 | | static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) |
235 | 0 | { |
236 | 0 | return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel)); |
237 | 0 | } |
238 | | |
239 | | /** |
240 | | * Performs non-incremental BOM sniffing. |
241 | | * |
242 | | * The argument must either be a buffer representing the entire input |
243 | | * stream (non-streaming case) or a buffer representing at least the first |
244 | | * three bytes of the input stream (streaming case). |
245 | | * |
246 | | * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)` |
247 | | * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the |
248 | | * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise. |
249 | | */ |
250 | | static inline Tuple<const Encoding*, size_t> ForBOM( |
251 | | Span<const uint8_t> aBuffer) |
252 | 0 | { |
253 | 0 | size_t len = aBuffer.Length(); |
254 | 0 | const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len); |
255 | 0 | return MakeTuple(encoding, len); |
256 | 0 | } |
257 | | |
258 | | /** |
259 | | * Writes the name of this encoding into `aName`. |
260 | | * |
261 | | * This name is appropriate to return as-is from the DOM |
262 | | * `document.characterSet` property. |
263 | | */ |
264 | | inline void Name(nsACString& aName) const |
265 | 0 | { |
266 | 0 | aName.SetLength(ENCODING_NAME_MAX_LENGTH); |
267 | 0 | size_t length = |
268 | 0 | encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting())); |
269 | 0 | aName.SetLength(length); // truncation is the 64-bit case is OK |
270 | 0 | } |
271 | | |
272 | | /** |
273 | | * Checks whether the _output encoding_ of this encoding can encode every |
274 | | * Unicode code point. (Only true if the output encoding is UTF-8.) |
275 | | */ |
276 | | inline bool CanEncodeEverything() const |
277 | 0 | { |
278 | 0 | return encoding_can_encode_everything(this); |
279 | 0 | } |
280 | | |
281 | | /** |
282 | | * Checks whether the bytes 0x00...0x7F map exclusively to the characters |
283 | | * U+0000...U+007F and vice versa. |
284 | | */ |
285 | | inline bool IsAsciiCompatible() const |
286 | 0 | { |
287 | 0 | return encoding_is_ascii_compatible(this); |
288 | 0 | } |
289 | | |
290 | | /** |
291 | | * Returns the _output encoding_ of this encoding. This is UTF-8 for |
292 | | * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. |
293 | | */ |
294 | | inline NotNull<const mozilla::Encoding*> OutputEncoding() const |
295 | 0 | { |
296 | 0 | return WrapNotNull(encoding_output_encoding(this)); |
297 | 0 | } |
298 | | |
299 | | /** |
300 | | * Decode complete input to `nsACString` _with BOM sniffing_ and with |
301 | | * malformed sequences replaced with the REPLACEMENT CHARACTER when the |
302 | | * entire input is available as a single buffer (i.e. the end of the |
303 | | * buffer marks the end of the stream). |
304 | | * |
305 | | * This method implements the (non-streaming version of) the |
306 | | * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. |
307 | | * |
308 | | * The second item in the returned tuple is the encoding that was actually |
309 | | * used (which may differ from this encoding thanks to BOM sniffing). |
310 | | * |
311 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` |
312 | | * if there were malformed sequences (that were replaced with the |
313 | | * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the |
314 | | * tuple. |
315 | | * |
316 | | * The backing buffer of the string isn't copied if the input buffer |
317 | | * is heap-allocated and decoding from UTF-8 and the input is valid |
318 | | * BOMless UTF-8, decoding from an ASCII-compatible encoding and |
319 | | * the input is valid ASCII or decoding from ISO-2022-JP and the |
320 | | * input stays in the ASCII state of ISO-2022-JP. It is OK to pass |
321 | | * the same string as both arguments. |
322 | | * |
323 | | * _Note:_ It is wrong to use this when the input buffer represents only |
324 | | * a segment of the input instead of the whole input. Use `NewDecoder()` |
325 | | * when decoding segmented input. |
326 | | */ |
327 | | inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode( |
328 | | const nsACString& aBytes, |
329 | | nsACString& aOut) const |
330 | 0 | { |
331 | 0 | const Encoding* encoding = this; |
332 | 0 | const nsACString* bytes = &aBytes; |
333 | 0 | nsACString* out = &aOut; |
334 | 0 | nsresult rv; |
335 | 0 | if (bytes == out) { |
336 | 0 | nsAutoCString temp(aBytes); |
337 | 0 | rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out); |
338 | 0 | } else { |
339 | 0 | rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out); |
340 | 0 | } |
341 | 0 | return MakeTuple(rv, WrapNotNull(encoding)); |
342 | 0 | } |
343 | | |
344 | | /** |
345 | | * Decode complete input to `nsAString` _with BOM sniffing_ and with |
346 | | * malformed sequences replaced with the REPLACEMENT CHARACTER when the |
347 | | * entire input is available as a single buffer (i.e. the end of the |
348 | | * buffer marks the end of the stream). |
349 | | * |
350 | | * This method implements the (non-streaming version of) the |
351 | | * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. |
352 | | * |
353 | | * The second item in the returned tuple is the encoding that was actually |
354 | | * used (which may differ from this encoding thanks to BOM sniffing). |
355 | | * |
356 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` |
357 | | * if there were malformed sequences (that were replaced with the |
358 | | * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the |
359 | | * tuple. |
360 | | * |
361 | | * _Note:_ It is wrong to use this when the input buffer represents only |
362 | | * a segment of the input instead of the whole input. Use `NewDecoder()` |
363 | | * when decoding segmented input. |
364 | | */ |
365 | | inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode( |
366 | | Span<const uint8_t> aBytes, |
367 | | nsAString& aOut) const |
368 | 0 | { |
369 | 0 | const Encoding* encoding = this; |
370 | 0 | nsresult rv = mozilla_encoding_decode_to_nsstring( |
371 | 0 | &encoding, aBytes.Elements(), aBytes.Length(), &aOut); |
372 | 0 | return MakeTuple(rv, WrapNotNull(encoding)); |
373 | 0 | } |
374 | | |
375 | | /** |
376 | | * Decode complete input to `nsACString` _with BOM removal_ and with |
377 | | * malformed sequences replaced with the REPLACEMENT CHARACTER when the |
378 | | * entire input is available as a single buffer (i.e. the end of the |
379 | | * buffer marks the end of the stream). |
380 | | * |
381 | | * When invoked on `UTF_8`, this method implements the (non-streaming |
382 | | * version of) the _UTF-8 decode_ |
383 | | * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. |
384 | | * |
385 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` |
386 | | * if there were malformed sequences (that were replaced with the |
387 | | * REPLACEMENT CHARACTER) and `NS_OK` otherwise. |
388 | | * |
389 | | * The backing buffer of the string isn't copied if the input buffer |
390 | | * is heap-allocated and decoding from UTF-8 and the input is valid |
391 | | * BOMless UTF-8, decoding from an ASCII-compatible encoding and |
392 | | * the input is valid ASCII or decoding from ISO-2022-JP and the |
393 | | * input stays in the ASCII state of ISO-2022-JP. It is OK to pass |
394 | | * the same string as both arguments. |
395 | | * |
396 | | * _Note:_ It is wrong to use this when the input buffer represents only |
397 | | * a segment of the input instead of the whole input. Use |
398 | | * `NewDecoderWithBOMRemoval()` when decoding segmented input. |
399 | | */ |
400 | | inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes, |
401 | | nsACString& aOut) const |
402 | 0 | { |
403 | 0 | const nsACString* bytes = &aBytes; |
404 | 0 | nsACString* out = &aOut; |
405 | 0 | if (bytes == out) { |
406 | 0 | nsAutoCString temp(aBytes); |
407 | 0 | return mozilla_encoding_decode_to_nscstring_with_bom_removal( |
408 | 0 | this, &temp, out); |
409 | 0 | } |
410 | 0 | return mozilla_encoding_decode_to_nscstring_with_bom_removal( |
411 | 0 | this, bytes, out); |
412 | 0 | } |
413 | | |
414 | | /** |
415 | | * Decode complete input to `nsAString` _with BOM removal_ and with |
416 | | * malformed sequences replaced with the REPLACEMENT CHARACTER when the |
417 | | * entire input is available as a single buffer (i.e. the end of the |
418 | | * buffer marks the end of the stream). |
419 | | * |
420 | | * When invoked on `UTF_8`, this method implements the (non-streaming |
421 | | * version of) the _UTF-8 decode_ |
422 | | * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. |
423 | | * |
424 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` |
425 | | * if there were malformed sequences (that were replaced with the |
426 | | * REPLACEMENT CHARACTER) and `NS_OK` otherwise. |
427 | | * |
428 | | * _Note:_ It is wrong to use this when the input buffer represents only |
429 | | * a segment of the input instead of the whole input. Use |
430 | | * `NewDecoderWithBOMRemoval()` when decoding segmented input. |
431 | | */ |
432 | | inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes, |
433 | | nsAString& aOut) const |
434 | 0 | { |
435 | 0 | return mozilla_encoding_decode_to_nsstring_with_bom_removal( |
436 | 0 | this, aBytes.Elements(), aBytes.Length(), &aOut); |
437 | 0 | } |
438 | | |
439 | | /** |
440 | | * Decode complete input to `nsACString` _without BOM handling_ and |
441 | | * with malformed sequences replaced with the REPLACEMENT CHARACTER when |
442 | | * the entire input is available as a single buffer (i.e. the end of the |
443 | | * buffer marks the end of the stream). |
444 | | * |
445 | | * When invoked on `UTF_8`, this method implements the (non-streaming |
446 | | * version of) the _UTF-8 decode without BOM_ |
447 | | * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. |
448 | | * |
449 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` |
450 | | * if there were malformed sequences (that were replaced with the |
451 | | * REPLACEMENT CHARACTER) and `NS_OK` otherwise. |
452 | | * |
453 | | * The backing buffer of the string isn't copied if the input buffer |
454 | | * is heap-allocated and decoding from UTF-8 and the input is valid |
455 | | * UTF-8, decoding from an ASCII-compatible encoding and the input |
456 | | * is valid ASCII or decoding from ISO-2022-JP and the input stays |
457 | | * in the ASCII state of ISO-2022-JP. It is OK to pass the same string |
458 | | * as both arguments. |
459 | | * |
460 | | * _Note:_ It is wrong to use this when the input buffer represents only |
461 | | * a segment of the input instead of the whole input. Use |
462 | | * `NewDecoderWithoutBOMHandling()` when decoding segmented input. |
463 | | */ |
464 | | inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes, |
465 | | nsACString& aOut) const |
466 | 0 | { |
467 | 0 | const nsACString* bytes = &aBytes; |
468 | 0 | nsACString* out = &aOut; |
469 | 0 | if (bytes == out) { |
470 | 0 | nsAutoCString temp(aBytes); |
471 | 0 | return mozilla_encoding_decode_to_nscstring_without_bom_handling( |
472 | 0 | this, &temp, out); |
473 | 0 | } |
474 | 0 | return mozilla_encoding_decode_to_nscstring_without_bom_handling( |
475 | 0 | this, bytes, out); |
476 | 0 | } |
477 | | |
478 | | /** |
479 | | * Decode complete input to `nsAString` _without BOM handling_ and |
480 | | * with malformed sequences replaced with the REPLACEMENT CHARACTER when |
481 | | * the entire input is available as a single buffer (i.e. the end of the |
482 | | * buffer marks the end of the stream). |
483 | | * |
484 | | * When invoked on `UTF_8`, this method implements the (non-streaming |
485 | | * version of) the _UTF-8 decode without BOM_ |
486 | | * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. |
487 | | * |
488 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` |
489 | | * if there were malformed sequences (that were replaced with the |
490 | | * REPLACEMENT CHARACTER) and `NS_OK` otherwise. |
491 | | * |
492 | | * _Note:_ It is wrong to use this when the input buffer represents only |
493 | | * a segment of the input instead of the whole input. Use |
494 | | * `NewDecoderWithoutBOMHandling()` when decoding segmented input. |
495 | | */ |
496 | | inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes, |
497 | | nsAString& aOut) const |
498 | 0 | { |
499 | 0 | return mozilla_encoding_decode_to_nsstring_without_bom_handling( |
500 | 0 | this, aBytes.Elements(), aBytes.Length(), &aOut); |
501 | 0 | } |
502 | | |
503 | | /** |
504 | | * Decode complete input to `nsACString` _without BOM handling_ and |
505 | | * _with malformed sequences treated as fatal_ when the entire input is |
506 | | * available as a single buffer (i.e. the end of the buffer marks the end |
507 | | * of the stream). |
508 | | * |
509 | | * When invoked on `UTF_8`, this method implements the (non-streaming |
510 | | * version of) the _UTF-8 decode without BOM or fail_ |
511 | | * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) |
512 | | * spec concept. |
513 | | * |
514 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT` |
515 | | * if a malformed sequence was encountered and `NS_OK` otherwise. |
516 | | * |
517 | | * The backing buffer of the string isn't copied if the input buffer |
518 | | * is heap-allocated and decoding from UTF-8 and the input is valid |
519 | | * UTF-8, decoding from an ASCII-compatible encoding and the input |
520 | | * is valid ASCII or decoding from ISO-2022-JP and the input stays |
521 | | * in the ASCII state of ISO-2022-JP. It is OK to pass the same string |
522 | | * as both arguments. |
523 | | * |
524 | | * _Note:_ It is wrong to use this when the input buffer represents only |
525 | | * a segment of the input instead of the whole input. Use |
526 | | * `NewDecoderWithoutBOMHandling()` when decoding segmented input. |
527 | | */ |
528 | | inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement( |
529 | | const nsACString& aBytes, |
530 | | nsACString& aOut) const |
531 | 0 | { |
532 | 0 | const nsACString* bytes = &aBytes; |
533 | 0 | nsACString* out = &aOut; |
534 | 0 | if (bytes == out) { |
535 | 0 | nsAutoCString temp(aBytes); |
536 | 0 | return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( |
537 | 0 | this, &temp, out); |
538 | 0 | } |
539 | 0 | return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( |
540 | 0 | this, bytes, out); |
541 | 0 | } |
542 | | |
543 | | /** |
544 | | * Decode complete input to `nsACString` _without BOM handling_ and |
545 | | * with malformed sequences replaced with the REPLACEMENT CHARACTER when |
546 | | * the entire input is available as a single buffer (i.e. the end of the |
547 | | * buffer marks the end of the stream) _asserting that a number of bytes |
548 | | * from the start are already known to be valid UTF-8_. |
549 | | * |
550 | | * The use case for this method is avoiding copying when dealing with |
551 | | * input that has a UTF-8 BOM. _When in doubt, do not use this method._ |
552 | | * |
553 | | * When invoked on `UTF_8`, this method implements the (non-streaming |
554 | | * version of) the _UTF-8 decode without BOM_ |
555 | | * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. |
556 | | * |
557 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` |
558 | | * if there were malformed sequences (that were replaced with the |
559 | | * REPLACEMENT CHARACTER) and `NS_OK` otherwise. |
560 | | * |
561 | | * _Note:_ It is wrong to use this when the input buffer represents only |
562 | | * a segment of the input instead of the whole input. Use |
563 | | * `NewDecoderWithoutBOMHandling()` when decoding segmented input. |
564 | | * |
565 | | * # Safety |
566 | | * |
567 | | * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8. |
568 | | * `aBytes` _must not_ alias the buffer (if any) of `aOut`. |
569 | | */ |
570 | | inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes, |
571 | | nsACString& aOut, |
572 | | size_t aAlreadyValidated) const |
573 | 0 | { |
574 | 0 | return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling( |
575 | 0 | this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated); |
576 | 0 | } |
577 | | |
578 | | /** |
579 | | * Decode complete input to `nsAString` _without BOM handling_ and |
580 | | * _with malformed sequences treated as fatal_ when the entire input is |
581 | | * available as a single buffer (i.e. the end of the buffer marks the end |
582 | | * of the stream). |
583 | | * |
584 | | * When invoked on `UTF_8`, this method implements the (non-streaming |
585 | | * version of) the _UTF-8 decode without BOM or fail_ |
586 | | * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) |
587 | | * spec concept. |
588 | | * |
589 | | * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT` |
590 | | * if a malformed sequence was encountered and `NS_OK` otherwise. |
591 | | * |
592 | | * _Note:_ It is wrong to use this when the input buffer represents only |
593 | | * a segment of the input instead of the whole input. Use |
594 | | * `NewDecoderWithoutBOMHandling()` when decoding segmented input. |
595 | | */ |
596 | | inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement( |
597 | | Span<const uint8_t> aBytes, |
598 | | nsAString& aOut) const |
599 | 0 | { |
600 | 0 | return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement( |
601 | 0 | this, aBytes.Elements(), aBytes.Length(), &aOut); |
602 | 0 | } |
603 | | |
604 | | /** |
605 | | * Encode complete input to `nsACString` with unmappable characters |
606 | | * replaced with decimal numeric character references when the entire input |
607 | | * is available as a single buffer (i.e. the end of the buffer marks the |
608 | | * end of the stream). |
609 | | * |
610 | | * This method implements the (non-streaming version of) the |
611 | | * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. |
612 | | * |
613 | | * The second item in the returned tuple is the encoding that was actually |
614 | | * used (which may differ from this encoding thanks to some encodings |
615 | | * having UTF-8 as their output encoding). |
616 | | * |
617 | | * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if |
618 | | * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM, |
619 | | * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were |
620 | | * replaced with numeric character references) and `NS_OK` otherwise. |
621 | | * |
622 | | * The backing buffer of the string isn't copied if the input buffer |
623 | | * is heap-allocated and encoding to UTF-8 and the input is valid |
624 | | * UTF-8, encoding to an ASCII-compatible encoding and the input |
625 | | * is valid ASCII or encoding from ISO-2022-JP and the input stays |
626 | | * in the ASCII state of ISO-2022-JP. It is OK to pass the same string |
627 | | * as both arguments. |
628 | | * |
629 | | * _Note:_ It is wrong to use this when the input buffer represents only |
630 | | * a segment of the input instead of the whole input. Use `NewEncoder()` |
631 | | * when encoding segmented output. |
632 | | */ |
633 | | inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode( |
634 | | const nsACString& aString, |
635 | | nsACString& aOut) const |
636 | 0 | { |
637 | 0 | const Encoding* encoding = this; |
638 | 0 | const nsACString* string = &aString; |
639 | 0 | nsACString* out = &aOut; |
640 | 0 | nsresult rv; |
641 | 0 | if (string == out) { |
642 | 0 | nsAutoCString temp(aString); |
643 | 0 | rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out); |
644 | 0 | } else { |
645 | 0 | rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out); |
646 | 0 | } |
647 | 0 | return MakeTuple(rv, WrapNotNull(encoding)); |
648 | 0 | } |
649 | | |
650 | | /** |
651 | | * Encode complete input to `nsACString` with unmappable characters |
652 | | * replaced with decimal numeric character references when the entire input |
653 | | * is available as a single buffer (i.e. the end of the buffer marks the |
654 | | * end of the stream). |
655 | | * |
656 | | * This method implements the (non-streaming version of) the |
657 | | * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. |
658 | | * |
659 | | * The second item in the returned tuple is the encoding that was actually |
660 | | * used (which may differ from this encoding thanks to some encodings |
661 | | * having UTF-8 as their output encoding). |
662 | | * |
663 | | * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon |
664 | | * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that |
665 | | * were replaced with numeric character references) and `NS_OK` otherwise. |
666 | | |
667 | | * _Note:_ It is wrong to use this when the input buffer represents only |
668 | | * a segment of the input instead of the whole input. Use `NewEncoder()` |
669 | | * when encoding segmented output. |
670 | | */ |
671 | | inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode( |
672 | | Span<const char16_t> aString, |
673 | | nsACString& aOut) const |
674 | 0 | { |
675 | 0 | const Encoding* encoding = this; |
676 | 0 | nsresult rv = mozilla_encoding_encode_from_utf16( |
677 | 0 | &encoding, aString.Elements(), aString.Length(), &aOut); |
678 | 0 | return MakeTuple(rv, WrapNotNull(encoding)); |
679 | 0 | } |
680 | | |
681 | | /** |
682 | | * Instantiates a new decoder for this encoding with BOM sniffing enabled. |
683 | | * |
684 | | * BOM sniffing may cause the returned decoder to morph into a decoder |
685 | | * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. |
686 | | */ |
687 | | inline UniquePtr<Decoder> NewDecoder() const |
688 | 0 | { |
689 | 0 | UniquePtr<Decoder> decoder(encoding_new_decoder(this)); |
690 | 0 | return decoder; |
691 | 0 | } |
692 | | |
693 | | /** |
694 | | * Instantiates a new decoder for this encoding with BOM sniffing enabled |
695 | | * into memory occupied by a previously-instantiated decoder. |
696 | | * |
697 | | * BOM sniffing may cause the returned decoder to morph into a decoder |
698 | | * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. |
699 | | */ |
700 | | inline void NewDecoderInto(Decoder& aDecoder) const |
701 | 0 | { |
702 | 0 | encoding_new_decoder_into(this, &aDecoder); |
703 | 0 | } |
704 | | |
705 | | /** |
706 | | * Instantiates a new decoder for this encoding with BOM removal. |
707 | | * |
708 | | * If the input starts with bytes that are the BOM for this encoding, |
709 | | * those bytes are removed. However, the decoder never morphs into a |
710 | | * decoder for another encoding: A BOM for another encoding is treated as |
711 | | * (potentially malformed) input to the decoding algorithm for this |
712 | | * encoding. |
713 | | */ |
714 | | inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const |
715 | 0 | { |
716 | 0 | UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this)); |
717 | 0 | return decoder; |
718 | 0 | } |
719 | | |
720 | | /** |
721 | | * Instantiates a new decoder for this encoding with BOM removal |
722 | | * into memory occupied by a previously-instantiated decoder. |
723 | | * |
724 | | * If the input starts with bytes that are the BOM for this encoding, |
725 | | * those bytes are removed. However, the decoder never morphs into a |
726 | | * decoder for another encoding: A BOM for another encoding is treated as |
727 | | * (potentially malformed) input to the decoding algorithm for this |
728 | | * encoding. |
729 | | */ |
730 | | inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const |
731 | 0 | { |
732 | 0 | encoding_new_decoder_with_bom_removal_into(this, &aDecoder); |
733 | 0 | } |
734 | | |
735 | | /** |
736 | | * Instantiates a new decoder for this encoding with BOM handling disabled. |
737 | | * |
738 | | * If the input starts with bytes that look like a BOM, those bytes are |
739 | | * not treated as a BOM. (Hence, the decoder never morphs into a decoder |
740 | | * for another encoding.) |
741 | | * |
742 | | * _Note:_ If the caller has performed BOM sniffing on its own but has not |
743 | | * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()` |
744 | | * instead of this method to cause the BOM to be removed. |
745 | | */ |
746 | | inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const |
747 | 0 | { |
748 | 0 | UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this)); |
749 | 0 | return decoder; |
750 | 0 | } |
751 | | |
752 | | /** |
753 | | * Instantiates a new decoder for this encoding with BOM handling disabled |
754 | | * into memory occupied by a previously-instantiated decoder. |
755 | | * |
756 | | * If the input starts with bytes that look like a BOM, those bytes are |
757 | | * not treated as a BOM. (Hence, the decoder never morphs into a decoder |
758 | | * for another encoding.) |
759 | | * |
760 | | * _Note:_ If the caller has performed BOM sniffing on its own but has not |
761 | | * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()` |
762 | | * instead of this method to cause the BOM to be removed. |
763 | | */ |
764 | | inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const |
765 | 0 | { |
766 | 0 | encoding_new_decoder_without_bom_handling_into(this, &aDecoder); |
767 | 0 | } |
768 | | |
769 | | /** |
770 | | * Instantiates a new encoder for the output encoding of this encoding. |
771 | | */ |
772 | | inline UniquePtr<Encoder> NewEncoder() const |
773 | 0 | { |
774 | 0 | UniquePtr<Encoder> encoder(encoding_new_encoder(this)); |
775 | 0 | return encoder; |
776 | 0 | } |
777 | | |
778 | | /** |
779 | | * Instantiates a new encoder for the output encoding of this encoding |
780 | | * into memory occupied by a previously-instantiated encoder. |
781 | | */ |
782 | | inline void NewEncoderInto(Encoder& aEncoder) const |
783 | 0 | { |
784 | 0 | encoding_new_encoder_into(this, &aEncoder); |
785 | 0 | } |
786 | | |
787 | | /** |
788 | | * Validates UTF-8. |
789 | | * |
790 | | * Returns the index of the first byte that makes the input malformed as |
791 | | * UTF-8 or the length of the input if the input is entirely valid. |
792 | | */ |
793 | | static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) |
794 | 0 | { |
795 | 0 | return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length()); |
796 | 0 | } |
797 | | |
798 | | /** |
799 | | * Validates ASCII. |
800 | | * |
801 | | * Returns the index of the first byte that makes the input malformed as |
802 | | * ASCII or the length of the input if the input is entirely valid. |
803 | | */ |
804 | | static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) |
805 | 0 | { |
806 | 0 | return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length()); |
807 | 0 | } |
808 | | |
809 | | /** |
810 | | * Validates ISO-2022-JP ASCII-state data. |
811 | | * |
812 | | * Returns the index of the first byte that makes the input not |
813 | | * representable in the ASCII state of ISO-2022-JP or the length of the |
814 | | * input if the input is entirely representable in the ASCII state of |
815 | | * ISO-2022-JP. |
816 | | */ |
817 | | static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) |
818 | 0 | { |
819 | 0 | return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(), |
820 | 0 | aBuffer.Length()); |
821 | 0 | } |
822 | | |
823 | | private: |
824 | | Encoding() = delete; |
825 | | Encoding(const Encoding&) = delete; |
826 | | Encoding& operator=(const Encoding&) = delete; |
827 | | ~Encoding() = delete; |
828 | | |
829 | | }; |
830 | | |
831 | | /** |
832 | | * A converter that decodes a byte stream into Unicode according to a |
833 | | * character encoding in a streaming (incremental) manner. |
834 | | * |
835 | | * The various `Decode*` methods take an input buffer (`aSrc`) and an output |
836 | | * buffer `aDst` both of which are caller-allocated. There are variants for |
837 | | * both UTF-8 and UTF-16 output buffers. |
838 | | * |
839 | | * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored |
840 | | * into `aDst` until one of the following three things happens: |
841 | | * |
842 | | * 1. A malformed byte sequence is encountered (`*WithoutReplacement` |
843 | | * variants only). |
844 | | * |
845 | | * 2. The output buffer has been filled so near capacity that the decoder |
846 | | * cannot be sure that processing an additional byte of input wouldn't |
847 | | * cause so much output that the output buffer would overflow. |
848 | | * |
849 | | * 3. All the input bytes have been processed. |
850 | | * |
851 | | * The `Decode*` method then returns tuple of a status indicating which one |
852 | | * of the three reasons to return happened, how many input bytes were read, |
853 | | * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t` |
854 | | * when decoding to UTF-16) were written, and in the case of the |
855 | | * variants performing replacement, a boolean indicating whether an error was |
856 | | * replaced with the REPLACEMENT CHARACTER during the call. |
857 | | * |
858 | | * The number of bytes "written" is what's logically written. Garbage may be |
859 | | * written in the output buffer beyond the point logically written to. |
860 | | * |
861 | | * In the case of the `*WithoutReplacement` variants, the status is a |
862 | | * `uint32_t` whose possible values are packed info about a malformed byte |
863 | | * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases |
864 | | * listed above). |
865 | | * |
866 | | * Packed info about malformed sequences has the following format: |
867 | | * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3, |
868 | | * indicate the number of bytes that were consumed after the malformed |
869 | | * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate |
870 | | * the length of the malformed byte sequence (possible decimal values 1, 2, |
871 | | * 3 or 4). The maximum possible sum of the two is 6. |
872 | | * |
873 | | * In the case of methods whose name does not end with |
874 | | * `*WithoutReplacement`, malformed sequences are automatically replaced |
875 | | * with the REPLACEMENT CHARACTER and errors do not cause the methods to |
876 | | * return early. |
877 | | * |
878 | | * When decoding to UTF-8, the output buffer must have at least 4 bytes of |
879 | | * space. When decoding to UTF-16, the output buffer must have at least two |
880 | | * UTF-16 code units (`char16_t`) of space. |
881 | | * |
882 | | * When decoding to UTF-8 without replacement, the methods are guaranteed |
883 | | * not to return indicating that more output space is needed if the length |
884 | | * of the output buffer is at least the length returned by |
885 | | * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8 |
886 | | * with replacement, the length of the output buffer that guarantees the |
887 | | * methods not to return indicating that more output space is needed is given |
888 | | * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with |
889 | | * or without replacement, the length of the output buffer that guarantees |
890 | | * the methods not to return indicating that more output space is needed is |
891 | | * given by `MaxUTF16BufferLength()`. |
892 | | * |
893 | | * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16, |
894 | | * and the output after each `Decode*` call is guaranteed to consist of |
895 | | * complete characters. (I.e. the code unit sequence for the last character is |
896 | | * guaranteed not to be split across output buffers.) |
897 | | * |
898 | | * The boolean argument `aLast` indicates that the end of the stream is reached |
899 | | * when all the bytes in `aSrc` have been consumed. |
900 | | * |
901 | | * A `Decoder` object can be used to incrementally decode a byte stream. |
902 | | * |
903 | | * During the processing of a single stream, the caller must call `Decode*` |
904 | | * zero or more times with `aLast` set to `false` and then call `Decode*` at |
905 | | * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`, |
906 | | * the processing of the stream has ended. Otherwise, the caller must call |
907 | | * `Decode*` again with `aLast` set to `true` (or treat a malformed result, |
908 | | * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error). |
909 | | * |
910 | | * Once the stream has ended, the `Decoder` object must not be used anymore. |
911 | | * That is, you need to create another one to process another stream. |
912 | | * |
913 | | * When the decoder returns `kOutputFull` or the decoder returns a malformed |
914 | | * result and the caller does not wish to treat it as a fatal error, the input |
915 | | * buffer `aSrc` may not have been completely consumed. In that case, the caller |
916 | | * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next |
917 | | * call. |
918 | | * |
919 | | * # Infinite loops |
920 | | * |
921 | | * When converting with a fixed-size output buffer whose size is too small to |
922 | | * accommodate one character of output, an infinite loop ensues. When |
923 | | * converting with a fixed-size output buffer, it generally makes sense to |
924 | | * make the buffer fairly large (e.g. couple of kilobytes). |
925 | | */ |
926 | | class Decoder final |
927 | | { |
928 | | public: |
929 | 0 | ~Decoder() {} |
930 | | static void operator delete(void* aDecoder) |
931 | 0 | { |
932 | 0 | decoder_free(reinterpret_cast<Decoder*>(aDecoder)); |
933 | 0 | } |
934 | | |
935 | | /** |
936 | | * The `Encoding` this `Decoder` is for. |
937 | | * |
938 | | * BOM sniffing can change the return value of this method during the life |
939 | | * of the decoder. |
940 | | */ |
941 | | inline NotNull<const mozilla::Encoding*> Encoding() const |
942 | 0 | { |
943 | 0 | return WrapNotNull(decoder_encoding(this)); |
944 | 0 | } |
945 | | |
946 | | /** |
947 | | * Query the worst-case UTF-8 output size _with replacement_. |
948 | | * |
949 | | * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) |
950 | | * that will not overflow given the current state of the decoder and |
951 | | * `aByteLength` number of additional input bytes when decoding with |
952 | | * errors handled by outputting a REPLACEMENT CHARACTER for each malformed |
953 | | * sequence. |
954 | | */ |
955 | | inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const |
956 | 0 | { |
957 | 0 | CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength)); |
958 | 0 | if (max.value() == MaxValue<size_t>::value) { |
959 | 0 | // Mark invalid by overflowing |
960 | 0 | max++; |
961 | 0 | MOZ_ASSERT(!max.isValid()); |
962 | 0 | } |
963 | 0 | return max; |
964 | 0 | } |
965 | | |
966 | | /** |
967 | | * Query the worst-case UTF-8 output size _without replacement_. |
968 | | * |
969 | | * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) |
970 | | * that will not overflow given the current state of the decoder and |
971 | | * `aByteLength` number of additional input bytes when decoding without |
972 | | * replacement error handling. |
973 | | * |
974 | | * Note that this value may be too small for the `WithReplacement` case. |
975 | | * Use `MaxUTF8BufferLength()` for that case. |
976 | | */ |
977 | | inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement( |
978 | | size_t aByteLength) const |
979 | 0 | { |
980 | 0 | CheckedInt<size_t> max( |
981 | 0 | decoder_max_utf8_buffer_length_without_replacement(this, aByteLength)); |
982 | 0 | if (max.value() == MaxValue<size_t>::value) { |
983 | 0 | // Mark invalid by overflowing |
984 | 0 | max++; |
985 | 0 | MOZ_ASSERT(!max.isValid()); |
986 | 0 | } |
987 | 0 | return max; |
988 | 0 | } |
989 | | |
990 | | /** |
991 | | * Incrementally decode a byte stream into UTF-8 with malformed sequences |
992 | | * replaced with the REPLACEMENT CHARACTER. |
993 | | * |
994 | | * See the documentation of the class for documentation for `Decode*` |
995 | | * methods collectively. |
996 | | */ |
997 | | inline Tuple<uint32_t, size_t, size_t, bool> |
998 | | DecodeToUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) |
999 | 0 | { |
1000 | 0 | size_t srcRead = aSrc.Length(); |
1001 | 0 | size_t dstWritten = aDst.Length(); |
1002 | 0 | bool hadReplacements; |
1003 | 0 | uint32_t result = decoder_decode_to_utf8(this, |
1004 | 0 | aSrc.Elements(), |
1005 | 0 | &srcRead, |
1006 | 0 | aDst.Elements(), |
1007 | 0 | &dstWritten, |
1008 | 0 | aLast, |
1009 | 0 | &hadReplacements); |
1010 | 0 | return MakeTuple(result, srcRead, dstWritten, hadReplacements); |
1011 | 0 | } |
1012 | | |
1013 | | /** |
1014 | | * Incrementally decode a byte stream into UTF-8 _without replacement_. |
1015 | | * |
1016 | | * See the documentation of the class for documentation for `Decode*` |
1017 | | * methods collectively. |
1018 | | */ |
1019 | | inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement( |
1020 | | Span<const uint8_t> aSrc, |
1021 | | Span<uint8_t> aDst, |
1022 | | bool aLast) |
1023 | 0 | { |
1024 | 0 | size_t srcRead = aSrc.Length(); |
1025 | 0 | size_t dstWritten = aDst.Length(); |
1026 | 0 | uint32_t result = decoder_decode_to_utf8_without_replacement( |
1027 | 0 | this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); |
1028 | 0 | return MakeTuple(result, srcRead, dstWritten); |
1029 | 0 | } |
1030 | | |
1031 | | /** |
1032 | | * Query the worst-case UTF-16 output size (with or without replacement). |
1033 | | * |
1034 | | * Returns the size of the output buffer in UTF-16 code units (`char16_t`) |
1035 | | * that will not overflow given the current state of the decoder and |
1036 | | * `aByteLength` number of additional input bytes. |
1037 | | * |
1038 | | * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the |
1039 | | * return value of this method applies also in the |
1040 | | * `_without_replacement` case. |
1041 | | */ |
1042 | | inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const |
1043 | 0 | { |
1044 | 0 | CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length)); |
1045 | 0 | if (max.value() == MaxValue<size_t>::value) { |
1046 | 0 | // Mark invalid by overflowing |
1047 | 0 | max++; |
1048 | 0 | MOZ_ASSERT(!max.isValid()); |
1049 | 0 | } |
1050 | 0 | return max; |
1051 | 0 | } |
1052 | | |
1053 | | /** |
1054 | | * Incrementally decode a byte stream into UTF-16 with malformed sequences |
1055 | | * replaced with the REPLACEMENT CHARACTER. |
1056 | | * |
1057 | | * See the documentation of the class for documentation for `Decode*` |
1058 | | * methods collectively. |
1059 | | */ |
1060 | | inline Tuple<uint32_t, size_t, size_t, bool> |
1061 | | DecodeToUTF16(Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) |
1062 | 0 | { |
1063 | 0 | size_t srcRead = aSrc.Length(); |
1064 | 0 | size_t dstWritten = aDst.Length(); |
1065 | 0 | bool hadReplacements; |
1066 | 0 | uint32_t result = decoder_decode_to_utf16(this, |
1067 | 0 | aSrc.Elements(), |
1068 | 0 | &srcRead, |
1069 | 0 | aDst.Elements(), |
1070 | 0 | &dstWritten, |
1071 | 0 | aLast, |
1072 | 0 | &hadReplacements); |
1073 | 0 | return MakeTuple(result, srcRead, dstWritten, hadReplacements); |
1074 | 0 | } |
1075 | | |
1076 | | /** |
1077 | | * Incrementally decode a byte stream into UTF-16 _without replacement_. |
1078 | | * |
1079 | | * See the documentation of the class for documentation for `Decode*` |
1080 | | * methods collectively. |
1081 | | */ |
1082 | | inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement( |
1083 | | Span<const uint8_t> aSrc, |
1084 | | Span<char16_t> aDst, |
1085 | | bool aLast) |
1086 | 0 | { |
1087 | 0 | size_t srcRead = aSrc.Length(); |
1088 | 0 | size_t dstWritten = aDst.Length(); |
1089 | 0 | uint32_t result = decoder_decode_to_utf16_without_replacement( |
1090 | 0 | this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); |
1091 | 0 | return MakeTuple(result, srcRead, dstWritten); |
1092 | 0 | } |
1093 | | |
1094 | | private: |
1095 | | Decoder() = delete; |
1096 | | Decoder(const Decoder&) = delete; |
1097 | | Decoder& operator=(const Decoder&) = delete; |
1098 | | }; |
1099 | | |
1100 | | /** |
1101 | | * A converter that encodes a Unicode stream into bytes according to a |
1102 | | * character encoding in a streaming (incremental) manner. |
1103 | | * |
1104 | | * The various `Encode*` methods take an input buffer (`aSrc`) and an output |
1105 | | * buffer `aDst` both of which are caller-allocated. There are variants for |
1106 | | * both UTF-8 and UTF-16 input buffers. |
1107 | | * |
1108 | | * An `Encode*` method encode characters from `aSrc` into bytes characters |
1109 | | * stored into `aDst` until one of the following three things happens: |
1110 | | * |
1111 | | * 1. An unmappable character is encountered (`*WithoutReplacement` variants |
1112 | | * only). |
1113 | | * |
1114 | | * 2. The output buffer has been filled so near capacity that the decoder |
1115 | | * cannot be sure that processing an additional character of input wouldn't |
1116 | | * cause so much output that the output buffer would overflow. |
1117 | | * |
1118 | | * 3. All the input characters have been processed. |
1119 | | * |
1120 | | * The `Encode*` method then returns tuple of a status indicating which one |
1121 | | * of the three reasons to return happened, how many input code units (`uint8_t` |
1122 | | * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read, |
1123 | | * how many output bytes were written, and in the case of the variants that |
1124 | | * perform replacement, a boolean indicating whether an unmappable |
1125 | | * character was replaced with a numeric character reference during the call. |
1126 | | * |
1127 | | * The number of bytes "written" is what's logically written. Garbage may be |
1128 | | * written in the output buffer beyond the point logically written to. |
1129 | | * |
1130 | | * In the case of the methods whose name ends with |
1131 | | * `*WithoutReplacement`, the status is a `uint32_t` whose possible values |
1132 | | * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding |
1133 | | * to the three cases listed above). |
1134 | | * |
1135 | | * In the case of methods whose name does not end with |
1136 | | * `*WithoutReplacement`, unmappable characters are automatically replaced |
1137 | | * with the corresponding numeric character references and unmappable |
1138 | | * characters do not cause the methods to return early. |
1139 | | * |
1140 | | * When encoding from UTF-8 without replacement, the methods are guaranteed |
1141 | | * not to return indicating that more output space is needed if the length |
1142 | | * of the output buffer is at least the length returned by |
1143 | | * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from |
1144 | | * UTF-8 with replacement, the length of the output buffer that guarantees the |
1145 | | * methods not to return indicating that more output space is needed in the |
1146 | | * absence of unmappable characters is given by |
1147 | | * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from |
1148 | | * UTF-16 without replacement, the methods are guaranteed not to return |
1149 | | * indicating that more output space is needed if the length of the output |
1150 | | * buffer is at least the length returned by |
1151 | | * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding |
1152 | | * from UTF-16 with replacement, the the length of the output buffer that |
1153 | | * guarantees the methods not to return indicating that more output space is |
1154 | | * needed in the absence of unmappable characters is given by |
1155 | | * `MaxBufferLengthFromUTF16IfNoUnmappables()`. |
1156 | | * When encoding with replacement, applications are not expected to size the |
1157 | | * buffer for the worst case ahead of time but to resize the buffer if there |
1158 | | * are unmappable characters. This is why max length queries are only available |
1159 | | * for the case where there are no unmappable characters. |
1160 | | * |
1161 | | * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When |
1162 | | * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD |
1163 | | * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to |
1164 | | * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that |
1165 | | * surrogate pairs are not split across input buffer boundaries. |
1166 | | * |
1167 | | * After an `Encode*` call returns, the output produced so far, taken as a |
1168 | | * whole from the start of the stream, is guaranteed to consist of a valid |
1169 | | * byte sequence in the target encoding. (I.e. the code unit sequence for a |
1170 | | * character is guaranteed not to be split across output buffers. However, due |
1171 | | * to the stateful nature of ISO-2022-JP, the stream needs to be considered |
1172 | | * from the start for it to be valid. For other encodings, the validity holds |
1173 | | * on a per-output buffer basis.) |
1174 | | * |
1175 | | * The boolean argument `aLast` indicates that the end of the stream is reached |
1176 | | * when all the characters in `aSrc` have been consumed. This argument is needed |
1177 | | * for ISO-2022-JP and is ignored for other encodings. |
1178 | | * |
1179 | | * An `Encoder` object can be used to incrementally encode a byte stream. |
1180 | | * |
1181 | | * During the processing of a single stream, the caller must call `Encode*` |
1182 | | * zero or more times with `aLast` set to `false` and then call `Encode*` at |
1183 | | * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`, |
1184 | | * the processing of the stream has ended. Otherwise, the caller must call |
1185 | | * `Encode*` again with `aLast` set to `true` (or treat an unmappable result, |
1186 | | * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error). |
1187 | | * |
1188 | | * Once the stream has ended, the `Encoder` object must not be used anymore. |
1189 | | * That is, you need to create another one to process another stream. |
1190 | | * |
1191 | | * When the encoder returns `kOutputFull` or the encoder returns an unmappable |
1192 | | * result and the caller does not wish to treat it as a fatal error, the input |
1193 | | * buffer `aSrc` may not have been completely consumed. In that case, the caller |
1194 | | * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next |
1195 | | * call. |
1196 | | * |
1197 | | * # Infinite loops |
1198 | | * |
1199 | | * When converting with a fixed-size output buffer whose size is too small to |
1200 | | * accommodate one character of output, an infinite loop ensues. When |
1201 | | * converting with a fixed-size output buffer, it generally makes sense to |
1202 | | * make the buffer fairly large (e.g. couple of kilobytes). |
1203 | | */ |
1204 | | class Encoder final |
1205 | | { |
1206 | | public: |
1207 | 0 | ~Encoder() {} |
1208 | | |
1209 | | static void operator delete(void* aEncoder) |
1210 | 0 | { |
1211 | 0 | encoder_free(reinterpret_cast<Encoder*>(aEncoder)); |
1212 | 0 | } |
1213 | | |
1214 | | /** |
1215 | | * The `Encoding` this `Encoder` is for. |
1216 | | */ |
1217 | | inline NotNull<const mozilla::Encoding*> Encoding() const |
1218 | 0 | { |
1219 | 0 | return WrapNotNull(encoder_encoding(this)); |
1220 | 0 | } |
1221 | | |
1222 | | /** |
1223 | | * Returns `true` if this is an ISO-2022-JP encoder that's not in the |
1224 | | * ASCII state and `false` otherwise. |
1225 | | */ |
1226 | | inline bool HasPendingState() const |
1227 | 0 | { |
1228 | 0 | return encoder_has_pending_state(this); |
1229 | 0 | } |
1230 | | |
1231 | | /** |
1232 | | * Query the worst-case output size when encoding from UTF-8 with |
1233 | | * replacement. |
1234 | | * |
1235 | | * Returns the size of the output buffer in bytes that will not overflow |
1236 | | * given the current state of the encoder and `aByteLength` number of |
1237 | | * additional input code units if there are no unmappable characters in |
1238 | | * the input. |
1239 | | */ |
1240 | | inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables( |
1241 | | size_t aByteLength) const |
1242 | 0 | { |
1243 | 0 | CheckedInt<size_t> max( |
1244 | 0 | encoder_max_buffer_length_from_utf8_if_no_unmappables(this, aByteLength)); |
1245 | 0 | if (max.value() == MaxValue<size_t>::value) { |
1246 | 0 | // Mark invalid by overflowing |
1247 | 0 | max++; |
1248 | 0 | MOZ_ASSERT(!max.isValid()); |
1249 | 0 | } |
1250 | 0 | return max; |
1251 | 0 | } |
1252 | | |
1253 | | /** |
1254 | | * Query the worst-case output size when encoding from UTF-8 without |
1255 | | * replacement. |
1256 | | * |
1257 | | * Returns the size of the output buffer in bytes that will not overflow |
1258 | | * given the current state of the encoder and `aByteLength` number of |
1259 | | * additional input code units. |
1260 | | */ |
1261 | | inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement( |
1262 | | size_t aByteLength) const |
1263 | 0 | { |
1264 | 0 | CheckedInt<size_t> max( |
1265 | 0 | encoder_max_buffer_length_from_utf8_without_replacement(this, |
1266 | 0 | aByteLength)); |
1267 | 0 | if (max.value() == MaxValue<size_t>::value) { |
1268 | 0 | // Mark invalid by overflowing |
1269 | 0 | max++; |
1270 | 0 | MOZ_ASSERT(!max.isValid()); |
1271 | 0 | } |
1272 | 0 | return max; |
1273 | 0 | } |
1274 | | |
1275 | | /** |
1276 | | * Incrementally encode into byte stream from UTF-8 with unmappable |
1277 | | * characters replaced with HTML (decimal) numeric character references. |
1278 | | * |
1279 | | * See the documentation of the class for documentation for `Encode*` |
1280 | | * methods collectively. |
1281 | | * |
1282 | | * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING: |
1283 | | * The input ***MUST*** be valid UTF-8 or bad things happen! Unless |
1284 | | * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check. |
1285 | | */ |
1286 | | inline Tuple<uint32_t, size_t, size_t, bool> |
1287 | | EncodeFromUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) |
1288 | 0 | { |
1289 | 0 | size_t srcRead = aSrc.Length(); |
1290 | 0 | size_t dstWritten = aDst.Length(); |
1291 | 0 | bool hadReplacements; |
1292 | 0 | uint32_t result = encoder_encode_from_utf8(this, |
1293 | 0 | aSrc.Elements(), |
1294 | 0 | &srcRead, |
1295 | 0 | aDst.Elements(), |
1296 | 0 | &dstWritten, |
1297 | 0 | aLast, |
1298 | 0 | &hadReplacements); |
1299 | 0 | return MakeTuple(result, srcRead, dstWritten, hadReplacements); |
1300 | 0 | } |
1301 | | |
1302 | | /** |
1303 | | * Incrementally encode into byte stream from UTF-8 _without replacement_. |
1304 | | * |
1305 | | * See the documentation of the class for documentation for `Encode*` |
1306 | | * methods collectively. |
1307 | | * |
1308 | | * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING: |
1309 | | * The input ***MUST*** be valid UTF-8 or bad things happen! Unless |
1310 | | * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check. |
1311 | | */ |
1312 | | inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement( |
1313 | | Span<const uint8_t> aSrc, |
1314 | | Span<uint8_t> aDst, |
1315 | | bool aLast) |
1316 | 0 | { |
1317 | 0 | size_t srcRead = aSrc.Length(); |
1318 | 0 | size_t dstWritten = aDst.Length(); |
1319 | 0 | uint32_t result = encoder_encode_from_utf8_without_replacement( |
1320 | 0 | this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); |
1321 | 0 | return MakeTuple(result, srcRead, dstWritten); |
1322 | 0 | } |
1323 | | |
1324 | | /** |
1325 | | * Query the worst-case output size when encoding from UTF-16 with |
1326 | | * replacement. |
1327 | | * |
1328 | | * Returns the size of the output buffer in bytes that will not overflow |
1329 | | * given the current state of the encoder and `aU16Length` number of |
1330 | | * additional input code units if there are no unmappable characters in |
1331 | | * the input. |
1332 | | */ |
1333 | | inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables( |
1334 | | size_t aU16Length) const |
1335 | 0 | { |
1336 | 0 | CheckedInt<size_t> max( |
1337 | 0 | encoder_max_buffer_length_from_utf16_if_no_unmappables(this, aU16Length)); |
1338 | 0 | if (max.value() == MaxValue<size_t>::value) { |
1339 | 0 | // Mark invalid by overflowing |
1340 | 0 | max++; |
1341 | 0 | MOZ_ASSERT(!max.isValid()); |
1342 | 0 | } |
1343 | 0 | return max; |
1344 | 0 | } |
1345 | | |
1346 | | /** |
1347 | | * Query the worst-case output size when encoding from UTF-16 without |
1348 | | * replacement. |
1349 | | * |
1350 | | * Returns the size of the output buffer in bytes that will not overflow |
1351 | | * given the current state of the encoder and `aU16Length` number of |
1352 | | * additional input code units. |
1353 | | */ |
1354 | | inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement( |
1355 | | size_t aU16Length) const |
1356 | 0 | { |
1357 | 0 | CheckedInt<size_t> max( |
1358 | 0 | encoder_max_buffer_length_from_utf16_without_replacement(this, |
1359 | 0 | aU16Length)); |
1360 | 0 | if (max.value() == MaxValue<size_t>::value) { |
1361 | 0 | // Mark invalid by overflowing |
1362 | 0 | max++; |
1363 | 0 | MOZ_ASSERT(!max.isValid()); |
1364 | 0 | } |
1365 | 0 | return max; |
1366 | 0 | } |
1367 | | |
1368 | | /** |
1369 | | * Incrementally encode into byte stream from UTF-16 with unmappable |
1370 | | * characters replaced with HTML (decimal) numeric character references. |
1371 | | * |
1372 | | * See the documentation of the class for documentation for `Encode*` |
1373 | | * methods collectively. |
1374 | | */ |
1375 | | inline Tuple<uint32_t, size_t, size_t, bool> |
1376 | | EncodeFromUTF16(Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) |
1377 | 0 | { |
1378 | 0 | size_t srcRead = aSrc.Length(); |
1379 | 0 | size_t dstWritten = aDst.Length(); |
1380 | 0 | bool hadReplacements; |
1381 | 0 | uint32_t result = encoder_encode_from_utf16(this, |
1382 | 0 | aSrc.Elements(), |
1383 | 0 | &srcRead, |
1384 | 0 | aDst.Elements(), |
1385 | 0 | &dstWritten, |
1386 | 0 | aLast, |
1387 | 0 | &hadReplacements); |
1388 | 0 | return MakeTuple(result, srcRead, dstWritten, hadReplacements); |
1389 | 0 | } |
1390 | | |
1391 | | /** |
1392 | | * Incrementally encode into byte stream from UTF-16 _without replacement_. |
1393 | | * |
1394 | | * See the documentation of the class for documentation for `Encode*` |
1395 | | * methods collectively. |
1396 | | */ |
1397 | | inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement( |
1398 | | Span<const char16_t> aSrc, |
1399 | | Span<uint8_t> aDst, |
1400 | | bool aLast) |
1401 | 0 | { |
1402 | 0 | size_t srcRead = aSrc.Length(); |
1403 | 0 | size_t dstWritten = aDst.Length(); |
1404 | 0 | uint32_t result = encoder_encode_from_utf16_without_replacement( |
1405 | 0 | this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); |
1406 | 0 | return MakeTuple(result, srcRead, dstWritten); |
1407 | 0 | } |
1408 | | |
1409 | | private: |
1410 | | Encoder() = delete; |
1411 | | Encoder(const Encoder&) = delete; |
1412 | | Encoder& operator=(const Encoder&) = delete; |
1413 | | }; |
1414 | | |
1415 | | }; // namespace mozilla |
1416 | | |
1417 | | #endif // mozilla_Encoding_h |