Coverage Report

Created: 2018-09-25 14:53

/work/obj-fuzz/dist/include/mozilla/Encoding.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2
// file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
// Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11
// "top-level directory" in the above notice refers to
12
// third_party/rust/encoding_c/.
13
14
#ifndef mozilla_Encoding_h
15
#define mozilla_Encoding_h
16
17
#include "mozilla/CheckedInt.h"
18
#include "mozilla/NotNull.h"
19
#include "mozilla/Span.h"
20
#include "mozilla/Tuple.h"
21
#include "nsString.h"
22
23
namespace mozilla {
24
class Encoding;
25
class Decoder;
26
class Encoder;
27
}; // namespace mozilla
28
29
#define ENCODING_RS_ENCODING mozilla::Encoding
30
#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR mozilla::NotNull<const mozilla::Encoding*>
31
#define ENCODING_RS_ENCODER mozilla::Encoder
32
#define ENCODING_RS_DECODER mozilla::Decoder
33
34
#include "encoding_rs.h"
35
36
extern "C" {
37
38
nsresult
39
mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
40
                                    uint8_t const* src,
41
                                    size_t src_len,
42
                                    nsAString* dst);
43
44
nsresult
45
mozilla_encoding_decode_to_nsstring_with_bom_removal(
46
  mozilla::Encoding const* encoding,
47
  uint8_t const* src,
48
  size_t src_len,
49
  nsAString* dst);
50
51
nsresult
52
mozilla_encoding_decode_to_nsstring_without_bom_handling(
53
  mozilla::Encoding const* encoding,
54
  uint8_t const* src,
55
  size_t src_len,
56
  nsAString* dst);
57
58
nsresult
59
mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
60
  mozilla::Encoding const* encoding,
61
  uint8_t const* src,
62
  size_t src_len,
63
  nsAString* dst);
64
65
nsresult
66
mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
67
                                   char16_t const* src,
68
                                   size_t src_len,
69
                                   nsACString* dst);
70
71
nsresult
72
mozilla_encoding_decode_to_nscstring(mozilla::Encoding const** encoding,
73
                                     nsACString const* src,
74
                                     nsACString* dst);
75
76
nsresult
77
mozilla_encoding_decode_to_nscstring_with_bom_removal(
78
  mozilla::Encoding const* encoding,
79
  nsACString const* src,
80
  nsACString* dst);
81
82
nsresult
83
mozilla_encoding_decode_to_nscstring_without_bom_handling(
84
  mozilla::Encoding const* encoding,
85
  nsACString const* src,
86
  nsACString* dst);
87
88
nsresult
89
mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
90
  mozilla::Encoding const* encoding,
91
  uint8_t const* src,
92
  size_t src_len,
93
  nsACString* dst,
94
  size_t already_validated);
95
96
nsresult
97
mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
98
  mozilla::Encoding const* encoding,
99
  nsACString const* src,
100
  nsACString* dst);
101
102
nsresult
103
mozilla_encoding_encode_from_nscstring(mozilla::Encoding const** encoding,
104
                                       nsACString const* src,
105
                                       nsACString* dst);
106
107
} // extern "C"
108
109
namespace mozilla {
110
111
/**
112
 * Return value from `Decoder`/`Encoder` to indicate that input
113
 * was exhausted.
114
 */
115
const uint32_t kInputEmpty = INPUT_EMPTY;
116
117
/**
118
 * Return value from `Decoder`/`Encoder` to indicate that output
119
 * space was insufficient.
120
 */
121
const uint32_t kOutputFull = OUTPUT_FULL;
122
123
/**
124
 * An encoding as defined in the Encoding Standard
125
 * (https://encoding.spec.whatwg.org/).
126
 *
127
 * See https://docs.rs/encoding_rs/ for the Rust API docs.
128
 *
129
 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
130
 * sequence and, in most cases, vice versa. Each encoding has a name, an output
131
 * encoding, and one or more labels.
132
 *
133
 * _Labels_ are ASCII-case-insensitive strings that are used to identify an
134
 * encoding in formats and protocols. The _name_ of the encoding is the
135
 * preferred label in the case appropriate for returning from the
136
 * `characterSet` property of the `Document` DOM interface, except for
137
 * the replacement encoding whose name is not one of its labels.
138
 *
139
 * The _output encoding_ is the encoding used for form submission and URL
140
 * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
141
 * UTF-16LE and UTF-16BE encodings and the encoding itself for other
142
 * encodings.
143
 *
144
 * # Streaming vs. Non-Streaming
145
 *
146
 * When you have the entire input in a single buffer, you can use the
147
 * methods `Decode()`, `DecodeWithBOMRemoval()`,
148
 * `DecodeWithoutBOMHandling()`,
149
 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
150
 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
151
 * NewEncoder()` methods), these methods perform heap allocations. You should
152
 * the `Decoder` and `Encoder` objects when your input is split into multiple
153
 * buffers or when you want to control the allocation of the output buffers.
154
 *
155
 * # Instances
156
 *
157
 * All instances of `Encoding` are statically allocated and have the process's
158
 * lifetime. There is precisely one unique `Encoding` instance for each
159
 * encoding defined in the Encoding Standard.
160
 *
161
 * To obtain a reference to a particular encoding whose identity you know at
162
 * compile time, use a `static` that refers to encoding. There is a `static`
163
 * for each encoding. The `static`s are named in all caps with hyphens
164
 * replaced with underscores and with `_ENCODING` appended to the
165
 * name. For example, if you know at compile time that you will want to
166
 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
167
 *
168
 * If you don't know what encoding you need at compile time and need to
169
 * dynamically get an encoding by label, use `Encoding::for_label()`.
170
 *
171
 * Pointers to `Encoding` can be compared with `==` to check for the sameness
172
 * of two encodings.
173
 *
174
 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
175
 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
176
 * `const mozilla::Encoding*` in the C signature and
177
 * `*const encoding_rs::Encoding` is the corresponding Rust signature.
178
 */
179
class Encoding final
180
{
181
public:
182
  /**
183
   * Implements the _get an encoding_ algorithm
184
   * (https://encoding.spec.whatwg.org/#concept-encoding-get).
185
   *
186
   * If, after ASCII-lowercasing and removing leading and trailing
187
   * whitespace, the argument matches a label defined in the Encoding
188
   * Standard, `const Encoding*` representing the corresponding
189
   * encoding is returned. If there is no match, `nullptr` is returned.
190
   *
191
   * This is the right method to use if the action upon the method returning
192
   * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
193
   * instead. When the action upon the method returning `nullptr` is not to
194
   * proceed with a fallback but to refuse processing,
195
   * `ForLabelNoReplacement()` is more appropriate.
196
  */
197
  static inline const Encoding* ForLabel(Span<const char> aLabel)
198
0
  {
199
0
    return encoding_for_label(
200
0
      reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
201
0
  }
202
203
  /**
204
   * `nsAString` argument version. See above for docs.
205
   */
206
  static inline const Encoding* ForLabel(const nsAString& aLabel)
207
0
  {
208
0
    return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
209
0
  }
210
211
  /**
212
   * This method behaves the same as `ForLabel()`, except when `ForLabel()`
213
   * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
214
   *
215
   * This method is useful in scenarios where a fatal error is required
216
   * upon invalid label, because in those cases the caller typically wishes
217
   * to treat the labels that map to the replacement encoding as fatal
218
   * errors, too.
219
   *
220
   * It is not OK to use this method when the action upon the method returning
221
   * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
222
   * such a case, the `ForLabel()` method should be used instead in order to avoid
223
   * unsafe fallback for labels that `ForLabel()` maps to `REPLACEMENT_ENCODING`.
224
   */
225
  static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel)
226
1.11M
  {
227
1.11M
    return encoding_for_label_no_replacement(
228
1.11M
      reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
229
1.11M
  }
230
231
  /**
232
   * `nsAString` argument version. See above for docs.
233
   */
234
  static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel)
235
0
  {
236
0
    return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
237
0
  }
238
239
  /**
240
   * Performs non-incremental BOM sniffing.
241
   *
242
   * The argument must either be a buffer representing the entire input
243
   * stream (non-streaming case) or a buffer representing at least the first
244
   * three bytes of the input stream (streaming case).
245
   *
246
   * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
247
   * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
248
   * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
249
   */
250
  static inline Tuple<const Encoding*, size_t> ForBOM(
251
    Span<const uint8_t> aBuffer)
252
0
  {
253
0
    size_t len = aBuffer.Length();
254
0
    const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
255
0
    return MakeTuple(encoding, len);
256
0
  }
257
258
  /**
259
   * Writes the name of this encoding into `aName`.
260
   *
261
   * This name is appropriate to return as-is from the DOM
262
   * `document.characterSet` property.
263
   */
264
  inline void Name(nsACString& aName) const
265
0
  {
266
0
    aName.SetLength(ENCODING_NAME_MAX_LENGTH);
267
0
    size_t length =
268
0
      encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
269
0
    aName.SetLength(length); // truncation is the 64-bit case is OK
270
0
  }
271
272
  /**
273
   * Checks whether the _output encoding_ of this encoding can encode every
274
   * Unicode code point. (Only true if the output encoding is UTF-8.)
275
   */
276
  inline bool CanEncodeEverything() const
277
0
  {
278
0
    return encoding_can_encode_everything(this);
279
0
  }
280
281
  /**
282
   * Checks whether the bytes 0x00...0x7F map exclusively to the characters
283
   * U+0000...U+007F and vice versa.
284
   */
285
  inline bool IsAsciiCompatible() const
286
0
  {
287
0
    return encoding_is_ascii_compatible(this);
288
0
  }
289
290
  /**
291
   * Returns the _output encoding_ of this encoding. This is UTF-8 for
292
   * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
293
   */
294
  inline NotNull<const mozilla::Encoding*> OutputEncoding() const
295
0
  {
296
0
    return WrapNotNull(encoding_output_encoding(this));
297
0
  }
298
299
  /**
300
   * Decode complete input to `nsACString` _with BOM sniffing_ and with
301
   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
302
   * entire input is available as a single buffer (i.e. the end of the
303
   * buffer marks the end of the stream).
304
   *
305
   * This method implements the (non-streaming version of) the
306
   * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
307
   *
308
   * The second item in the returned tuple is the encoding that was actually
309
   * used (which may differ from this encoding thanks to BOM sniffing).
310
   *
311
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
312
   * if there were malformed sequences (that were replaced with the
313
   * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
314
   * tuple.
315
   *
316
   * The backing buffer of the string isn't copied if the input buffer
317
   * is heap-allocated and decoding from UTF-8 and the input is valid
318
   * BOMless UTF-8, decoding from an ASCII-compatible encoding and
319
   * the input is valid ASCII or decoding from ISO-2022-JP and the
320
   * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
321
   * the same string as both arguments.
322
   *
323
   * _Note:_ It is wrong to use this when the input buffer represents only
324
   * a segment of the input instead of the whole input. Use `NewDecoder()`
325
   * when decoding segmented input.
326
   */
327
  inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
328
    const nsACString& aBytes,
329
    nsACString& aOut) const
330
0
  {
331
0
    const Encoding* encoding = this;
332
0
    const nsACString* bytes = &aBytes;
333
0
    nsACString* out = &aOut;
334
0
    nsresult rv;
335
0
    if (bytes == out) {
336
0
      nsAutoCString temp(aBytes);
337
0
      rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
338
0
    } else {
339
0
      rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
340
0
    }
341
0
    return MakeTuple(rv, WrapNotNull(encoding));
342
0
  }
343
344
  /**
345
   * Decode complete input to `nsAString` _with BOM sniffing_ and with
346
   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
347
   * entire input is available as a single buffer (i.e. the end of the
348
   * buffer marks the end of the stream).
349
   *
350
   * This method implements the (non-streaming version of) the
351
   * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
352
   *
353
   * The second item in the returned tuple is the encoding that was actually
354
   * used (which may differ from this encoding thanks to BOM sniffing).
355
   *
356
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
357
   * if there were malformed sequences (that were replaced with the
358
   * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
359
   * tuple.
360
   *
361
   * _Note:_ It is wrong to use this when the input buffer represents only
362
   * a segment of the input instead of the whole input. Use `NewDecoder()`
363
   * when decoding segmented input.
364
   */
365
  inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
366
    Span<const uint8_t> aBytes,
367
    nsAString& aOut) const
368
0
  {
369
0
    const Encoding* encoding = this;
370
0
    nsresult rv = mozilla_encoding_decode_to_nsstring(
371
0
      &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
372
0
    return MakeTuple(rv, WrapNotNull(encoding));
373
0
  }
374
375
  /**
376
   * Decode complete input to `nsACString` _with BOM removal_ and with
377
   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
378
   * entire input is available as a single buffer (i.e. the end of the
379
   * buffer marks the end of the stream).
380
   *
381
   * When invoked on `UTF_8`, this method implements the (non-streaming
382
   * version of) the _UTF-8 decode_
383
   * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
384
   *
385
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
386
   * if there were malformed sequences (that were replaced with the
387
   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
388
   *
389
   * The backing buffer of the string isn't copied if the input buffer
390
   * is heap-allocated and decoding from UTF-8 and the input is valid
391
   * BOMless UTF-8, decoding from an ASCII-compatible encoding and
392
   * the input is valid ASCII or decoding from ISO-2022-JP and the
393
   * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
394
   * the same string as both arguments.
395
   *
396
   * _Note:_ It is wrong to use this when the input buffer represents only
397
   * a segment of the input instead of the whole input. Use
398
   * `NewDecoderWithBOMRemoval()` when decoding segmented input.
399
   */
400
  inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
401
                                       nsACString& aOut) const
402
0
  {
403
0
    const nsACString* bytes = &aBytes;
404
0
    nsACString* out = &aOut;
405
0
    if (bytes == out) {
406
0
      nsAutoCString temp(aBytes);
407
0
      return mozilla_encoding_decode_to_nscstring_with_bom_removal(
408
0
        this, &temp, out);
409
0
    }
410
0
    return mozilla_encoding_decode_to_nscstring_with_bom_removal(
411
0
      this, bytes, out);
412
0
  }
413
414
  /**
415
   * Decode complete input to `nsAString` _with BOM removal_ and with
416
   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
417
   * entire input is available as a single buffer (i.e. the end of the
418
   * buffer marks the end of the stream).
419
   *
420
   * When invoked on `UTF_8`, this method implements the (non-streaming
421
   * version of) the _UTF-8 decode_
422
   * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
423
   *
424
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
425
   * if there were malformed sequences (that were replaced with the
426
   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
427
   *
428
   * _Note:_ It is wrong to use this when the input buffer represents only
429
   * a segment of the input instead of the whole input. Use
430
   * `NewDecoderWithBOMRemoval()` when decoding segmented input.
431
   */
432
  inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
433
                                       nsAString& aOut) const
434
0
  {
435
0
    return mozilla_encoding_decode_to_nsstring_with_bom_removal(
436
0
      this, aBytes.Elements(), aBytes.Length(), &aOut);
437
0
  }
438
439
  /**
440
   * Decode complete input to `nsACString` _without BOM handling_ and
441
   * with malformed sequences replaced with the REPLACEMENT CHARACTER when
442
   * the entire input is available as a single buffer (i.e. the end of the
443
   * buffer marks the end of the stream).
444
   *
445
   * When invoked on `UTF_8`, this method implements the (non-streaming
446
   * version of) the _UTF-8 decode without BOM_
447
   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
448
   *
449
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
450
   * if there were malformed sequences (that were replaced with the
451
   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
452
   *
453
   * The backing buffer of the string isn't copied if the input buffer
454
   * is heap-allocated and decoding from UTF-8 and the input is valid
455
   * UTF-8, decoding from an ASCII-compatible encoding and the input
456
   * is valid ASCII or decoding from ISO-2022-JP and the input stays
457
   * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
458
   * as both arguments.
459
   *
460
   * _Note:_ It is wrong to use this when the input buffer represents only
461
   * a segment of the input instead of the whole input. Use
462
   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
463
   */
464
  inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
465
                                           nsACString& aOut) const
466
0
  {
467
0
    const nsACString* bytes = &aBytes;
468
0
    nsACString* out = &aOut;
469
0
    if (bytes == out) {
470
0
      nsAutoCString temp(aBytes);
471
0
      return mozilla_encoding_decode_to_nscstring_without_bom_handling(
472
0
        this, &temp, out);
473
0
    }
474
0
    return mozilla_encoding_decode_to_nscstring_without_bom_handling(
475
0
      this, bytes, out);
476
0
  }
477
478
  /**
479
   * Decode complete input to `nsAString` _without BOM handling_ and
480
   * with malformed sequences replaced with the REPLACEMENT CHARACTER when
481
   * the entire input is available as a single buffer (i.e. the end of the
482
   * buffer marks the end of the stream).
483
   *
484
   * When invoked on `UTF_8`, this method implements the (non-streaming
485
   * version of) the _UTF-8 decode without BOM_
486
   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
487
   *
488
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
489
   * if there were malformed sequences (that were replaced with the
490
   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
491
   *
492
   * _Note:_ It is wrong to use this when the input buffer represents only
493
   * a segment of the input instead of the whole input. Use
494
   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
495
   */
496
  inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
497
                                           nsAString& aOut) const
498
0
  {
499
0
    return mozilla_encoding_decode_to_nsstring_without_bom_handling(
500
0
      this, aBytes.Elements(), aBytes.Length(), &aOut);
501
0
  }
502
503
  /**
504
   * Decode complete input to `nsACString` _without BOM handling_ and
505
   * _with malformed sequences treated as fatal_ when the entire input is
506
   * available as a single buffer (i.e. the end of the buffer marks the end
507
   * of the stream).
508
   *
509
   * When invoked on `UTF_8`, this method implements the (non-streaming
510
   * version of) the _UTF-8 decode without BOM or fail_
511
   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
512
   * spec concept.
513
   *
514
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
515
   * if a malformed sequence was encountered and `NS_OK` otherwise.
516
   *
517
   * The backing buffer of the string isn't copied if the input buffer
518
   * is heap-allocated and decoding from UTF-8 and the input is valid
519
   * UTF-8, decoding from an ASCII-compatible encoding and the input
520
   * is valid ASCII or decoding from ISO-2022-JP and the input stays
521
   * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
522
   * as both arguments.
523
   *
524
   * _Note:_ It is wrong to use this when the input buffer represents only
525
   * a segment of the input instead of the whole input. Use
526
   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
527
   */
528
  inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
529
    const nsACString& aBytes,
530
    nsACString& aOut) const
531
0
  {
532
0
    const nsACString* bytes = &aBytes;
533
0
    nsACString* out = &aOut;
534
0
    if (bytes == out) {
535
0
      nsAutoCString temp(aBytes);
536
0
      return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
537
0
        this, &temp, out);
538
0
    }
539
0
    return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
540
0
      this, bytes, out);
541
0
  }
542
543
  /**
544
   * Decode complete input to `nsACString` _without BOM handling_ and
545
   * with malformed sequences replaced with the REPLACEMENT CHARACTER when
546
   * the entire input is available as a single buffer (i.e. the end of the
547
   * buffer marks the end of the stream) _asserting that a number of bytes
548
   * from the start are already known to be valid UTF-8_.
549
   *
550
   * The use case for this method is avoiding copying when dealing with
551
   * input that has a UTF-8 BOM. _When in doubt, do not use this method._
552
   *
553
   * When invoked on `UTF_8`, this method implements the (non-streaming
554
   * version of) the _UTF-8 decode without BOM_
555
   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
556
   *
557
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
558
   * if there were malformed sequences (that were replaced with the
559
   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
560
   *
561
   * _Note:_ It is wrong to use this when the input buffer represents only
562
   * a segment of the input instead of the whole input. Use
563
   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
564
   *
565
   * # Safety
566
   *
567
   * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
568
   * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
569
   */
570
  inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
571
                                           nsACString& aOut,
572
                                           size_t aAlreadyValidated) const
573
0
  {
574
0
    return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
575
0
      this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
576
0
  }
577
578
  /**
579
   * Decode complete input to `nsAString` _without BOM handling_ and
580
   * _with malformed sequences treated as fatal_ when the entire input is
581
   * available as a single buffer (i.e. the end of the buffer marks the end
582
   * of the stream).
583
   *
584
   * When invoked on `UTF_8`, this method implements the (non-streaming
585
   * version of) the _UTF-8 decode without BOM or fail_
586
   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
587
   * spec concept.
588
   *
589
   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
590
   * if a malformed sequence was encountered and `NS_OK` otherwise.
591
   *
592
   * _Note:_ It is wrong to use this when the input buffer represents only
593
   * a segment of the input instead of the whole input. Use
594
   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
595
   */
596
  inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
597
    Span<const uint8_t> aBytes,
598
    nsAString& aOut) const
599
0
  {
600
0
    return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
601
0
      this, aBytes.Elements(), aBytes.Length(), &aOut);
602
0
  }
603
604
  /**
605
   * Encode complete input to `nsACString` with unmappable characters
606
   * replaced with decimal numeric character references when the entire input
607
   * is available as a single buffer (i.e. the end of the buffer marks the
608
   * end of the stream).
609
   *
610
   * This method implements the (non-streaming version of) the
611
   * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
612
   *
613
   * The second item in the returned tuple is the encoding that was actually
614
   * used (which may differ from this encoding thanks to some encodings
615
   * having UTF-8 as their output encoding).
616
   *
617
   * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
618
   * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
619
   * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
620
   * replaced with numeric character references) and `NS_OK` otherwise.
621
   *
622
   * The backing buffer of the string isn't copied if the input buffer
623
   * is heap-allocated and encoding to UTF-8 and the input is valid
624
   * UTF-8, encoding to an ASCII-compatible encoding and the input
625
   * is valid ASCII or encoding from ISO-2022-JP and the input stays
626
   * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
627
   * as both arguments.
628
   *
629
   * _Note:_ It is wrong to use this when the input buffer represents only
630
   * a segment of the input instead of the whole input. Use `NewEncoder()`
631
   * when encoding segmented output.
632
   */
633
  inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
634
    const nsACString& aString,
635
    nsACString& aOut) const
636
0
  {
637
0
    const Encoding* encoding = this;
638
0
    const nsACString* string = &aString;
639
0
    nsACString* out = &aOut;
640
0
    nsresult rv;
641
0
    if (string == out) {
642
0
      nsAutoCString temp(aString);
643
0
      rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
644
0
    } else {
645
0
      rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
646
0
    }
647
0
    return MakeTuple(rv, WrapNotNull(encoding));
648
0
  }
649
650
  /**
651
   * Encode complete input to `nsACString` with unmappable characters
652
   * replaced with decimal numeric character references when the entire input
653
   * is available as a single buffer (i.e. the end of the buffer marks the
654
   * end of the stream).
655
   *
656
   * This method implements the (non-streaming version of) the
657
   * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
658
   *
659
   * The second item in the returned tuple is the encoding that was actually
660
   * used (which may differ from this encoding thanks to some encodings
661
   * having UTF-8 as their output encoding).
662
   *
663
   * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
664
   * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
665
   * were replaced with numeric character references) and `NS_OK` otherwise.
666
667
   * _Note:_ It is wrong to use this when the input buffer represents only
668
   * a segment of the input instead of the whole input. Use `NewEncoder()`
669
   * when encoding segmented output.
670
   */
671
  inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
672
    Span<const char16_t> aString,
673
    nsACString& aOut) const
674
0
  {
675
0
    const Encoding* encoding = this;
676
0
    nsresult rv = mozilla_encoding_encode_from_utf16(
677
0
      &encoding, aString.Elements(), aString.Length(), &aOut);
678
0
    return MakeTuple(rv, WrapNotNull(encoding));
679
0
  }
680
681
  /**
682
   * Instantiates a new decoder for this encoding with BOM sniffing enabled.
683
   *
684
   * BOM sniffing may cause the returned decoder to morph into a decoder
685
   * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
686
   */
687
  inline UniquePtr<Decoder> NewDecoder() const
688
0
  {
689
0
    UniquePtr<Decoder> decoder(encoding_new_decoder(this));
690
0
    return decoder;
691
0
  }
692
693
  /**
694
   * Instantiates a new decoder for this encoding with BOM sniffing enabled
695
   * into memory occupied by a previously-instantiated decoder.
696
   *
697
   * BOM sniffing may cause the returned decoder to morph into a decoder
698
   * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
699
   */
700
  inline void NewDecoderInto(Decoder& aDecoder) const
701
0
  {
702
0
    encoding_new_decoder_into(this, &aDecoder);
703
0
  }
704
705
  /**
706
   * Instantiates a new decoder for this encoding with BOM removal.
707
   *
708
   * If the input starts with bytes that are the BOM for this encoding,
709
   * those bytes are removed. However, the decoder never morphs into a
710
   * decoder for another encoding: A BOM for another encoding is treated as
711
   * (potentially malformed) input to the decoding algorithm for this
712
   * encoding.
713
   */
714
  inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const
715
0
  {
716
0
    UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
717
0
    return decoder;
718
0
  }
719
720
  /**
721
   * Instantiates a new decoder for this encoding with BOM removal
722
   * into memory occupied by a previously-instantiated decoder.
723
   *
724
   * If the input starts with bytes that are the BOM for this encoding,
725
   * those bytes are removed. However, the decoder never morphs into a
726
   * decoder for another encoding: A BOM for another encoding is treated as
727
   * (potentially malformed) input to the decoding algorithm for this
728
   * encoding.
729
   */
730
  inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const
731
0
  {
732
0
    encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
733
0
  }
734
735
  /**
736
   * Instantiates a new decoder for this encoding with BOM handling disabled.
737
   *
738
   * If the input starts with bytes that look like a BOM, those bytes are
739
   * not treated as a BOM. (Hence, the decoder never morphs into a decoder
740
   * for another encoding.)
741
   *
742
   * _Note:_ If the caller has performed BOM sniffing on its own but has not
743
   * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
744
   * instead of this method to cause the BOM to be removed.
745
   */
746
  inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const
747
0
  {
748
0
    UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
749
0
    return decoder;
750
0
  }
751
752
  /**
753
   * Instantiates a new decoder for this encoding with BOM handling disabled
754
   * into memory occupied by a previously-instantiated decoder.
755
   *
756
   * If the input starts with bytes that look like a BOM, those bytes are
757
   * not treated as a BOM. (Hence, the decoder never morphs into a decoder
758
   * for another encoding.)
759
   *
760
   * _Note:_ If the caller has performed BOM sniffing on its own but has not
761
   * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
762
   * instead of this method to cause the BOM to be removed.
763
   */
764
  inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const
765
0
  {
766
0
    encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
767
0
  }
768
769
  /**
770
   * Instantiates a new encoder for the output encoding of this encoding.
771
   */
772
  inline UniquePtr<Encoder> NewEncoder() const
773
0
  {
774
0
    UniquePtr<Encoder> encoder(encoding_new_encoder(this));
775
0
    return encoder;
776
0
  }
777
778
  /**
779
   * Instantiates a new encoder for the output encoding of this encoding
780
   * into memory occupied by a previously-instantiated encoder.
781
   */
782
  inline void NewEncoderInto(Encoder& aEncoder) const
783
0
  {
784
0
    encoding_new_encoder_into(this, &aEncoder);
785
0
  }
786
787
  /**
788
   * Validates UTF-8.
789
   *
790
   * Returns the index of the first byte that makes the input malformed as
791
   * UTF-8 or the length of the input if the input is entirely valid.
792
   */
793
  static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer)
794
0
  {
795
0
    return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
796
0
  }
797
798
  /**
799
   * Validates ASCII.
800
   *
801
   * Returns the index of the first byte that makes the input malformed as
802
   * ASCII or the length of the input if the input is entirely valid.
803
   */
804
  static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer)
805
0
  {
806
0
    return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
807
0
  }
808
809
  /**
810
   * Validates ISO-2022-JP ASCII-state data.
811
   *
812
   * Returns the index of the first byte that makes the input not
813
   * representable in the ASCII state of ISO-2022-JP or the length of the
814
   * input if the input is entirely representable in the ASCII state of
815
   * ISO-2022-JP.
816
   */
817
  static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer)
818
0
  {
819
0
    return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
820
0
                                                  aBuffer.Length());
821
0
  }
822
823
private:
824
  Encoding() = delete;
825
  Encoding(const Encoding&) = delete;
826
  Encoding& operator=(const Encoding&) = delete;
827
  ~Encoding() = delete;
828
829
};
830
831
/**
832
 * A converter that decodes a byte stream into Unicode according to a
833
 * character encoding in a streaming (incremental) manner.
834
 *
835
 * The various `Decode*` methods take an input buffer (`aSrc`) and an output
836
 * buffer `aDst` both of which are caller-allocated. There are variants for
837
 * both UTF-8 and UTF-16 output buffers.
838
 *
839
 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
840
 * into `aDst` until one of the following three things happens:
841
 *
842
 * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
843
 *    variants only).
844
 *
845
 * 2. The output buffer has been filled so near capacity that the decoder
846
 *    cannot be sure that processing an additional byte of input wouldn't
847
 *    cause so much output that the output buffer would overflow.
848
 *
849
 * 3. All the input bytes have been processed.
850
 *
851
 * The `Decode*` method then returns tuple of a status indicating which one
852
 * of the three reasons to return happened, how many input bytes were read,
853
 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
854
 * when decoding to UTF-16) were written, and in the case of the
855
 * variants performing replacement, a boolean indicating whether an error was
856
 * replaced with the REPLACEMENT CHARACTER during the call.
857
 *
858
 * The number of bytes "written" is what's logically written. Garbage may be
859
 * written in the output buffer beyond the point logically written to.
860
 *
861
 * In the case of the `*WithoutReplacement` variants, the status is a
862
 * `uint32_t` whose possible values are packed info about a malformed byte
863
 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
864
 * listed above).
865
 *
866
 * Packed info about malformed sequences has the following format:
867
 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
868
 * indicate the number of bytes that were consumed after the malformed
869
 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
870
 * the length of the malformed byte sequence (possible decimal values 1, 2,
871
 * 3 or 4). The maximum possible sum of the two is 6.
872
 *
873
 * In the case of methods whose name does not end with
874
 * `*WithoutReplacement`, malformed sequences are automatically replaced
875
 * with the REPLACEMENT CHARACTER and errors do not cause the methods to
876
 * return early.
877
 *
878
 * When decoding to UTF-8, the output buffer must have at least 4 bytes of
879
 * space. When decoding to UTF-16, the output buffer must have at least two
880
 * UTF-16 code units (`char16_t`) of space.
881
 *
882
 * When decoding to UTF-8 without replacement, the methods are guaranteed
883
 * not to return indicating that more output space is needed if the length
884
 * of the output buffer is at least the length returned by
885
 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
886
 * with replacement, the length of the output buffer that guarantees the
887
 * methods not to return indicating that more output space is needed is given
888
 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
889
 * or without replacement, the length of the output buffer that guarantees
890
 * the methods not to return indicating that more output space is needed is
891
 * given by `MaxUTF16BufferLength()`.
892
 *
893
 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
894
 * and the output after each `Decode*` call is guaranteed to consist of
895
 * complete characters. (I.e. the code unit sequence for the last character is
896
 * guaranteed not to be split across output buffers.)
897
 *
898
 * The boolean argument `aLast` indicates that the end of the stream is reached
899
 * when all the bytes in `aSrc` have been consumed.
900
 *
901
 * A `Decoder` object can be used to incrementally decode a byte stream.
902
 *
903
 * During the processing of a single stream, the caller must call `Decode*`
904
 * zero or more times with `aLast` set to `false` and then call `Decode*` at
905
 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
906
 * the processing of the stream has ended. Otherwise, the caller must call
907
 * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
908
 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
909
 *
910
 * Once the stream has ended, the `Decoder` object must not be used anymore.
911
 * That is, you need to create another one to process another stream.
912
 *
913
 * When the decoder returns `kOutputFull` or the decoder returns a malformed
914
 * result and the caller does not wish to treat it as a fatal error, the input
915
 * buffer `aSrc` may not have been completely consumed. In that case, the caller
916
 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
917
 * call.
918
 *
919
 * # Infinite loops
920
 *
921
 * When converting with a fixed-size output buffer whose size is too small to
922
 * accommodate one character of output, an infinite loop ensues. When
923
 * converting with a fixed-size output buffer, it generally makes sense to
924
 * make the buffer fairly large (e.g. couple of kilobytes).
925
 */
926
class Decoder final
927
{
928
public:
929
0
  ~Decoder() {}
930
  static void operator delete(void* aDecoder)
931
0
  {
932
0
    decoder_free(reinterpret_cast<Decoder*>(aDecoder));
933
0
  }
934
935
  /**
936
   * The `Encoding` this `Decoder` is for.
937
   *
938
   * BOM sniffing can change the return value of this method during the life
939
   * of the decoder.
940
   */
941
  inline NotNull<const mozilla::Encoding*> Encoding() const
942
0
  {
943
0
    return WrapNotNull(decoder_encoding(this));
944
0
  }
945
946
  /**
947
   * Query the worst-case UTF-8 output size _with replacement_.
948
   *
949
   * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
950
   * that will not overflow given the current state of the decoder and
951
   * `aByteLength` number of additional input bytes when decoding with
952
   * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
953
   * sequence.
954
   */
955
  inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const
956
0
  {
957
0
    CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
958
0
    if (max.value() == MaxValue<size_t>::value) {
959
0
      // Mark invalid by overflowing
960
0
      max++;
961
0
      MOZ_ASSERT(!max.isValid());
962
0
    }
963
0
    return max;
964
0
  }
965
966
  /**
967
   * Query the worst-case UTF-8 output size _without replacement_.
968
   *
969
   * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
970
   * that will not overflow given the current state of the decoder and
971
   * `aByteLength` number of additional input bytes when decoding without
972
   * replacement error handling.
973
   *
974
   * Note that this value may be too small for the `WithReplacement` case.
975
   * Use `MaxUTF8BufferLength()` for that case.
976
   */
977
  inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
978
    size_t aByteLength) const
979
0
  {
980
0
    CheckedInt<size_t> max(
981
0
      decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
982
0
    if (max.value() == MaxValue<size_t>::value) {
983
0
      // Mark invalid by overflowing
984
0
      max++;
985
0
      MOZ_ASSERT(!max.isValid());
986
0
    }
987
0
    return max;
988
0
  }
989
990
  /**
991
   * Incrementally decode a byte stream into UTF-8 with malformed sequences
992
   * replaced with the REPLACEMENT CHARACTER.
993
   *
994
   * See the documentation of the class for documentation for `Decode*`
995
   * methods collectively.
996
   */
997
  inline Tuple<uint32_t, size_t, size_t, bool>
998
  DecodeToUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
999
0
  {
1000
0
    size_t srcRead = aSrc.Length();
1001
0
    size_t dstWritten = aDst.Length();
1002
0
    bool hadReplacements;
1003
0
    uint32_t result = decoder_decode_to_utf8(this,
1004
0
                                             aSrc.Elements(),
1005
0
                                             &srcRead,
1006
0
                                             aDst.Elements(),
1007
0
                                             &dstWritten,
1008
0
                                             aLast,
1009
0
                                             &hadReplacements);
1010
0
    return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1011
0
  }
1012
1013
  /**
1014
   * Incrementally decode a byte stream into UTF-8 _without replacement_.
1015
   *
1016
   * See the documentation of the class for documentation for `Decode*`
1017
   * methods collectively.
1018
   */
1019
  inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
1020
    Span<const uint8_t> aSrc,
1021
    Span<uint8_t> aDst,
1022
    bool aLast)
1023
0
  {
1024
0
    size_t srcRead = aSrc.Length();
1025
0
    size_t dstWritten = aDst.Length();
1026
0
    uint32_t result = decoder_decode_to_utf8_without_replacement(
1027
0
      this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1028
0
    return MakeTuple(result, srcRead, dstWritten);
1029
0
  }
1030
1031
  /**
1032
   * Query the worst-case UTF-16 output size (with or without replacement).
1033
   *
1034
   * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
1035
   * that will not overflow given the current state of the decoder and
1036
   * `aByteLength` number of additional input bytes.
1037
   *
1038
   * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
1039
   * return value of this method applies also in the
1040
   * `_without_replacement` case.
1041
   */
1042
  inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const
1043
0
  {
1044
0
    CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
1045
0
    if (max.value() == MaxValue<size_t>::value) {
1046
0
      // Mark invalid by overflowing
1047
0
      max++;
1048
0
      MOZ_ASSERT(!max.isValid());
1049
0
    }
1050
0
    return max;
1051
0
  }
1052
1053
  /**
1054
   * Incrementally decode a byte stream into UTF-16 with malformed sequences
1055
   * replaced with the REPLACEMENT CHARACTER.
1056
   *
1057
   * See the documentation of the class for documentation for `Decode*`
1058
   * methods collectively.
1059
   */
1060
  inline Tuple<uint32_t, size_t, size_t, bool>
1061
  DecodeToUTF16(Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast)
1062
0
  {
1063
0
    size_t srcRead = aSrc.Length();
1064
0
    size_t dstWritten = aDst.Length();
1065
0
    bool hadReplacements;
1066
0
    uint32_t result = decoder_decode_to_utf16(this,
1067
0
                                              aSrc.Elements(),
1068
0
                                              &srcRead,
1069
0
                                              aDst.Elements(),
1070
0
                                              &dstWritten,
1071
0
                                              aLast,
1072
0
                                              &hadReplacements);
1073
0
    return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1074
0
  }
1075
1076
  /**
1077
   * Incrementally decode a byte stream into UTF-16 _without replacement_.
1078
   *
1079
   * See the documentation of the class for documentation for `Decode*`
1080
   * methods collectively.
1081
   */
1082
  inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1083
    Span<const uint8_t> aSrc,
1084
    Span<char16_t> aDst,
1085
    bool aLast)
1086
0
  {
1087
0
    size_t srcRead = aSrc.Length();
1088
0
    size_t dstWritten = aDst.Length();
1089
0
    uint32_t result = decoder_decode_to_utf16_without_replacement(
1090
0
      this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1091
0
    return MakeTuple(result, srcRead, dstWritten);
1092
0
  }
1093
1094
private:
1095
  Decoder() = delete;
1096
  Decoder(const Decoder&) = delete;
1097
  Decoder& operator=(const Decoder&) = delete;
1098
};
1099
1100
/**
1101
 * A converter that encodes a Unicode stream into bytes according to a
1102
 * character encoding in a streaming (incremental) manner.
1103
 *
1104
 * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1105
 * buffer `aDst` both of which are caller-allocated. There are variants for
1106
 * both UTF-8 and UTF-16 input buffers.
1107
 *
1108
 * An `Encode*` method encode characters from `aSrc` into bytes characters
1109
 * stored into `aDst` until one of the following three things happens:
1110
 *
1111
 * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1112
 *    only).
1113
 *
1114
 * 2. The output buffer has been filled so near capacity that the decoder
1115
 *    cannot be sure that processing an additional character of input wouldn't
1116
 *    cause so much output that the output buffer would overflow.
1117
 *
1118
 * 3. All the input characters have been processed.
1119
 *
1120
 * The `Encode*` method then returns tuple of a status indicating which one
1121
 * of the three reasons to return happened, how many input code units (`uint8_t`
1122
 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1123
 * how many output bytes were written, and in the case of the variants that
1124
 * perform replacement, a boolean indicating whether an unmappable
1125
 * character was replaced with a numeric character reference during the call.
1126
 *
1127
 * The number of bytes "written" is what's logically written. Garbage may be
1128
 * written in the output buffer beyond the point logically written to.
1129
 *
1130
 * In the case of the methods whose name ends with
1131
 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1132
 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1133
 * to the three cases listed above).
1134
 *
1135
 * In the case of methods whose name does not end with
1136
 * `*WithoutReplacement`, unmappable characters are automatically replaced
1137
 * with the corresponding numeric character references and unmappable
1138
 * characters do not cause the methods to return early.
1139
 *
1140
 * When encoding from UTF-8 without replacement, the methods are guaranteed
1141
 * not to return indicating that more output space is needed if the length
1142
 * of the output buffer is at least the length returned by
1143
 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1144
 * UTF-8 with replacement, the length of the output buffer that guarantees the
1145
 * methods not to return indicating that more output space is needed in the
1146
 * absence of unmappable characters is given by
1147
 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1148
 * UTF-16 without replacement, the methods are guaranteed not to return
1149
 * indicating that more output space is needed if the length of the output
1150
 * buffer is at least the length returned by
1151
 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1152
 * from UTF-16 with replacement, the the length of the output buffer that
1153
 * guarantees the methods not to return indicating that more output space is
1154
 * needed in the absence of unmappable characters is given by
1155
 * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1156
 * When encoding with replacement, applications are not expected to size the
1157
 * buffer for the worst case ahead of time but to resize the buffer if there
1158
 * are unmappable characters. This is why max length queries are only available
1159
 * for the case where there are no unmappable characters.
1160
 *
1161
 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1162
 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1163
 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1164
 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1165
 * surrogate pairs are not split across input buffer boundaries.
1166
 *
1167
 * After an `Encode*` call returns, the output produced so far, taken as a
1168
 * whole from the start of the stream, is guaranteed to consist of a valid
1169
 * byte sequence in the target encoding. (I.e. the code unit sequence for a
1170
 * character is guaranteed not to be split across output buffers. However, due
1171
 * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1172
 * from the start for it to be valid. For other encodings, the validity holds
1173
 * on a per-output buffer basis.)
1174
 *
1175
 * The boolean argument `aLast` indicates that the end of the stream is reached
1176
 * when all the characters in `aSrc` have been consumed. This argument is needed
1177
 * for ISO-2022-JP and is ignored for other encodings.
1178
 *
1179
 * An `Encoder` object can be used to incrementally encode a byte stream.
1180
 *
1181
 * During the processing of a single stream, the caller must call `Encode*`
1182
 * zero or more times with `aLast` set to `false` and then call `Encode*` at
1183
 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1184
 * the processing of the stream has ended. Otherwise, the caller must call
1185
 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1186
 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1187
 *
1188
 * Once the stream has ended, the `Encoder` object must not be used anymore.
1189
 * That is, you need to create another one to process another stream.
1190
 *
1191
 * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1192
 * result and the caller does not wish to treat it as a fatal error, the input
1193
 * buffer `aSrc` may not have been completely consumed. In that case, the caller
1194
 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1195
 * call.
1196
 *
1197
 * # Infinite loops
1198
 *
1199
 * When converting with a fixed-size output buffer whose size is too small to
1200
 * accommodate one character of output, an infinite loop ensues. When
1201
 * converting with a fixed-size output buffer, it generally makes sense to
1202
 * make the buffer fairly large (e.g. couple of kilobytes).
1203
 */
1204
class Encoder final
1205
{
1206
public:
1207
0
  ~Encoder() {}
1208
1209
  static void operator delete(void* aEncoder)
1210
0
  {
1211
0
    encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1212
0
  }
1213
1214
  /**
1215
   * The `Encoding` this `Encoder` is for.
1216
   */
1217
  inline NotNull<const mozilla::Encoding*> Encoding() const
1218
0
  {
1219
0
    return WrapNotNull(encoder_encoding(this));
1220
0
  }
1221
1222
  /**
1223
   * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1224
   * ASCII state and `false` otherwise.
1225
   */
1226
  inline bool HasPendingState() const
1227
0
  {
1228
0
    return encoder_has_pending_state(this);
1229
0
  }
1230
1231
  /**
1232
   * Query the worst-case output size when encoding from UTF-8 with
1233
   * replacement.
1234
   *
1235
   * Returns the size of the output buffer in bytes that will not overflow
1236
   * given the current state of the encoder and `aByteLength` number of
1237
   * additional input code units if there are no unmappable characters in
1238
   * the input.
1239
   */
1240
  inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1241
    size_t aByteLength) const
1242
0
  {
1243
0
    CheckedInt<size_t> max(
1244
0
      encoder_max_buffer_length_from_utf8_if_no_unmappables(this, aByteLength));
1245
0
    if (max.value() == MaxValue<size_t>::value) {
1246
0
      // Mark invalid by overflowing
1247
0
      max++;
1248
0
      MOZ_ASSERT(!max.isValid());
1249
0
    }
1250
0
    return max;
1251
0
  }
1252
1253
  /**
1254
   * Query the worst-case output size when encoding from UTF-8 without
1255
   * replacement.
1256
   *
1257
   * Returns the size of the output buffer in bytes that will not overflow
1258
   * given the current state of the encoder and `aByteLength` number of
1259
   * additional input code units.
1260
   */
1261
  inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1262
    size_t aByteLength) const
1263
0
  {
1264
0
    CheckedInt<size_t> max(
1265
0
      encoder_max_buffer_length_from_utf8_without_replacement(this,
1266
0
                                                              aByteLength));
1267
0
    if (max.value() == MaxValue<size_t>::value) {
1268
0
      // Mark invalid by overflowing
1269
0
      max++;
1270
0
      MOZ_ASSERT(!max.isValid());
1271
0
    }
1272
0
    return max;
1273
0
  }
1274
1275
  /**
1276
   * Incrementally encode into byte stream from UTF-8 with unmappable
1277
   * characters replaced with HTML (decimal) numeric character references.
1278
   *
1279
   * See the documentation of the class for documentation for `Encode*`
1280
   * methods collectively.
1281
   *
1282
   * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1283
   * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1284
   * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1285
   */
1286
  inline Tuple<uint32_t, size_t, size_t, bool>
1287
  EncodeFromUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
1288
0
  {
1289
0
    size_t srcRead = aSrc.Length();
1290
0
    size_t dstWritten = aDst.Length();
1291
0
    bool hadReplacements;
1292
0
    uint32_t result = encoder_encode_from_utf8(this,
1293
0
                                               aSrc.Elements(),
1294
0
                                               &srcRead,
1295
0
                                               aDst.Elements(),
1296
0
                                               &dstWritten,
1297
0
                                               aLast,
1298
0
                                               &hadReplacements);
1299
0
    return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1300
0
  }
1301
1302
  /**
1303
   * Incrementally encode into byte stream from UTF-8 _without replacement_.
1304
   *
1305
   * See the documentation of the class for documentation for `Encode*`
1306
   * methods collectively.
1307
   *
1308
   * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1309
   * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1310
   * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1311
   */
1312
  inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1313
    Span<const uint8_t> aSrc,
1314
    Span<uint8_t> aDst,
1315
    bool aLast)
1316
0
  {
1317
0
    size_t srcRead = aSrc.Length();
1318
0
    size_t dstWritten = aDst.Length();
1319
0
    uint32_t result = encoder_encode_from_utf8_without_replacement(
1320
0
      this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1321
0
    return MakeTuple(result, srcRead, dstWritten);
1322
0
  }
1323
1324
  /**
1325
   * Query the worst-case output size when encoding from UTF-16 with
1326
   * replacement.
1327
   *
1328
   * Returns the size of the output buffer in bytes that will not overflow
1329
   * given the current state of the encoder and `aU16Length` number of
1330
   * additional input code units if there are no unmappable characters in
1331
   * the input.
1332
   */
1333
  inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1334
    size_t aU16Length) const
1335
0
  {
1336
0
    CheckedInt<size_t> max(
1337
0
      encoder_max_buffer_length_from_utf16_if_no_unmappables(this, aU16Length));
1338
0
    if (max.value() == MaxValue<size_t>::value) {
1339
0
      // Mark invalid by overflowing
1340
0
      max++;
1341
0
      MOZ_ASSERT(!max.isValid());
1342
0
    }
1343
0
    return max;
1344
0
  }
1345
1346
  /**
1347
   * Query the worst-case output size when encoding from UTF-16 without
1348
   * replacement.
1349
   *
1350
   * Returns the size of the output buffer in bytes that will not overflow
1351
   * given the current state of the encoder and `aU16Length` number of
1352
   * additional input code units.
1353
   */
1354
  inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1355
    size_t aU16Length) const
1356
0
  {
1357
0
    CheckedInt<size_t> max(
1358
0
      encoder_max_buffer_length_from_utf16_without_replacement(this,
1359
0
                                                               aU16Length));
1360
0
    if (max.value() == MaxValue<size_t>::value) {
1361
0
      // Mark invalid by overflowing
1362
0
      max++;
1363
0
      MOZ_ASSERT(!max.isValid());
1364
0
    }
1365
0
    return max;
1366
0
  }
1367
1368
  /**
1369
   * Incrementally encode into byte stream from UTF-16 with unmappable
1370
   * characters replaced with HTML (decimal) numeric character references.
1371
   *
1372
   * See the documentation of the class for documentation for `Encode*`
1373
   * methods collectively.
1374
   */
1375
  inline Tuple<uint32_t, size_t, size_t, bool>
1376
  EncodeFromUTF16(Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast)
1377
0
  {
1378
0
    size_t srcRead = aSrc.Length();
1379
0
    size_t dstWritten = aDst.Length();
1380
0
    bool hadReplacements;
1381
0
    uint32_t result = encoder_encode_from_utf16(this,
1382
0
                                                aSrc.Elements(),
1383
0
                                                &srcRead,
1384
0
                                                aDst.Elements(),
1385
0
                                                &dstWritten,
1386
0
                                                aLast,
1387
0
                                                &hadReplacements);
1388
0
    return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1389
0
  }
1390
1391
  /**
1392
   * Incrementally encode into byte stream from UTF-16 _without replacement_.
1393
   *
1394
   * See the documentation of the class for documentation for `Encode*`
1395
   * methods collectively.
1396
   */
1397
  inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1398
    Span<const char16_t> aSrc,
1399
    Span<uint8_t> aDst,
1400
    bool aLast)
1401
0
  {
1402
0
    size_t srcRead = aSrc.Length();
1403
0
    size_t dstWritten = aDst.Length();
1404
0
    uint32_t result = encoder_encode_from_utf16_without_replacement(
1405
0
      this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1406
0
    return MakeTuple(result, srcRead, dstWritten);
1407
0
  }
1408
1409
private:
1410
  Encoder() = delete;
1411
  Encoder(const Encoder&) = delete;
1412
  Encoder& operator=(const Encoder&) = delete;
1413
};
1414
1415
}; // namespace mozilla
1416
1417
#endif // mozilla_Encoding_h