Coverage Report

Created: 2025-12-31 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/include/simdutf/implementation.h
Line
Count
Source
1
#ifndef SIMDUTF_IMPLEMENTATION_H
2
#define SIMDUTF_IMPLEMENTATION_H
3
#if !defined(SIMDUTF_NO_THREADS)
4
  #include <atomic>
5
#endif
6
#include <string>
7
#ifdef SIMDUTF_INTERNAL_TESTS
8
  #include <vector>
9
#endif
10
#include "simdutf/common_defs.h"
11
#include "simdutf/compiler_check.h"
12
#include "simdutf/encoding_types.h"
13
#include "simdutf/error.h"
14
#include "simdutf/internal/isadetection.h"
15
16
#if SIMDUTF_SPAN
17
  #include <concepts>
18
  #include <type_traits>
19
  #include <span>
20
  #include <tuple>
21
#endif
22
#if SIMDUTF_CPLUSPLUS17
23
  #include <string_view>
24
#endif
25
// The following defines are conditionally enabled/disabled during amalgamation.
26
// By default all features are enabled, regular code shouldn't check them. Only
27
// when user code really relies of a selected subset, it's good to verify these
28
// flags, like:
29
//
30
//      #if !SIMDUTF_FEATURE_UTF16
31
//      #   error("Please amalgamate simdutf with UTF-16 support")
32
//      #endif
33
//
34
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
35
#define SIMDUTF_FEATURE_ASCII 1
36
#define SIMDUTF_FEATURE_LATIN1 1
37
#define SIMDUTF_FEATURE_UTF8 1
38
#define SIMDUTF_FEATURE_UTF16 1
39
#define SIMDUTF_FEATURE_UTF32 1
40
#define SIMDUTF_FEATURE_BASE64 1
41
42
namespace simdutf {
43
44
constexpr size_t default_line_length =
45
    76; ///< default line length for base64 encoding with lines
46
47
#if SIMDUTF_SPAN
48
/// helpers placed in namespace detail are not a part of the public API
49
namespace detail {
50
/**
51
 * matches a byte, in the many ways C++ allows. note that these
52
 * are all distinct types.
53
 */
54
template <typename T>
55
concept byte_like = std::is_same_v<T, std::byte> ||     //
56
                    std::is_same_v<T, char> ||          //
57
                    std::is_same_v<T, signed char> ||   //
58
                    std::is_same_v<T, unsigned char> || //
59
                    std::is_same_v<T, char8_t>;
60
61
template <typename T>
62
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
63
64
template <typename T>
65
concept is_pointer = std::is_pointer_v<T>;
66
67
/**
68
 * matches anything that behaves like std::span and points to character-like
69
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
70
 * std::uint8_t
71
 */
72
template <typename T>
73
concept input_span_of_byte_like = requires(const T &t) {
74
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
75
  { t.data() } noexcept -> is_pointer;
76
  { *t.data() } noexcept -> is_byte_like;
77
};
78
79
template <typename T>
80
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
81
82
/**
83
 * like span_of_byte_like, but for an output span (intended to be written to)
84
 */
85
template <typename T>
86
concept output_span_of_byte_like = requires(T &t) {
87
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
88
  { t.data() } noexcept -> is_pointer;
89
  { *t.data() } noexcept -> is_byte_like;
90
  { *t.data() } noexcept -> is_mutable;
91
};
92
} // namespace detail
93
#endif
94
95
#if SIMDUTF_FEATURE_DETECT_ENCODING
96
/**
97
 * Autodetect the encoding of the input, a single encoding is recommended.
98
 * E.g., the function might return simdutf::encoding_type::UTF8,
99
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
100
 * simdutf::encoding_type::UTF32_LE.
101
 *
102
 * @param input the string to analyze.
103
 * @param length the length of the string in bytes.
104
 * @return the detected encoding type
105
 */
106
simdutf_warn_unused simdutf::encoding_type
107
autodetect_encoding(const char *input, size_t length) noexcept;
108
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
109
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
110
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
111
0
}
112
  #if SIMDUTF_SPAN
113
/**
114
 * Autodetect the encoding of the input, a single encoding is recommended.
115
 * E.g., the function might return simdutf::encoding_type::UTF8,
116
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
117
 * simdutf::encoding_type::UTF32_LE.
118
 *
119
 * @param input the string to analyze. can be a anything span-like that has a
120
 * data() and size() that points to character data: std::string,
121
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
122
 * @return the detected encoding type
123
 */
124
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
125
autodetect_encoding(
126
    const detail::input_span_of_byte_like auto &input) noexcept {
127
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
128
                             input.size());
129
}
130
  #endif // SIMDUTF_SPAN
131
132
/**
133
 * Autodetect the possible encodings of the input in one pass.
134
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
135
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
136
 *
137
 * Overridden by each implementation.
138
 *
139
 * @param input the string to analyze.
140
 * @param length the length of the string in bytes.
141
 * @return the detected encoding type
142
 */
143
simdutf_warn_unused int detect_encodings(const char *input,
144
                                         size_t length) noexcept;
145
simdutf_really_inline simdutf_warn_unused int
146
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
147
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
148
0
}
149
  #if SIMDUTF_SPAN
150
simdutf_really_inline simdutf_warn_unused int
151
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
152
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
153
                          input.size());
154
}
155
  #endif // SIMDUTF_SPAN
156
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
157
158
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
159
/**
160
 * Validate the UTF-8 string. This function may be best when you expect
161
 * the input to be almost always valid. Otherwise, consider using
162
 * validate_utf8_with_errors.
163
 *
164
 * Overridden by each implementation.
165
 *
166
 * @param buf the UTF-8 string to validate.
167
 * @param len the length of the string in bytes.
168
 * @return true if and only if the string is valid UTF-8.
169
 */
170
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
171
  #if SIMDUTF_SPAN
172
simdutf_really_inline simdutf_warn_unused bool
173
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
174
  return validate_utf8(reinterpret_cast<const char *>(input.data()),
175
                       input.size());
176
}
177
  #endif // SIMDUTF_SPAN
178
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
179
180
#if SIMDUTF_FEATURE_UTF8
181
/**
182
 * Validate the UTF-8 string and stop on error.
183
 *
184
 * Overridden by each implementation.
185
 *
186
 * @param buf the UTF-8 string to validate.
187
 * @param len the length of the string in bytes.
188
 * @return a result pair struct (of type simdutf::result containing the two
189
 * fields error and count) with an error code and either position of the error
190
 * (in the input in code units) if any, or the number of code units validated if
191
 * successful.
192
 */
193
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
194
                                                     size_t len) noexcept;
195
  #if SIMDUTF_SPAN
196
simdutf_really_inline simdutf_warn_unused result validate_utf8_with_errors(
197
    const detail::input_span_of_byte_like auto &input) noexcept {
198
  return validate_utf8_with_errors(reinterpret_cast<const char *>(input.data()),
199
                                   input.size());
200
}
201
  #endif // SIMDUTF_SPAN
202
#endif   // SIMDUTF_FEATURE_UTF8
203
204
#if SIMDUTF_FEATURE_ASCII
205
/**
206
 * Validate the ASCII string.
207
 *
208
 * Overridden by each implementation.
209
 *
210
 * @param buf the ASCII string to validate.
211
 * @param len the length of the string in bytes.
212
 * @return true if and only if the string is valid ASCII.
213
 */
214
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
215
  #if SIMDUTF_SPAN
216
simdutf_really_inline simdutf_warn_unused bool
217
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
218
  return validate_ascii(reinterpret_cast<const char *>(input.data()),
219
                        input.size());
220
}
221
  #endif // SIMDUTF_SPAN
222
223
/**
224
 * Validate the ASCII string and stop on error. It might be faster than
225
 * validate_utf8 when an error is expected to occur early.
226
 *
227
 * Overridden by each implementation.
228
 *
229
 * @param buf the ASCII string to validate.
230
 * @param len the length of the string in bytes.
231
 * @return a result pair struct (of type simdutf::result containing the two
232
 * fields error and count) with an error code and either position of the error
233
 * (in the input in code units) if any, or the number of code units validated if
234
 * successful.
235
 */
236
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
237
                                                      size_t len) noexcept;
238
  #if SIMDUTF_SPAN
239
simdutf_really_inline simdutf_warn_unused result validate_ascii_with_errors(
240
    const detail::input_span_of_byte_like auto &input) noexcept {
241
  return validate_ascii_with_errors(
242
      reinterpret_cast<const char *>(input.data()), input.size());
243
}
244
  #endif // SIMDUTF_SPAN
245
#endif   // SIMDUTF_FEATURE_ASCII
246
247
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
248
/**
249
 * Validate the ASCII string as a UTF-16 sequence.
250
 * An UTF-16 sequence is considered an ASCII sequence
251
 * if it could be converted to an ASCII string losslessly.
252
 *
253
 * Overridden by each implementation.
254
 *
255
 * @param buf the UTF-16 string to validate.
256
 * @param len the length of the string in bytes.
257
 * @return true if and only if the string is valid ASCII.
258
 */
259
simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf,
260
                                                 size_t len) noexcept;
261
  #if SIMDUTF_SPAN
262
simdutf_really_inline simdutf_warn_unused bool
263
0
validate_utf16_as_ascii(std::span<const char16_t> input) noexcept {
264
0
  return validate_utf16_as_ascii(input.data(), input.size());
265
0
}
266
  #endif // SIMDUTF_SPAN
267
268
/**
269
 * Validate the ASCII string as a UTF-16BE sequence.
270
 * An UTF-16 sequence is considered an ASCII sequence
271
 * if it could be converted to an ASCII string losslessly.
272
 *
273
 * Overridden by each implementation.
274
 *
275
 * @param buf the UTF-16BE string to validate.
276
 * @param len the length of the string in bytes.
277
 * @return true if and only if the string is valid ASCII.
278
 */
279
simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf,
280
                                                   size_t len) noexcept;
281
  #if SIMDUTF_SPAN
282
simdutf_really_inline simdutf_warn_unused bool
283
0
validate_utf16be_as_ascii(std::span<const char16_t> input) noexcept {
284
0
  return validate_utf16be_as_ascii(input.data(), input.size());
285
0
}
286
  #endif // SIMDUTF_SPAN
287
288
/**
289
 * Validate the ASCII string as a UTF-16LE sequence.
290
 * An UTF-16 sequence is considered an ASCII sequence
291
 * if it could be converted to an ASCII string losslessly.
292
 *
293
 * Overridden by each implementation.
294
 *
295
 * @param buf the UTF-16LE string to validate.
296
 * @param len the length of the string in bytes.
297
 * @return true if and only if the string is valid ASCII.
298
 */
299
simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf,
300
                                                   size_t len) noexcept;
301
  #if SIMDUTF_SPAN
302
simdutf_really_inline simdutf_warn_unused bool
303
0
validate_utf16le_as_ascii(std::span<const char16_t> input) noexcept {
304
0
  return validate_utf16le_as_ascii(input.data(), input.size());
305
0
}
306
  #endif // SIMDUTF_SPAN
307
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
308
309
#if SIMDUTF_FEATURE_UTF16
310
/**
311
 * Using native endianness; Validate the UTF-16 string.
312
 * This function may be best when you expect the input to be almost always
313
 * valid. Otherwise, consider using validate_utf16_with_errors.
314
 *
315
 * Overridden by each implementation.
316
 *
317
 * This function is not BOM-aware.
318
 *
319
 * @param buf the UTF-16 string to validate.
320
 * @param len the length of the string in number of 2-byte code units
321
 * (char16_t).
322
 * @return true if and only if the string is valid UTF-16.
323
 */
324
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
325
                                        size_t len) noexcept;
326
  #if SIMDUTF_SPAN
327
simdutf_really_inline simdutf_warn_unused bool
328
0
validate_utf16(std::span<const char16_t> input) noexcept {
329
0
  return validate_utf16(input.data(), input.size());
330
0
}
331
  #endif // SIMDUTF_SPAN
332
#endif   // SIMDUTF_FEATURE_UTF16
333
334
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
335
/**
336
 * Validate the UTF-16LE string. This function may be best when you expect
337
 * the input to be almost always valid. Otherwise, consider using
338
 * validate_utf16le_with_errors.
339
 *
340
 * Overridden by each implementation.
341
 *
342
 * This function is not BOM-aware.
343
 *
344
 * @param buf the UTF-16LE string to validate.
345
 * @param len the length of the string in number of 2-byte code units
346
 * (char16_t).
347
 * @return true if and only if the string is valid UTF-16LE.
348
 */
349
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
350
                                          size_t len) noexcept;
351
  #if SIMDUTF_SPAN
352
simdutf_really_inline simdutf_warn_unused bool
353
0
validate_utf16le(std::span<const char16_t> input) noexcept {
354
0
  return validate_utf16le(input.data(), input.size());
355
0
}
356
  #endif // SIMDUTF_SPAN
357
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
358
359
#if SIMDUTF_FEATURE_UTF16
360
/**
361
 * Validate the UTF-16BE string. This function may be best when you expect
362
 * the input to be almost always valid. Otherwise, consider using
363
 * validate_utf16be_with_errors.
364
 *
365
 * Overridden by each implementation.
366
 *
367
 * This function is not BOM-aware.
368
 *
369
 * @param buf the UTF-16BE string to validate.
370
 * @param len the length of the string in number of 2-byte code units
371
 * (char16_t).
372
 * @return true if and only if the string is valid UTF-16BE.
373
 */
374
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
375
                                          size_t len) noexcept;
376
  #if SIMDUTF_SPAN
377
simdutf_really_inline simdutf_warn_unused bool
378
0
validate_utf16be(std::span<const char16_t> input) noexcept {
379
0
  return validate_utf16be(input.data(), input.size());
380
0
}
381
  #endif // SIMDUTF_SPAN
382
383
/**
384
 * Using native endianness; Validate the UTF-16 string and stop on error.
385
 * It might be faster than validate_utf16 when an error is expected to occur
386
 * early.
387
 *
388
 * Overridden by each implementation.
389
 *
390
 * This function is not BOM-aware.
391
 *
392
 * @param buf the UTF-16 string to validate.
393
 * @param len the length of the string in number of 2-byte code units
394
 * (char16_t).
395
 * @return a result pair struct (of type simdutf::result containing the two
396
 * fields error and count) with an error code and either position of the error
397
 * (in the input in code units) if any, or the number of code units validated if
398
 * successful.
399
 */
400
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
401
                                                      size_t len) noexcept;
402
  #if SIMDUTF_SPAN
403
simdutf_really_inline simdutf_warn_unused result
404
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
405
0
  return validate_utf16_with_errors(input.data(), input.size());
406
0
}
407
  #endif // SIMDUTF_SPAN
408
409
/**
410
 * Validate the UTF-16LE string and stop on error. It might be faster than
411
 * validate_utf16le when an error is expected to occur early.
412
 *
413
 * Overridden by each implementation.
414
 *
415
 * This function is not BOM-aware.
416
 *
417
 * @param buf the UTF-16LE string to validate.
418
 * @param len the length of the string in number of 2-byte code units
419
 * (char16_t).
420
 * @return a result pair struct (of type simdutf::result containing the two
421
 * fields error and count) with an error code and either position of the error
422
 * (in the input in code units) if any, or the number of code units validated if
423
 * successful.
424
 */
425
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
426
                                                        size_t len) noexcept;
427
  #if SIMDUTF_SPAN
428
simdutf_really_inline simdutf_warn_unused result
429
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
430
0
  return validate_utf16le_with_errors(input.data(), input.size());
431
0
}
432
  #endif // SIMDUTF_SPAN
433
434
/**
435
 * Validate the UTF-16BE string and stop on error. It might be faster than
436
 * validate_utf16be when an error is expected to occur early.
437
 *
438
 * Overridden by each implementation.
439
 *
440
 * This function is not BOM-aware.
441
 *
442
 * @param buf the UTF-16BE string to validate.
443
 * @param len the length of the string in number of 2-byte code units
444
 * (char16_t).
445
 * @return a result pair struct (of type simdutf::result containing the two
446
 * fields error and count) with an error code and either position of the error
447
 * (in the input in code units) if any, or the number of code units validated if
448
 * successful.
449
 */
450
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
451
                                                        size_t len) noexcept;
452
  #if SIMDUTF_SPAN
453
simdutf_really_inline simdutf_warn_unused result
454
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
455
0
  return validate_utf16be_with_errors(input.data(), input.size());
456
0
}
457
  #endif // SIMDUTF_SPAN
458
459
/**
460
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
461
 * the Unicode replacement character U+FFFD. If input and output points to
462
 * different memory areas, the procedure copies string, and it's expected that
463
 * output memory is at least as big as the input. It's also possible to set
464
 * input equal output, that makes replacements an in-place operation.
465
 *
466
 * @param input the UTF-16LE string to correct.
467
 * @param len the length of the string in number of 2-byte code units
468
 * (char16_t).
469
 * @param output the output buffer.
470
 */
471
void to_well_formed_utf16le(const char16_t *input, size_t len,
472
                            char16_t *output) noexcept;
473
  #if SIMDUTF_SPAN
474
simdutf_really_inline void
475
to_well_formed_utf16le(std::span<const char16_t> input,
476
0
                       std::span<char16_t> output) noexcept {
477
0
  to_well_formed_utf16le(input.data(), input.size(), output.data());
478
0
}
479
  #endif // SIMDUTF_SPAN
480
481
/**
482
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
483
 * the Unicode replacement character U+FFFD. If input and output points to
484
 * different memory areas, the procedure copies string, and it's expected that
485
 * output memory is at least as big as the input. It's also possible to set
486
 * input equal output, that makes replacements an in-place operation.
487
 *
488
 * @param input the UTF-16BE string to correct.
489
 * @param len the length of the string in number of 2-byte code units
490
 * (char16_t).
491
 * @param output the output buffer.
492
 */
493
void to_well_formed_utf16be(const char16_t *input, size_t len,
494
                            char16_t *output) noexcept;
495
  #if SIMDUTF_SPAN
496
simdutf_really_inline void
497
to_well_formed_utf16be(std::span<const char16_t> input,
498
0
                       std::span<char16_t> output) noexcept {
499
0
  to_well_formed_utf16be(input.data(), input.size(), output.data());
500
0
}
501
  #endif // SIMDUTF_SPAN
502
503
/**
504
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
505
 * Unicode replacement character U+FFFD. If input and output points to different
506
 * memory areas, the procedure copies string, and it's expected that output
507
 * memory is at least as big as the input. It's also possible to set input equal
508
 * output, that makes replacements an in-place operation.
509
 *
510
 * @param input the UTF-16 string to correct.
511
 * @param len the length of the string in number of 2-byte code units
512
 * (char16_t).
513
 * @param output the output buffer.
514
 */
515
void to_well_formed_utf16(const char16_t *input, size_t len,
516
                          char16_t *output) noexcept;
517
  #if SIMDUTF_SPAN
518
simdutf_really_inline void
519
to_well_formed_utf16(std::span<const char16_t> input,
520
0
                     std::span<char16_t> output) noexcept {
521
0
  to_well_formed_utf16(input.data(), input.size(), output.data());
522
0
}
523
  #endif // SIMDUTF_SPAN
524
525
#endif // SIMDUTF_FEATURE_UTF16
526
527
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
528
/**
529
 * Validate the UTF-32 string. This function may be best when you expect
530
 * the input to be almost always valid. Otherwise, consider using
531
 * validate_utf32_with_errors.
532
 *
533
 * Overridden by each implementation.
534
 *
535
 * This function is not BOM-aware.
536
 *
537
 * @param buf the UTF-32 string to validate.
538
 * @param len the length of the string in number of 4-byte code units
539
 * (char32_t).
540
 * @return true if and only if the string is valid UTF-32.
541
 */
542
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
543
                                        size_t len) noexcept;
544
  #if SIMDUTF_SPAN
545
simdutf_really_inline simdutf_warn_unused bool
546
0
validate_utf32(std::span<const char32_t> input) noexcept {
547
0
  return validate_utf32(input.data(), input.size());
548
0
}
549
  #endif // SIMDUTF_SPAN
550
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
551
552
#if SIMDUTF_FEATURE_UTF32
553
/**
554
 * Validate the UTF-32 string and stop on error. It might be faster than
555
 * validate_utf32 when an error is expected to occur early.
556
 *
557
 * Overridden by each implementation.
558
 *
559
 * This function is not BOM-aware.
560
 *
561
 * @param buf the UTF-32 string to validate.
562
 * @param len the length of the string in number of 4-byte code units
563
 * (char32_t).
564
 * @return a result pair struct (of type simdutf::result containing the two
565
 * fields error and count) with an error code and either position of the error
566
 * (in the input in code units) if any, or the number of code units validated if
567
 * successful.
568
 */
569
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
570
                                                      size_t len) noexcept;
571
  #if SIMDUTF_SPAN
572
simdutf_really_inline simdutf_warn_unused result
573
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
574
0
  return validate_utf32_with_errors(input.data(), input.size());
575
0
}
576
  #endif // SIMDUTF_SPAN
577
#endif   // SIMDUTF_FEATURE_UTF32
578
579
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
580
/**
581
 * Convert Latin1 string into UTF-8 string.
582
 *
583
 * This function is suitable to work with inputs from untrusted sources.
584
 *
585
 * @param input         the Latin1 string to convert
586
 * @param length        the length of the string in bytes
587
 * @param utf8_output   the pointer to buffer that can hold conversion result
588
 * @return the number of written char; 0 if conversion is not possible
589
 */
590
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
591
                                                  size_t length,
592
                                                  char *utf8_output) noexcept;
593
  #if SIMDUTF_SPAN
594
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8(
595
    const detail::input_span_of_byte_like auto &latin1_input,
596
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
597
  return convert_latin1_to_utf8(
598
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
599
      utf8_output.data());
600
}
601
  #endif // SIMDUTF_SPAN
602
603
/**
604
 * Convert Latin1 string into UTF-8 string with output limit.
605
 *
606
 * This function is suitable to work with inputs from untrusted sources.
607
 *
608
 * We write as many characters as possible.
609
 *
610
 * @param input         the Latin1 string to convert
611
 * @param length        the length of the string in bytes
612
 * @param utf8_output   the pointer to buffer that can hold conversion result
613
 * @param utf8_len      the maximum output length
614
 * @return the number of written char; 0 if conversion is not possible
615
 */
616
simdutf_warn_unused size_t
617
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
618
                            size_t utf8_len) noexcept;
619
  #if SIMDUTF_SPAN
620
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
621
    const detail::input_span_of_byte_like auto &input,
622
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
623
  // implementation note: outputspan is a forwarding ref to avoid copying and
624
  // allow both lvalues and rvalues. std::span can be copied without problems,
625
  // but std::vector should not, and this function should accept both. it will
626
  // allow using an owning rvalue ref (example: passing a temporary std::string)
627
  // as output, but the user will quickly find out that he has no way of getting
628
  // the data out of the object in that case.
629
  return convert_latin1_to_utf8_safe(
630
      input.data(), input.size(), reinterpret_cast<char *>(utf8_output.data()),
631
      utf8_output.size());
632
}
633
  #endif // SIMDUTF_SPAN
634
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
635
636
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
637
/**
638
 * Convert possibly Latin1 string into UTF-16LE string.
639
 *
640
 * This function is suitable to work with inputs from untrusted sources.
641
 *
642
 * @param input         the Latin1 string to convert
643
 * @param length        the length of the string in bytes
644
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
645
 * @return the number of written char16_t; 0 if conversion is not possible
646
 */
647
simdutf_warn_unused size_t convert_latin1_to_utf16le(
648
    const char *input, size_t length, char16_t *utf16_output) noexcept;
649
  #if SIMDUTF_SPAN
650
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf16le(
651
    const detail::input_span_of_byte_like auto &latin1_input,
652
    std::span<char16_t> utf16_output) noexcept {
653
  return convert_latin1_to_utf16le(
654
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
655
      utf16_output.data());
656
}
657
  #endif // SIMDUTF_SPAN
658
659
/**
660
 * Convert Latin1 string into UTF-16BE string.
661
 *
662
 * This function is suitable to work with inputs from untrusted sources.
663
 *
664
 * @param input         the Latin1 string to convert
665
 * @param length        the length of the string in bytes
666
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
667
 * @return the number of written char16_t; 0 if conversion is not possible
668
 */
669
simdutf_warn_unused size_t convert_latin1_to_utf16be(
670
    const char *input, size_t length, char16_t *utf16_output) noexcept;
671
  #if SIMDUTF_SPAN
672
simdutf_really_inline simdutf_warn_unused size_t
673
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
674
                          std::span<char16_t> output) noexcept {
675
  return convert_latin1_to_utf16be(reinterpret_cast<const char *>(input.data()),
676
                                   input.size(), output.data());
677
}
678
  #endif // SIMDUTF_SPAN
679
/**
680
 * Compute the number of bytes that this UTF-16 string would require in Latin1
681
 * format.
682
 *
683
 * @param length        the length of the string in Latin1 code units (char)
684
 * @return the length of the string in Latin1 code units (char) required to
685
 * encode the UTF-16 string as Latin1
686
 */
687
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
688
689
/**
690
 * Compute the number of code units that this Latin1 string would require in
691
 * UTF-16 format.
692
 *
693
 * @param length        the length of the string in Latin1 code units (char)
694
 * @return the length of the string in 2-byte code units (char16_t) required to
695
 * encode the Latin1 string as UTF-16
696
 */
697
simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept;
698
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
699
700
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
701
/**
702
 * Convert Latin1 string into UTF-32 string.
703
 *
704
 * This function is suitable to work with inputs from untrusted sources.
705
 *
706
 * @param input         the Latin1 string to convert
707
 * @param length        the length of the string in bytes
708
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
709
 * @return the number of written char32_t; 0 if conversion is not possible
710
 */
711
simdutf_warn_unused size_t convert_latin1_to_utf32(
712
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
713
  #if SIMDUTF_SPAN
714
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf32(
715
    const detail::input_span_of_byte_like auto &latin1_input,
716
    std::span<char32_t> utf32_output) noexcept {
717
  return convert_latin1_to_utf32(
718
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
719
      utf32_output.data());
720
}
721
  #endif // SIMDUTF_SPAN
722
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
723
724
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
725
/**
726
 * Convert possibly broken UTF-8 string into latin1 string.
727
 *
728
 * During the conversion also validation of the input string is done.
729
 * This function is suitable to work with inputs from untrusted sources.
730
 *
731
 * @param input         the UTF-8 string to convert
732
 * @param length        the length of the string in bytes
733
 * @param latin1_output  the pointer to buffer that can hold conversion result
734
 * @return the number of written char; 0 if the input was not valid UTF-8 string
735
 * or if it cannot be represented as Latin1
736
 */
737
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
738
                                                  size_t length,
739
                                                  char *latin1_output) noexcept;
740
  #if SIMDUTF_SPAN
741
simdutf_really_inline simdutf_warn_unused size_t convert_utf8_to_latin1(
742
    const detail::input_span_of_byte_like auto &input,
743
    detail::output_span_of_byte_like auto &&output) noexcept {
744
  return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
745
                                input.size(),
746
                                reinterpret_cast<char *>(output.data()));
747
}
748
  #endif // SIMDUTF_SPAN
749
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
750
751
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
752
/**
753
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
754
 * string.
755
 *
756
 * During the conversion also validation of the input string is done.
757
 * This function is suitable to work with inputs from untrusted sources.
758
 *
759
 * @param input         the UTF-8 string to convert
760
 * @param length        the length of the string in bytes
761
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
762
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
763
 * string
764
 */
765
simdutf_warn_unused size_t convert_utf8_to_utf16(
766
    const char *input, size_t length, char16_t *utf16_output) noexcept;
767
  #if SIMDUTF_SPAN
768
simdutf_really_inline simdutf_warn_unused size_t
769
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
770
                      std::span<char16_t> output) noexcept {
771
  return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
772
                               input.size(), output.data());
773
}
774
  #endif // SIMDUTF_SPAN
775
776
/**
777
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
778
 * format even when the UTF-16LE content contains mismatched surrogates
779
 * that have to be replaced by the replacement character (0xFFFD).
780
 *
781
 * @param input         the UTF-16LE string to convert
782
 * @param length        the length of the string in 2-byte code units (char16_t)
783
 * @return a result pair struct (of type simdutf::result containing the two
784
 * fields error and count) where the count is the number of bytes required to
785
 * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or
786
 * SURROGATE. The count is correct regardless of the error field.
787
 * When SURROGATE is returned, it does not indicate an error in the case of this
788
 * function: it indicates that at least one surrogate has been encountered: the
789
 * surrogates may be matched or not (thus this function does not validate). If
790
 * the returned error code is SUCCESS, then the input contains no surrogate, is
791
 * in the Basic Multilingual Plane, and is necessarily valid.
792
 */
793
simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
794
    const char16_t *input, size_t length) noexcept;
795
  #if SIMDUTF_SPAN
796
simdutf_really_inline simdutf_warn_unused result
797
utf8_length_from_utf16le_with_replacement(
798
0
    std::span<const char16_t> valid_utf16_input) noexcept {
799
0
  return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
800
0
                                                   valid_utf16_input.size());
801
0
}
802
  #endif // SIMDUTF_SPAN
803
804
/**
805
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
806
 * format even when the UTF-16BE content contains mismatched surrogates
807
 * that have to be replaced by the replacement character (0xFFFD).
808
 *
809
 * @param input         the UTF-16BE string to convert
810
 * @param length        the length of the string in 2-byte code units (char16_t)
811
 * @return a result pair struct (of type simdutf::result containing the two
812
 * fields error and count) where the count is the number of bytes required to
813
 * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or
814
 * SURROGATE. The count is correct regardless of the error field.
815
 * When SURROGATE is returned, it does not indicate an error in the case of this
816
 * function: it indicates that at least one surrogate has been encountered: the
817
 * surrogates may be matched or not (thus this function does not validate). If
818
 * the returned error code is SUCCESS, then the input contains no surrogate, is
819
 * in the Basic Multilingual Plane, and is necessarily valid.
820
 */
821
simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
822
    const char16_t *input, size_t length) noexcept;
823
  #if SIMDUTF_SPAN
824
simdutf_really_inline simdutf_warn_unused result
825
utf8_length_from_utf16be_with_replacement(
826
0
    std::span<const char16_t> valid_utf16_input) noexcept {
827
0
  return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
828
0
                                                   valid_utf16_input.size());
829
0
}
830
  #endif // SIMDUTF_SPAN
831
832
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
833
834
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
835
/**
836
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
837
 *
838
 * @param input         the Latin1 string to convert
839
 * @param length        the length of the string in bytes
840
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
841
 * @return the number of written char16_t.
842
 */
843
simdutf_warn_unused size_t convert_latin1_to_utf16(
844
    const char *input, size_t length, char16_t *utf16_output) noexcept;
845
  #if SIMDUTF_SPAN
846
simdutf_really_inline simdutf_warn_unused size_t
847
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
848
                        std::span<char16_t> output) noexcept {
849
  return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
850
                                 input.size(), output.data());
851
}
852
  #endif // SIMDUTF_SPAN
853
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
854
855
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
856
/**
857
 * Convert possibly broken UTF-8 string into UTF-16LE string.
858
 *
859
 * During the conversion also validation of the input string is done.
860
 * This function is suitable to work with inputs from untrusted sources.
861
 *
862
 * @param input         the UTF-8 string to convert
863
 * @param length        the length of the string in bytes
864
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
865
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
866
 * string
867
 */
868
simdutf_warn_unused size_t convert_utf8_to_utf16le(
869
    const char *input, size_t length, char16_t *utf16_output) noexcept;
870
  #if SIMDUTF_SPAN
871
simdutf_really_inline simdutf_warn_unused size_t
872
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
873
                        std::span<char16_t> utf16_output) noexcept {
874
  return convert_utf8_to_utf16le(
875
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
876
      utf16_output.data());
877
}
878
  #endif // SIMDUTF_SPAN
879
880
/**
881
 * Convert possibly broken UTF-8 string into UTF-16BE string.
882
 *
883
 * During the conversion also validation of the input string is done.
884
 * This function is suitable to work with inputs from untrusted sources.
885
 *
886
 * @param input         the UTF-8 string to convert
887
 * @param length        the length of the string in bytes
888
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
889
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
890
 * string
891
 */
892
simdutf_warn_unused size_t convert_utf8_to_utf16be(
893
    const char *input, size_t length, char16_t *utf16_output) noexcept;
894
  #if SIMDUTF_SPAN
895
simdutf_really_inline simdutf_warn_unused size_t
896
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
897
                        std::span<char16_t> utf16_output) noexcept {
898
  return convert_utf8_to_utf16be(
899
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
900
      utf16_output.data());
901
}
902
  #endif // SIMDUTF_SPAN
903
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
904
905
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
906
/**
907
 * Convert possibly broken UTF-8 string into latin1 string with errors.
908
 * If the string cannot be represented as Latin1, an error
909
 * code is returned.
910
 *
911
 * During the conversion also validation of the input string is done.
912
 * This function is suitable to work with inputs from untrusted sources.
913
 *
914
 * @param input         the UTF-8 string to convert
915
 * @param length        the length of the string in bytes
916
 * @param latin1_output  the pointer to buffer that can hold conversion result
917
 * @return a result pair struct (of type simdutf::result containing the two
918
 * fields error and count) with an error code and either position of the error
919
 * (in the input in code units) if any, or the number of code units validated if
920
 * successful.
921
 */
922
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
923
    const char *input, size_t length, char *latin1_output) noexcept;
924
  #if SIMDUTF_SPAN
925
simdutf_really_inline simdutf_warn_unused result
926
convert_utf8_to_latin1_with_errors(
927
    const detail::input_span_of_byte_like auto &utf8_input,
928
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
929
  return convert_utf8_to_latin1_with_errors(
930
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
931
      reinterpret_cast<char *>(latin1_output.data()));
932
}
933
  #endif // SIMDUTF_SPAN
934
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
935
936
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
937
/**
938
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
939
 * string and stop on error.
940
 *
941
 * During the conversion also validation of the input string is done.
942
 * This function is suitable to work with inputs from untrusted sources.
943
 *
944
 * @param input         the UTF-8 string to convert
945
 * @param length        the length of the string in bytes
946
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
947
 * @return a result pair struct (of type simdutf::result containing the two
948
 * fields error and count) with an error code and either position of the error
949
 * (in the input in code units) if any, or the number of char16_t written if
950
 * successful.
951
 */
952
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
953
    const char *input, size_t length, char16_t *utf16_output) noexcept;
954
  #if SIMDUTF_SPAN
955
simdutf_really_inline simdutf_warn_unused result
956
convert_utf8_to_utf16_with_errors(
957
    const detail::input_span_of_byte_like auto &utf8_input,
958
    std::span<char16_t> utf16_output) noexcept {
959
  return convert_utf8_to_utf16_with_errors(
960
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
961
      utf16_output.data());
962
}
963
  #endif // SIMDUTF_SPAN
964
965
/**
966
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
967
 *
968
 * During the conversion also validation of the input string is done.
969
 * This function is suitable to work with inputs from untrusted sources.
970
 *
971
 * @param input         the UTF-8 string to convert
972
 * @param length        the length of the string in bytes
973
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
974
 * @return a result pair struct (of type simdutf::result containing the two
975
 * fields error and count) with an error code and either position of the error
976
 * (in the input in code units) if any, or the number of char16_t written if
977
 * successful.
978
 */
979
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
980
    const char *input, size_t length, char16_t *utf16_output) noexcept;
981
  #if SIMDUTF_SPAN
982
simdutf_really_inline simdutf_warn_unused result
983
convert_utf8_to_utf16le_with_errors(
984
    const detail::input_span_of_byte_like auto &utf8_input,
985
    std::span<char16_t> utf16_output) noexcept {
986
  return convert_utf8_to_utf16le_with_errors(
987
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
988
      utf16_output.data());
989
}
990
  #endif // SIMDUTF_SPAN
991
992
/**
993
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
994
 *
995
 * During the conversion also validation of the input string is done.
996
 * This function is suitable to work with inputs from untrusted sources.
997
 *
998
 * @param input         the UTF-8 string to convert
999
 * @param length        the length of the string in bytes
1000
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1001
 * @return a result pair struct (of type simdutf::result containing the two
1002
 * fields error and count) with an error code and either position of the error
1003
 * (in the input in code units) if any, or the number of char16_t written if
1004
 * successful.
1005
 */
1006
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
1007
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1008
  #if SIMDUTF_SPAN
1009
simdutf_really_inline simdutf_warn_unused result
1010
convert_utf8_to_utf16be_with_errors(
1011
    const detail::input_span_of_byte_like auto &utf8_input,
1012
    std::span<char16_t> utf16_output) noexcept {
1013
  return convert_utf8_to_utf16be_with_errors(
1014
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1015
      utf16_output.data());
1016
}
1017
  #endif // SIMDUTF_SPAN
1018
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1019
1020
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1021
/**
1022
 * Convert possibly broken UTF-8 string into UTF-32 string.
1023
 *
1024
 * During the conversion also validation of the input string is done.
1025
 * This function is suitable to work with inputs from untrusted sources.
1026
 *
1027
 * @param input         the UTF-8 string to convert
1028
 * @param length        the length of the string in bytes
1029
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1030
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
1031
 * string
1032
 */
1033
simdutf_warn_unused size_t convert_utf8_to_utf32(
1034
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1035
  #if SIMDUTF_SPAN
1036
simdutf_really_inline simdutf_warn_unused size_t
1037
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
1038
                      std::span<char32_t> utf32_output) noexcept {
1039
  return convert_utf8_to_utf32(
1040
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1041
      utf32_output.data());
1042
}
1043
  #endif // SIMDUTF_SPAN
1044
1045
/**
1046
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1047
 *
1048
 * During the conversion also validation of the input string is done.
1049
 * This function is suitable to work with inputs from untrusted sources.
1050
 *
1051
 * @param input         the UTF-8 string to convert
1052
 * @param length        the length of the string in bytes
1053
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1054
 * @return a result pair struct (of type simdutf::result containing the two
1055
 * fields error and count) with an error code and either position of the error
1056
 * (in the input in code units) if any, or the number of char32_t written if
1057
 * successful.
1058
 */
1059
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
1060
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1061
  #if SIMDUTF_SPAN
1062
simdutf_really_inline simdutf_warn_unused result
1063
convert_utf8_to_utf32_with_errors(
1064
    const detail::input_span_of_byte_like auto &utf8_input,
1065
    std::span<char32_t> utf32_output) noexcept {
1066
  return convert_utf8_to_utf32_with_errors(
1067
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1068
      utf32_output.data());
1069
}
1070
  #endif // SIMDUTF_SPAN
1071
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1072
1073
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1074
/**
1075
 * Convert valid UTF-8 string into latin1 string.
1076
 *
1077
 * This function assumes that the input string is valid UTF-8 and that it can be
1078
 * represented as Latin1. If you violate this assumption, the result is
1079
 * implementation defined and may include system-dependent behavior such as
1080
 * crashes.
1081
 *
1082
 * This function is for expert users only and not part of our public API. Use
1083
 * convert_utf8_to_latin1 instead. The function may be removed from the library
1084
 * in the future.
1085
 *
1086
 * This function is not BOM-aware.
1087
 *
1088
 * @param input         the UTF-8 string to convert
1089
 * @param length        the length of the string in bytes
1090
 * @param latin1_output  the pointer to buffer that can hold conversion result
1091
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1092
 */
1093
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1094
    const char *input, size_t length, char *latin1_output) noexcept;
1095
  #if SIMDUTF_SPAN
1096
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1097
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1098
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1099
  return convert_valid_utf8_to_latin1(
1100
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1101
      valid_utf8_input.size(), latin1_output.data());
1102
}
1103
  #endif // SIMDUTF_SPAN
1104
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1105
1106
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1107
/**
1108
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1109
 *
1110
 * This function assumes that the input string is valid UTF-8.
1111
 *
1112
 * @param input         the UTF-8 string to convert
1113
 * @param length        the length of the string in bytes
1114
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1115
 * @return the number of written char16_t
1116
 */
1117
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1118
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1119
  #if SIMDUTF_SPAN
1120
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1121
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1122
    std::span<char16_t> utf16_output) noexcept {
1123
  return convert_valid_utf8_to_utf16(
1124
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1125
      valid_utf8_input.size(), utf16_output.data());
1126
}
1127
  #endif // SIMDUTF_SPAN
1128
1129
/**
1130
 * Convert valid UTF-8 string into UTF-16LE string.
1131
 *
1132
 * This function assumes that the input string is valid UTF-8.
1133
 *
1134
 * @param input         the UTF-8 string to convert
1135
 * @param length        the length of the string in bytes
1136
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1137
 * @return the number of written char16_t
1138
 */
1139
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1140
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1141
  #if SIMDUTF_SPAN
1142
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1143
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1144
    std::span<char16_t> utf16_output) noexcept {
1145
  return convert_valid_utf8_to_utf16le(
1146
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1147
      valid_utf8_input.size(), utf16_output.data());
1148
}
1149
  #endif // SIMDUTF_SPAN
1150
1151
/**
1152
 * Convert valid UTF-8 string into UTF-16BE string.
1153
 *
1154
 * This function assumes that the input string is valid UTF-8.
1155
 *
1156
 * @param input         the UTF-8 string to convert
1157
 * @param length        the length of the string in bytes
1158
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1159
 * @return the number of written char16_t
1160
 */
1161
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1162
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1163
  #if SIMDUTF_SPAN
1164
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1165
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1166
    std::span<char16_t> utf16_output) noexcept {
1167
  return convert_valid_utf8_to_utf16be(
1168
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1169
      valid_utf8_input.size(), utf16_output.data());
1170
}
1171
  #endif // SIMDUTF_SPAN
1172
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1173
1174
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1175
/**
1176
 * Convert valid UTF-8 string into UTF-32 string.
1177
 *
1178
 * This function assumes that the input string is valid UTF-8.
1179
 *
1180
 * @param input         the UTF-8 string to convert
1181
 * @param length        the length of the string in bytes
1182
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1183
 * @return the number of written char32_t
1184
 */
1185
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1186
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1187
  #if SIMDUTF_SPAN
1188
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1189
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1190
    std::span<char32_t> utf32_output) noexcept {
1191
  return convert_valid_utf8_to_utf32(
1192
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1193
      valid_utf8_input.size(), utf32_output.data());
1194
}
1195
  #endif // SIMDUTF_SPAN
1196
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1197
1198
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1199
/**
1200
 * Return the number of bytes that this Latin1 string would require in UTF-8
1201
 * format.
1202
 *
1203
 * @param input         the Latin1 string to convert
1204
 * @param length        the length of the string bytes
1205
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1206
 */
1207
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1208
                                                   size_t length) noexcept;
1209
  #if SIMDUTF_SPAN
1210
simdutf_really_inline simdutf_warn_unused size_t utf8_length_from_latin1(
1211
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1212
  return utf8_length_from_latin1(
1213
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size());
1214
}
1215
  #endif // SIMDUTF_SPAN
1216
1217
/**
1218
 * Compute the number of bytes that this UTF-8 string would require in Latin1
1219
 * format.
1220
 *
1221
 * This function does not validate the input. It is acceptable to pass invalid
1222
 * UTF-8 strings but in such cases the result is implementation defined.
1223
 *
1224
 * This function is not BOM-aware.
1225
 *
1226
 * @param input         the UTF-8 string to convert
1227
 * @param length        the length of the string in byte
1228
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1229
 */
1230
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1231
                                                   size_t length) noexcept;
1232
  #if SIMDUTF_SPAN
1233
simdutf_really_inline simdutf_warn_unused size_t latin1_length_from_utf8(
1234
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1235
  return latin1_length_from_utf8(
1236
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1237
      valid_utf8_input.size());
1238
}
1239
  #endif // SIMDUTF_SPAN
1240
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1241
1242
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1243
/**
1244
 * Compute the number of 2-byte code units that this UTF-8 string would require
1245
 * in UTF-16LE format.
1246
 *
1247
 * This function does not validate the input. It is acceptable to pass invalid
1248
 * UTF-8 strings but in such cases the result is implementation defined.
1249
 *
1250
 * This function is not BOM-aware.
1251
 *
1252
 * @param input         the UTF-8 string to process
1253
 * @param length        the length of the string in bytes
1254
 * @return the number of char16_t code units required to encode the UTF-8 string
1255
 * as UTF-16LE
1256
 */
1257
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1258
                                                  size_t length) noexcept;
1259
  #if SIMDUTF_SPAN
1260
simdutf_really_inline simdutf_warn_unused size_t utf16_length_from_utf8(
1261
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1262
  return utf16_length_from_utf8(
1263
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1264
      valid_utf8_input.size());
1265
}
1266
  #endif // SIMDUTF_SPAN
1267
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1268
1269
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1270
/**
1271
 * Compute the number of 4-byte code units that this UTF-8 string would require
1272
 * in UTF-32 format.
1273
 *
1274
 * This function is equivalent to count_utf8
1275
 *
1276
 * This function does not validate the input. It is acceptable to pass invalid
1277
 * UTF-8 strings but in such cases the result is implementation defined.
1278
 *
1279
 * This function is not BOM-aware.
1280
 *
1281
 * @param input         the UTF-8 string to process
1282
 * @param length        the length of the string in bytes
1283
 * @return the number of char32_t code units required to encode the UTF-8 string
1284
 * as UTF-32
1285
 */
1286
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1287
                                                  size_t length) noexcept;
1288
  #if SIMDUTF_SPAN
1289
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf8(
1290
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1291
  return utf32_length_from_utf8(
1292
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1293
      valid_utf8_input.size());
1294
}
1295
  #endif // SIMDUTF_SPAN
1296
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1297
1298
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1299
/**
1300
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1301
 * string.
1302
 *
1303
 * During the conversion also validation of the input string is done.
1304
 * This function is suitable to work with inputs from untrusted sources.
1305
 *
1306
 * This function is not BOM-aware.
1307
 *
1308
 * @param input         the UTF-16 string to convert
1309
 * @param length        the length of the string in 2-byte code units (char16_t)
1310
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1311
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1312
 * string
1313
 */
1314
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1315
                                                 size_t length,
1316
                                                 char *utf8_buffer) noexcept;
1317
  #if SIMDUTF_SPAN
1318
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8(
1319
    std::span<const char16_t> utf16_input,
1320
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1321
  return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1322
                               reinterpret_cast<char *>(utf8_output.data()));
1323
}
1324
  #endif // SIMDUTF_SPAN
1325
1326
/**
1327
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1328
 * string with output limit.
1329
 *
1330
 * We write as many characters as possible into the output buffer,
1331
 *
1332
 * During the conversion also validation of the input string is done.
1333
 * This function is suitable to work with inputs from untrusted sources.
1334
 *
1335
 * This function is not BOM-aware.
1336
 *
1337
 *
1338
 * @param input         the UTF-16 string to convert
1339
 * @param length        the length of the string in 16-bit code units (char16_t)
1340
 * @param utf8_output   the pointer to buffer that can hold conversion result
1341
 * @param utf8_len      the maximum output length
1342
 * @return the number of written char; 0 if conversion is not possible
1343
 */
1344
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1345
                                                      size_t length,
1346
                                                      char *utf8_output,
1347
                                                      size_t utf8_len) noexcept;
1348
  #if SIMDUTF_SPAN
1349
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8_safe(
1350
    std::span<const char16_t> utf16_input,
1351
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1352
  // implementation note: outputspan is a forwarding ref to avoid copying and
1353
  // allow both lvalues and rvalues. std::span can be copied without problems,
1354
  // but std::vector should not, and this function should accept both. it will
1355
  // allow using an owning rvalue ref (example: passing a temporary std::string)
1356
  // as output, but the user will quickly find out that he has no way of getting
1357
  // the data out of the object in that case.
1358
  return convert_utf16_to_utf8_safe(
1359
      utf16_input.data(), utf16_input.size(),
1360
      reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1361
}
1362
  #endif // SIMDUTF_SPAN
1363
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1364
1365
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1366
/**
1367
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1368
 * string.
1369
 *
1370
 * During the conversion also validation of the input string is done.
1371
 * This function is suitable to work with inputs from untrusted sources.
1372
 *
1373
 * This function is not BOM-aware.
1374
 *
1375
 * @param input         the UTF-16 string to convert
1376
 * @param length        the length of the string in 2-byte code units (char16_t)
1377
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1378
 * @return number of written code units; 0 if input is not a valid UTF-16 string
1379
 * or if it cannot be represented as Latin1
1380
 */
1381
simdutf_warn_unused size_t convert_utf16_to_latin1(
1382
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1383
  #if SIMDUTF_SPAN
1384
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_latin1(
1385
    std::span<const char16_t> utf16_input,
1386
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1387
  return convert_utf16_to_latin1(
1388
      utf16_input.data(), utf16_input.size(),
1389
      reinterpret_cast<char *>(latin1_output.data()));
1390
}
1391
  #endif // SIMDUTF_SPAN
1392
1393
/**
1394
 * Convert possibly broken UTF-16LE string into Latin1 string.
1395
 * If the string cannot be represented as Latin1, an error
1396
 * is returned.
1397
 *
1398
 * During the conversion also validation of the input string is done.
1399
 * This function is suitable to work with inputs from untrusted sources.
1400
 *
1401
 * This function is not BOM-aware.
1402
 *
1403
 * @param input         the UTF-16LE string to convert
1404
 * @param length        the length of the string in 2-byte code units (char16_t)
1405
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1406
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1407
 * string or if it cannot be represented as Latin1
1408
 */
1409
simdutf_warn_unused size_t convert_utf16le_to_latin1(
1410
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1411
  #if SIMDUTF_SPAN
1412
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_latin1(
1413
    std::span<const char16_t> utf16_input,
1414
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1415
  return convert_utf16le_to_latin1(
1416
      utf16_input.data(), utf16_input.size(),
1417
      reinterpret_cast<char *>(latin1_output.data()));
1418
}
1419
  #endif // SIMDUTF_SPAN
1420
1421
/**
1422
 * Convert possibly broken UTF-16BE string into Latin1 string.
1423
 *
1424
 * During the conversion also validation of the input string is done.
1425
 * This function is suitable to work with inputs from untrusted sources.
1426
 *
1427
 * This function is not BOM-aware.
1428
 *
1429
 * @param input         the UTF-16BE string to convert
1430
 * @param length        the length of the string in 2-byte code units (char16_t)
1431
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1432
 * @return number of written code units; 0 if input is not a valid UTF-16BE
1433
 * string or if it cannot be represented as Latin1
1434
 */
1435
simdutf_warn_unused size_t convert_utf16be_to_latin1(
1436
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1437
  #if SIMDUTF_SPAN
1438
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_latin1(
1439
    std::span<const char16_t> utf16_input,
1440
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1441
  return convert_utf16be_to_latin1(
1442
      utf16_input.data(), utf16_input.size(),
1443
      reinterpret_cast<char *>(latin1_output.data()));
1444
}
1445
  #endif // SIMDUTF_SPAN
1446
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1447
1448
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1449
/**
1450
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1451
 *
1452
 * During the conversion also validation of the input string is done.
1453
 * This function is suitable to work with inputs from untrusted sources.
1454
 *
1455
 * This function is not BOM-aware.
1456
 *
1457
 * @param input         the UTF-16LE string to convert
1458
 * @param length        the length of the string in 2-byte code units (char16_t)
1459
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1460
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1461
 * string
1462
 */
1463
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1464
                                                   size_t length,
1465
                                                   char *utf8_buffer) noexcept;
1466
  #if SIMDUTF_SPAN
1467
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_utf8(
1468
    std::span<const char16_t> utf16_input,
1469
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1470
  return convert_utf16le_to_utf8(utf16_input.data(), utf16_input.size(),
1471
                                 reinterpret_cast<char *>(utf8_output.data()));
1472
}
1473
  #endif // SIMDUTF_SPAN
1474
1475
/**
1476
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1477
 *
1478
 * During the conversion also validation of the input string is done.
1479
 * This function is suitable to work with inputs from untrusted sources.
1480
 *
1481
 * This function is not BOM-aware.
1482
 *
1483
 * @param input         the UTF-16BE string to convert
1484
 * @param length        the length of the string in 2-byte code units (char16_t)
1485
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1486
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1487
 * string
1488
 */
1489
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
1490
                                                   size_t length,
1491
                                                   char *utf8_buffer) noexcept;
1492
  #if SIMDUTF_SPAN
1493
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_utf8(
1494
    std::span<const char16_t> utf16_input,
1495
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1496
  return convert_utf16be_to_utf8(utf16_input.data(), utf16_input.size(),
1497
                                 reinterpret_cast<char *>(utf8_output.data()));
1498
}
1499
  #endif // SIMDUTF_SPAN
1500
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1501
1502
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1503
/**
1504
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1505
 * string.
1506
 *
1507
 * During the conversion also validation of the input string is done.
1508
 * This function is suitable to work with inputs from untrusted sources.
1509
 * This function is not BOM-aware.
1510
 *
1511
 * @param input         the UTF-16 string to convert
1512
 * @param length        the length of the string in 2-byte code units (char16_t)
1513
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1514
 * @return a result pair struct (of type simdutf::result containing the two
1515
 * fields error and count) with an error code and either position of the error
1516
 * (in the input in code units) if any, or the number of char written if
1517
 * successful.
1518
 */
1519
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
1520
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1521
  #if SIMDUTF_SPAN
1522
simdutf_really_inline simdutf_warn_unused result
1523
convert_utf16_to_latin1_with_errors(
1524
    std::span<const char16_t> utf16_input,
1525
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1526
  return convert_utf16_to_latin1_with_errors(
1527
      utf16_input.data(), utf16_input.size(),
1528
      reinterpret_cast<char *>(latin1_output.data()));
1529
}
1530
  #endif // SIMDUTF_SPAN
1531
1532
/**
1533
 * Convert possibly broken UTF-16LE string into Latin1 string.
1534
 *
1535
 * During the conversion also validation of the input string is done.
1536
 * This function is suitable to work with inputs from untrusted sources.
1537
 * This function is not BOM-aware.
1538
 *
1539
 * @param input         the UTF-16LE string to convert
1540
 * @param length        the length of the string in 2-byte code units (char16_t)
1541
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1542
 * @return a result pair struct (of type simdutf::result containing the two
1543
 * fields error and count) with an error code and either position of the error
1544
 * (in the input in code units) if any, or the number of char written if
1545
 * successful.
1546
 */
1547
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
1548
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1549
  #if SIMDUTF_SPAN
1550
simdutf_really_inline simdutf_warn_unused result
1551
convert_utf16le_to_latin1_with_errors(
1552
    std::span<const char16_t> utf16_input,
1553
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1554
  return convert_utf16le_to_latin1_with_errors(
1555
      utf16_input.data(), utf16_input.size(),
1556
      reinterpret_cast<char *>(latin1_output.data()));
1557
}
1558
  #endif // SIMDUTF_SPAN
1559
1560
/**
1561
 * Convert possibly broken UTF-16BE string into Latin1 string.
1562
 * If the string cannot be represented as Latin1, an error
1563
 * is returned.
1564
 *
1565
 * During the conversion also validation of the input string is done.
1566
 * This function is suitable to work with inputs from untrusted sources.
1567
 * This function is not BOM-aware.
1568
 *
1569
 * @param input         the UTF-16BE string to convert
1570
 * @param length        the length of the string in 2-byte code units (char16_t)
1571
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1572
 * @return a result pair struct (of type simdutf::result containing the two
1573
 * fields error and count) with an error code and either position of the error
1574
 * (in the input in code units) if any, or the number of char written if
1575
 * successful.
1576
 */
1577
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
1578
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1579
  #if SIMDUTF_SPAN
1580
simdutf_really_inline simdutf_warn_unused result
1581
convert_utf16be_to_latin1_with_errors(
1582
    std::span<const char16_t> utf16_input,
1583
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1584
  return convert_utf16be_to_latin1_with_errors(
1585
      utf16_input.data(), utf16_input.size(),
1586
      reinterpret_cast<char *>(latin1_output.data()));
1587
}
1588
  #endif // SIMDUTF_SPAN
1589
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1590
1591
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1592
/**
1593
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1594
 * string and stop on error.
1595
 *
1596
 * During the conversion also validation of the input string is done.
1597
 * This function is suitable to work with inputs from untrusted sources.
1598
 *
1599
 * This function is not BOM-aware.
1600
 *
1601
 * @param input         the UTF-16 string to convert
1602
 * @param length        the length of the string in 2-byte code units (char16_t)
1603
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1604
 * @return a result pair struct (of type simdutf::result containing the two
1605
 * fields error and count) with an error code and either position of the error
1606
 * (in the input in code units) if any, or the number of char written if
1607
 * successful.
1608
 */
1609
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
1610
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1611
  #if SIMDUTF_SPAN
1612
simdutf_really_inline simdutf_warn_unused result
1613
convert_utf16_to_utf8_with_errors(
1614
    std::span<const char16_t> utf16_input,
1615
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1616
  return convert_utf16_to_utf8_with_errors(
1617
      utf16_input.data(), utf16_input.size(),
1618
      reinterpret_cast<char *>(utf8_output.data()));
1619
}
1620
  #endif // SIMDUTF_SPAN
1621
1622
/**
1623
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1624
 *
1625
 * During the conversion also validation of the input string is done.
1626
 * This function is suitable to work with inputs from untrusted sources.
1627
 *
1628
 * This function is not BOM-aware.
1629
 *
1630
 * @param input         the UTF-16LE string to convert
1631
 * @param length        the length of the string in 2-byte code units (char16_t)
1632
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1633
 * @return a result pair struct (of type simdutf::result containing the two
1634
 * fields error and count) with an error code and either position of the error
1635
 * (in the input in code units) if any, or the number of char written if
1636
 * successful.
1637
 */
1638
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
1639
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1640
  #if SIMDUTF_SPAN
1641
simdutf_really_inline simdutf_warn_unused result
1642
convert_utf16le_to_utf8_with_errors(
1643
    std::span<const char16_t> utf16_input,
1644
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1645
  return convert_utf16le_to_utf8_with_errors(
1646
      utf16_input.data(), utf16_input.size(),
1647
      reinterpret_cast<char *>(utf8_output.data()));
1648
}
1649
  #endif // SIMDUTF_SPAN
1650
1651
/**
1652
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1653
 *
1654
 * During the conversion also validation of the input string is done.
1655
 * This function is suitable to work with inputs from untrusted sources.
1656
 *
1657
 * This function is not BOM-aware.
1658
 *
1659
 * @param input         the UTF-16BE string to convert
1660
 * @param length        the length of the string in 2-byte code units (char16_t)
1661
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1662
 * @return a result pair struct (of type simdutf::result containing the two
1663
 * fields error and count) with an error code and either position of the error
1664
 * (in the input in code units) if any, or the number of char written if
1665
 * successful.
1666
 */
1667
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
1668
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1669
  #if SIMDUTF_SPAN
1670
simdutf_really_inline simdutf_warn_unused result
1671
convert_utf16be_to_utf8_with_errors(
1672
    std::span<const char16_t> utf16_input,
1673
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1674
  return convert_utf16be_to_utf8_with_errors(
1675
      utf16_input.data(), utf16_input.size(),
1676
      reinterpret_cast<char *>(utf8_output.data()));
1677
}
1678
  #endif // SIMDUTF_SPAN
1679
1680
/**
1681
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
1682
 *
1683
 * This function assumes that the input string is valid UTF-16LE.
1684
 *
1685
 * This function is not BOM-aware.
1686
 *
1687
 * @param input         the UTF-16 string to convert
1688
 * @param length        the length of the string in 2-byte code units (char16_t)
1689
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1690
 * result
1691
 * @return number of written code units; 0 if conversion is not possible
1692
 */
1693
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1694
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1695
  #if SIMDUTF_SPAN
1696
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1697
    std::span<const char16_t> valid_utf16_input,
1698
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1699
  return convert_valid_utf16_to_utf8(
1700
      valid_utf16_input.data(), valid_utf16_input.size(),
1701
      reinterpret_cast<char *>(utf8_output.data()));
1702
}
1703
  #endif // SIMDUTF_SPAN
1704
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1705
1706
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1707
/**
1708
 * Using native endianness, convert UTF-16 string into Latin1 string.
1709
 *
1710
 * This function assumes that the input string is valid UTF-16 and that it can
1711
 * be represented as Latin1. If you violate this assumption, the result is
1712
 * implementation defined and may include system-dependent behavior such as
1713
 * crashes.
1714
 *
1715
 * This function is for expert users only and not part of our public API. Use
1716
 * convert_utf16_to_latin1 instead. The function may be removed from the library
1717
 * in the future.
1718
 *
1719
 * This function is not BOM-aware.
1720
 *
1721
 * @param input         the UTF-16 string to convert
1722
 * @param length        the length of the string in 2-byte code units (char16_t)
1723
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1724
 * @return number of written code units; 0 if conversion is not possible
1725
 */
1726
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1727
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1728
  #if SIMDUTF_SPAN
1729
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1730
    std::span<const char16_t> valid_utf16_input,
1731
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1732
  return convert_valid_utf16_to_latin1(
1733
      valid_utf16_input.data(), valid_utf16_input.size(),
1734
      reinterpret_cast<char *>(latin1_output.data()));
1735
}
1736
  #endif // SIMDUTF_SPAN
1737
1738
/**
1739
 * Convert valid UTF-16LE string into Latin1 string.
1740
 *
1741
 * This function assumes that the input string is valid UTF-16LE and that it can
1742
 * be represented as Latin1. If you violate this assumption, the result is
1743
 * implementation defined and may include system-dependent behavior such as
1744
 * crashes.
1745
 *
1746
 * This function is for expert users only and not part of our public API. Use
1747
 * convert_utf16le_to_latin1 instead. The function may be removed from the
1748
 * library in the future.
1749
 *
1750
 * This function is not BOM-aware.
1751
 *
1752
 * @param input         the UTF-16LE string to convert
1753
 * @param length        the length of the string in 2-byte code units (char16_t)
1754
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1755
 * @return number of written code units; 0 if conversion is not possible
1756
 */
1757
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
1758
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1759
  #if SIMDUTF_SPAN
1760
simdutf_really_inline simdutf_warn_unused size_t
1761
convert_valid_utf16le_to_latin1(
1762
    std::span<const char16_t> valid_utf16_input,
1763
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1764
  return convert_valid_utf16le_to_latin1(
1765
      valid_utf16_input.data(), valid_utf16_input.size(),
1766
      reinterpret_cast<char *>(latin1_output.data()));
1767
}
1768
  #endif // SIMDUTF_SPAN
1769
1770
/**
1771
 * Convert valid UTF-16BE string into Latin1 string.
1772
 *
1773
 * This function assumes that the input string is valid UTF-16BE and that it can
1774
 * be represented as Latin1. If you violate this assumption, the result is
1775
 * implementation defined and may include system-dependent behavior such as
1776
 * crashes.
1777
 *
1778
 * This function is for expert users only and not part of our public API. Use
1779
 * convert_utf16be_to_latin1 instead. The function may be removed from the
1780
 * library in the future.
1781
 *
1782
 * This function is not BOM-aware.
1783
 *
1784
 * @param input         the UTF-16BE string to convert
1785
 * @param length        the length of the string in 2-byte code units (char16_t)
1786
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1787
 * @return number of written code units; 0 if conversion is not possible
1788
 */
1789
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
1790
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1791
  #if SIMDUTF_SPAN
1792
simdutf_really_inline simdutf_warn_unused size_t
1793
convert_valid_utf16be_to_latin1(
1794
    std::span<const char16_t> valid_utf16_input,
1795
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1796
  return convert_valid_utf16be_to_latin1(
1797
      valid_utf16_input.data(), valid_utf16_input.size(),
1798
      reinterpret_cast<char *>(latin1_output.data()));
1799
}
1800
  #endif // SIMDUTF_SPAN
1801
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1802
1803
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1804
/**
1805
 * Convert valid UTF-16LE string into UTF-8 string.
1806
 *
1807
 * This function assumes that the input string is valid UTF-16LE and that it can
1808
 * be represented as Latin1.
1809
 *
1810
 * This function is not BOM-aware.
1811
 *
1812
 * @param input         the UTF-16LE string to convert
1813
 * @param length        the length of the string in 2-byte code units (char16_t)
1814
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1815
 * result
1816
 * @return number of written code units; 0 if conversion is not possible
1817
 */
1818
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1819
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1820
  #if SIMDUTF_SPAN
1821
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1822
    std::span<const char16_t> valid_utf16_input,
1823
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1824
  return convert_valid_utf16le_to_utf8(
1825
      valid_utf16_input.data(), valid_utf16_input.size(),
1826
      reinterpret_cast<char *>(utf8_output.data()));
1827
}
1828
  #endif // SIMDUTF_SPAN
1829
1830
/**
1831
 * Convert valid UTF-16BE string into UTF-8 string.
1832
 *
1833
 * This function assumes that the input string is valid UTF-16BE.
1834
 *
1835
 * This function is not BOM-aware.
1836
 *
1837
 * @param input         the UTF-16BE string to convert
1838
 * @param length        the length of the string in 2-byte code units (char16_t)
1839
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1840
 * result
1841
 * @return number of written code units; 0 if conversion is not possible
1842
 */
1843
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1844
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1845
  #if SIMDUTF_SPAN
1846
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1847
    std::span<const char16_t> valid_utf16_input,
1848
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1849
  return convert_valid_utf16be_to_utf8(
1850
      valid_utf16_input.data(), valid_utf16_input.size(),
1851
      reinterpret_cast<char *>(utf8_output.data()));
1852
}
1853
  #endif // SIMDUTF_SPAN
1854
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1855
1856
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
1857
/**
1858
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
1859
 * string.
1860
 *
1861
 * During the conversion also validation of the input string is done.
1862
 * This function is suitable to work with inputs from untrusted sources.
1863
 *
1864
 * This function is not BOM-aware.
1865
 *
1866
 * @param input         the UTF-16 string to convert
1867
 * @param length        the length of the string in 2-byte code units (char16_t)
1868
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1869
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1870
 * string
1871
 */
1872
simdutf_warn_unused size_t convert_utf16_to_utf32(
1873
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1874
  #if SIMDUTF_SPAN
1875
simdutf_really_inline simdutf_warn_unused size_t
1876
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
1877
0
                       std::span<char32_t> utf32_output) noexcept {
1878
0
  return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
1879
0
                                utf32_output.data());
1880
0
}
1881
  #endif // SIMDUTF_SPAN
1882
1883
/**
1884
 * Convert possibly broken UTF-16LE string into UTF-32 string.
1885
 *
1886
 * During the conversion also validation of the input string is done.
1887
 * This function is suitable to work with inputs from untrusted sources.
1888
 *
1889
 * This function is not BOM-aware.
1890
 *
1891
 * @param input         the UTF-16LE string to convert
1892
 * @param length        the length of the string in 2-byte code units (char16_t)
1893
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1894
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1895
 * string
1896
 */
1897
simdutf_warn_unused size_t convert_utf16le_to_utf32(
1898
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1899
  #if SIMDUTF_SPAN
1900
simdutf_really_inline simdutf_warn_unused size_t
1901
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
1902
0
                         std::span<char32_t> utf32_output) noexcept {
1903
0
  return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
1904
0
                                  utf32_output.data());
1905
0
}
1906
  #endif // SIMDUTF_SPAN
1907
1908
/**
1909
 * Convert possibly broken UTF-16BE string into UTF-32 string.
1910
 *
1911
 * During the conversion also validation of the input string is done.
1912
 * This function is suitable to work with inputs from untrusted sources.
1913
 *
1914
 * This function is not BOM-aware.
1915
 *
1916
 * @param input         the UTF-16BE string to convert
1917
 * @param length        the length of the string in 2-byte code units (char16_t)
1918
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1919
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1920
 * string
1921
 */
1922
simdutf_warn_unused size_t convert_utf16be_to_utf32(
1923
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1924
  #if SIMDUTF_SPAN
1925
simdutf_really_inline simdutf_warn_unused size_t
1926
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
1927
0
                         std::span<char32_t> utf32_output) noexcept {
1928
0
  return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
1929
0
                                  utf32_output.data());
1930
0
}
1931
  #endif // SIMDUTF_SPAN
1932
1933
/**
1934
 * Using native endianness, convert possibly broken UTF-16 string into
1935
 * UTF-32 string and stop on error.
1936
 *
1937
 * During the conversion also validation of the input string is done.
1938
 * This function is suitable to work with inputs from untrusted sources.
1939
 *
1940
 * This function is not BOM-aware.
1941
 *
1942
 * @param input         the UTF-16 string to convert
1943
 * @param length        the length of the string in 2-byte code units (char16_t)
1944
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1945
 * @return a result pair struct (of type simdutf::result containing the two
1946
 * fields error and count) with an error code and either position of the error
1947
 * (in the input in code units) if any, or the number of char32_t written if
1948
 * successful.
1949
 */
1950
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
1951
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1952
  #if SIMDUTF_SPAN
1953
simdutf_really_inline simdutf_warn_unused result
1954
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
1955
0
                                   std::span<char32_t> utf32_output) noexcept {
1956
0
  return convert_utf16_to_utf32_with_errors(
1957
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1958
0
}
1959
  #endif // SIMDUTF_SPAN
1960
1961
/**
1962
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1963
 *
1964
 * During the conversion also validation of the input string is done.
1965
 * This function is suitable to work with inputs from untrusted sources.
1966
 *
1967
 * This function is not BOM-aware.
1968
 *
1969
 * @param input         the UTF-16LE string to convert
1970
 * @param length        the length of the string in 2-byte code units (char16_t)
1971
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1972
 * @return a result pair struct (of type simdutf::result containing the two
1973
 * fields error and count) with an error code and either position of the error
1974
 * (in the input in code units) if any, or the number of char32_t written if
1975
 * successful.
1976
 */
1977
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
1978
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1979
  #if SIMDUTF_SPAN
1980
simdutf_really_inline simdutf_warn_unused result
1981
convert_utf16le_to_utf32_with_errors(
1982
    std::span<const char16_t> utf16_input,
1983
0
    std::span<char32_t> utf32_output) noexcept {
1984
0
  return convert_utf16le_to_utf32_with_errors(
1985
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1986
0
}
1987
  #endif // SIMDUTF_SPAN
1988
1989
/**
1990
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1991
 *
1992
 * During the conversion also validation of the input string is done.
1993
 * This function is suitable to work with inputs from untrusted sources.
1994
 *
1995
 * This function is not BOM-aware.
1996
 *
1997
 * @param input         the UTF-16BE string to convert
1998
 * @param length        the length of the string in 2-byte code units (char16_t)
1999
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2000
 * @return a result pair struct (of type simdutf::result containing the two
2001
 * fields error and count) with an error code and either position of the error
2002
 * (in the input in code units) if any, or the number of char32_t written if
2003
 * successful.
2004
 */
2005
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
2006
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2007
  #if SIMDUTF_SPAN
2008
simdutf_really_inline simdutf_warn_unused result
2009
convert_utf16be_to_utf32_with_errors(
2010
    std::span<const char16_t> utf16_input,
2011
0
    std::span<char32_t> utf32_output) noexcept {
2012
0
  return convert_utf16be_to_utf32_with_errors(
2013
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
2014
0
}
2015
  #endif // SIMDUTF_SPAN
2016
2017
/**
2018
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
2019
 *
2020
 * This function assumes that the input string is valid UTF-16 (native
2021
 * endianness).
2022
 *
2023
 * This function is not BOM-aware.
2024
 *
2025
 * @param input         the UTF-16 string to convert
2026
 * @param length        the length of the string in 2-byte code units (char16_t)
2027
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2028
 * result
2029
 * @return number of written code units; 0 if conversion is not possible
2030
 */
2031
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
2032
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2033
  #if SIMDUTF_SPAN
2034
simdutf_really_inline simdutf_warn_unused size_t
2035
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
2036
0
                             std::span<char32_t> utf32_output) noexcept {
2037
0
  return convert_valid_utf16_to_utf32(
2038
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2039
0
}
2040
  #endif // SIMDUTF_SPAN
2041
2042
/**
2043
 * Convert valid UTF-16LE string into UTF-32 string.
2044
 *
2045
 * This function assumes that the input string is valid UTF-16LE.
2046
 *
2047
 * This function is not BOM-aware.
2048
 *
2049
 * @param input         the UTF-16LE string to convert
2050
 * @param length        the length of the string in 2-byte code units (char16_t)
2051
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2052
 * result
2053
 * @return number of written code units; 0 if conversion is not possible
2054
 */
2055
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
2056
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2057
  #if SIMDUTF_SPAN
2058
simdutf_really_inline simdutf_warn_unused size_t
2059
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
2060
0
                               std::span<char32_t> utf32_output) noexcept {
2061
0
  return convert_valid_utf16le_to_utf32(
2062
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2063
0
}
2064
  #endif // SIMDUTF_SPAN
2065
2066
/**
2067
 * Convert valid UTF-16BE string into UTF-32 string.
2068
 *
2069
 * This function assumes that the input string is valid UTF-16LE.
2070
 *
2071
 * This function is not BOM-aware.
2072
 *
2073
 * @param input         the UTF-16BE string to convert
2074
 * @param length        the length of the string in 2-byte code units (char16_t)
2075
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2076
 * result
2077
 * @return number of written code units; 0 if conversion is not possible
2078
 */
2079
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
2080
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2081
  #if SIMDUTF_SPAN
2082
simdutf_really_inline simdutf_warn_unused size_t
2083
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
2084
0
                               std::span<char32_t> utf32_output) noexcept {
2085
0
  return convert_valid_utf16be_to_utf32(
2086
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2087
0
}
2088
  #endif // SIMDUTF_SPAN
2089
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2090
2091
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2092
/**
2093
 * Compute the number of bytes that this UTF-16LE/BE string would require in
2094
 * Latin1 format.
2095
 *
2096
 * This function does not validate the input. It is acceptable to pass invalid
2097
 * UTF-16 strings but in such cases the result is implementation defined.
2098
 *
2099
 * This function is not BOM-aware.
2100
 *
2101
 * @param length        the length of the string in 2-byte code units (char16_t)
2102
 * @return the number of bytes required to encode the UTF-16LE string as Latin1
2103
 */
2104
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
2105
2106
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2107
2108
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2109
/**
2110
 * Using native endianness; Compute the number of bytes that this UTF-16
2111
 * string would require in UTF-8 format.
2112
 *
2113
 * This function does not validate the input. It is acceptable to pass invalid
2114
 * UTF-16 strings but in such cases the result is implementation defined.
2115
 *
2116
 * @param input         the UTF-16 string to convert
2117
 * @param length        the length of the string in 2-byte code units (char16_t)
2118
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2119
 */
2120
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
2121
                                                  size_t length) noexcept;
2122
  #if SIMDUTF_SPAN
2123
simdutf_really_inline simdutf_warn_unused size_t
2124
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2125
0
  return utf8_length_from_utf16(valid_utf16_input.data(),
2126
0
                                valid_utf16_input.size());
2127
0
}
2128
  #endif // SIMDUTF_SPAN
2129
2130
/**
2131
 * Using native endianness; compute the number of bytes that this UTF-16
2132
 * string would require in UTF-8 format even when the UTF-16LE content contains
2133
 * mismatched surrogates that have to be replaced by the replacement character
2134
 * (0xFFFD).
2135
 *
2136
 * @param input         the UTF-16 string to convert
2137
 * @param length        the length of the string in 2-byte code units (char16_t)
2138
 * @return a result pair struct (of type simdutf::result containing the two
2139
 * fields error and count) where the count is the number of bytes required to
2140
 * encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or
2141
 * SURROGATE. The count is correct regardless of the error field.
2142
 * When SURROGATE is returned, it does not indicate an error in the case of this
2143
 * function: it indicates that at least one surrogate has been encountered: the
2144
 * surrogates may be matched or not (thus this function does not validate). If
2145
 * the returned error code is SUCCESS, then the input contains no surrogate, is
2146
 * in the Basic Multilingual Plane, and is necessarily valid.
2147
 */
2148
simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
2149
    const char16_t *input, size_t length) noexcept;
2150
  #if SIMDUTF_SPAN
2151
simdutf_really_inline simdutf_warn_unused result
2152
utf8_length_from_utf16_with_replacement(
2153
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2154
0
  return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
2155
0
                                                 valid_utf16_input.size());
2156
0
}
2157
  #endif // SIMDUTF_SPAN
2158
2159
/**
2160
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2161
 * format.
2162
 *
2163
 * This function does not validate the input. It is acceptable to pass invalid
2164
 * UTF-16 strings but in such cases the result is implementation defined.
2165
 *
2166
 * @param input         the UTF-16LE string to convert
2167
 * @param length        the length of the string in 2-byte code units (char16_t)
2168
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2169
 */
2170
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
2171
                                                    size_t length) noexcept;
2172
  #if SIMDUTF_SPAN
2173
simdutf_really_inline simdutf_warn_unused size_t
2174
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2175
0
  return utf8_length_from_utf16le(valid_utf16_input.data(),
2176
0
                                  valid_utf16_input.size());
2177
0
}
2178
  #endif // SIMDUTF_SPAN
2179
2180
/**
2181
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
2182
 * format.
2183
 *
2184
 * This function does not validate the input. It is acceptable to pass invalid
2185
 * UTF-16 strings but in such cases the result is implementation defined.
2186
 *
2187
 * @param input         the UTF-16BE string to convert
2188
 * @param length        the length of the string in 2-byte code units (char16_t)
2189
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2190
 */
2191
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
2192
                                                    size_t length) noexcept;
2193
  #if SIMDUTF_SPAN
2194
simdutf_really_inline simdutf_warn_unused size_t
2195
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2196
0
  return utf8_length_from_utf16be(valid_utf16_input.data(),
2197
0
                                  valid_utf16_input.size());
2198
0
}
2199
  #endif // SIMDUTF_SPAN
2200
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2201
2202
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2203
/**
2204
 * Convert possibly broken UTF-32 string into UTF-8 string.
2205
 *
2206
 * During the conversion also validation of the input string is done.
2207
 * This function is suitable to work with inputs from untrusted sources.
2208
 *
2209
 * This function is not BOM-aware.
2210
 *
2211
 * @param input         the UTF-32 string to convert
2212
 * @param length        the length of the string in 4-byte code units (char32_t)
2213
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2214
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2215
 */
2216
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
2217
                                                 size_t length,
2218
                                                 char *utf8_buffer) noexcept;
2219
  #if SIMDUTF_SPAN
2220
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_utf8(
2221
    std::span<const char32_t> utf32_input,
2222
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2223
  return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
2224
                               reinterpret_cast<char *>(utf8_output.data()));
2225
}
2226
  #endif // SIMDUTF_SPAN
2227
2228
/**
2229
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2230
 *
2231
 * During the conversion also validation of the input string is done.
2232
 * This function is suitable to work with inputs from untrusted sources.
2233
 *
2234
 * This function is not BOM-aware.
2235
 *
2236
 * @param input         the UTF-32 string to convert
2237
 * @param length        the length of the string in 4-byte code units (char32_t)
2238
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2239
 * @return a result pair struct (of type simdutf::result containing the two
2240
 * fields error and count) with an error code and either position of the error
2241
 * (in the input in code units) if any, or the number of char written if
2242
 * successful.
2243
 */
2244
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
2245
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2246
  #if SIMDUTF_SPAN
2247
simdutf_really_inline simdutf_warn_unused result
2248
convert_utf32_to_utf8_with_errors(
2249
    std::span<const char32_t> utf32_input,
2250
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2251
  return convert_utf32_to_utf8_with_errors(
2252
      utf32_input.data(), utf32_input.size(),
2253
      reinterpret_cast<char *>(utf8_output.data()));
2254
}
2255
  #endif // SIMDUTF_SPAN
2256
2257
/**
2258
 * Convert valid UTF-32 string into UTF-8 string.
2259
 *
2260
 * This function assumes that the input string is valid UTF-32.
2261
 *
2262
 * This function is not BOM-aware.
2263
 *
2264
 * @param input         the UTF-32 string to convert
2265
 * @param length        the length of the string in 4-byte code units (char32_t)
2266
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2267
 * result
2268
 * @return number of written code units; 0 if conversion is not possible
2269
 */
2270
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2271
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2272
  #if SIMDUTF_SPAN
2273
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2274
    std::span<const char32_t> valid_utf32_input,
2275
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2276
  return convert_valid_utf32_to_utf8(
2277
      valid_utf32_input.data(), valid_utf32_input.size(),
2278
      reinterpret_cast<char *>(utf8_output.data()));
2279
}
2280
  #endif // SIMDUTF_SPAN
2281
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2282
2283
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2284
/**
2285
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
2286
 * string.
2287
 *
2288
 * During the conversion also validation of the input string is done.
2289
 * This function is suitable to work with inputs from untrusted sources.
2290
 *
2291
 * This function is not BOM-aware.
2292
 *
2293
 * @param input         the UTF-32 string to convert
2294
 * @param length        the length of the string in 4-byte code units (char32_t)
2295
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2296
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2297
 */
2298
simdutf_warn_unused size_t convert_utf32_to_utf16(
2299
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2300
  #if SIMDUTF_SPAN
2301
simdutf_really_inline simdutf_warn_unused size_t
2302
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
2303
0
                       std::span<char16_t> utf16_output) noexcept {
2304
0
  return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
2305
0
                                utf16_output.data());
2306
0
}
2307
  #endif // SIMDUTF_SPAN
2308
2309
/**
2310
 * Convert possibly broken UTF-32 string into UTF-16LE string.
2311
 *
2312
 * During the conversion also validation of the input string is done.
2313
 * This function is suitable to work with inputs from untrusted sources.
2314
 *
2315
 * This function is not BOM-aware.
2316
 *
2317
 * @param input         the UTF-32 string to convert
2318
 * @param length        the length of the string in 4-byte code units (char32_t)
2319
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2320
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2321
 */
2322
simdutf_warn_unused size_t convert_utf32_to_utf16le(
2323
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2324
  #if SIMDUTF_SPAN
2325
simdutf_really_inline simdutf_warn_unused size_t
2326
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
2327
0
                         std::span<char16_t> utf16_output) noexcept {
2328
0
  return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
2329
0
                                  utf16_output.data());
2330
0
}
2331
  #endif // SIMDUTF_SPAN
2332
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2333
2334
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2335
/**
2336
 * Convert possibly broken UTF-32 string into Latin1 string.
2337
 *
2338
 * During the conversion also validation of the input string is done.
2339
 * This function is suitable to work with inputs from untrusted sources.
2340
 *
2341
 * This function is not BOM-aware.
2342
 *
2343
 * @param input         the UTF-32 string to convert
2344
 * @param length        the length of the string in 4-byte code units (char32_t)
2345
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2346
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2347
 * or if it cannot be represented as Latin1
2348
 */
2349
simdutf_warn_unused size_t convert_utf32_to_latin1(
2350
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2351
  #if SIMDUTF_SPAN
2352
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_latin1(
2353
    std::span<const char32_t> utf32_input,
2354
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2355
  return convert_utf32_to_latin1(
2356
      utf32_input.data(), utf32_input.size(),
2357
      reinterpret_cast<char *>(latin1_output.data()));
2358
}
2359
  #endif // SIMDUTF_SPAN
2360
2361
/**
2362
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
2363
 * If the string cannot be represented as Latin1, an error is returned.
2364
 *
2365
 * During the conversion also validation of the input string is done.
2366
 * This function is suitable to work with inputs from untrusted sources.
2367
 *
2368
 * This function is not BOM-aware.
2369
 *
2370
 * @param input         the UTF-32 string to convert
2371
 * @param length        the length of the string in 4-byte code units (char32_t)
2372
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2373
 * @return a result pair struct (of type simdutf::result containing the two
2374
 * fields error and count) with an error code and either position of the error
2375
 * (in the input in code units) if any, or the number of char written if
2376
 * successful.
2377
 */
2378
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
2379
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2380
  #if SIMDUTF_SPAN
2381
simdutf_really_inline simdutf_warn_unused result
2382
convert_utf32_to_latin1_with_errors(
2383
    std::span<const char32_t> utf32_input,
2384
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2385
  return convert_utf32_to_latin1_with_errors(
2386
      utf32_input.data(), utf32_input.size(),
2387
      reinterpret_cast<char *>(latin1_output.data()));
2388
}
2389
  #endif // SIMDUTF_SPAN
2390
2391
/**
2392
 * Convert valid UTF-32 string into Latin1 string.
2393
 *
2394
 * This function assumes that the input string is valid UTF-32 and that it can
2395
 * be represented as Latin1. If you violate this assumption, the result is
2396
 * implementation defined and may include system-dependent behavior such as
2397
 * crashes.
2398
 *
2399
 * This function is for expert users only and not part of our public API. Use
2400
 * convert_utf32_to_latin1 instead. The function may be removed from the library
2401
 * in the future.
2402
 *
2403
 * This function is not BOM-aware.
2404
 *
2405
 * @param input         the UTF-32 string to convert
2406
 * @param length        the length of the string in 4-byte code units (char32_t)
2407
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
2408
 * result
2409
 * @return number of written code units; 0 if conversion is not possible
2410
 */
2411
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2412
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2413
  #if SIMDUTF_SPAN
2414
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2415
    std::span<const char32_t> valid_utf32_input,
2416
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2417
  return convert_valid_utf32_to_latin1(
2418
      valid_utf32_input.data(), valid_utf32_input.size(),
2419
      reinterpret_cast<char *>(latin1_output.data()));
2420
}
2421
  #endif // SIMDUTF_SPAN
2422
2423
/**
2424
 * Compute the number of bytes that this UTF-32 string would require in Latin1
2425
 * format.
2426
 *
2427
 * This function does not validate the input. It is acceptable to pass invalid
2428
 * UTF-32 strings but in such cases the result is implementation defined.
2429
 *
2430
 * This function is not BOM-aware.
2431
 *
2432
 * @param length        the length of the string in 4-byte code units (char32_t)
2433
 * @return the number of bytes required to encode the UTF-32 string as Latin1
2434
 */
2435
simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept;
2436
2437
/**
2438
 * Compute the number of bytes that this Latin1 string would require in UTF-32
2439
 * format.
2440
 *
2441
 * @param length        the length of the string in Latin1 code units (char)
2442
 * @return the length of the string in 4-byte code units (char32_t) required to
2443
 * encode the Latin1 string as UTF-32
2444
 */
2445
simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept;
2446
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2447
2448
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2449
/**
2450
 * Convert possibly broken UTF-32 string into UTF-16BE string.
2451
 *
2452
 * During the conversion also validation of the input string is done.
2453
 * This function is suitable to work with inputs from untrusted sources.
2454
 *
2455
 * This function is not BOM-aware.
2456
 *
2457
 * @param input         the UTF-32 string to convert
2458
 * @param length        the length of the string in 4-byte code units (char32_t)
2459
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2460
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2461
 */
2462
simdutf_warn_unused size_t convert_utf32_to_utf16be(
2463
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2464
  #if SIMDUTF_SPAN
2465
simdutf_really_inline simdutf_warn_unused size_t
2466
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
2467
0
                         std::span<char16_t> utf16_output) noexcept {
2468
0
  return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
2469
0
                                  utf16_output.data());
2470
0
}
2471
  #endif // SIMDUTF_SPAN
2472
2473
/**
2474
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
2475
 * string and stop on error.
2476
 *
2477
 * During the conversion also validation of the input string is done.
2478
 * This function is suitable to work with inputs from untrusted sources.
2479
 *
2480
 * This function is not BOM-aware.
2481
 *
2482
 * @param input         the UTF-32 string to convert
2483
 * @param length        the length of the string in 4-byte code units (char32_t)
2484
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2485
 * @return a result pair struct (of type simdutf::result containing the two
2486
 * fields error and count) with an error code and either position of the error
2487
 * (in the input in code units) if any, or the number of char16_t written if
2488
 * successful.
2489
 */
2490
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
2491
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2492
  #if SIMDUTF_SPAN
2493
simdutf_really_inline simdutf_warn_unused result
2494
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
2495
0
                                   std::span<char16_t> utf16_output) noexcept {
2496
0
  return convert_utf32_to_utf16_with_errors(
2497
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2498
0
}
2499
  #endif // SIMDUTF_SPAN
2500
2501
/**
2502
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2503
 *
2504
 * During the conversion also validation of the input string is done.
2505
 * This function is suitable to work with inputs from untrusted sources.
2506
 *
2507
 * This function is not BOM-aware.
2508
 *
2509
 * @param input         the UTF-32 string to convert
2510
 * @param length        the length of the string in 4-byte code units (char32_t)
2511
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2512
 * @return a result pair struct (of type simdutf::result containing the two
2513
 * fields error and count) with an error code and either position of the error
2514
 * (in the input in code units) if any, or the number of char16_t written if
2515
 * successful.
2516
 */
2517
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
2518
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2519
  #if SIMDUTF_SPAN
2520
simdutf_really_inline simdutf_warn_unused result
2521
convert_utf32_to_utf16le_with_errors(
2522
    std::span<const char32_t> utf32_input,
2523
0
    std::span<char16_t> utf16_output) noexcept {
2524
0
  return convert_utf32_to_utf16le_with_errors(
2525
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2526
0
}
2527
  #endif // SIMDUTF_SPAN
2528
2529
/**
2530
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2531
 *
2532
 * During the conversion also validation of the input string is done.
2533
 * This function is suitable to work with inputs from untrusted sources.
2534
 *
2535
 * This function is not BOM-aware.
2536
 *
2537
 * @param input         the UTF-32 string to convert
2538
 * @param length        the length of the string in 4-byte code units (char32_t)
2539
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2540
 * @return a result pair struct (of type simdutf::result containing the two
2541
 * fields error and count) with an error code and either position of the error
2542
 * (in the input in code units) if any, or the number of char16_t written if
2543
 * successful.
2544
 */
2545
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
2546
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2547
  #if SIMDUTF_SPAN
2548
simdutf_really_inline simdutf_warn_unused result
2549
convert_utf32_to_utf16be_with_errors(
2550
    std::span<const char32_t> utf32_input,
2551
0
    std::span<char16_t> utf16_output) noexcept {
2552
0
  return convert_utf32_to_utf16be_with_errors(
2553
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2554
0
}
2555
  #endif // SIMDUTF_SPAN
2556
2557
/**
2558
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
2559
 *
2560
 * This function assumes that the input string is valid UTF-32.
2561
 *
2562
 * This function is not BOM-aware.
2563
 *
2564
 * @param input         the UTF-32 string to convert
2565
 * @param length        the length of the string in 4-byte code units (char32_t)
2566
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2567
 * result
2568
 * @return number of written code units; 0 if conversion is not possible
2569
 */
2570
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
2571
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2572
  #if SIMDUTF_SPAN
2573
simdutf_really_inline simdutf_warn_unused size_t
2574
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
2575
0
                             std::span<char16_t> utf16_output) noexcept {
2576
0
  return convert_valid_utf32_to_utf16(
2577
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2578
0
}
2579
  #endif // SIMDUTF_SPAN
2580
2581
/**
2582
 * Convert valid UTF-32 string into UTF-16LE string.
2583
 *
2584
 * This function assumes that the input string is valid UTF-32.
2585
 *
2586
 * This function is not BOM-aware.
2587
 *
2588
 * @param input         the UTF-32 string to convert
2589
 * @param length        the length of the string in 4-byte code units (char32_t)
2590
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2591
 * result
2592
 * @return number of written code units; 0 if conversion is not possible
2593
 */
2594
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
2595
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2596
  #if SIMDUTF_SPAN
2597
simdutf_really_inline simdutf_warn_unused size_t
2598
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
2599
0
                               std::span<char16_t> utf16_output) noexcept {
2600
0
  return convert_valid_utf32_to_utf16le(
2601
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2602
0
}
2603
  #endif // SIMDUTF_SPAN
2604
2605
/**
2606
 * Convert valid UTF-32 string into UTF-16BE string.
2607
 *
2608
 * This function assumes that the input string is valid UTF-32.
2609
 *
2610
 * This function is not BOM-aware.
2611
 *
2612
 * @param input         the UTF-32 string to convert
2613
 * @param length        the length of the string in 4-byte code units (char32_t)
2614
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2615
 * result
2616
 * @return number of written code units; 0 if conversion is not possible
2617
 */
2618
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
2619
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2620
  #if SIMDUTF_SPAN
2621
simdutf_really_inline simdutf_warn_unused size_t
2622
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
2623
0
                               std::span<char16_t> utf16_output) noexcept {
2624
0
  return convert_valid_utf32_to_utf16be(
2625
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2626
0
}
2627
  #endif // SIMDUTF_SPAN
2628
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2629
2630
#if SIMDUTF_FEATURE_UTF16
2631
/**
2632
 * Change the endianness of the input. Can be used to go from UTF-16LE to
2633
 * UTF-16BE or from UTF-16BE to UTF-16LE.
2634
 *
2635
 * This function does not validate the input.
2636
 *
2637
 * This function is not BOM-aware.
2638
 *
2639
 * @param input         the UTF-16 string to process
2640
 * @param length        the length of the string in 2-byte code units (char16_t)
2641
 * @param output        the pointer to a buffer that can hold the conversion
2642
 * result
2643
 */
2644
void change_endianness_utf16(const char16_t *input, size_t length,
2645
                             char16_t *output) noexcept;
2646
  #if SIMDUTF_SPAN
2647
simdutf_really_inline void
2648
change_endianness_utf16(std::span<const char16_t> utf16_input,
2649
0
                        std::span<char16_t> utf16_output) noexcept {
2650
0
  return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
2651
0
                                 utf16_output.data());
2652
0
}
2653
  #endif // SIMDUTF_SPAN
2654
#endif   // SIMDUTF_FEATURE_UTF16
2655
2656
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2657
/**
2658
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
2659
 * format.
2660
 *
2661
 * This function does not validate the input. It is acceptable to pass invalid
2662
 * UTF-32 strings but in such cases the result is implementation defined.
2663
 *
2664
 * @param input         the UTF-32 string to convert
2665
 * @param length        the length of the string in 4-byte code units (char32_t)
2666
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2667
 */
2668
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
2669
                                                  size_t length) noexcept;
2670
  #if SIMDUTF_SPAN
2671
simdutf_really_inline simdutf_warn_unused size_t
2672
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2673
0
  return utf8_length_from_utf32(valid_utf32_input.data(),
2674
0
                                valid_utf32_input.size());
2675
0
}
2676
  #endif // SIMDUTF_SPAN
2677
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2678
2679
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2680
/**
2681
 * Compute the number of two-byte code units that this UTF-32 string would
2682
 * require in UTF-16 format.
2683
 *
2684
 * This function does not validate the input. It is acceptable to pass invalid
2685
 * UTF-32 strings but in such cases the result is implementation defined.
2686
 *
2687
 * @param input         the UTF-32 string to convert
2688
 * @param length        the length of the string in 4-byte code units (char32_t)
2689
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2690
 */
2691
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
2692
                                                   size_t length) noexcept;
2693
  #if SIMDUTF_SPAN
2694
simdutf_really_inline simdutf_warn_unused size_t
2695
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2696
0
  return utf16_length_from_utf32(valid_utf32_input.data(),
2697
0
                                 valid_utf32_input.size());
2698
0
}
2699
  #endif // SIMDUTF_SPAN
2700
2701
/**
2702
 * Using native endianness; Compute the number of bytes that this UTF-16
2703
 * string would require in UTF-32 format.
2704
 *
2705
 * This function is equivalent to count_utf16.
2706
 *
2707
 * This function does not validate the input. It is acceptable to pass invalid
2708
 * UTF-16 strings but in such cases the result is implementation defined.
2709
 *
2710
 * This function is not BOM-aware.
2711
 *
2712
 * @param input         the UTF-16 string to convert
2713
 * @param length        the length of the string in 2-byte code units (char16_t)
2714
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2715
 */
2716
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
2717
                                                   size_t length) noexcept;
2718
  #if SIMDUTF_SPAN
2719
simdutf_really_inline simdutf_warn_unused size_t
2720
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2721
0
  return utf32_length_from_utf16(valid_utf16_input.data(),
2722
0
                                 valid_utf16_input.size());
2723
0
}
2724
  #endif // SIMDUTF_SPAN
2725
2726
/**
2727
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
2728
 * format.
2729
 *
2730
 * This function is equivalent to count_utf16le.
2731
 *
2732
 * This function does not validate the input. It is acceptable to pass invalid
2733
 * UTF-16 strings but in such cases the result is implementation defined.
2734
 *
2735
 * This function is not BOM-aware.
2736
 *
2737
 * @param input         the UTF-16LE string to convert
2738
 * @param length        the length of the string in 2-byte code units (char16_t)
2739
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2740
 */
2741
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
2742
                                                     size_t length) noexcept;
2743
  #if SIMDUTF_SPAN
2744
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16le(
2745
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2746
0
  return utf32_length_from_utf16le(valid_utf16_input.data(),
2747
0
                                   valid_utf16_input.size());
2748
0
}
2749
  #endif // SIMDUTF_SPAN
2750
2751
/**
2752
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
2753
 * format.
2754
 *
2755
 * This function is equivalent to count_utf16be.
2756
 *
2757
 * This function does not validate the input. It is acceptable to pass invalid
2758
 * UTF-16 strings but in such cases the result is implementation defined.
2759
 *
2760
 * This function is not BOM-aware.
2761
 *
2762
 * @param input         the UTF-16BE string to convert
2763
 * @param length        the length of the string in 2-byte code units (char16_t)
2764
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2765
 */
2766
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
2767
                                                     size_t length) noexcept;
2768
  #if SIMDUTF_SPAN
2769
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16be(
2770
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2771
0
  return utf32_length_from_utf16be(valid_utf16_input.data(),
2772
0
                                   valid_utf16_input.size());
2773
0
}
2774
  #endif // SIMDUTF_SPAN
2775
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2776
2777
#if SIMDUTF_FEATURE_UTF16
2778
/**
2779
 * Count the number of code points (characters) in the string assuming that
2780
 * it is valid.
2781
 *
2782
 * This function assumes that the input string is valid UTF-16 (native
2783
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
2784
 * cases the result is implementation defined.
2785
 *
2786
 * This function is not BOM-aware.
2787
 *
2788
 * @param input         the UTF-16 string to process
2789
 * @param length        the length of the string in 2-byte code units (char16_t)
2790
 * @return number of code points
2791
 */
2792
simdutf_warn_unused size_t count_utf16(const char16_t *input,
2793
                                       size_t length) noexcept;
2794
  #if SIMDUTF_SPAN
2795
simdutf_really_inline simdutf_warn_unused size_t
2796
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2797
0
  return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2798
0
}
2799
  #endif // SIMDUTF_SPAN
2800
2801
/**
2802
 * Count the number of code points (characters) in the string assuming that
2803
 * it is valid.
2804
 *
2805
 * This function assumes that the input string is valid UTF-16LE.
2806
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2807
 * the result is implementation defined.
2808
 *
2809
 * This function is not BOM-aware.
2810
 *
2811
 * @param input         the UTF-16LE string to process
2812
 * @param length        the length of the string in 2-byte code units (char16_t)
2813
 * @return number of code points
2814
 */
2815
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
2816
                                         size_t length) noexcept;
2817
  #if SIMDUTF_SPAN
2818
simdutf_really_inline simdutf_warn_unused size_t
2819
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2820
0
  return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
2821
0
}
2822
  #endif // SIMDUTF_SPAN
2823
2824
/**
2825
 * Count the number of code points (characters) in the string assuming that
2826
 * it is valid.
2827
 *
2828
 * This function assumes that the input string is valid UTF-16BE.
2829
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2830
 * the result is implementation defined.
2831
 *
2832
 * This function is not BOM-aware.
2833
 *
2834
 * @param input         the UTF-16BE string to process
2835
 * @param length        the length of the string in 2-byte code units (char16_t)
2836
 * @return number of code points
2837
 */
2838
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
2839
                                         size_t length) noexcept;
2840
  #if SIMDUTF_SPAN
2841
simdutf_really_inline simdutf_warn_unused size_t
2842
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2843
0
  return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
2844
0
}
2845
  #endif // SIMDUTF_SPAN
2846
#endif   // SIMDUTF_FEATURE_UTF16
2847
2848
#if SIMDUTF_FEATURE_UTF8
2849
/**
2850
 * Count the number of code points (characters) in the string assuming that
2851
 * it is valid.
2852
 *
2853
 * This function assumes that the input string is valid UTF-8.
2854
 * It is acceptable to pass invalid UTF-8 strings but in such cases
2855
 * the result is implementation defined.
2856
 *
2857
 * @param input         the UTF-8 string to process
2858
 * @param length        the length of the string in bytes
2859
 * @return number of code points
2860
 */
2861
simdutf_warn_unused size_t count_utf8(const char *input,
2862
                                      size_t length) noexcept;
2863
  #if SIMDUTF_SPAN
2864
simdutf_really_inline simdutf_warn_unused size_t count_utf8(
2865
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2866
  return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
2867
                    valid_utf8_input.size());
2868
}
2869
  #endif // SIMDUTF_SPAN
2870
2871
/**
2872
 * Given a valid UTF-8 string having a possibly truncated last character,
2873
 * this function checks the end of string. If the last character is truncated
2874
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
2875
 * that the short UTF-8 strings only contain complete characters. If there is no
2876
 * truncated character, the original length is returned.
2877
 *
2878
 * This function assumes that the input string is valid UTF-8, but possibly
2879
 * truncated.
2880
 *
2881
 * @param input         the UTF-8 string to process
2882
 * @param length        the length of the string in bytes
2883
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
2884
 */
2885
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
2886
  #if SIMDUTF_SPAN
2887
simdutf_really_inline simdutf_warn_unused size_t trim_partial_utf8(
2888
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2889
  return trim_partial_utf8(
2890
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2891
      valid_utf8_input.size());
2892
}
2893
  #endif // SIMDUTF_SPAN
2894
#endif   // SIMDUTF_FEATURE_UTF8
2895
2896
#if SIMDUTF_FEATURE_UTF16
2897
/**
2898
 * Given a valid UTF-16BE string having a possibly truncated last character,
2899
 * this function checks the end of string. If the last character is truncated
2900
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2901
 * the short UTF-16BE strings only contain complete characters. If there is no
2902
 * truncated character, the original length is returned.
2903
 *
2904
 * This function assumes that the input string is valid UTF-16BE, but possibly
2905
 * truncated.
2906
 *
2907
 * @param input         the UTF-16BE string to process
2908
 * @param length        the length of the string in bytes
2909
 * @return the length of the string in bytes, possibly shorter by 1 unit
2910
 */
2911
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
2912
                                                size_t length);
2913
  #if SIMDUTF_SPAN
2914
simdutf_really_inline simdutf_warn_unused size_t
2915
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2916
0
  return trim_partial_utf16be(valid_utf16_input.data(),
2917
0
                              valid_utf16_input.size());
2918
0
}
2919
  #endif // SIMDUTF_SPAN
2920
2921
/**
2922
 * Given a valid UTF-16LE string having a possibly truncated last character,
2923
 * this function checks the end of string. If the last character is truncated
2924
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2925
 * the short UTF-16LE strings only contain complete characters. If there is no
2926
 * truncated character, the original length is returned.
2927
 *
2928
 * This function assumes that the input string is valid UTF-16LE, but possibly
2929
 * truncated.
2930
 *
2931
 * @param input         the UTF-16LE string to process
2932
 * @param length        the length of the string in bytes
2933
 * @return the length of the string in unit, possibly shorter by 1 unit
2934
 */
2935
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
2936
                                                size_t length);
2937
  #if SIMDUTF_SPAN
2938
simdutf_really_inline simdutf_warn_unused size_t
2939
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2940
0
  return trim_partial_utf16le(valid_utf16_input.data(),
2941
0
                              valid_utf16_input.size());
2942
0
}
2943
  #endif // SIMDUTF_SPAN
2944
2945
/**
2946
 * Given a valid UTF-16 string having a possibly truncated last character,
2947
 * this function checks the end of string. If the last character is truncated
2948
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2949
 * the short UTF-16 strings only contain complete characters. If there is no
2950
 * truncated character, the original length is returned.
2951
 *
2952
 * This function assumes that the input string is valid UTF-16, but possibly
2953
 * truncated. We use the native endianness.
2954
 *
2955
 * @param input         the UTF-16 string to process
2956
 * @param length        the length of the string in bytes
2957
 * @return the length of the string in unit, possibly shorter by 1 unit
2958
 */
2959
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
2960
                                              size_t length);
2961
  #if SIMDUTF_SPAN
2962
simdutf_really_inline simdutf_warn_unused size_t
2963
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2964
0
  return trim_partial_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2965
0
}
2966
  #endif // SIMDUTF_SPAN
2967
#endif   // SIMDUTF_FEATURE_UTF16
2968
2969
#if SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 ||                         \
2970
    SIMDUTF_FEATURE_DETECT_ENCODING
2971
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
2972
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
2973
  #endif
2974
#endif // SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 ||
2975
       // SIMDUTF_FEATURE_DETECT_ENCODING
2976
2977
#if SIMDUTF_FEATURE_BASE64
2978
// base64_options are used to specify the base64 encoding options.
2979
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
2980
// garbage characters are characters that are not part of the base64 alphabet
2981
// nor ASCII spaces.
2982
constexpr uint64_t base64_reverse_padding =
2983
    2; /* modifier for base64_default and base64_url */
2984
enum base64_options : uint64_t {
2985
  base64_default = 0, /* standard base64 format (with padding) */
2986
  base64_url = 1,     /* base64url format (no padding) */
2987
  base64_default_no_padding =
2988
      base64_default |
2989
      base64_reverse_padding, /* standard base64 format without padding */
2990
  base64_url_with_padding =
2991
      base64_url | base64_reverse_padding, /* base64url with padding */
2992
  base64_default_accept_garbage =
2993
      4, /* standard base64 format accepting garbage characters, the input stops
2994
            with the first '=' if any */
2995
  base64_url_accept_garbage =
2996
      5, /* base64url format accepting garbage characters, the input stops with
2997
            the first '=' if any */
2998
  base64_default_or_url =
2999
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
3000
  base64_default_or_url_accept_garbage =
3001
      12, /* standard/base64url hybrid format accepting garbage characters
3002
             (only meaningful for decoding!), the input stops with the first '='
3003
             if any */
3004
};
3005
3006
  #if SIMDUTF_CPLUSPLUS17
3007
0
inline std::string_view to_string(base64_options options) {
3008
0
  switch (options) {
3009
0
  case base64_default:
3010
0
    return "base64_default";
3011
0
  case base64_url:
3012
0
    return "base64_url";
3013
0
  case base64_reverse_padding:
3014
0
    return "base64_reverse_padding";
3015
0
  case base64_url_with_padding:
3016
0
    return "base64_url_with_padding";
3017
0
  case base64_default_accept_garbage:
3018
0
    return "base64_default_accept_garbage";
3019
0
  case base64_url_accept_garbage:
3020
0
    return "base64_url_accept_garbage";
3021
0
  case base64_default_or_url:
3022
0
    return "base64_default_or_url";
3023
0
  case base64_default_or_url_accept_garbage:
3024
0
    return "base64_default_or_url_accept_garbage";
3025
0
  }
3026
0
  return "<unknown>";
3027
0
}
3028
  #endif // SIMDUTF_CPLUSPLUS17
3029
3030
// last_chunk_handling_options are used to specify the handling of the last
3031
// chunk in base64 decoding.
3032
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3033
enum last_chunk_handling_options : uint64_t {
3034
  loose = 0,  /* standard base64 format, decode partial final chunk */
3035
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
3036
                 unpadded, or non-zero bit padding */
3037
  stop_before_partial =
3038
      2, /* if the last chunk is partial, ignore it (no error) */
3039
  only_full_chunks =
3040
      3 /* only decode full blocks (4 base64 characters, no padding) */
3041
};
3042
3043
inline bool is_partial(last_chunk_handling_options options) {
3044
  return (options == stop_before_partial) || (options == only_full_chunks);
3045
}
3046
3047
  #if SIMDUTF_CPLUSPLUS17
3048
0
inline std::string_view to_string(last_chunk_handling_options options) {
3049
0
  switch (options) {
3050
0
  case loose:
3051
0
    return "loose";
3052
0
  case strict:
3053
0
    return "strict";
3054
0
  case stop_before_partial:
3055
0
    return "stop_before_partial";
3056
0
  case only_full_chunks:
3057
0
    return "only_full_chunks";
3058
0
  }
3059
0
  return "<unknown>";
3060
0
}
3061
  #endif
3062
3063
/**
3064
 * Provide the maximal binary length in bytes given the base64 input.
3065
 * As long as the input does not contain ignorable characters (e.g., ASCII
3066
 * spaces or linefeed characters), the result is exact. In particular, the
3067
 * function checks for padding characters.
3068
 *
3069
 * The function is fast (constant time). It checks up to two characters at
3070
 * the end of the string. The input is not otherwise validated or read.
3071
 *
3072
 * @param input         the base64 input to process
3073
 * @param length        the length of the base64 input in bytes
3074
 * @return maximum number of binary bytes
3075
 */
3076
simdutf_warn_unused size_t
3077
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
3078
  #if SIMDUTF_SPAN
3079
simdutf_really_inline simdutf_warn_unused size_t
3080
maximal_binary_length_from_base64(
3081
    const detail::input_span_of_byte_like auto &input) noexcept {
3082
  return maximal_binary_length_from_base64(
3083
      reinterpret_cast<const char *>(input.data()), input.size());
3084
}
3085
  #endif // SIMDUTF_SPAN
3086
3087
/**
3088
 * Provide the maximal binary length in bytes given the base64 input.
3089
 * As long as the input does not contain ignorable characters (e.g., ASCII
3090
 * spaces or linefeed characters), the result is exact. In particular, the
3091
 * function checks for padding characters.
3092
 *
3093
 * The function is fast (constant time). It checks up to two characters at
3094
 * the end of the string. The input is not otherwise validated or read.
3095
 *
3096
 * @param input         the base64 input to process, in ASCII stored as 16-bit
3097
 * units
3098
 * @param length        the length of the base64 input in 16-bit units
3099
 * @return maximal number of binary bytes
3100
 */
3101
simdutf_warn_unused size_t maximal_binary_length_from_base64(
3102
    const char16_t *input, size_t length) noexcept;
3103
  #if SIMDUTF_SPAN
3104
simdutf_really_inline simdutf_warn_unused size_t
3105
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
3106
0
  return maximal_binary_length_from_base64(input.data(), input.size());
3107
0
}
3108
  #endif // SIMDUTF_SPAN
3109
3110
/**
3111
 * Convert a base64 input to a binary output.
3112
 *
3113
 * This function follows the WHATWG forgiving-base64 format, which means that it
3114
 * will ignore any ASCII spaces in the input. You may provide a padded input
3115
 * (with one or two equal signs at the end) or an unpadded input (without any
3116
 * equal signs at the end).
3117
 *
3118
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3119
 *
3120
 * This function will fail in case of invalid input. When last_chunk_options =
3121
 * loose, there are two possible reasons for failure: the input contains a
3122
 * number of base64 characters that when divided by 4, leaves a single remainder
3123
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3124
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3125
 *
3126
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3127
 * input where the invalid character was found. When the error is
3128
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3129
 *
3130
 * The default option (simdutf::base64_default) expects the characters `+` and
3131
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3132
 * characters `-` and `_` as part of its alphabet.
3133
 *
3134
 * The padding (`=`) is validated if present. There may be at most two padding
3135
 * characters at the end of the input. If there are any padding characters, the
3136
 * total number of characters (excluding spaces but including padding
3137
 * characters) must be divisible by four.
3138
 *
3139
 * You should call this function with a buffer that is at least
3140
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
3141
 * provide that much space, the function may cause a buffer overflow.
3142
 *
3143
 * Advanced users may want to tailor how the last chunk is handled. By default,
3144
 * we use a loose (forgiving) approach but we also support a strict approach
3145
 * as well as a stop_before_partial approach, as per the following proposal:
3146
 *
3147
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3148
 *
3149
 * @param input         the base64 string to process
3150
 * @param length        the length of the string in bytes
3151
 * @param output        the pointer to a buffer that can hold the conversion
3152
 * result (should be at least maximal_binary_length_from_base64(input, length)
3153
 * bytes long).
3154
 * @param options       the base64 options to use, usually base64_default or
3155
 * base64_url, and base64_default by default.
3156
 * @param last_chunk_options the last chunk handling options,
3157
 * last_chunk_handling_options::loose by default
3158
 * but can also be last_chunk_handling_options::strict or
3159
 * last_chunk_handling_options::stop_before_partial.
3160
 * @return a result pair struct (of type simdutf::result containing the two
3161
 * fields error and count) with an error code and either position of the error
3162
 * (in the input in bytes) if any, or the number of bytes written if successful.
3163
 */
3164
simdutf_warn_unused result base64_to_binary(
3165
    const char *input, size_t length, char *output,
3166
    base64_options options = base64_default,
3167
    last_chunk_handling_options last_chunk_options = loose) noexcept;
3168
  #if SIMDUTF_SPAN
3169
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3170
    const detail::input_span_of_byte_like auto &input,
3171
    detail::output_span_of_byte_like auto &&binary_output,
3172
    base64_options options = base64_default,
3173
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3174
  return base64_to_binary(reinterpret_cast<const char *>(input.data()),
3175
                          input.size(),
3176
                          reinterpret_cast<char *>(binary_output.data()),
3177
                          options, last_chunk_options);
3178
}
3179
  #endif // SIMDUTF_SPAN
3180
3181
/**
3182
 * Provide the base64 length in bytes given the length of a binary input.
3183
 *
3184
 * @param length        the length of the input in bytes
3185
 * @return number of base64 bytes
3186
 */
3187
simdutf_warn_unused size_t base64_length_from_binary(
3188
    size_t length, base64_options options = base64_default) noexcept;
3189
3190
/**
3191
 * Provide the base64 length in bytes given the length of a binary input,
3192
 * taking into account line breaks.
3193
 *
3194
 * @param length        the length of the input in bytes
3195
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
3196
 * interpreted as 4),
3197
 * @return number of base64 bytes
3198
 */
3199
simdutf_warn_unused size_t base64_length_from_binary_with_lines(
3200
    size_t length, base64_options options = base64_default,
3201
    size_t line_length = default_line_length) noexcept;
3202
3203
/**
3204
 * Convert a binary input to a base64 output.
3205
 *
3206
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3207
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3208
 * output to ensure that the output length is a multiple of four.
3209
 *
3210
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3211
 * of its alphabet. No padding is added at the end of the output.
3212
 *
3213
 * This function always succeeds.
3214
 *
3215
 * @param input         the binary to process
3216
 * @param length        the length of the input in bytes
3217
 * @param output        the pointer to a buffer that can hold the conversion
3218
 * result (should be at least base64_length_from_binary(length) bytes long)
3219
 * @param options       the base64 options to use, can be base64_default or
3220
 * base64_url, is base64_default by default.
3221
 * @return number of written bytes, will be equal to
3222
 * base64_length_from_binary(length, options)
3223
 */
3224
size_t binary_to_base64(const char *input, size_t length, char *output,
3225
                        base64_options options = base64_default) noexcept;
3226
  #if SIMDUTF_SPAN
3227
simdutf_really_inline simdutf_warn_unused size_t
3228
binary_to_base64(const detail::input_span_of_byte_like auto &input,
3229
                 detail::output_span_of_byte_like auto &&binary_output,
3230
                 base64_options options = base64_default) noexcept {
3231
  return binary_to_base64(
3232
      reinterpret_cast<const char *>(input.data()), input.size(),
3233
      reinterpret_cast<char *>(binary_output.data()), options);
3234
}
3235
  #endif // SIMDUTF_SPAN
3236
3237
/**
3238
 * Convert a binary input to a base64 output with line breaks.
3239
 *
3240
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3241
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3242
 * output to ensure that the output length is a multiple of four.
3243
 *
3244
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3245
 * of its alphabet. No padding is added at the end of the output.
3246
 *
3247
 * This function always succeeds.
3248
 *
3249
 * @param input         the binary to process
3250
 * @param length        the length of the input in bytes
3251
 * @param output        the pointer to a buffer that can hold the conversion
3252
 * result (should be at least base64_length_from_binary_with_lines(length,
3253
 * options, line_length) bytes long)
3254
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
3255
 * interpreted as 4),
3256
 * @param options       the base64 options to use, can be base64_default or
3257
 * base64_url, is base64_default by default.
3258
 * @return number of written bytes, will be equal to
3259
 * base64_length_from_binary_with_lines(length, options)
3260
 */
3261
size_t
3262
binary_to_base64_with_lines(const char *input, size_t length, char *output,
3263
                            size_t line_length = simdutf::default_line_length,
3264
                            base64_options options = base64_default) noexcept;
3265
  #if SIMDUTF_SPAN
3266
simdutf_really_inline simdutf_warn_unused size_t binary_to_base64_with_lines(
3267
    const detail::input_span_of_byte_like auto &input,
3268
    detail::output_span_of_byte_like auto &&binary_output,
3269
    size_t line_length = simdutf::default_line_length,
3270
    base64_options options = base64_default) noexcept {
3271
  return binary_to_base64_with_lines(
3272
      reinterpret_cast<const char *>(input.data()), input.size(),
3273
      reinterpret_cast<char *>(binary_output.data()), line_length, options);
3274
}
3275
  #endif // SIMDUTF_SPAN
3276
3277
  #if SIMDUTF_ATOMIC_REF
3278
/**
3279
 * Convert a binary input to a base64 output, using atomic accesses.
3280
 * This function comes with a potentially significant performance
3281
 * penalty, but it may be useful in some cases where the input
3282
 * buffers are shared between threads, to avoid undefined
3283
 * behavior in case of data races.
3284
 *
3285
 * The function is for advanced users. Its main use case is when
3286
 * to silence sanitizer warnings. We have no documented use case
3287
 * where this function is actually necessary in terms of practical correctness.
3288
 *
3289
 * This function is only available when simdutf is compiled with
3290
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3291
 * the availability of this function by checking the macro
3292
 * SIMDUTF_ATOMIC_REF.
3293
 *
3294
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3295
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3296
 * output to ensure that the output length is a multiple of four.
3297
 *
3298
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3299
 * of its alphabet. No padding is added at the end of the output.
3300
 *
3301
 * This function always succeeds.
3302
 *
3303
 * This function is considered experimental. It is not tested by default
3304
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3305
 * It is not documented in the public API documentation (README). It is
3306
 * offered on a best effort basis. We rely on the community for further
3307
 * testing and feedback.
3308
 *
3309
 * @brief atomic_binary_to_base64
3310
 * @param input         the binary to process
3311
 * @param length        the length of the input in bytes
3312
 * @param output        the pointer to a buffer that can hold the conversion
3313
 * result (should be at least base64_length_from_binary(length) bytes long)
3314
 * @param options       the base64 options to use, can be base64_default or
3315
 * base64_url, is base64_default by default.
3316
 * @return number of written bytes, will be equal to
3317
 * base64_length_from_binary(length, options)
3318
 */
3319
size_t
3320
atomic_binary_to_base64(const char *input, size_t length, char *output,
3321
                        base64_options options = base64_default) noexcept;
3322
    #if SIMDUTF_SPAN
3323
simdutf_really_inline simdutf_warn_unused size_t
3324
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
3325
                        detail::output_span_of_byte_like auto &&binary_output,
3326
                        base64_options options = base64_default) noexcept {
3327
  return atomic_binary_to_base64(
3328
      reinterpret_cast<const char *>(input.data()), input.size(),
3329
      reinterpret_cast<char *>(binary_output.data()), options);
3330
}
3331
    #endif // SIMDUTF_SPAN
3332
  #endif   // SIMDUTF_ATOMIC_REF
3333
3334
/**
3335
 * Convert a base64 input to a binary output.
3336
 *
3337
 * This function follows the WHATWG forgiving-base64 format, which means that it
3338
 * will ignore any ASCII spaces in the input. You may provide a padded input
3339
 * (with one or two equal signs at the end) or an unpadded input (without any
3340
 * equal signs at the end).
3341
 *
3342
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3343
 *
3344
 * This function will fail in case of invalid input. When last_chunk_options =
3345
 * loose, there are two possible reasons for failure: the input contains a
3346
 * number of base64 characters that when divided by 4, leaves a single remainder
3347
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3348
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3349
 *
3350
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3351
 * input where the invalid character was found. When the error is
3352
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3353
 *
3354
 * The default option (simdutf::base64_default) expects the characters `+` and
3355
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3356
 * characters `-` and `_` as part of its alphabet.
3357
 *
3358
 * The padding (`=`) is validated if present. There may be at most two padding
3359
 * characters at the end of the input. If there are any padding characters, the
3360
 * total number of characters (excluding spaces but including padding
3361
 * characters) must be divisible by four.
3362
 *
3363
 * You should call this function with a buffer that is at least
3364
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
3365
 * to provide that much space, the function may cause a buffer overflow.
3366
 *
3367
 * Advanced users may want to tailor how the last chunk is handled. By default,
3368
 * we use a loose (forgiving) approach but we also support a strict approach
3369
 * as well as a stop_before_partial approach, as per the following proposal:
3370
 *
3371
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3372
 *
3373
 * @param input         the base64 string to process, in ASCII stored as 16-bit
3374
 * units
3375
 * @param length        the length of the string in 16-bit units
3376
 * @param output        the pointer to a buffer that can hold the conversion
3377
 * result (should be at least maximal_binary_length_from_base64(input, length)
3378
 * bytes long).
3379
 * @param options       the base64 options to use, can be base64_default or
3380
 * base64_url, is base64_default by default.
3381
 * @param last_chunk_options the last chunk handling options,
3382
 * last_chunk_handling_options::loose by default
3383
 * but can also be last_chunk_handling_options::strict or
3384
 * last_chunk_handling_options::stop_before_partial.
3385
 * @return a result pair struct (of type simdutf::result containing the two
3386
 * fields error and count) with an error code and position of the
3387
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3388
 * of bytes written if successful.
3389
 */
3390
simdutf_warn_unused result
3391
base64_to_binary(const char16_t *input, size_t length, char *output,
3392
                 base64_options options = base64_default,
3393
                 last_chunk_handling_options last_chunk_options =
3394
                     last_chunk_handling_options::loose) noexcept;
3395
  #if SIMDUTF_SPAN
3396
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3397
    std::span<const char16_t> input,
3398
    detail::output_span_of_byte_like auto &&binary_output,
3399
    base64_options options = base64_default,
3400
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3401
  return base64_to_binary(input.data(), input.size(),
3402
                          reinterpret_cast<char *>(binary_output.data()),
3403
                          options, last_chunk_options);
3404
}
3405
  #endif // SIMDUTF_SPAN
3406
3407
/**
3408
 * Check if a character is an ignorabl base64 character.
3409
 * Checking a large input, character by character, is not computationally
3410
 * efficient.
3411
 *
3412
 * @param input         the character to check
3413
 * @param options       the base64 options to use, is base64_default by default.
3414
 * @return true if the character is an ignorablee base64 character, false
3415
 * otherwise.
3416
 */
3417
simdutf_warn_unused bool
3418
base64_ignorable(char input, base64_options options = base64_default) noexcept;
3419
simdutf_warn_unused bool
3420
base64_ignorable(char16_t input,
3421
                 base64_options options = base64_default) noexcept;
3422
3423
/**
3424
 * Check if a character is a valid base64 character.
3425
 * Checking a large input, character by character, is not computationally
3426
 * efficient.
3427
 * Note that padding characters are not considered valid base64 characters in
3428
 * this context, nor are spaces.
3429
 *
3430
 * @param input         the character to check
3431
 * @param options       the base64 options to use, is base64_default by default.
3432
 * @return true if the character is a base64 character, false otherwise.
3433
 */
3434
simdutf_warn_unused bool
3435
base64_valid(char input, base64_options options = base64_default) noexcept;
3436
simdutf_warn_unused bool
3437
base64_valid(char16_t input, base64_options options = base64_default) noexcept;
3438
3439
/**
3440
 * Check if a character is a valid base64 character or the padding character
3441
 * ('='). Checking a large input, character by character, is not computationally
3442
 * efficient.
3443
 *
3444
 * @param input         the character to check
3445
 * @param options       the base64 options to use, is base64_default by default.
3446
 * @return true if the character is a base64 character, false otherwise.
3447
 */
3448
simdutf_warn_unused bool
3449
base64_valid_or_padding(char input,
3450
                        base64_options options = base64_default) noexcept;
3451
simdutf_warn_unused bool
3452
base64_valid_or_padding(char16_t input,
3453
                        base64_options options = base64_default) noexcept;
3454
3455
/**
3456
 * Convert a base64 input to a binary output.
3457
 *
3458
 * This function follows the WHATWG forgiving-base64 format, which means that it
3459
 * will ignore any ASCII spaces in the input. You may provide a padded input
3460
 * (with one or two equal signs at the end) or an unpadded input (without any
3461
 * equal signs at the end).
3462
 *
3463
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3464
 *
3465
 * This function will fail in case of invalid input. When last_chunk_options =
3466
 * loose, there are three possible reasons for failure: the input contains a
3467
 * number of base64 characters that when divided by 4, leaves a single remainder
3468
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
3469
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
3470
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
3471
 *
3472
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
3473
 * and the number of units processed, see description of the parameters and
3474
 * returned value.
3475
 *
3476
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3477
 * input where the invalid character was found. When the error is
3478
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3479
 *
3480
 * The default option (simdutf::base64_default) expects the characters `+` and
3481
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3482
 * characters `-` and `_` as part of its alphabet.
3483
 *
3484
 * The padding (`=`) is validated if present. There may be at most two padding
3485
 * characters at the end of the input. If there are any padding characters, the
3486
 * total number of characters (excluding spaces but including padding
3487
 * characters) must be divisible by four.
3488
 *
3489
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
3490
 * to discard the output unless the parameter decode_up_to_bad_char is set to
3491
 * true. In that case, the function will decode up to the first invalid
3492
 * character. Extra padding characters ('=') are considered invalid characters.
3493
 *
3494
 * Advanced users may want to tailor how the last chunk is handled. By default,
3495
 * we use a loose (forgiving) approach but we also support a strict approach
3496
 * as well as a stop_before_partial approach, as per the following proposal:
3497
 *
3498
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3499
 *
3500
 * @param input         the base64 string to process, in ASCII stored as 8-bit
3501
 * or 16-bit units
3502
 * @param length        the length of the string in 8-bit or 16-bit units.
3503
 * @param output        the pointer to a buffer that can hold the conversion
3504
 * result.
3505
 * @param outlen        the number of bytes that can be written in the output
3506
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3507
 * @param options       the base64 options to use, can be base64_default or
3508
 * base64_url, is base64_default by default.
3509
 * @param last_chunk_options the last chunk handling options,
3510
 * last_chunk_handling_options::loose by default
3511
 * but can also be last_chunk_handling_options::strict or
3512
 * last_chunk_handling_options::stop_before_partial.
3513
 * @param decode_up_to_bad_char if true, the function will decode up to the
3514
 * first invalid character. By default (false), it is assumed that the output
3515
 * buffer is to be discarded. When there are multiple errors in the input,
3516
 * using decode_up_to_bad_char might trigger a different error.
3517
 * @return a result pair struct (of type simdutf::result containing the two
3518
 * fields error and count) with an error code and position of the
3519
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3520
 * of units processed if successful.
3521
 */
3522
simdutf_warn_unused result
3523
base64_to_binary_safe(const char *input, size_t length, char *output,
3524
                      size_t &outlen, base64_options options = base64_default,
3525
                      last_chunk_handling_options last_chunk_options =
3526
                          last_chunk_handling_options::loose,
3527
                      bool decode_up_to_bad_char = false) noexcept;
3528
  #if SIMDUTF_SPAN
3529
/**
3530
 * @brief span overload
3531
 * @return a tuple of result and outlen
3532
 */
3533
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3534
base64_to_binary_safe(const detail::input_span_of_byte_like auto &input,
3535
                      detail::output_span_of_byte_like auto &&binary_output,
3536
                      base64_options options = base64_default,
3537
                      last_chunk_handling_options last_chunk_options = loose,
3538
                      bool decode_up_to_bad_char = false) noexcept {
3539
  size_t outlen = binary_output.size();
3540
  auto r = base64_to_binary_safe(
3541
      reinterpret_cast<const char *>(input.data()), input.size(),
3542
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3543
      last_chunk_options, decode_up_to_bad_char);
3544
  return {r, outlen};
3545
}
3546
  #endif // SIMDUTF_SPAN
3547
3548
simdutf_warn_unused result
3549
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
3550
                      size_t &outlen, base64_options options = base64_default,
3551
                      last_chunk_handling_options last_chunk_options =
3552
                          last_chunk_handling_options::loose,
3553
                      bool decode_up_to_bad_char = false) noexcept;
3554
  #if SIMDUTF_SPAN
3555
/**
3556
 * @brief span overload
3557
 * @return a tuple of result and outlen
3558
 */
3559
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3560
base64_to_binary_safe(std::span<const char16_t> input,
3561
                      detail::output_span_of_byte_like auto &&binary_output,
3562
                      base64_options options = base64_default,
3563
                      last_chunk_handling_options last_chunk_options = loose,
3564
                      bool decode_up_to_bad_char = false) noexcept {
3565
  size_t outlen = binary_output.size();
3566
  auto r = base64_to_binary_safe(input.data(), input.size(),
3567
                                 reinterpret_cast<char *>(binary_output.data()),
3568
                                 outlen, options, last_chunk_options,
3569
                                 decode_up_to_bad_char);
3570
  return {r, outlen};
3571
}
3572
  #endif // SIMDUTF_SPAN
3573
3574
  #if SIMDUTF_ATOMIC_REF
3575
/**
3576
 * Convert a base64 input to a binary output with a size limit and using atomic
3577
 * operations.
3578
 *
3579
 * Like `base64_to_binary_safe` but using atomic operations, this function is
3580
 * thread-safe for concurrent memory access, allowing the output
3581
 * buffers to be shared between threads without undefined behavior in case of
3582
 * data races.
3583
 *
3584
 * This function comes with a potentially significant performance penalty, but
3585
 * is useful when thread safety is needed during base64 decoding.
3586
 *
3587
 * This function is only available when simdutf is compiled with
3588
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3589
 * the availability of this function by checking the macro
3590
 * SIMDUTF_ATOMIC_REF.
3591
 *
3592
 * This function is considered experimental. It is not tested by default
3593
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3594
 * It is not documented in the public API documentation (README). It is
3595
 * offered on a best effort basis. We rely on the community for further
3596
 * testing and feedback.
3597
 *
3598
 * @param input         the base64 input to decode
3599
 * @param length        the length of the input in bytes
3600
 * @param output        the pointer to buffer that can hold the conversion
3601
 * result
3602
 * @param outlen        the number of bytes that can be written in the output
3603
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3604
 * @param options       the base64 options to use (default, url, etc.)
3605
 * @param last_chunk_options the last chunk handling options (loose, strict,
3606
 * stop_before_partial)
3607
 * @param decode_up_to_bad_char if true, the function will decode up to the
3608
 * first invalid character. By default (false), it is assumed that the output
3609
 * buffer is to be discarded. When there are multiple errors in the input,
3610
 * using decode_up_to_bad_char might trigger a different error.
3611
 * @return a result struct with an error code and count indicating error
3612
 * position or success
3613
 */
3614
simdutf_warn_unused result atomic_base64_to_binary_safe(
3615
    const char *input, size_t length, char *output, size_t &outlen,
3616
    base64_options options = base64_default,
3617
    last_chunk_handling_options last_chunk_options =
3618
        last_chunk_handling_options::loose,
3619
    bool decode_up_to_bad_char = false) noexcept;
3620
simdutf_warn_unused result atomic_base64_to_binary_safe(
3621
    const char16_t *input, size_t length, char *output, size_t &outlen,
3622
    base64_options options = base64_default,
3623
    last_chunk_handling_options last_chunk_options = loose,
3624
    bool decode_up_to_bad_char = false) noexcept;
3625
    #if SIMDUTF_SPAN
3626
/**
3627
 * @brief span overload
3628
 * @return a tuple of result and outlen
3629
 */
3630
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3631
atomic_base64_to_binary_safe(
3632
    const detail::input_span_of_byte_like auto &binary_input,
3633
    detail::output_span_of_byte_like auto &&output,
3634
    base64_options options = base64_default,
3635
    last_chunk_handling_options last_chunk_options =
3636
        last_chunk_handling_options::loose,
3637
    bool decode_up_to_bad_char = false) noexcept {
3638
  size_t outlen = output.size();
3639
  auto ret = atomic_base64_to_binary_safe(
3640
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
3641
      reinterpret_cast<char *>(output.data()), outlen, options,
3642
      last_chunk_options, decode_up_to_bad_char);
3643
  return {ret, outlen};
3644
}
3645
/**
3646
 * @brief span overload
3647
 * @return a tuple of result and outlen
3648
 */
3649
simdutf_warn_unused std::tuple<result, std::size_t>
3650
atomic_base64_to_binary_safe(
3651
    std::span<const char16_t> base64_input,
3652
    detail::output_span_of_byte_like auto &&binary_output,
3653
    base64_options options = base64_default,
3654
    last_chunk_handling_options last_chunk_options = loose,
3655
    bool decode_up_to_bad_char = false) noexcept {
3656
  size_t outlen = binary_output.size();
3657
  auto ret = atomic_base64_to_binary_safe(
3658
      base64_input.data(), base64_input.size(),
3659
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3660
      last_chunk_options, decode_up_to_bad_char);
3661
  return {ret, outlen};
3662
}
3663
    #endif // SIMDUTF_SPAN
3664
  #endif   // SIMDUTF_ATOMIC_REF
3665
3666
/**
3667
 * Find the first occurrence of a character in a string. If the character is
3668
 * not found, return a pointer to the end of the string.
3669
 * @param start        the start of the string
3670
 * @param end          the end of the string
3671
 * @param character    the character to find
3672
 * @return a pointer to the first occurrence of the character in the string,
3673
 * or a pointer to the end of the string if the character is not found.
3674
 *
3675
 */
3676
simdutf_warn_unused const char *find(const char *start, const char *end,
3677
                                     char character) noexcept;
3678
simdutf_warn_unused const char16_t *
3679
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
3680
#endif // SIMDUTF_FEATURE_BASE64
3681
3682
/**
3683
 * An implementation of simdutf for a particular CPU architecture.
3684
 *
3685
 * Also used to maintain the currently active implementation. The active
3686
 * implementation is automatically initialized on first use to the most advanced
3687
 * implementation supported by the host.
3688
 */
3689
class implementation {
3690
public:
3691
  /**
3692
   * The name of this implementation.
3693
   *
3694
   *     const implementation *impl = simdutf::active_implementation;
3695
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3696
   * impl->description() << ")" << endl;
3697
   *
3698
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3699
   */
3700
  virtual std::string name() const { return std::string(_name); }
3701
3702
  /**
3703
   * The description of this implementation.
3704
   *
3705
   *     const implementation *impl = simdutf::active_implementation;
3706
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3707
   * impl->description() << ")" << endl;
3708
   *
3709
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3710
   */
3711
  virtual std::string description() const { return std::string(_description); }
3712
3713
  /**
3714
   * The instruction sets this implementation is compiled against
3715
   * and the current CPU match. This function may poll the current CPU/system
3716
   * and should therefore not be called too often if performance is a concern.
3717
   *
3718
   *
3719
   * @return true if the implementation can be safely used on the current system
3720
   * (determined at runtime)
3721
   */
3722
  bool supported_by_runtime_system() const;
3723
3724
#if SIMDUTF_FEATURE_DETECT_ENCODING
3725
  /**
3726
   * This function will try to detect the encoding
3727
   * @param input the string to identify
3728
   * @param length the length of the string in bytes.
3729
   * @return the encoding type detected
3730
   */
3731
  virtual encoding_type autodetect_encoding(const char *input,
3732
                                            size_t length) const noexcept;
3733
3734
  /**
3735
   * This function will try to detect the possible encodings in one pass
3736
   * @param input the string to identify
3737
   * @param length the length of the string in bytes.
3738
   * @return the encoding type detected
3739
   */
3740
  virtual int detect_encodings(const char *input,
3741
                               size_t length) const noexcept = 0;
3742
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
3743
3744
  /**
3745
   * @private For internal implementation use
3746
   *
3747
   * The instruction sets this implementation is compiled against.
3748
   *
3749
   * @return a mask of all required `internal::instruction_set::` values
3750
   */
3751
  virtual uint32_t required_instruction_sets() const {
3752
    return _required_instruction_sets;
3753
  }
3754
3755
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3756
  /**
3757
   * Validate the UTF-8 string.
3758
   *
3759
   * Overridden by each implementation.
3760
   *
3761
   * @param buf the UTF-8 string to validate.
3762
   * @param len the length of the string in bytes.
3763
   * @return true if and only if the string is valid UTF-8.
3764
   */
3765
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
3766
                                                 size_t len) const noexcept = 0;
3767
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3768
3769
#if SIMDUTF_FEATURE_UTF8
3770
  /**
3771
   * Validate the UTF-8 string and stop on errors.
3772
   *
3773
   * Overridden by each implementation.
3774
   *
3775
   * @param buf the UTF-8 string to validate.
3776
   * @param len the length of the string in bytes.
3777
   * @return a result pair struct (of type simdutf::result containing the two
3778
   * fields error and count) with an error code and either position of the error
3779
   * (in the input in code units) if any, or the number of code units validated
3780
   * if successful.
3781
   */
3782
  simdutf_warn_unused virtual result
3783
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
3784
#endif // SIMDUTF_FEATURE_UTF8
3785
3786
#if SIMDUTF_FEATURE_ASCII
3787
  /**
3788
   * Validate the ASCII string.
3789
   *
3790
   * Overridden by each implementation.
3791
   *
3792
   * @param buf the ASCII string to validate.
3793
   * @param len the length of the string in bytes.
3794
   * @return true if and only if the string is valid ASCII.
3795
   */
3796
  simdutf_warn_unused virtual bool
3797
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
3798
3799
  /**
3800
   * Validate the ASCII string and stop on error.
3801
   *
3802
   * Overridden by each implementation.
3803
   *
3804
   * @param buf the ASCII string to validate.
3805
   * @param len the length of the string in bytes.
3806
   * @return a result pair struct (of type simdutf::result containing the two
3807
   * fields error and count) with an error code and either position of the error
3808
   * (in the input in code units) if any, or the number of code units validated
3809
   * if successful.
3810
   */
3811
  simdutf_warn_unused virtual result
3812
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
3813
3814
#endif // SIMDUTF_FEATURE_ASCII
3815
3816
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3817
  /**
3818
   * Validate the ASCII string as a UTF-16BE sequence.
3819
   * An UTF-16 sequence is considered an ASCII sequence
3820
   * if it could be converted to an ASCII string losslessly.
3821
   *
3822
   * Overridden by each implementation.
3823
   *
3824
   * @param buf the UTF-16BE string to validate.
3825
   * @param len the length of the string in bytes.
3826
   * @return true if and only if the string is valid ASCII.
3827
   */
3828
  simdutf_warn_unused virtual bool
3829
  validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3830
3831
  /**
3832
   * Validate the ASCII string as a UTF-16LE sequence.
3833
   * An UTF-16 sequence is considered an ASCII sequence
3834
   * if it could be converted to an ASCII string losslessly.
3835
   *
3836
   * Overridden by each implementation.
3837
   *
3838
   * @param buf the UTF-16LE string to validate.
3839
   * @param len the length of the string in bytes.
3840
   * @return true if and only if the string is valid ASCII.
3841
   */
3842
  simdutf_warn_unused virtual bool
3843
  validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3844
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3845
3846
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3847
  /**
3848
   * Validate the UTF-16LE string.This function may be best when you expect
3849
   * the input to be almost always valid. Otherwise, consider using
3850
   * validate_utf16le_with_errors.
3851
   *
3852
   * Overridden by each implementation.
3853
   *
3854
   * This function is not BOM-aware.
3855
   *
3856
   * @param buf the UTF-16LE string to validate.
3857
   * @param len the length of the string in number of 2-byte code units
3858
   * (char16_t).
3859
   * @return true if and only if the string is valid UTF-16LE.
3860
   */
3861
  simdutf_warn_unused virtual bool
3862
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
3863
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3864
3865
#if SIMDUTF_FEATURE_UTF16
3866
  /**
3867
   * Validate the UTF-16BE string. This function may be best when you expect
3868
   * the input to be almost always valid. Otherwise, consider using
3869
   * validate_utf16be_with_errors.
3870
   *
3871
   * Overridden by each implementation.
3872
   *
3873
   * This function is not BOM-aware.
3874
   *
3875
   * @param buf the UTF-16BE string to validate.
3876
   * @param len the length of the string in number of 2-byte code units
3877
   * (char16_t).
3878
   * @return true if and only if the string is valid UTF-16BE.
3879
   */
3880
  simdutf_warn_unused virtual bool
3881
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
3882
3883
  /**
3884
   * Validate the UTF-16LE string and stop on error.  It might be faster than
3885
   * validate_utf16le when an error is expected to occur early.
3886
   *
3887
   * Overridden by each implementation.
3888
   *
3889
   * This function is not BOM-aware.
3890
   *
3891
   * @param buf the UTF-16LE string to validate.
3892
   * @param len the length of the string in number of 2-byte code units
3893
   * (char16_t).
3894
   * @return a result pair struct (of type simdutf::result containing the two
3895
   * fields error and count) with an error code and either position of the error
3896
   * (in the input in code units) if any, or the number of code units validated
3897
   * if successful.
3898
   */
3899
  simdutf_warn_unused virtual result
3900
  validate_utf16le_with_errors(const char16_t *buf,
3901
                               size_t len) const noexcept = 0;
3902
3903
  /**
3904
   * Validate the UTF-16BE string and stop on error. It might be faster than
3905
   * validate_utf16be when an error is expected to occur early.
3906
   *
3907
   * Overridden by each implementation.
3908
   *
3909
   * This function is not BOM-aware.
3910
   *
3911
   * @param buf the UTF-16BE string to validate.
3912
   * @param len the length of the string in number of 2-byte code units
3913
   * (char16_t).
3914
   * @return a result pair struct (of type simdutf::result containing the two
3915
   * fields error and count) with an error code and either position of the error
3916
   * (in the input in code units) if any, or the number of code units validated
3917
   * if successful.
3918
   */
3919
  simdutf_warn_unused virtual result
3920
  validate_utf16be_with_errors(const char16_t *buf,
3921
                               size_t len) const noexcept = 0;
3922
  /**
3923
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
3924
   * Unicode replacement character U+FFFD. We allow the input and output to be
3925
   * the same buffer so that the correction is done in-place.
3926
   *
3927
   * Overridden by each implementation.
3928
   *
3929
   * @param input the UTF-16LE string to correct.
3930
   * @param len the length of the string in number of 2-byte code units
3931
   * (char16_t).
3932
   * @param output the output buffer.
3933
   */
3934
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
3935
                                      char16_t *output) const noexcept = 0;
3936
  /**
3937
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
3938
   * Unicode replacement character U+FFFD. We allow the input and output to be
3939
   * the same buffer so that the correction is done in-place.
3940
   *
3941
   * Overridden by each implementation.
3942
   *
3943
   * @param input the UTF-16BE string to correct.
3944
   * @param len the length of the string in number of 2-byte code units
3945
   * (char16_t).
3946
   * @param output the output buffer.
3947
   */
3948
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
3949
                                      char16_t *output) const noexcept = 0;
3950
#endif // SIMDUTF_FEATURE_UTF16
3951
3952
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3953
  /**
3954
   * Validate the UTF-32 string.
3955
   *
3956
   * Overridden by each implementation.
3957
   *
3958
   * This function is not BOM-aware.
3959
   *
3960
   * @param buf the UTF-32 string to validate.
3961
   * @param len the length of the string in number of 4-byte code units
3962
   * (char32_t).
3963
   * @return true if and only if the string is valid UTF-32.
3964
   */
3965
  simdutf_warn_unused virtual bool
3966
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
3967
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3968
3969
#if SIMDUTF_FEATURE_UTF32
3970
  /**
3971
   * Validate the UTF-32 string and stop on error.
3972
   *
3973
   * Overridden by each implementation.
3974
   *
3975
   * This function is not BOM-aware.
3976
   *
3977
   * @param buf the UTF-32 string to validate.
3978
   * @param len the length of the string in number of 4-byte code units
3979
   * (char32_t).
3980
   * @return a result pair struct (of type simdutf::result containing the two
3981
   * fields error and count) with an error code and either position of the error
3982
   * (in the input in code units) if any, or the number of code units validated
3983
   * if successful.
3984
   */
3985
  simdutf_warn_unused virtual result
3986
  validate_utf32_with_errors(const char32_t *buf,
3987
                             size_t len) const noexcept = 0;
3988
#endif // SIMDUTF_FEATURE_UTF32
3989
3990
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3991
  /**
3992
   * Convert Latin1 string into UTF-8 string.
3993
   *
3994
   * This function is suitable to work with inputs from untrusted sources.
3995
   *
3996
   * @param input         the Latin1 string to convert
3997
   * @param length        the length of the string in bytes
3998
   * @param utf8_output  the pointer to buffer that can hold conversion result
3999
   * @return the number of written char; 0 if conversion is not possible
4000
   */
4001
  simdutf_warn_unused virtual size_t
4002
  convert_latin1_to_utf8(const char *input, size_t length,
4003
                         char *utf8_output) const noexcept = 0;
4004
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4005
4006
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4007
  /**
4008
   * Convert possibly Latin1 string into UTF-16LE string.
4009
   *
4010
   * This function is suitable to work with inputs from untrusted sources.
4011
   *
4012
   * @param input         the Latin1  string to convert
4013
   * @param length        the length of the string in bytes
4014
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4015
   * @return the number of written char16_t; 0 if conversion is not possible
4016
   */
4017
  simdutf_warn_unused virtual size_t
4018
  convert_latin1_to_utf16le(const char *input, size_t length,
4019
                            char16_t *utf16_output) const noexcept = 0;
4020
4021
  /**
4022
   * Convert Latin1 string into UTF-16BE string.
4023
   *
4024
   * This function is suitable to work with inputs from untrusted sources.
4025
   *
4026
   * @param input         the Latin1 string to convert
4027
   * @param length        the length of the string in bytes
4028
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4029
   * @return the number of written char16_t; 0 if conversion is not possible
4030
   */
4031
  simdutf_warn_unused virtual size_t
4032
  convert_latin1_to_utf16be(const char *input, size_t length,
4033
                            char16_t *utf16_output) const noexcept = 0;
4034
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4035
4036
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4037
  /**
4038
   * Convert Latin1 string into UTF-32 string.
4039
   *
4040
   * This function is suitable to work with inputs from untrusted sources.
4041
   *
4042
   * @param input         the Latin1 string to convert
4043
   * @param length        the length of the string in bytes
4044
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4045
   * @return the number of written char32_t; 0 if conversion is not possible
4046
   */
4047
  simdutf_warn_unused virtual size_t
4048
  convert_latin1_to_utf32(const char *input, size_t length,
4049
                          char32_t *utf32_buffer) const noexcept = 0;
4050
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4051
4052
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4053
  /**
4054
   * Convert possibly broken UTF-8 string into latin1 string.
4055
   *
4056
   * During the conversion also validation of the input string is done.
4057
   * This function is suitable to work with inputs from untrusted sources.
4058
   *
4059
   * @param input         the UTF-8 string to convert
4060
   * @param length        the length of the string in bytes
4061
   * @param latin1_output  the pointer to buffer that can hold conversion result
4062
   * @return the number of written char; 0 if the input was not valid UTF-8
4063
   * string or if it cannot be represented as Latin1
4064
   */
4065
  simdutf_warn_unused virtual size_t
4066
  convert_utf8_to_latin1(const char *input, size_t length,
4067
                         char *latin1_output) const noexcept = 0;
4068
4069
  /**
4070
   * Convert possibly broken UTF-8 string into latin1 string with errors.
4071
   * If the string cannot be represented as Latin1, an error
4072
   * code is returned.
4073
   *
4074
   * During the conversion also validation of the input string is done.
4075
   * This function is suitable to work with inputs from untrusted sources.
4076
   *
4077
   * @param input         the UTF-8 string to convert
4078
   * @param length        the length of the string in bytes
4079
   * @param latin1_output  the pointer to buffer that can hold conversion result
4080
   * @return a result pair struct (of type simdutf::result containing the two
4081
   * fields error and count) with an error code and either position of the error
4082
   * (in the input in code units) if any, or the number of code units validated
4083
   * if successful.
4084
   */
4085
  simdutf_warn_unused virtual result
4086
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
4087
                                     char *latin1_output) const noexcept = 0;
4088
4089
  /**
4090
   * Convert valid UTF-8 string into latin1 string.
4091
   *
4092
   * This function assumes that the input string is valid UTF-8 and that it can
4093
   * be represented as Latin1. If you violate this assumption, the result is
4094
   * implementation defined and may include system-dependent behavior such as
4095
   * crashes.
4096
   *
4097
   * This function is for expert users only and not part of our public API. Use
4098
   * convert_utf8_to_latin1 instead.
4099
   *
4100
   * This function is not BOM-aware.
4101
   *
4102
   * @param input         the UTF-8 string to convert
4103
   * @param length        the length of the string in bytes
4104
   * @param latin1_output  the pointer to buffer that can hold conversion result
4105
   * @return the number of written char; 0 if the input was not valid UTF-8
4106
   * string
4107
   */
4108
  simdutf_warn_unused virtual size_t
4109
  convert_valid_utf8_to_latin1(const char *input, size_t length,
4110
                               char *latin1_output) const noexcept = 0;
4111
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4112
4113
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4114
  /**
4115
   * Convert possibly broken UTF-8 string into UTF-16LE string.
4116
   *
4117
   * During the conversion also validation of the input string is done.
4118
   * This function is suitable to work with inputs from untrusted sources.
4119
   *
4120
   * @param input         the UTF-8 string to convert
4121
   * @param length        the length of the string in bytes
4122
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4123
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4124
   * string
4125
   */
4126
  simdutf_warn_unused virtual size_t
4127
  convert_utf8_to_utf16le(const char *input, size_t length,
4128
                          char16_t *utf16_output) const noexcept = 0;
4129
4130
  /**
4131
   * Convert possibly broken UTF-8 string into UTF-16BE string.
4132
   *
4133
   * During the conversion also validation of the input string is done.
4134
   * This function is suitable to work with inputs from untrusted sources.
4135
   *
4136
   * @param input         the UTF-8 string to convert
4137
   * @param length        the length of the string in bytes
4138
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4139
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4140
   * string
4141
   */
4142
  simdutf_warn_unused virtual size_t
4143
  convert_utf8_to_utf16be(const char *input, size_t length,
4144
                          char16_t *utf16_output) const noexcept = 0;
4145
4146
  /**
4147
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
4148
   * error.
4149
   *
4150
   * During the conversion also validation of the input string is done.
4151
   * This function is suitable to work with inputs from untrusted sources.
4152
   *
4153
   * @param input         the UTF-8 string to convert
4154
   * @param length        the length of the string in bytes
4155
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4156
   * @return a result pair struct (of type simdutf::result containing the two
4157
   * fields error and count) with an error code and either position of the error
4158
   * (in the input in code units) if any, or the number of code units validated
4159
   * if successful.
4160
   */
4161
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
4162
      const char *input, size_t length,
4163
      char16_t *utf16_output) const noexcept = 0;
4164
4165
  /**
4166
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
4167
   * error.
4168
   *
4169
   * During the conversion also validation of the input string is done.
4170
   * This function is suitable to work with inputs from untrusted sources.
4171
   *
4172
   * @param input         the UTF-8 string to convert
4173
   * @param length        the length of the string in bytes
4174
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4175
   * @return a result pair struct (of type simdutf::result containing the two
4176
   * fields error and count) with an error code and either position of the error
4177
   * (in the input in code units) if any, or the number of code units validated
4178
   * if successful.
4179
   */
4180
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
4181
      const char *input, size_t length,
4182
      char16_t *utf16_output) const noexcept = 0;
4183
  /**
4184
   * Compute the number of bytes that this UTF-16LE string would require in
4185
   * UTF-8 format even when the UTF-16LE content contains mismatched
4186
   * surrogates that have to be replaced by the replacement character (0xFFFD).
4187
   *
4188
   * @param input         the UTF-16LE string to convert
4189
   * @param length        the length of the string in 2-byte code units
4190
   * (char16_t)
4191
   * @return a result pair struct (of type simdutf::result containing the two
4192
   * fields error and count) where the count is the number of bytes required to
4193
   * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS
4194
   * or SURROGATE. The count is correct regardless of the error field.
4195
   * When SURROGATE is returned, it does not indicate an error in the case of
4196
   * this function: it indicates that at least one surrogate has been
4197
   * encountered: the surrogates may be matched or not (thus this function does
4198
   * not validate). If the returned error code is SUCCESS, then the input
4199
   * contains no surrogate, is in the Basic Multilingual Plane, and is
4200
   * necessarily valid.
4201
   */
4202
  virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
4203
      const char16_t *input, size_t length) const noexcept = 0;
4204
4205
  /**
4206
   * Compute the number of bytes that this UTF-16BE string would require in
4207
   * UTF-8 format even when the UTF-16BE content contains mismatched
4208
   * surrogates that have to be replaced by the replacement character (0xFFFD).
4209
   *
4210
   * @param input         the UTF-16BE string to convert
4211
   * @param length        the length of the string in 2-byte code units
4212
   * (char16_t)
4213
   * @return a result pair struct (of type simdutf::result containing the two
4214
   * fields error and count) where the count is the number of bytes required to
4215
   * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS
4216
   * or SURROGATE. The count is correct regardless of the error field.
4217
   * When SURROGATE is returned, it does not indicate an error in the case of
4218
   * this function: it indicates that at least one surrogate has been
4219
   * encountered: the surrogates may be matched or not (thus this function does
4220
   * not validate). If the returned error code is SUCCESS, then the input
4221
   * contains no surrogate, is in the Basic Multilingual Plane, and is
4222
   * necessarily valid.
4223
   */
4224
  virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
4225
      const char16_t *input, size_t length) const noexcept = 0;
4226
4227
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4228
4229
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4230
  /**
4231
   * Convert possibly broken UTF-8 string into UTF-32 string.
4232
   *
4233
   * During the conversion also validation of the input string is done.
4234
   * This function is suitable to work with inputs from untrusted sources.
4235
   *
4236
   * @param input         the UTF-8 string to convert
4237
   * @param length        the length of the string in bytes
4238
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4239
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4240
   * string
4241
   */
4242
  simdutf_warn_unused virtual size_t
4243
  convert_utf8_to_utf32(const char *input, size_t length,
4244
                        char32_t *utf32_output) const noexcept = 0;
4245
4246
  /**
4247
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
4248
   *
4249
   * During the conversion also validation of the input string is done.
4250
   * This function is suitable to work with inputs from untrusted sources.
4251
   *
4252
   * @param input         the UTF-8 string to convert
4253
   * @param length        the length of the string in bytes
4254
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4255
   * @return a result pair struct (of type simdutf::result containing the two
4256
   * fields error and count) with an error code and either position of the error
4257
   * (in the input in code units) if any, or the number of char32_t written if
4258
   * successful.
4259
   */
4260
  simdutf_warn_unused virtual result
4261
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
4262
                                    char32_t *utf32_output) const noexcept = 0;
4263
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4264
4265
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4266
  /**
4267
   * Convert valid UTF-8 string into UTF-16LE string.
4268
   *
4269
   * This function assumes that the input string is valid UTF-8.
4270
   *
4271
   * @param input         the UTF-8 string to convert
4272
   * @param length        the length of the string in bytes
4273
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4274
   * @return the number of written char16_t
4275
   */
4276
  simdutf_warn_unused virtual size_t
4277
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
4278
                                char16_t *utf16_buffer) const noexcept = 0;
4279
4280
  /**
4281
   * Convert valid UTF-8 string into UTF-16BE string.
4282
   *
4283
   * This function assumes that the input string is valid UTF-8.
4284
   *
4285
   * @param input         the UTF-8 string to convert
4286
   * @param length        the length of the string in bytes
4287
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4288
   * @return the number of written char16_t
4289
   */
4290
  simdutf_warn_unused virtual size_t
4291
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
4292
                                char16_t *utf16_buffer) const noexcept = 0;
4293
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4294
4295
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4296
  /**
4297
   * Convert valid UTF-8 string into UTF-32 string.
4298
   *
4299
   * This function assumes that the input string is valid UTF-8.
4300
   *
4301
   * @param input         the UTF-8 string to convert
4302
   * @param length        the length of the string in bytes
4303
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4304
   * @return the number of written char32_t
4305
   */
4306
  simdutf_warn_unused virtual size_t
4307
  convert_valid_utf8_to_utf32(const char *input, size_t length,
4308
                              char32_t *utf32_buffer) const noexcept = 0;
4309
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4310
4311
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4312
  /**
4313
   * Compute the number of 2-byte code units that this UTF-8 string would
4314
   * require in UTF-16LE format.
4315
   *
4316
   * This function does not validate the input. It is acceptable to pass invalid
4317
   * UTF-8 strings but in such cases the result is implementation defined.
4318
   *
4319
   * @param input         the UTF-8 string to process
4320
   * @param length        the length of the string in bytes
4321
   * @return the number of char16_t code units required to encode the UTF-8
4322
   * string as UTF-16LE
4323
   */
4324
  simdutf_warn_unused virtual size_t
4325
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4326
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4327
4328
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4329
  /**
4330
   * Compute the number of 4-byte code units that this UTF-8 string would
4331
   * require in UTF-32 format.
4332
   *
4333
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
4334
   * UTF-8 strings but in such cases the result is implementation defined.
4335
   *
4336
   * This function does not validate the input.
4337
   *
4338
   * @param input         the UTF-8 string to process
4339
   * @param length        the length of the string in bytes
4340
   * @return the number of char32_t code units required to encode the UTF-8
4341
   * string as UTF-32
4342
   */
4343
  simdutf_warn_unused virtual size_t
4344
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4345
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4346
4347
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4348
  /**
4349
   * Convert possibly broken UTF-16LE string into Latin1 string.
4350
   *
4351
   * During the conversion also validation of the input string is done.
4352
   * This function is suitable to work with inputs from untrusted sources.
4353
   *
4354
   * This function is not BOM-aware.
4355
   *
4356
   * @param input         the UTF-16LE string to convert
4357
   * @param length        the length of the string in 2-byte code units
4358
   * (char16_t)
4359
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4360
   * result
4361
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4362
   * string or if it cannot be represented as Latin1
4363
   */
4364
  simdutf_warn_unused virtual size_t
4365
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
4366
                            char *latin1_buffer) const noexcept = 0;
4367
4368
  /**
4369
   * Convert possibly broken UTF-16BE string into Latin1 string.
4370
   *
4371
   * During the conversion also validation of the input string is done.
4372
   * This function is suitable to work with inputs from untrusted sources.
4373
   *
4374
   * This function is not BOM-aware.
4375
   *
4376
   * @param input         the UTF-16BE string to convert
4377
   * @param length        the length of the string in 2-byte code units
4378
   * (char16_t)
4379
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4380
   * result
4381
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4382
   * string or if it cannot be represented as Latin1
4383
   */
4384
  simdutf_warn_unused virtual size_t
4385
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
4386
                            char *latin1_buffer) const noexcept = 0;
4387
4388
  /**
4389
   * Convert possibly broken UTF-16LE string into Latin1 string.
4390
   * If the string cannot be represented as Latin1, an error
4391
   * is returned.
4392
   *
4393
   * During the conversion also validation of the input string is done.
4394
   * This function is suitable to work with inputs from untrusted sources.
4395
   * This function is not BOM-aware.
4396
   *
4397
   * @param input         the UTF-16LE string to convert
4398
   * @param length        the length of the string in 2-byte code units
4399
   * (char16_t)
4400
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4401
   * result
4402
   * @return a result pair struct (of type simdutf::result containing the two
4403
   * fields error and count) with an error code and either position of the error
4404
   * (in the input in code units) if any, or the number of char written if
4405
   * successful.
4406
   */
4407
  simdutf_warn_unused virtual result
4408
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
4409
                                        char *latin1_buffer) const noexcept = 0;
4410
4411
  /**
4412
   * Convert possibly broken UTF-16BE string into Latin1 string.
4413
   * If the string cannot be represented as Latin1, an error
4414
   * is returned.
4415
   *
4416
   * During the conversion also validation of the input string is done.
4417
   * This function is suitable to work with inputs from untrusted sources.
4418
   * This function is not BOM-aware.
4419
   *
4420
   * @param input         the UTF-16BE string to convert
4421
   * @param length        the length of the string in 2-byte code units
4422
   * (char16_t)
4423
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4424
   * result
4425
   * @return a result pair struct (of type simdutf::result containing the two
4426
   * fields error and count) with an error code and either position of the error
4427
   * (in the input in code units) if any, or the number of char written if
4428
   * successful.
4429
   */
4430
  simdutf_warn_unused virtual result
4431
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
4432
                                        char *latin1_buffer) const noexcept = 0;
4433
4434
  /**
4435
   * Convert valid UTF-16LE string into Latin1 string.
4436
   *
4437
   * This function assumes that the input string is valid UTF-L16LE and that it
4438
   * can be represented as Latin1. If you violate this assumption, the result is
4439
   * implementation defined and may include system-dependent behavior such as
4440
   * crashes.
4441
   *
4442
   * This function is for expert users only and not part of our public API. Use
4443
   * convert_utf16le_to_latin1 instead.
4444
   *
4445
   * This function is not BOM-aware.
4446
   *
4447
   * @param input         the UTF-16LE string to convert
4448
   * @param length        the length of the string in 2-byte code units
4449
   * (char16_t)
4450
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4451
   * result
4452
   * @return number of written code units; 0 if conversion is not possible
4453
   */
4454
  simdutf_warn_unused virtual size_t
4455
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
4456
                                  char *latin1_buffer) const noexcept = 0;
4457
4458
  /**
4459
   * Convert valid UTF-16BE string into Latin1 string.
4460
   *
4461
   * This function assumes that the input string is valid UTF16-BE and that it
4462
   * can be represented as Latin1. If you violate this assumption, the result is
4463
   * implementation defined and may include system-dependent behavior such as
4464
   * crashes.
4465
   *
4466
   * This function is for expert users only and not part of our public API. Use
4467
   * convert_utf16be_to_latin1 instead.
4468
   *
4469
   * This function is not BOM-aware.
4470
   *
4471
   * @param input         the UTF-16BE string to convert
4472
   * @param length        the length of the string in 2-byte code units
4473
   * (char16_t)
4474
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4475
   * result
4476
   * @return number of written code units; 0 if conversion is not possible
4477
   */
4478
  simdutf_warn_unused virtual size_t
4479
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
4480
                                  char *latin1_buffer) const noexcept = 0;
4481
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4482
4483
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4484
  /**
4485
   * Convert possibly broken UTF-16LE string into UTF-8 string.
4486
   *
4487
   * During the conversion also validation of the input string is done.
4488
   * This function is suitable to work with inputs from untrusted sources.
4489
   *
4490
   * This function is not BOM-aware.
4491
   *
4492
   * @param input         the UTF-16LE string to convert
4493
   * @param length        the length of the string in 2-byte code units
4494
   * (char16_t)
4495
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4496
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4497
   * string
4498
   */
4499
  simdutf_warn_unused virtual size_t
4500
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
4501
                          char *utf8_buffer) const noexcept = 0;
4502
4503
  /**
4504
   * Convert possibly broken UTF-16BE string into UTF-8 string.
4505
   *
4506
   * During the conversion also validation of the input string is done.
4507
   * This function is suitable to work with inputs from untrusted sources.
4508
   *
4509
   * This function is not BOM-aware.
4510
   *
4511
   * @param input         the UTF-16BE string to convert
4512
   * @param length        the length of the string in 2-byte code units
4513
   * (char16_t)
4514
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4515
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4516
   * string
4517
   */
4518
  simdutf_warn_unused virtual size_t
4519
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
4520
                          char *utf8_buffer) const noexcept = 0;
4521
4522
  /**
4523
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
4524
   * error.
4525
   *
4526
   * During the conversion also validation of the input string is done.
4527
   * This function is suitable to work with inputs from untrusted sources.
4528
   *
4529
   * This function is not BOM-aware.
4530
   *
4531
   * @param input         the UTF-16LE string to convert
4532
   * @param length        the length of the string in 2-byte code units
4533
   * (char16_t)
4534
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4535
   * @return a result pair struct (of type simdutf::result containing the two
4536
   * fields error and count) with an error code and either position of the error
4537
   * (in the input in code units) if any, or the number of char written if
4538
   * successful.
4539
   */
4540
  simdutf_warn_unused virtual result
4541
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
4542
                                      char *utf8_buffer) const noexcept = 0;
4543
4544
  /**
4545
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
4546
   * error.
4547
   *
4548
   * During the conversion also validation of the input string is done.
4549
   * This function is suitable to work with inputs from untrusted sources.
4550
   *
4551
   * This function is not BOM-aware.
4552
   *
4553
   * @param input         the UTF-16BE string to convert
4554
   * @param length        the length of the string in 2-byte code units
4555
   * (char16_t)
4556
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4557
   * @return a result pair struct (of type simdutf::result containing the two
4558
   * fields error and count) with an error code and either position of the error
4559
   * (in the input in code units) if any, or the number of char written if
4560
   * successful.
4561
   */
4562
  simdutf_warn_unused virtual result
4563
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
4564
                                      char *utf8_buffer) const noexcept = 0;
4565
4566
  /**
4567
   * Convert valid UTF-16LE string into UTF-8 string.
4568
   *
4569
   * This function assumes that the input string is valid UTF-16LE.
4570
   *
4571
   * This function is not BOM-aware.
4572
   *
4573
   * @param input         the UTF-16LE string to convert
4574
   * @param length        the length of the string in 2-byte code units
4575
   * (char16_t)
4576
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4577
   * result
4578
   * @return number of written code units; 0 if conversion is not possible
4579
   */
4580
  simdutf_warn_unused virtual size_t
4581
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
4582
                                char *utf8_buffer) const noexcept = 0;
4583
4584
  /**
4585
   * Convert valid UTF-16BE string into UTF-8 string.
4586
   *
4587
   * This function assumes that the input string is valid UTF-16BE.
4588
   *
4589
   * This function is not BOM-aware.
4590
   *
4591
   * @param input         the UTF-16BE string to convert
4592
   * @param length        the length of the string in 2-byte code units
4593
   * (char16_t)
4594
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4595
   * result
4596
   * @return number of written code units; 0 if conversion is not possible
4597
   */
4598
  simdutf_warn_unused virtual size_t
4599
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
4600
                                char *utf8_buffer) const noexcept = 0;
4601
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4602
4603
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4604
  /**
4605
   * Convert possibly broken UTF-16LE string into UTF-32 string.
4606
   *
4607
   * During the conversion also validation of the input string is done.
4608
   * This function is suitable to work with inputs from untrusted sources.
4609
   *
4610
   * This function is not BOM-aware.
4611
   *
4612
   * @param input         the UTF-16LE string to convert
4613
   * @param length        the length of the string in 2-byte code units
4614
   * (char16_t)
4615
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4616
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4617
   * string
4618
   */
4619
  simdutf_warn_unused virtual size_t
4620
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
4621
                           char32_t *utf32_buffer) const noexcept = 0;
4622
4623
  /**
4624
   * Convert possibly broken UTF-16BE string into UTF-32 string.
4625
   *
4626
   * During the conversion also validation of the input string is done.
4627
   * This function is suitable to work with inputs from untrusted sources.
4628
   *
4629
   * This function is not BOM-aware.
4630
   *
4631
   * @param input         the UTF-16BE string to convert
4632
   * @param length        the length of the string in 2-byte code units
4633
   * (char16_t)
4634
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4635
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4636
   * string
4637
   */
4638
  simdutf_warn_unused virtual size_t
4639
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
4640
                           char32_t *utf32_buffer) const noexcept = 0;
4641
4642
  /**
4643
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
4644
   * error.
4645
   *
4646
   * During the conversion also validation of the input string is done.
4647
   * This function is suitable to work with inputs from untrusted sources.
4648
   *
4649
   * This function is not BOM-aware.
4650
   *
4651
   * @param input         the UTF-16LE string to convert
4652
   * @param length        the length of the string in 2-byte code units
4653
   * (char16_t)
4654
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4655
   * @return a result pair struct (of type simdutf::result containing the two
4656
   * fields error and count) with an error code and either position of the error
4657
   * (in the input in code units) if any, or the number of char32_t written if
4658
   * successful.
4659
   */
4660
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
4661
      const char16_t *input, size_t length,
4662
      char32_t *utf32_buffer) const noexcept = 0;
4663
4664
  /**
4665
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
4666
   * error.
4667
   *
4668
   * During the conversion also validation of the input string is done.
4669
   * This function is suitable to work with inputs from untrusted sources.
4670
   *
4671
   * This function is not BOM-aware.
4672
   *
4673
   * @param input         the UTF-16BE string to convert
4674
   * @param length        the length of the string in 2-byte code units
4675
   * (char16_t)
4676
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4677
   * @return a result pair struct (of type simdutf::result containing the two
4678
   * fields error and count) with an error code and either position of the error
4679
   * (in the input in code units) if any, or the number of char32_t written if
4680
   * successful.
4681
   */
4682
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
4683
      const char16_t *input, size_t length,
4684
      char32_t *utf32_buffer) const noexcept = 0;
4685
4686
  /**
4687
   * Convert valid UTF-16LE string into UTF-32 string.
4688
   *
4689
   * This function assumes that the input string is valid UTF-16LE.
4690
   *
4691
   * This function is not BOM-aware.
4692
   *
4693
   * @param input         the UTF-16LE string to convert
4694
   * @param length        the length of the string in 2-byte code units
4695
   * (char16_t)
4696
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4697
   * result
4698
   * @return number of written code units; 0 if conversion is not possible
4699
   */
4700
  simdutf_warn_unused virtual size_t
4701
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
4702
                                 char32_t *utf32_buffer) const noexcept = 0;
4703
4704
  /**
4705
   * Convert valid UTF-16LE string into UTF-32BE string.
4706
   *
4707
   * This function assumes that the input string is valid UTF-16BE.
4708
   *
4709
   * This function is not BOM-aware.
4710
   *
4711
   * @param input         the UTF-16BE string to convert
4712
   * @param length        the length of the string in 2-byte code units
4713
   * (char16_t)
4714
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4715
   * result
4716
   * @return number of written code units; 0 if conversion is not possible
4717
   */
4718
  simdutf_warn_unused virtual size_t
4719
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
4720
                                 char32_t *utf32_buffer) const noexcept = 0;
4721
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4722
4723
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4724
  /**
4725
   * Compute the number of bytes that this UTF-16LE string would require in
4726
   * UTF-8 format.
4727
   *
4728
   * This function does not validate the input. It is acceptable to pass invalid
4729
   * UTF-16 strings but in such cases the result is implementation defined.
4730
   *
4731
   * This function is not BOM-aware.
4732
   *
4733
   * @param input         the UTF-16LE string to convert
4734
   * @param length        the length of the string in 2-byte code units
4735
   * (char16_t)
4736
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
4737
   */
4738
  simdutf_warn_unused virtual size_t
4739
  utf8_length_from_utf16le(const char16_t *input,
4740
                           size_t length) const noexcept = 0;
4741
4742
  /**
4743
   * Compute the number of bytes that this UTF-16BE string would require in
4744
   * UTF-8 format.
4745
   *
4746
   * This function does not validate the input. It is acceptable to pass invalid
4747
   * UTF-16 strings but in such cases the result is implementation defined.
4748
   *
4749
   * This function is not BOM-aware.
4750
   *
4751
   * @param input         the UTF-16BE string to convert
4752
   * @param length        the length of the string in 2-byte code units
4753
   * (char16_t)
4754
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
4755
   */
4756
  simdutf_warn_unused virtual size_t
4757
  utf8_length_from_utf16be(const char16_t *input,
4758
                           size_t length) const noexcept = 0;
4759
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4760
4761
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4762
  /**
4763
   * Convert possibly broken UTF-32 string into Latin1 string.
4764
   *
4765
   * During the conversion also validation of the input string is done.
4766
   * This function is suitable to work with inputs from untrusted sources.
4767
   *
4768
   * This function is not BOM-aware.
4769
   *
4770
   * @param input         the UTF-32 string to convert
4771
   * @param length        the length of the string in 4-byte code units
4772
   * (char32_t)
4773
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4774
   * result
4775
   * @return number of written code units; 0 if input is not a valid UTF-32
4776
   * string
4777
   */
4778
  simdutf_warn_unused virtual size_t
4779
  convert_utf32_to_latin1(const char32_t *input, size_t length,
4780
                          char *latin1_buffer) const noexcept = 0;
4781
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4782
4783
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4784
  /**
4785
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
4786
   * If the string cannot be represented as Latin1, an error is returned.
4787
   *
4788
   * During the conversion also validation of the input string is done.
4789
   * This function is suitable to work with inputs from untrusted sources.
4790
   *
4791
   * This function is not BOM-aware.
4792
   *
4793
   * @param input         the UTF-32 string to convert
4794
   * @param length        the length of the string in 4-byte code units
4795
   * (char32_t)
4796
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4797
   * result
4798
   * @return a result pair struct (of type simdutf::result containing the two
4799
   * fields error and count) with an error code and either position of the error
4800
   * (in the input in code units) if any, or the number of char written if
4801
   * successful.
4802
   */
4803
  simdutf_warn_unused virtual result
4804
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
4805
                                      char *latin1_buffer) const noexcept = 0;
4806
4807
  /**
4808
   * Convert valid UTF-32 string into Latin1 string.
4809
   *
4810
   * This function assumes that the input string is valid UTF-32 and can be
4811
   * represented as Latin1. If you violate this assumption, the result is
4812
   * implementation defined and may include system-dependent behavior such as
4813
   * crashes.
4814
   *
4815
   * This function is for expert users only and not part of our public API. Use
4816
   * convert_utf32_to_latin1 instead.
4817
   *
4818
   * This function is not BOM-aware.
4819
   *
4820
   * @param input         the UTF-32 string to convert
4821
   * @param length        the length of the string in 4-byte code units
4822
   * (char32_t)
4823
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
4824
   * result
4825
   * @return number of written code units; 0 if conversion is not possible
4826
   */
4827
  simdutf_warn_unused virtual size_t
4828
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
4829
                                char *latin1_buffer) const noexcept = 0;
4830
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4831
4832
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4833
  /**
4834
   * Convert possibly broken UTF-32 string into UTF-8 string.
4835
   *
4836
   * During the conversion also validation of the input string is done.
4837
   * This function is suitable to work with inputs from untrusted sources.
4838
   *
4839
   * This function is not BOM-aware.
4840
   *
4841
   * @param input         the UTF-32 string to convert
4842
   * @param length        the length of the string in 4-byte code units
4843
   * (char32_t)
4844
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4845
   * @return number of written code units; 0 if input is not a valid UTF-32
4846
   * string
4847
   */
4848
  simdutf_warn_unused virtual size_t
4849
  convert_utf32_to_utf8(const char32_t *input, size_t length,
4850
                        char *utf8_buffer) const noexcept = 0;
4851
4852
  /**
4853
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
4854
   *
4855
   * During the conversion also validation of the input string is done.
4856
   * This function is suitable to work with inputs from untrusted sources.
4857
   *
4858
   * This function is not BOM-aware.
4859
   *
4860
   * @param input         the UTF-32 string to convert
4861
   * @param length        the length of the string in 4-byte code units
4862
   * (char32_t)
4863
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4864
   * @return a result pair struct (of type simdutf::result containing the two
4865
   * fields error and count) with an error code and either position of the error
4866
   * (in the input in code units) if any, or the number of char written if
4867
   * successful.
4868
   */
4869
  simdutf_warn_unused virtual result
4870
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
4871
                                    char *utf8_buffer) const noexcept = 0;
4872
4873
  /**
4874
   * Convert valid UTF-32 string into UTF-8 string.
4875
   *
4876
   * This function assumes that the input string is valid UTF-32.
4877
   *
4878
   * This function is not BOM-aware.
4879
   *
4880
   * @param input         the UTF-32 string to convert
4881
   * @param length        the length of the string in 4-byte code units
4882
   * (char32_t)
4883
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4884
   * result
4885
   * @return number of written code units; 0 if conversion is not possible
4886
   */
4887
  simdutf_warn_unused virtual size_t
4888
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
4889
                              char *utf8_buffer) const noexcept = 0;
4890
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4891
4892
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4893
  /**
4894
   * Return the number of bytes that this UTF-16 string would require in Latin1
4895
   * format.
4896
   *
4897
   *
4898
   * @param input         the UTF-16 string to convert
4899
   * @param length        the length of the string in 2-byte code units
4900
   * (char16_t)
4901
   * @return the number of bytes required to encode the UTF-16 string as Latin1
4902
   */
4903
  simdutf_warn_unused virtual size_t
4904
  utf16_length_from_latin1(size_t length) const noexcept {
4905
    return length;
4906
  }
4907
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4908
4909
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4910
  /**
4911
   * Convert possibly broken UTF-32 string into UTF-16LE string.
4912
   *
4913
   * During the conversion also validation of the input string is done.
4914
   * This function is suitable to work with inputs from untrusted sources.
4915
   *
4916
   * This function is not BOM-aware.
4917
   *
4918
   * @param input         the UTF-32 string to convert
4919
   * @param length        the length of the string in 4-byte code units
4920
   * (char32_t)
4921
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4922
   * @return number of written code units; 0 if input is not a valid UTF-32
4923
   * string
4924
   */
4925
  simdutf_warn_unused virtual size_t
4926
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
4927
                           char16_t *utf16_buffer) const noexcept = 0;
4928
4929
  /**
4930
   * Convert possibly broken UTF-32 string into UTF-16BE string.
4931
   *
4932
   * During the conversion also validation of the input string is done.
4933
   * This function is suitable to work with inputs from untrusted sources.
4934
   *
4935
   * This function is not BOM-aware.
4936
   *
4937
   * @param input         the UTF-32 string to convert
4938
   * @param length        the length of the string in 4-byte code units
4939
   * (char32_t)
4940
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4941
   * @return number of written code units; 0 if input is not a valid UTF-32
4942
   * string
4943
   */
4944
  simdutf_warn_unused virtual size_t
4945
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
4946
                           char16_t *utf16_buffer) const noexcept = 0;
4947
4948
  /**
4949
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
4950
   * error.
4951
   *
4952
   * During the conversion also validation of the input string is done.
4953
   * This function is suitable to work with inputs from untrusted sources.
4954
   *
4955
   * This function is not BOM-aware.
4956
   *
4957
   * @param input         the UTF-32 string to convert
4958
   * @param length        the length of the string in 4-byte code units
4959
   * (char32_t)
4960
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4961
   * @return a result pair struct (of type simdutf::result containing the two
4962
   * fields error and count) with an error code and either position of the error
4963
   * (in the input in code units) if any, or the number of char16_t written if
4964
   * successful.
4965
   */
4966
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
4967
      const char32_t *input, size_t length,
4968
      char16_t *utf16_buffer) const noexcept = 0;
4969
4970
  /**
4971
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
4972
   * error.
4973
   *
4974
   * During the conversion also validation of the input string is done.
4975
   * This function is suitable to work with inputs from untrusted sources.
4976
   *
4977
   * This function is not BOM-aware.
4978
   *
4979
   * @param input         the UTF-32 string to convert
4980
   * @param length        the length of the string in 4-byte code units
4981
   * (char32_t)
4982
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4983
   * @return a result pair struct (of type simdutf::result containing the two
4984
   * fields error and count) with an error code and either position of the error
4985
   * (in the input in code units) if any, or the number of char16_t written if
4986
   * successful.
4987
   */
4988
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
4989
      const char32_t *input, size_t length,
4990
      char16_t *utf16_buffer) const noexcept = 0;
4991
4992
  /**
4993
   * Convert valid UTF-32 string into UTF-16LE string.
4994
   *
4995
   * This function assumes that the input string is valid UTF-32.
4996
   *
4997
   * This function is not BOM-aware.
4998
   *
4999
   * @param input         the UTF-32 string to convert
5000
   * @param length        the length of the string in 4-byte code units
5001
   * (char32_t)
5002
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
5003
   * result
5004
   * @return number of written code units; 0 if conversion is not possible
5005
   */
5006
  simdutf_warn_unused virtual size_t
5007
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
5008
                                 char16_t *utf16_buffer) const noexcept = 0;
5009
5010
  /**
5011
   * Convert valid UTF-32 string into UTF-16BE string.
5012
   *
5013
   * This function assumes that the input string is valid UTF-32.
5014
   *
5015
   * This function is not BOM-aware.
5016
   *
5017
   * @param input         the UTF-32 string to convert
5018
   * @param length        the length of the string in 4-byte code units
5019
   * (char32_t)
5020
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
5021
   * result
5022
   * @return number of written code units; 0 if conversion is not possible
5023
   */
5024
  simdutf_warn_unused virtual size_t
5025
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
5026
                                 char16_t *utf16_buffer) const noexcept = 0;
5027
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5028
5029
#if SIMDUTF_FEATURE_UTF16
5030
  /**
5031
   * Change the endianness of the input. Can be used to go from UTF-16LE to
5032
   * UTF-16BE or from UTF-16BE to UTF-16LE.
5033
   *
5034
   * This function does not validate the input.
5035
   *
5036
   * This function is not BOM-aware.
5037
   *
5038
   * @param input         the UTF-16 string to process
5039
   * @param length        the length of the string in 2-byte code units
5040
   * (char16_t)
5041
   * @param output        the pointer to a buffer that can hold the conversion
5042
   * result
5043
   */
5044
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
5045
                                       char16_t *output) const noexcept = 0;
5046
#endif // SIMDUTF_FEATURE_UTF16
5047
5048
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5049
  /**
5050
   * Return the number of bytes that this Latin1 string would require in UTF-8
5051
   * format.
5052
   *
5053
   * @param input         the Latin1 string to convert
5054
   * @param length        the length of the string bytes
5055
   * @return the number of bytes required to encode the Latin1 string as UTF-8
5056
   */
5057
  simdutf_warn_unused virtual size_t
5058
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
5059
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5060
5061
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5062
  /**
5063
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
5064
   * format.
5065
   *
5066
   * This function does not validate the input. It is acceptable to pass invalid
5067
   * UTF-32 strings but in such cases the result is implementation defined.
5068
   *
5069
   * @param input         the UTF-32 string to convert
5070
   * @param length        the length of the string in 4-byte code units
5071
   * (char32_t)
5072
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
5073
   */
5074
  simdutf_warn_unused virtual size_t
5075
  utf8_length_from_utf32(const char32_t *input,
5076
                         size_t length) const noexcept = 0;
5077
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5078
5079
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5080
  /**
5081
   * Compute the number of bytes that this UTF-32 string would require in Latin1
5082
   * format.
5083
   *
5084
   * This function does not validate the input. It is acceptable to pass invalid
5085
   * UTF-32 strings but in such cases the result is implementation defined.
5086
   *
5087
   * @param length        the length of the string in 4-byte code units
5088
   * (char32_t)
5089
   * @return the number of bytes required to encode the UTF-32 string as Latin1
5090
   */
5091
  simdutf_warn_unused virtual size_t
5092
  latin1_length_from_utf32(size_t length) const noexcept {
5093
    return length;
5094
  }
5095
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5096
5097
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5098
  /**
5099
   * Compute the number of bytes that this UTF-8 string would require in Latin1
5100
   * format.
5101
   *
5102
   * This function does not validate the input. It is acceptable to pass invalid
5103
   * UTF-8 strings but in such cases the result is implementation defined.
5104
   *
5105
   * @param input         the UTF-8 string to convert
5106
   * @param length        the length of the string in byte
5107
   * @return the number of bytes required to encode the UTF-8 string as Latin1
5108
   */
5109
  simdutf_warn_unused virtual size_t
5110
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5111
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5112
5113
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5114
  /**
5115
   * Compute the number of bytes that this UTF-16LE/BE string would require in
5116
   * Latin1 format.
5117
   *
5118
   * This function does not validate the input. It is acceptable to pass invalid
5119
   * UTF-16 strings but in such cases the result is implementation defined.
5120
   *
5121
   * This function is not BOM-aware.
5122
   *
5123
   * @param input         the UTF-16LE string to convert
5124
   * @param length        the length of the string in 2-byte code units
5125
   * (char16_t)
5126
   * @return the number of bytes required to encode the UTF-16LE string as
5127
   * Latin1
5128
   */
5129
  simdutf_warn_unused virtual size_t
5130
  latin1_length_from_utf16(size_t length) const noexcept {
5131
    return length;
5132
  }
5133
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5134
5135
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5136
  /**
5137
   * Compute the number of two-byte code units that this UTF-32 string would
5138
   * require in UTF-16 format.
5139
   *
5140
   * This function does not validate the input. It is acceptable to pass invalid
5141
   * UTF-32 strings but in such cases the result is implementation defined.
5142
   *
5143
   * @param input         the UTF-32 string to convert
5144
   * @param length        the length of the string in 4-byte code units
5145
   * (char32_t)
5146
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
5147
   */
5148
  simdutf_warn_unused virtual size_t
5149
  utf16_length_from_utf32(const char32_t *input,
5150
                          size_t length) const noexcept = 0;
5151
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5152
5153
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5154
  /**
5155
   * Return the number of bytes that this UTF-32 string would require in Latin1
5156
   * format.
5157
   *
5158
   * @param length        the length of the string in 4-byte code units
5159
   * (char32_t)
5160
   * @return the number of bytes required to encode the UTF-32 string as Latin1
5161
   */
5162
  simdutf_warn_unused virtual size_t
5163
  utf32_length_from_latin1(size_t length) const noexcept {
5164
    return length;
5165
  }
5166
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5167
5168
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5169
  /**
5170
   * Compute the number of bytes that this UTF-16LE string would require in
5171
   * UTF-32 format.
5172
   *
5173
   * This function is equivalent to count_utf16le.
5174
   *
5175
   * This function does not validate the input. It is acceptable to pass invalid
5176
   * UTF-16 strings but in such cases the result is implementation defined.
5177
   *
5178
   * This function is not BOM-aware.
5179
   *
5180
   * @param input         the UTF-16LE string to convert
5181
   * @param length        the length of the string in 2-byte code units
5182
   * (char16_t)
5183
   * @return the number of bytes required to encode the UTF-16LE string as
5184
   * UTF-32
5185
   */
5186
  simdutf_warn_unused virtual size_t
5187
  utf32_length_from_utf16le(const char16_t *input,
5188
                            size_t length) const noexcept = 0;
5189
5190
  /**
5191
   * Compute the number of bytes that this UTF-16BE string would require in
5192
   * UTF-32 format.
5193
   *
5194
   * This function is equivalent to count_utf16be.
5195
   *
5196
   * This function does not validate the input. It is acceptable to pass invalid
5197
   * UTF-16 strings but in such cases the result is implementation defined.
5198
   *
5199
   * This function is not BOM-aware.
5200
   *
5201
   * @param input         the UTF-16BE string to convert
5202
   * @param length        the length of the string in 2-byte code units
5203
   * (char16_t)
5204
   * @return the number of bytes required to encode the UTF-16BE string as
5205
   * UTF-32
5206
   */
5207
  simdutf_warn_unused virtual size_t
5208
  utf32_length_from_utf16be(const char16_t *input,
5209
                            size_t length) const noexcept = 0;
5210
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5211
5212
#if SIMDUTF_FEATURE_UTF16
5213
  /**
5214
   * Count the number of code points (characters) in the string assuming that
5215
   * it is valid.
5216
   *
5217
   * This function assumes that the input string is valid UTF-16LE.
5218
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5219
   * the result is implementation defined.
5220
   *
5221
   * This function is not BOM-aware.
5222
   *
5223
   * @param input         the UTF-16LE string to process
5224
   * @param length        the length of the string in 2-byte code units
5225
   * (char16_t)
5226
   * @return number of code points
5227
   */
5228
  simdutf_warn_unused virtual size_t
5229
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
5230
5231
  /**
5232
   * Count the number of code points (characters) in the string assuming that
5233
   * it is valid.
5234
   *
5235
   * This function assumes that the input string is valid UTF-16BE.
5236
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5237
   * the result is implementation defined.
5238
   *
5239
   * This function is not BOM-aware.
5240
   *
5241
   * @param input         the UTF-16BE string to process
5242
   * @param length        the length of the string in 2-byte code units
5243
   * (char16_t)
5244
   * @return number of code points
5245
   */
5246
  simdutf_warn_unused virtual size_t
5247
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
5248
#endif // SIMDUTF_FEATURE_UTF16
5249
5250
#if SIMDUTF_FEATURE_UTF8
5251
  /**
5252
   * Count the number of code points (characters) in the string assuming that
5253
   * it is valid.
5254
   *
5255
   * This function assumes that the input string is valid UTF-8.
5256
   * It is acceptable to pass invalid UTF-8 strings but in such cases
5257
   * the result is implementation defined.
5258
   *
5259
   * @param input         the UTF-8 string to process
5260
   * @param length        the length of the string in bytes
5261
   * @return number of code points
5262
   */
5263
  simdutf_warn_unused virtual size_t
5264
  count_utf8(const char *input, size_t length) const noexcept = 0;
5265
#endif // SIMDUTF_FEATURE_UTF8
5266
5267
#if SIMDUTF_FEATURE_BASE64
5268
  /**
5269
   * Provide the maximal binary length in bytes given the base64 input.
5270
   * As long as the input does not contain ignorable characters (e.g., ASCII
5271
   * spaces or linefeed characters), the result is exact. In particular, the
5272
   * function checks for padding characters.
5273
   *
5274
   * The function is fast (constant time). It checks up to two characters at
5275
   * the end of the string. The input is not otherwise validated or read..
5276
   *
5277
   * @param input         the base64 input to process
5278
   * @param length        the length of the base64 input in bytes
5279
   * @return maximal number of binary bytes
5280
   */
5281
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5282
      const char *input, size_t length) const noexcept;
5283
5284
  /**
5285
   * Provide the maximal binary length in bytes given the base64 input.
5286
   * As long as the input does not contain ignorable characters (e.g., ASCII
5287
   * spaces or linefeed characters), the result is exact. In particular, the
5288
   * function checks for padding characters.
5289
   *
5290
   * The function is fast (constant time). It checks up to two characters at
5291
   * the end of the string. The input is not otherwise validated or read.
5292
   *
5293
   * @param input         the base64 input to process, in ASCII stored as 16-bit
5294
   * units
5295
   * @param length        the length of the base64 input in 16-bit units
5296
   * @return maximal number of binary bytes
5297
   */
5298
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5299
      const char16_t *input, size_t length) const noexcept;
5300
5301
  /**
5302
   * Convert a base64 input to a binary output.
5303
   *
5304
   * This function follows the WHATWG forgiving-base64 format, which means that
5305
   * it will ignore any ASCII spaces in the input. You may provide a padded
5306
   * input (with one or two equal signs at the end) or an unpadded input
5307
   * (without any equal signs at the end).
5308
   *
5309
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5310
   *
5311
   * This function will fail in case of invalid input. When last_chunk_options =
5312
   * loose, there are two possible reasons for failure: the input contains a
5313
   * number of base64 characters that when divided by 4, leaves a single
5314
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5315
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5316
   *
5317
   * You should call this function with a buffer that is at least
5318
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5319
   * provide that much space, the function may cause a buffer overflow.
5320
   *
5321
   * @param input         the base64 string to process
5322
   * @param length        the length of the string in bytes
5323
   * @param output        the pointer to a buffer that can hold the conversion
5324
   * result (should be at least maximal_binary_length_from_base64(input, length)
5325
   * bytes long).
5326
   * @param options       the base64 options to use, can be base64_default or
5327
   * base64_url, is base64_default by default.
5328
   * @return a result pair struct (of type simdutf::result containing the two
5329
   * fields error and count) with an error code and either position of the error
5330
   * (in the input in bytes) if any, or the number of bytes written if
5331
   * successful.
5332
   */
5333
  simdutf_warn_unused virtual result
5334
  base64_to_binary(const char *input, size_t length, char *output,
5335
                   base64_options options = base64_default,
5336
                   last_chunk_handling_options last_chunk_options =
5337
                       last_chunk_handling_options::loose) const noexcept = 0;
5338
5339
  /**
5340
   * Convert a base64 input to a binary output while returning more details
5341
   * than base64_to_binary.
5342
   *
5343
   * This function follows the WHATWG forgiving-base64 format, which means that
5344
   * it will ignore any ASCII spaces in the input. You may provide a padded
5345
   * input (with one or two equal signs at the end) or an unpadded input
5346
   * (without any equal signs at the end).
5347
   *
5348
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5349
   *
5350
   * This function will fail in case of invalid input. When last_chunk_options =
5351
   * loose, there are two possible reasons for failure: the input contains a
5352
   * number of base64 characters that when divided by 4, leaves a single
5353
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5354
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5355
   *
5356
   * You should call this function with a buffer that is at least
5357
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5358
   * provide that much space, the function may cause a buffer overflow.
5359
   *
5360
   * @param input         the base64 string to process
5361
   * @param length        the length of the string in bytes
5362
   * @param output        the pointer to a buffer that can hold the conversion
5363
   * result (should be at least maximal_binary_length_from_base64(input, length)
5364
   * bytes long).
5365
   * @param options       the base64 options to use, can be base64_default or
5366
   * base64_url, is base64_default by default.
5367
   * @return a full_result pair struct (of type simdutf::result containing the
5368
   * three fields error, input_count and output_count).
5369
   */
5370
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5371
      const char *input, size_t length, char *output,
5372
      base64_options options = base64_default,
5373
      last_chunk_handling_options last_chunk_options =
5374
          last_chunk_handling_options::loose) const noexcept = 0;
5375
  /**
5376
   * Convert a base64 input to a binary output.
5377
   *
5378
   * This function follows the WHATWG forgiving-base64 format, which means that
5379
   * it will ignore any ASCII spaces in the input. You may provide a padded
5380
   * input (with one or two equal signs at the end) or an unpadded input
5381
   * (without any equal signs at the end).
5382
   *
5383
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5384
   *
5385
   * This function will fail in case of invalid input. When last_chunk_options =
5386
   * loose, there are two possible reasons for failure: the input contains a
5387
   * number of base64 characters that when divided by 4, leaves a single
5388
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5389
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5390
   *
5391
   * You should call this function with a buffer that is at least
5392
   * maximal_binary_length_from_base64(input, length) bytes long. If you
5393
   * fail to provide that much space, the function may cause a buffer overflow.
5394
   *
5395
   * @param input         the base64 string to process, in ASCII stored as
5396
   * 16-bit units
5397
   * @param length        the length of the string in 16-bit units
5398
   * @param output        the pointer to a buffer that can hold the conversion
5399
   * result (should be at least maximal_binary_length_from_base64(input, length)
5400
   * bytes long).
5401
   * @param options       the base64 options to use, can be base64_default or
5402
   * base64_url, is base64_default by default.
5403
   * @return a result pair struct (of type simdutf::result containing the two
5404
   * fields error and count) with an error code and position of the
5405
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
5406
   * number of bytes written if successful.
5407
   */
5408
  simdutf_warn_unused virtual result
5409
  base64_to_binary(const char16_t *input, size_t length, char *output,
5410
                   base64_options options = base64_default,
5411
                   last_chunk_handling_options last_chunk_options =
5412
                       last_chunk_handling_options::loose) const noexcept = 0;
5413
5414
  /**
5415
   * Convert a base64 input to a binary output while returning more details
5416
   * than base64_to_binary.
5417
   *
5418
   * This function follows the WHATWG forgiving-base64 format, which means that
5419
   * it will ignore any ASCII spaces in the input. You may provide a padded
5420
   * input (with one or two equal signs at the end) or an unpadded input
5421
   * (without any equal signs at the end).
5422
   *
5423
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5424
   *
5425
   * This function will fail in case of invalid input. When last_chunk_options =
5426
   * loose, there are two possible reasons for failure: the input contains a
5427
   * number of base64 characters that when divided by 4, leaves a single
5428
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5429
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5430
   *
5431
   * You should call this function with a buffer that is at least
5432
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5433
   * provide that much space, the function may cause a buffer overflow.
5434
   *
5435
   * @param input         the base64 string to process
5436
   * @param length        the length of the string in bytes
5437
   * @param output        the pointer to a buffer that can hold the conversion
5438
   * result (should be at least maximal_binary_length_from_base64(input, length)
5439
   * bytes long).
5440
   * @param options       the base64 options to use, can be base64_default or
5441
   * base64_url, is base64_default by default.
5442
   * @return a full_result pair struct (of type simdutf::result containing the
5443
   * three fields error, input_count and output_count).
5444
   */
5445
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5446
      const char16_t *input, size_t length, char *output,
5447
      base64_options options = base64_default,
5448
      last_chunk_handling_options last_chunk_options =
5449
          last_chunk_handling_options::loose) const noexcept = 0;
5450
5451
  /**
5452
   * Provide the base64 length in bytes given the length of a binary input.
5453
   *
5454
   * @param length        the length of the input in bytes
5455
   * @param options       the base64 options to use, can be base64_default or
5456
   * base64_url, is base64_default by default.
5457
   * @return number of base64 bytes
5458
   */
5459
  simdutf_warn_unused size_t base64_length_from_binary(
5460
      size_t length, base64_options options = base64_default) const noexcept;
5461
5462
  /**
5463
   * Convert a binary input to a base64 output.
5464
   *
5465
   * The default option (simdutf::base64_default) uses the characters `+` and
5466
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5467
   * the output to ensure that the output length is a multiple of four.
5468
   *
5469
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5470
   * part of its alphabet. No padding is added at the end of the output.
5471
   *
5472
   * This function always succeeds.
5473
   *
5474
   * @param input         the binary to process
5475
   * @param length        the length of the input in bytes
5476
   * @param output        the pointer to a buffer that can hold the conversion
5477
   * result (should be at least base64_length_from_binary(length) bytes long)
5478
   * @param options       the base64 options to use, can be base64_default or
5479
   * base64_url, is base64_default by default.
5480
   * @return number of written bytes, will be equal to
5481
   * base64_length_from_binary(length, options)
5482
   */
5483
  virtual size_t
5484
  binary_to_base64(const char *input, size_t length, char *output,
5485
                   base64_options options = base64_default) const noexcept = 0;
5486
5487
  /**
5488
   * Convert a binary input to a base64 output with lines of given length.
5489
   * Lines are separated by a single linefeed character.
5490
   *
5491
   * The default option (simdutf::base64_default) uses the characters `+` and
5492
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5493
   * the output to ensure that the output length is a multiple of four.
5494
   *
5495
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5496
   * part of its alphabet. No padding is added at the end of the output.
5497
   *
5498
   * This function always succeeds.
5499
   *
5500
   * @param input         the binary to process
5501
   * @param length        the length of the input in bytes
5502
   * @param output        the pointer to a buffer that can hold the conversion
5503
   * result (should be at least base64_length_from_binary_with_lines(length,
5504
   * options, line_length) bytes long)
5505
   * @param line_length   the length of each line, values smaller than 4 are
5506
   * interpreted as 4
5507
   * @param options       the base64 options to use, can be base64_default or
5508
   * base64_url, is base64_default by default.
5509
   * @return number of written bytes, will be equal to
5510
   * base64_length_from_binary_with_lines(length, options, line_length)
5511
   */
5512
  virtual size_t binary_to_base64_with_lines(
5513
      const char *input, size_t length, char *output,
5514
      size_t line_length = simdutf::default_line_length,
5515
      base64_options options = base64_default) const noexcept = 0;
5516
  /**
5517
   * Find the first occurrence of a character in a string. If the character is
5518
   * not found, return a pointer to the end of the string.
5519
   * @param start        the start of the string
5520
   * @param end          the end of the string
5521
   * @param character    the character to find
5522
   * @return a pointer to the first occurrence of the character in the string,
5523
   * or a pointer to the end of the string if the character is not found.
5524
   *
5525
   */
5526
  virtual const char *find(const char *start, const char *end,
5527
                           char character) const noexcept = 0;
5528
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
5529
                               char16_t character) const noexcept = 0;
5530
#endif // SIMDUTF_FEATURE_BASE64
5531
5532
#ifdef SIMDUTF_INTERNAL_TESTS
5533
  // This method is exported only in developer mode, its purpose
5534
  // is to expose some internal test procedures from the given
5535
  // implementation and then use them through our standard test
5536
  // framework.
5537
  //
5538
  // Regular users should not use it, the tests of the public
5539
  // API are enough.
5540
5541
  struct TestProcedure {
5542
    // display name
5543
    std::string name;
5544
5545
    // procedure should return whether given test pass or not
5546
    void (*procedure)(const implementation &);
5547
  };
5548
5549
  virtual std::vector<TestProcedure> internal_tests() const;
5550
#endif
5551
5552
protected:
5553
  /** @private Construct an implementation with the given name and description.
5554
   * For subclasses. */
5555
  simdutf_really_inline implementation(const char *name,
5556
                                       const char *description,
5557
                                       uint32_t required_instruction_sets)
5558
      : _name(name), _description(description),
5559
        _required_instruction_sets(required_instruction_sets) {}
5560
5561
protected:
5562
  ~implementation() = default;
5563
5564
private:
5565
  /**
5566
   * The name of this implementation.
5567
   */
5568
  const char *_name;
5569
5570
  /**
5571
   * The description of this implementation.
5572
   */
5573
  const char *_description;
5574
5575
  /**
5576
   * Instruction sets required for this implementation.
5577
   */
5578
  const uint32_t _required_instruction_sets;
5579
};
5580
5581
/** @private */
5582
namespace internal {
5583
5584
/**
5585
 * The list of available implementations compiled into simdutf.
5586
 */
5587
class available_implementation_list {
5588
public:
5589
  /** Get the list of available implementations compiled into simdutf */
5590
  simdutf_really_inline available_implementation_list() {}
5591
  /** Number of implementations */
5592
  size_t size() const noexcept;
5593
  /** STL const begin() iterator */
5594
  const implementation *const *begin() const noexcept;
5595
  /** STL const end() iterator */
5596
  const implementation *const *end() const noexcept;
5597
5598
  /**
5599
   * Get the implementation with the given name.
5600
   *
5601
   * Case sensitive.
5602
   *
5603
   *     const implementation *impl =
5604
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
5605
   * (!imp->supported_by_runtime_system()) { exit(1); }
5606
   *     simdutf::active_implementation = impl;
5607
   *
5608
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
5609
   * @return the implementation, or nullptr if the parse failed.
5610
   */
5611
  const implementation *operator[](const std::string &name) const noexcept {
5612
    for (const implementation *impl : *this) {
5613
      if (impl->name() == name) {
5614
        return impl;
5615
      }
5616
    }
5617
    return nullptr;
5618
  }
5619
5620
  /**
5621
   * Detect the most advanced implementation supported by the current host.
5622
   *
5623
   * This is used to initialize the implementation on startup.
5624
   *
5625
   *     const implementation *impl =
5626
   * simdutf::available_implementation::detect_best_supported();
5627
   *     simdutf::active_implementation = impl;
5628
   *
5629
   * @return the most advanced supported implementation for the current host, or
5630
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
5631
   * supported implementation. Will never return nullptr.
5632
   */
5633
  const implementation *detect_best_supported() const noexcept;
5634
};
5635
5636
template <typename T> class atomic_ptr {
5637
public:
5638
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
5639
5640
#if defined(SIMDUTF_NO_THREADS)
5641
  operator const T *() const { return ptr; }
5642
  const T &operator*() const { return *ptr; }
5643
  const T *operator->() const { return ptr; }
5644
5645
  operator T *() { return ptr; }
5646
  T &operator*() { return *ptr; }
5647
  T *operator->() { return ptr; }
5648
  atomic_ptr &operator=(T *_ptr) {
5649
    ptr = _ptr;
5650
    return *this;
5651
  }
5652
5653
#else
5654
  operator const T *() const { return ptr.load(); }
5655
  const T &operator*() const { return *ptr; }
5656
  const T *operator->() const { return ptr.load(); }
5657
5658
  operator T *() { return ptr.load(); }
5659
  T &operator*() { return *ptr; }
5660
  T *operator->() { return ptr.load(); }
5661
  atomic_ptr &operator=(T *_ptr) {
5662
    ptr = _ptr;
5663
    return *this;
5664
  }
5665
5666
#endif
5667
5668
private:
5669
#if defined(SIMDUTF_NO_THREADS)
5670
  T *ptr;
5671
#else
5672
  std::atomic<T *> ptr;
5673
#endif
5674
};
5675
5676
class detect_best_supported_implementation_on_first_use;
5677
5678
} // namespace internal
5679
5680
/**
5681
 * The list of available implementations compiled into simdutf.
5682
 */
5683
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
5684
get_available_implementations();
5685
5686
/**
5687
 * The active implementation.
5688
 *
5689
 * Automatically initialized on first use to the most advanced implementation
5690
 * supported by this hardware.
5691
 */
5692
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
5693
get_active_implementation();
5694
5695
} // namespace simdutf
5696
5697
#endif // SIMDUTF_IMPLEMENTATION_H