Coverage Report

Created: 2026-02-14 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/include/simdutf/implementation.h
Line
Count
Source
1
#ifndef SIMDUTF_IMPLEMENTATION_H
2
#define SIMDUTF_IMPLEMENTATION_H
3
#if !defined(SIMDUTF_NO_THREADS)
4
  #include <atomic>
5
#endif
6
#include <string>
7
#ifdef SIMDUTF_INTERNAL_TESTS
8
  #include <vector>
9
#endif
10
#include "simdutf/common_defs.h"
11
#include "simdutf/compiler_check.h"
12
#include "simdutf/encoding_types.h"
13
#include "simdutf/error.h"
14
#include "simdutf/internal/isadetection.h"
15
16
#if SIMDUTF_SPAN
17
  #include <concepts>
18
  #include <type_traits>
19
  #include <span>
20
  #include <tuple>
21
#endif
22
#if SIMDUTF_CPLUSPLUS17
23
  #include <string_view>
24
#endif
25
// The following defines are conditionally enabled/disabled during amalgamation.
26
// By default all features are enabled, regular code shouldn't check them. Only
27
// when user code really relies of a selected subset, it's good to verify these
28
// flags, like:
29
//
30
//      #if !SIMDUTF_FEATURE_UTF16
31
//      #   error("Please amalgamate simdutf with UTF-16 support")
32
//      #endif
33
//
34
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
35
#define SIMDUTF_FEATURE_ASCII 1
36
#define SIMDUTF_FEATURE_LATIN1 1
37
#define SIMDUTF_FEATURE_UTF8 1
38
#define SIMDUTF_FEATURE_UTF16 1
39
#define SIMDUTF_FEATURE_UTF32 1
40
#define SIMDUTF_FEATURE_BASE64 1
41
42
#if SIMDUTF_CPLUSPLUS23
43
  #include <simdutf/constexpr_ptr.h>
44
#endif
45
46
#if SIMDUTF_SPAN
47
/// helpers placed in namespace detail are not a part of the public API
48
namespace simdutf {
49
namespace detail {
50
/**
51
 * matches a byte, in the many ways C++ allows. note that these
52
 * are all distinct types.
53
 */
54
template <typename T>
55
concept byte_like = std::is_same_v<T, std::byte> ||     //
56
                    std::is_same_v<T, char> ||          //
57
                    std::is_same_v<T, signed char> ||   //
58
                    std::is_same_v<T, unsigned char> || //
59
                    std::is_same_v<T, char8_t>;
60
61
template <typename T>
62
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
63
64
template <typename T>
65
concept is_pointer = std::is_pointer_v<T>;
66
67
/**
68
 * matches anything that behaves like std::span and points to character-like
69
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
70
 * std::uint8_t
71
 */
72
template <typename T>
73
concept input_span_of_byte_like = requires(const T &t) {
74
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
75
  { t.data() } noexcept -> is_pointer;
76
  { *t.data() } noexcept -> is_byte_like;
77
};
78
79
template <typename T>
80
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
81
82
/**
83
 * like span_of_byte_like, but for an output span (intended to be written to)
84
 */
85
template <typename T>
86
concept output_span_of_byte_like = requires(T &t) {
87
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
88
  { t.data() } noexcept -> is_pointer;
89
  { *t.data() } noexcept -> is_byte_like;
90
  { *t.data() } noexcept -> is_mutable;
91
};
92
93
/**
94
 * a pointer like object, when indexed, results in a byte like result.
95
 * valid examples: char*, const char*, std::array<char,10>
96
 * invalid examples: int*, std::array<int,10>
97
 */
98
template <class InputPtr>
99
concept indexes_into_byte_like = requires(InputPtr p) {
100
  { std::decay_t<decltype(p[0])>{} } -> simdutf::detail::byte_like;
101
};
102
template <class InputPtr>
103
concept indexes_into_utf16 = requires(InputPtr p) {
104
  { std::decay_t<decltype(p[0])>{} } -> std::same_as<char16_t>;
105
};
106
template <class InputPtr>
107
concept indexes_into_utf32 = requires(InputPtr p) {
108
  { std::decay_t<decltype(p[0])>{} } -> std::same_as<char32_t>;
109
};
110
111
template <class InputPtr>
112
concept index_assignable_from_char = requires(InputPtr p, char s) {
113
  { p[0] = s };
114
};
115
116
/**
117
 * a pointer like object that results in a uint32_t when indexed.
118
 * valid examples: uint32_t*
119
 */
120
template <class InputPtr>
121
concept indexes_into_uint32 = requires(InputPtr p) {
122
  { std::decay_t<decltype(p[0])>{} } -> std::same_as<std::uint32_t>;
123
};
124
} // namespace detail
125
} // namespace simdutf
126
#endif // SIMDUTF_SPAN
127
128
// these includes are needed for constexpr support. they are
129
// not part of the public api.
130
#include <simdutf/scalar/swap_bytes.h>
131
#include <simdutf/scalar/ascii.h>
132
#include <simdutf/scalar/atomic_util.h>
133
#include <simdutf/scalar/latin1.h>
134
#include <simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h>
135
#include <simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h>
136
#include <simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h>
137
#include <simdutf/scalar/utf16.h>
138
#include <simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h>
139
#include <simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h>
140
#include <simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h>
141
#include <simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h>
142
#include <simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h>
143
#include <simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h>
144
#include <simdutf/scalar/utf32.h>
145
#include <simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h>
146
#include <simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h>
147
#include <simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h>
148
#include <simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h>
149
#include <simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h>
150
#include <simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h>
151
#include <simdutf/scalar/utf8.h>
152
#include <simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h>
153
#include <simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h>
154
#include <simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h>
155
#include <simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h>
156
#include <simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h>
157
#include <simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h>
158
159
namespace simdutf {
160
161
constexpr size_t default_line_length =
162
    76; ///< default line length for base64 encoding with lines
163
164
#if SIMDUTF_FEATURE_DETECT_ENCODING
165
/**
166
 * Autodetect the encoding of the input, a single encoding is recommended.
167
 * E.g., the function might return simdutf::encoding_type::UTF8,
168
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
169
 * simdutf::encoding_type::UTF32_LE.
170
 *
171
 * @param input the string to analyze.
172
 * @param length the length of the string in bytes.
173
 * @return the detected encoding type
174
 */
175
simdutf_warn_unused simdutf::encoding_type
176
autodetect_encoding(const char *input, size_t length) noexcept;
177
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
178
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
179
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
180
0
}
181
  #if SIMDUTF_SPAN
182
/**
183
 * Autodetect the encoding of the input, a single encoding is recommended.
184
 * E.g., the function might return simdutf::encoding_type::UTF8,
185
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
186
 * simdutf::encoding_type::UTF32_LE.
187
 *
188
 * @param input the string to analyze. can be a anything span-like that has a
189
 * data() and size() that points to character data: std::string,
190
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
191
 * @return the detected encoding type
192
 */
193
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
194
autodetect_encoding(
195
    const detail::input_span_of_byte_like auto &input) noexcept {
196
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
197
                             input.size());
198
}
199
  #endif // SIMDUTF_SPAN
200
201
/**
202
 * Autodetect the possible encodings of the input in one pass.
203
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
204
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
205
 *
206
 * Overridden by each implementation.
207
 *
208
 * @param input the string to analyze.
209
 * @param length the length of the string in bytes.
210
 * @return the detected encoding type
211
 */
212
simdutf_warn_unused int detect_encodings(const char *input,
213
                                         size_t length) noexcept;
214
simdutf_really_inline simdutf_warn_unused int
215
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
216
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
217
0
}
218
  #if SIMDUTF_SPAN
219
simdutf_really_inline simdutf_warn_unused int
220
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
221
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
222
                          input.size());
223
}
224
  #endif // SIMDUTF_SPAN
225
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
226
227
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
228
/**
229
 * Validate the UTF-8 string. This function may be best when you expect
230
 * the input to be almost always valid. Otherwise, consider using
231
 * validate_utf8_with_errors.
232
 *
233
 * Overridden by each implementation.
234
 *
235
 * @param buf the UTF-8 string to validate.
236
 * @param len the length of the string in bytes.
237
 * @return true if and only if the string is valid UTF-8.
238
 */
239
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
240
  #if SIMDUTF_SPAN
241
simdutf_constexpr23 simdutf_really_inline simdutf_warn_unused bool
242
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
243
    #if SIMDUTF_CPLUSPLUS23
244
  if consteval {
245
    return scalar::utf8::validate(
246
        detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
247
  } else
248
    #endif
249
  {
250
    return validate_utf8(reinterpret_cast<const char *>(input.data()),
251
                         input.size());
252
  }
253
}
254
  #endif // SIMDUTF_SPAN
255
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
256
257
#if SIMDUTF_FEATURE_UTF8
258
/**
259
 * Validate the UTF-8 string and stop on error.
260
 *
261
 * Overridden by each implementation.
262
 *
263
 * @param buf the UTF-8 string to validate.
264
 * @param len the length of the string in bytes.
265
 * @return a result pair struct (of type simdutf::result containing the two
266
 * fields error and count) with an error code and either position of the error
267
 * (in the input in code units) if any, or the number of code units validated if
268
 * successful.
269
 */
270
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
271
                                                     size_t len) noexcept;
272
  #if SIMDUTF_SPAN
273
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
274
validate_utf8_with_errors(
275
    const detail::input_span_of_byte_like auto &input) noexcept {
276
    #if SIMDUTF_CPLUSPLUS23
277
  if consteval {
278
    return scalar::utf8::validate_with_errors(
279
        detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
280
  } else
281
    #endif
282
  {
283
    return validate_utf8_with_errors(
284
        reinterpret_cast<const char *>(input.data()), input.size());
285
  }
286
}
287
  #endif // SIMDUTF_SPAN
288
#endif   // SIMDUTF_FEATURE_UTF8
289
290
#if SIMDUTF_FEATURE_ASCII
291
/**
292
 * Validate the ASCII string.
293
 *
294
 * Overridden by each implementation.
295
 *
296
 * @param buf the ASCII string to validate.
297
 * @param len the length of the string in bytes.
298
 * @return true if and only if the string is valid ASCII.
299
 */
300
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
301
  #if SIMDUTF_SPAN
302
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
303
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
304
    #if SIMDUTF_CPLUSPLUS23
305
  if consteval {
306
    return scalar::ascii::validate(
307
        detail::constexpr_cast_ptr<std::uint8_t>(input.data()), input.size());
308
  } else
309
    #endif
310
  {
311
    return validate_ascii(reinterpret_cast<const char *>(input.data()),
312
                          input.size());
313
  }
314
}
315
  #endif // SIMDUTF_SPAN
316
317
/**
318
 * Validate the ASCII string and stop on error. It might be faster than
319
 * validate_utf8 when an error is expected to occur early.
320
 *
321
 * Overridden by each implementation.
322
 *
323
 * @param buf the ASCII string to validate.
324
 * @param len the length of the string in bytes.
325
 * @return a result pair struct (of type simdutf::result containing the two
326
 * fields error and count) with an error code and either position of the error
327
 * (in the input in code units) if any, or the number of code units validated if
328
 * successful.
329
 */
330
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
331
                                                      size_t len) noexcept;
332
  #if SIMDUTF_SPAN
333
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
334
validate_ascii_with_errors(
335
    const detail::input_span_of_byte_like auto &input) noexcept {
336
    #if SIMDUTF_CPLUSPLUS23
337
  if consteval {
338
    return scalar::ascii::validate_with_errors(
339
        detail::constexpr_cast_ptr<std::uint8_t>(input.data()), input.size());
340
  } else
341
    #endif
342
  {
343
    return validate_ascii_with_errors(
344
        reinterpret_cast<const char *>(input.data()), input.size());
345
  }
346
}
347
  #endif // SIMDUTF_SPAN
348
#endif   // SIMDUTF_FEATURE_ASCII
349
350
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
351
/**
352
 * Validate the ASCII string as a UTF-16 sequence.
353
 * An UTF-16 sequence is considered an ASCII sequence
354
 * if it could be converted to an ASCII string losslessly.
355
 *
356
 * Overridden by each implementation.
357
 *
358
 * @param buf the UTF-16 string to validate.
359
 * @param len the length of the string in bytes.
360
 * @return true if and only if the string is valid ASCII.
361
 */
362
simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf,
363
                                                 size_t len) noexcept;
364
  #if SIMDUTF_SPAN
365
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
366
0
validate_utf16_as_ascii(std::span<const char16_t> input) noexcept {
367
0
    #if SIMDUTF_CPLUSPLUS23
368
0
  if consteval {
369
0
    return scalar::utf16::validate_as_ascii<endianness::NATIVE>(input.data(),
370
0
                                                                input.size());
371
0
  } else
372
0
    #endif
373
0
  {
374
0
    return validate_utf16_as_ascii(input.data(), input.size());
375
0
  }
376
0
}
377
  #endif // SIMDUTF_SPAN
378
379
/**
380
 * Validate the ASCII string as a UTF-16BE sequence.
381
 * An UTF-16 sequence is considered an ASCII sequence
382
 * if it could be converted to an ASCII string losslessly.
383
 *
384
 * Overridden by each implementation.
385
 *
386
 * @param buf the UTF-16BE string to validate.
387
 * @param len the length of the string in bytes.
388
 * @return true if and only if the string is valid ASCII.
389
 */
390
simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf,
391
                                                   size_t len) noexcept;
392
  #if SIMDUTF_SPAN
393
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
394
0
validate_utf16be_as_ascii(std::span<const char16_t> input) noexcept {
395
0
    #if SIMDUTF_CPLUSPLUS23
396
0
  if consteval {
397
0
    return scalar::utf16::validate_as_ascii<endianness::BIG>(input.data(),
398
0
                                                             input.size());
399
0
  } else
400
0
    #endif
401
0
  {
402
0
    return validate_utf16be_as_ascii(input.data(), input.size());
403
0
  }
404
0
}
405
  #endif // SIMDUTF_SPAN
406
407
/**
408
 * Validate the ASCII string as a UTF-16LE sequence.
409
 * An UTF-16 sequence is considered an ASCII sequence
410
 * if it could be converted to an ASCII string losslessly.
411
 *
412
 * Overridden by each implementation.
413
 *
414
 * @param buf the UTF-16LE string to validate.
415
 * @param len the length of the string in bytes.
416
 * @return true if and only if the string is valid ASCII.
417
 */
418
simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf,
419
                                                   size_t len) noexcept;
420
  #if SIMDUTF_SPAN
421
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
422
0
validate_utf16le_as_ascii(std::span<const char16_t> input) noexcept {
423
0
    #if SIMDUTF_CPLUSPLUS23
424
0
  if consteval {
425
0
    return scalar::utf16::validate_as_ascii<endianness::LITTLE>(input.data(),
426
0
                                                                input.size());
427
0
  } else
428
0
    #endif
429
0
  {
430
0
    return validate_utf16le_as_ascii(input.data(), input.size());
431
0
  }
432
0
}
433
  #endif // SIMDUTF_SPAN
434
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
435
436
#if SIMDUTF_FEATURE_UTF16
437
/**
438
 * Using native endianness; Validate the UTF-16 string.
439
 * This function may be best when you expect the input to be almost always
440
 * valid. Otherwise, consider using validate_utf16_with_errors.
441
 *
442
 * Overridden by each implementation.
443
 *
444
 * This function is not BOM-aware.
445
 *
446
 * @param buf the UTF-16 string to validate.
447
 * @param len the length of the string in number of 2-byte code units
448
 * (char16_t).
449
 * @return true if and only if the string is valid UTF-16.
450
 */
451
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
452
                                        size_t len) noexcept;
453
  #if SIMDUTF_SPAN
454
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
455
0
validate_utf16(std::span<const char16_t> input) noexcept {
456
0
    #if SIMDUTF_CPLUSPLUS23
457
0
  if consteval {
458
0
    return scalar::utf16::validate<endianness::NATIVE>(input.data(),
459
0
                                                       input.size());
460
0
  } else
461
0
    #endif
462
0
  {
463
0
    return validate_utf16(input.data(), input.size());
464
0
  }
465
0
}
466
  #endif // SIMDUTF_SPAN
467
#endif   // SIMDUTF_FEATURE_UTF16
468
469
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
470
/**
471
 * Validate the UTF-16LE string. This function may be best when you expect
472
 * the input to be almost always valid. Otherwise, consider using
473
 * validate_utf16le_with_errors.
474
 *
475
 * Overridden by each implementation.
476
 *
477
 * This function is not BOM-aware.
478
 *
479
 * @param buf the UTF-16LE string to validate.
480
 * @param len the length of the string in number of 2-byte code units
481
 * (char16_t).
482
 * @return true if and only if the string is valid UTF-16LE.
483
 */
484
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
485
                                          size_t len) noexcept;
486
  #if SIMDUTF_SPAN
487
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused bool
488
0
validate_utf16le(std::span<const char16_t> input) noexcept {
489
0
    #if SIMDUTF_CPLUSPLUS23
490
0
  if consteval {
491
0
    return scalar::utf16::validate<endianness::LITTLE>(input.data(),
492
0
                                                       input.size());
493
0
  } else
494
0
    #endif
495
0
  {
496
0
    return validate_utf16le(input.data(), input.size());
497
0
  }
498
0
}
499
  #endif // SIMDUTF_SPAN
500
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
501
502
#if SIMDUTF_FEATURE_UTF16
503
/**
504
 * Validate the UTF-16BE string. This function may be best when you expect
505
 * the input to be almost always valid. Otherwise, consider using
506
 * validate_utf16be_with_errors.
507
 *
508
 * Overridden by each implementation.
509
 *
510
 * This function is not BOM-aware.
511
 *
512
 * @param buf the UTF-16BE string to validate.
513
 * @param len the length of the string in number of 2-byte code units
514
 * (char16_t).
515
 * @return true if and only if the string is valid UTF-16BE.
516
 */
517
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
518
                                          size_t len) noexcept;
519
  #if SIMDUTF_SPAN
520
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
521
0
validate_utf16be(std::span<const char16_t> input) noexcept {
522
0
    #if SIMDUTF_CPLUSPLUS23
523
0
  if consteval {
524
0
    return scalar::utf16::validate<endianness::BIG>(input.data(), input.size());
525
0
  } else
526
0
    #endif
527
0
  {
528
0
    return validate_utf16be(input.data(), input.size());
529
0
  }
530
0
}
531
  #endif // SIMDUTF_SPAN
532
533
/**
534
 * Using native endianness; Validate the UTF-16 string and stop on error.
535
 * It might be faster than validate_utf16 when an error is expected to occur
536
 * early.
537
 *
538
 * Overridden by each implementation.
539
 *
540
 * This function is not BOM-aware.
541
 *
542
 * @param buf the UTF-16 string to validate.
543
 * @param len the length of the string in number of 2-byte code units
544
 * (char16_t).
545
 * @return a result pair struct (of type simdutf::result containing the two
546
 * fields error and count) with an error code and either position of the error
547
 * (in the input in code units) if any, or the number of code units validated if
548
 * successful.
549
 */
550
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
551
                                                      size_t len) noexcept;
552
  #if SIMDUTF_SPAN
553
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
554
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
555
0
    #if SIMDUTF_CPLUSPLUS23
556
0
  if consteval {
557
0
    return scalar::utf16::validate_with_errors<endianness::NATIVE>(
558
0
        input.data(), input.size());
559
0
  } else
560
0
    #endif
561
0
  {
562
0
    return validate_utf16_with_errors(input.data(), input.size());
563
0
  }
564
0
}
565
  #endif // SIMDUTF_SPAN
566
567
/**
568
 * Validate the UTF-16LE string and stop on error. It might be faster than
569
 * validate_utf16le when an error is expected to occur early.
570
 *
571
 * Overridden by each implementation.
572
 *
573
 * This function is not BOM-aware.
574
 *
575
 * @param buf the UTF-16LE string to validate.
576
 * @param len the length of the string in number of 2-byte code units
577
 * (char16_t).
578
 * @return a result pair struct (of type simdutf::result containing the two
579
 * fields error and count) with an error code and either position of the error
580
 * (in the input in code units) if any, or the number of code units validated if
581
 * successful.
582
 */
583
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
584
                                                        size_t len) noexcept;
585
  #if SIMDUTF_SPAN
586
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
587
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
588
0
    #if SIMDUTF_CPLUSPLUS23
589
0
  if consteval {
590
0
    return scalar::utf16::validate_with_errors<endianness::LITTLE>(
591
0
        input.data(), input.size());
592
0
  } else
593
0
    #endif
594
0
  {
595
0
    return validate_utf16le_with_errors(input.data(), input.size());
596
0
  }
597
0
}
598
  #endif // SIMDUTF_SPAN
599
600
/**
601
 * Validate the UTF-16BE string and stop on error. It might be faster than
602
 * validate_utf16be when an error is expected to occur early.
603
 *
604
 * Overridden by each implementation.
605
 *
606
 * This function is not BOM-aware.
607
 *
608
 * @param buf the UTF-16BE string to validate.
609
 * @param len the length of the string in number of 2-byte code units
610
 * (char16_t).
611
 * @return a result pair struct (of type simdutf::result containing the two
612
 * fields error and count) with an error code and either position of the error
613
 * (in the input in code units) if any, or the number of code units validated if
614
 * successful.
615
 */
616
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
617
                                                        size_t len) noexcept;
618
  #if SIMDUTF_SPAN
619
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
620
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
621
0
    #if SIMDUTF_CPLUSPLUS23
622
0
  if consteval {
623
0
    return scalar::utf16::validate_with_errors<endianness::BIG>(input.data(),
624
0
                                                                input.size());
625
0
  } else
626
0
    #endif
627
0
  {
628
0
    return validate_utf16be_with_errors(input.data(), input.size());
629
0
  }
630
0
}
631
  #endif // SIMDUTF_SPAN
632
633
/**
634
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
635
 * the Unicode replacement character U+FFFD. If input and output points to
636
 * different memory areas, the procedure copies string, and it's expected that
637
 * output memory is at least as big as the input. It's also possible to set
638
 * input equal output, that makes replacements an in-place operation.
639
 *
640
 * @param input the UTF-16LE string to correct.
641
 * @param len the length of the string in number of 2-byte code units
642
 * (char16_t).
643
 * @param output the output buffer.
644
 */
645
void to_well_formed_utf16le(const char16_t *input, size_t len,
646
                            char16_t *output) noexcept;
647
  #if SIMDUTF_SPAN
648
simdutf_really_inline simdutf_constexpr23 void
649
to_well_formed_utf16le(std::span<const char16_t> input,
650
0
                       std::span<char16_t> output) noexcept {
651
0
    #if SIMDUTF_CPLUSPLUS23
652
0
  if consteval {
653
0
    scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(
654
0
        input.data(), input.size(), output.data());
655
0
  } else
656
0
    #endif
657
0
  {
658
0
    to_well_formed_utf16le(input.data(), input.size(), output.data());
659
0
  }
660
0
}
661
  #endif // SIMDUTF_SPAN
662
663
/**
664
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
665
 * the Unicode replacement character U+FFFD. If input and output points to
666
 * different memory areas, the procedure copies string, and it's expected that
667
 * output memory is at least as big as the input. It's also possible to set
668
 * input equal output, that makes replacements an in-place operation.
669
 *
670
 * @param input the UTF-16BE string to correct.
671
 * @param len the length of the string in number of 2-byte code units
672
 * (char16_t).
673
 * @param output the output buffer.
674
 */
675
void to_well_formed_utf16be(const char16_t *input, size_t len,
676
                            char16_t *output) noexcept;
677
  #if SIMDUTF_SPAN
678
simdutf_really_inline simdutf_constexpr23 void
679
to_well_formed_utf16be(std::span<const char16_t> input,
680
0
                       std::span<char16_t> output) noexcept {
681
0
    #if SIMDUTF_CPLUSPLUS23
682
0
  if consteval {
683
0
    scalar::utf16::to_well_formed_utf16<endianness::BIG>(
684
0
        input.data(), input.size(), output.data());
685
0
  } else
686
0
    #endif
687
0
  {
688
0
    to_well_formed_utf16be(input.data(), input.size(), output.data());
689
0
  }
690
0
}
691
  #endif // SIMDUTF_SPAN
692
693
/**
694
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
695
 * Unicode replacement character U+FFFD. If input and output points to different
696
 * memory areas, the procedure copies string, and it's expected that output
697
 * memory is at least as big as the input. It's also possible to set input equal
698
 * output, that makes replacements an in-place operation.
699
 *
700
 * @param input the UTF-16 string to correct.
701
 * @param len the length of the string in number of 2-byte code units
702
 * (char16_t).
703
 * @param output the output buffer.
704
 */
705
void to_well_formed_utf16(const char16_t *input, size_t len,
706
                          char16_t *output) noexcept;
707
  #if SIMDUTF_SPAN
708
simdutf_really_inline simdutf_constexpr23 void
709
to_well_formed_utf16(std::span<const char16_t> input,
710
0
                     std::span<char16_t> output) noexcept {
711
0
    #if SIMDUTF_CPLUSPLUS23
712
0
  if consteval {
713
0
    scalar::utf16::to_well_formed_utf16<endianness::NATIVE>(
714
0
        input.data(), input.size(), output.data());
715
0
  } else
716
0
    #endif
717
0
  {
718
0
    to_well_formed_utf16(input.data(), input.size(), output.data());
719
0
  }
720
0
}
721
  #endif // SIMDUTF_SPAN
722
723
#endif // SIMDUTF_FEATURE_UTF16
724
725
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
726
/**
727
 * Validate the UTF-32 string. This function may be best when you expect
728
 * the input to be almost always valid. Otherwise, consider using
729
 * validate_utf32_with_errors.
730
 *
731
 * Overridden by each implementation.
732
 *
733
 * This function is not BOM-aware.
734
 *
735
 * @param buf the UTF-32 string to validate.
736
 * @param len the length of the string in number of 4-byte code units
737
 * (char32_t).
738
 * @return true if and only if the string is valid UTF-32.
739
 */
740
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
741
                                        size_t len) noexcept;
742
  #if SIMDUTF_SPAN
743
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
744
0
validate_utf32(std::span<const char32_t> input) noexcept {
745
0
    #if SIMDUTF_CPLUSPLUS23
746
0
  if consteval {
747
0
    return scalar::utf32::validate(
748
0
        detail::constexpr_cast_ptr<std::uint32_t>(input.data()), input.size());
749
0
  } else
750
0
    #endif
751
0
  {
752
0
    return validate_utf32(input.data(), input.size());
753
0
  }
754
0
}
755
  #endif // SIMDUTF_SPAN
756
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
757
758
#if SIMDUTF_FEATURE_UTF32
759
/**
760
 * Validate the UTF-32 string and stop on error. It might be faster than
761
 * validate_utf32 when an error is expected to occur early.
762
 *
763
 * Overridden by each implementation.
764
 *
765
 * This function is not BOM-aware.
766
 *
767
 * @param buf the UTF-32 string to validate.
768
 * @param len the length of the string in number of 4-byte code units
769
 * (char32_t).
770
 * @return a result pair struct (of type simdutf::result containing the two
771
 * fields error and count) with an error code and either position of the error
772
 * (in the input in code units) if any, or the number of code units validated if
773
 * successful.
774
 */
775
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
776
                                                      size_t len) noexcept;
777
  #if SIMDUTF_SPAN
778
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
779
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
780
0
    #if SIMDUTF_CPLUSPLUS23
781
0
  if consteval {
782
0
    return scalar::utf32::validate_with_errors(
783
0
        detail::constexpr_cast_ptr<std::uint32_t>(input.data()), input.size());
784
0
  } else
785
0
    #endif
786
0
  {
787
0
    return validate_utf32_with_errors(input.data(), input.size());
788
0
  }
789
0
}
790
  #endif // SIMDUTF_SPAN
791
#endif   // SIMDUTF_FEATURE_UTF32
792
793
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
794
/**
795
 * Convert Latin1 string into UTF-8 string.
796
 *
797
 * This function is suitable to work with inputs from untrusted sources.
798
 *
799
 * @param input         the Latin1 string to convert
800
 * @param length        the length of the string in bytes
801
 * @param utf8_output   the pointer to buffer that can hold conversion result
802
 * @return the number of written char; 0 if conversion is not possible
803
 */
804
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
805
                                                  size_t length,
806
                                                  char *utf8_output) noexcept;
807
  #if SIMDUTF_SPAN
808
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
809
convert_latin1_to_utf8(
810
    const detail::input_span_of_byte_like auto &latin1_input,
811
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
812
    #if SIMDUTF_CPLUSPLUS23
813
  if consteval {
814
    return scalar::latin1_to_utf8::convert(
815
        detail::constexpr_cast_ptr<char>(latin1_input.data()),
816
        latin1_input.size(),
817
        detail::constexpr_cast_writeptr<char>(utf8_output.data()));
818
  } else
819
    #endif
820
  {
821
    return convert_latin1_to_utf8(
822
        reinterpret_cast<const char *>(latin1_input.data()),
823
        latin1_input.size(), reinterpret_cast<char *>(utf8_output.data()));
824
  }
825
}
826
  #endif // SIMDUTF_SPAN
827
828
/**
829
 * Convert Latin1 string into UTF-8 string with output limit.
830
 *
831
 * This function is suitable to work with inputs from untrusted sources.
832
 *
833
 * We write as many characters as possible.
834
 *
835
 * @param input         the Latin1 string to convert
836
 * @param length        the length of the string in bytes
837
 * @param utf8_output   the pointer to buffer that can hold conversion result
838
 * @param utf8_len      the maximum output length
839
 * @return the number of written char; 0 if conversion is not possible
840
 */
841
simdutf_warn_unused size_t
842
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
843
                            size_t utf8_len) noexcept;
844
  #if SIMDUTF_SPAN
845
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
846
convert_latin1_to_utf8_safe(
847
    const detail::input_span_of_byte_like auto &input,
848
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
849
      // implementation note: outputspan is a forwarding ref to avoid copying
850
      // and allow both lvalues and rvalues. std::span can be copied without
851
      // problems, but std::vector should not, and this function should accept
852
      // both. it will allow using an owning rvalue ref (example: passing a
853
      // temporary std::string) as output, but the user will quickly find out
854
      // that he has no way of getting the data out of the object in that case.
855
    #if SIMDUTF_CPLUSPLUS23
856
  if consteval {
857
    return scalar::latin1_to_utf8::convert_safe_constexpr(
858
        input.data(), input.size(), utf8_output.data(), utf8_output.size());
859
  } else
860
    #endif
861
  {
862
    return convert_latin1_to_utf8_safe(
863
        reinterpret_cast<const char *>(input.data()), input.size(),
864
        reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
865
  }
866
}
867
  #endif // SIMDUTF_SPAN
868
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
869
870
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
871
/**
872
 * Convert possibly Latin1 string into UTF-16LE string.
873
 *
874
 * This function is suitable to work with inputs from untrusted sources.
875
 *
876
 * @param input         the Latin1 string to convert
877
 * @param length        the length of the string in bytes
878
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
879
 * @return the number of written char16_t; 0 if conversion is not possible
880
 */
881
simdutf_warn_unused size_t convert_latin1_to_utf16le(
882
    const char *input, size_t length, char16_t *utf16_output) noexcept;
883
  #if SIMDUTF_SPAN
884
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
885
convert_latin1_to_utf16le(
886
    const detail::input_span_of_byte_like auto &latin1_input,
887
    std::span<char16_t> utf16_output) noexcept {
888
    #if SIMDUTF_CPLUSPLUS23
889
  if consteval {
890
    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(
891
        latin1_input.data(), latin1_input.size(), utf16_output.data());
892
  } else
893
    #endif
894
  {
895
    return convert_latin1_to_utf16le(
896
        reinterpret_cast<const char *>(latin1_input.data()),
897
        latin1_input.size(), utf16_output.data());
898
  }
899
}
900
  #endif // SIMDUTF_SPAN
901
902
/**
903
 * Convert Latin1 string into UTF-16BE string.
904
 *
905
 * This function is suitable to work with inputs from untrusted sources.
906
 *
907
 * @param input         the Latin1 string to convert
908
 * @param length        the length of the string in bytes
909
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
910
 * @return the number of written char16_t; 0 if conversion is not possible
911
 */
912
simdutf_warn_unused size_t convert_latin1_to_utf16be(
913
    const char *input, size_t length, char16_t *utf16_output) noexcept;
914
  #if SIMDUTF_SPAN
915
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
916
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
917
                          std::span<char16_t> output) noexcept {
918
    #if SIMDUTF_CPLUSPLUS23
919
  if consteval {
920
    return scalar::latin1_to_utf16::convert<endianness::BIG>(
921
        input.data(), input.size(), output.data());
922
  } else
923
    #endif
924
  {
925
    return convert_latin1_to_utf16be(
926
        reinterpret_cast<const char *>(input.data()), input.size(),
927
        output.data());
928
  }
929
}
930
  #endif // SIMDUTF_SPAN
931
/**
932
 * Compute the number of bytes that this UTF-16 string would require in Latin1
933
 * format.
934
 *
935
 * @param length        the length of the string in Latin1 code units (char)
936
 * @return the length of the string in Latin1 code units (char) required to
937
 * encode the UTF-16 string as Latin1
938
 */
939
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
940
latin1_length_from_utf16(size_t length) noexcept {
941
  return length;
942
}
943
944
/**
945
 * Compute the number of code units that this Latin1 string would require in
946
 * UTF-16 format.
947
 *
948
 * @param length        the length of the string in Latin1 code units (char)
949
 * @return the length of the string in 2-byte code units (char16_t) required to
950
 * encode the Latin1 string as UTF-16
951
 */
952
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
953
0
utf16_length_from_latin1(size_t length) noexcept {
954
0
  return length;
955
0
}
956
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
957
958
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
959
/**
960
 * Convert Latin1 string into UTF-32 string.
961
 *
962
 * This function is suitable to work with inputs from untrusted sources.
963
 *
964
 * @param input         the Latin1 string to convert
965
 * @param length        the length of the string in bytes
966
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
967
 * @return the number of written char32_t; 0 if conversion is not possible
968
 */
969
simdutf_warn_unused size_t convert_latin1_to_utf32(
970
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
971
  #if SIMDUTF_SPAN
972
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
973
convert_latin1_to_utf32(
974
    const detail::input_span_of_byte_like auto &latin1_input,
975
    std::span<char32_t> utf32_output) noexcept {
976
    #if SIMDUTF_CPLUSPLUS23
977
  if consteval {
978
    return scalar::latin1_to_utf32::convert(
979
        latin1_input.data(), latin1_input.size(), utf32_output.data());
980
  } else
981
    #endif
982
  {
983
    return convert_latin1_to_utf32(
984
        reinterpret_cast<const char *>(latin1_input.data()),
985
        latin1_input.size(), utf32_output.data());
986
  }
987
}
988
  #endif // SIMDUTF_SPAN
989
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
990
991
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
992
/**
993
 * Convert possibly broken UTF-8 string into latin1 string.
994
 *
995
 * During the conversion also validation of the input string is done.
996
 * This function is suitable to work with inputs from untrusted sources.
997
 *
998
 * @param input         the UTF-8 string to convert
999
 * @param length        the length of the string in bytes
1000
 * @param latin1_output  the pointer to buffer that can hold conversion result
1001
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1002
 * or if it cannot be represented as Latin1
1003
 */
1004
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
1005
                                                  size_t length,
1006
                                                  char *latin1_output) noexcept;
1007
  #if SIMDUTF_SPAN
1008
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1009
convert_utf8_to_latin1(
1010
    const detail::input_span_of_byte_like auto &input,
1011
    detail::output_span_of_byte_like auto &&output) noexcept {
1012
    #if SIMDUTF_CPLUSPLUS23
1013
  if consteval {
1014
    return scalar::utf8_to_latin1::convert(input.data(), input.size(),
1015
                                           output.data());
1016
  } else
1017
    #endif
1018
  {
1019
    return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
1020
                                  input.size(),
1021
                                  reinterpret_cast<char *>(output.data()));
1022
  }
1023
}
1024
  #endif // SIMDUTF_SPAN
1025
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1026
1027
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1028
/**
1029
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
1030
 * string.
1031
 *
1032
 * During the conversion also validation of the input string is done.
1033
 * This function is suitable to work with inputs from untrusted sources.
1034
 *
1035
 * @param input         the UTF-8 string to convert
1036
 * @param length        the length of the string in bytes
1037
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1038
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1039
 * string
1040
 */
1041
simdutf_warn_unused size_t convert_utf8_to_utf16(
1042
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1043
  #if SIMDUTF_SPAN
1044
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1045
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
1046
                      std::span<char16_t> output) noexcept {
1047
    #if SIMDUTF_CPLUSPLUS23
1048
  if consteval {
1049
    return scalar::utf8_to_utf16::convert<endianness::NATIVE>(
1050
        input.data(), input.size(), output.data());
1051
  } else
1052
    #endif
1053
  {
1054
    return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
1055
                                 input.size(), output.data());
1056
  }
1057
}
1058
  #endif // SIMDUTF_SPAN
1059
1060
/**
1061
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
1062
 * format even when the UTF-16LE content contains mismatched surrogates
1063
 * that have to be replaced by the replacement character (0xFFFD).
1064
 *
1065
 * @param input         the UTF-16LE string to convert
1066
 * @param length        the length of the string in 2-byte code units (char16_t)
1067
 * @return a result pair struct (of type simdutf::result containing the two
1068
 * fields error and count) where the count is the number of bytes required to
1069
 * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or
1070
 * SURROGATE. The count is correct regardless of the error field.
1071
 * When SURROGATE is returned, it does not indicate an error in the case of this
1072
 * function: it indicates that at least one surrogate has been encountered: the
1073
 * surrogates may be matched or not (thus this function does not validate). If
1074
 * the returned error code is SUCCESS, then the input contains no surrogate, is
1075
 * in the Basic Multilingual Plane, and is necessarily valid.
1076
 */
1077
simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
1078
    const char16_t *input, size_t length) noexcept;
1079
  #if SIMDUTF_SPAN
1080
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
1081
utf8_length_from_utf16le_with_replacement(
1082
0
    std::span<const char16_t> valid_utf16_input) noexcept {
1083
0
    #if SIMDUTF_CPLUSPLUS23
1084
0
  if consteval {
1085
0
    return scalar::utf16::utf8_length_from_utf16_with_replacement<
1086
0
        endianness::LITTLE>(valid_utf16_input.data(), valid_utf16_input.size());
1087
0
  } else
1088
0
    #endif
1089
0
  {
1090
0
    return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
1091
0
                                                     valid_utf16_input.size());
1092
0
  }
1093
0
}
1094
  #endif // SIMDUTF_SPAN
1095
1096
/**
1097
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
1098
 * format even when the UTF-16BE content contains mismatched surrogates
1099
 * that have to be replaced by the replacement character (0xFFFD).
1100
 *
1101
 * @param input         the UTF-16BE string to convert
1102
 * @param length        the length of the string in 2-byte code units (char16_t)
1103
 * @return a result pair struct (of type simdutf::result containing the two
1104
 * fields error and count) where the count is the number of bytes required to
1105
 * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or
1106
 * SURROGATE. The count is correct regardless of the error field.
1107
 * When SURROGATE is returned, it does not indicate an error in the case of this
1108
 * function: it indicates that at least one surrogate has been encountered: the
1109
 * surrogates may be matched or not (thus this function does not validate). If
1110
 * the returned error code is SUCCESS, then the input contains no surrogate, is
1111
 * in the Basic Multilingual Plane, and is necessarily valid.
1112
 */
1113
simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
1114
    const char16_t *input, size_t length) noexcept;
1115
  #if SIMDUTF_SPAN
1116
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1117
utf8_length_from_utf16be_with_replacement(
1118
0
    std::span<const char16_t> valid_utf16_input) noexcept {
1119
0
    #if SIMDUTF_CPLUSPLUS23
1120
0
  if consteval {
1121
0
    return scalar::utf16::utf8_length_from_utf16_with_replacement<
1122
0
        endianness::BIG>(valid_utf16_input.data(), valid_utf16_input.size());
1123
0
  } else
1124
0
    #endif
1125
0
  {
1126
0
    return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
1127
0
                                                     valid_utf16_input.size());
1128
0
  }
1129
0
}
1130
  #endif // SIMDUTF_SPAN
1131
1132
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1133
1134
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1135
/**
1136
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
1137
 *
1138
 * @param input         the Latin1 string to convert
1139
 * @param length        the length of the string in bytes
1140
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1141
 * @return the number of written char16_t.
1142
 */
1143
simdutf_warn_unused size_t convert_latin1_to_utf16(
1144
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1145
  #if SIMDUTF_SPAN
1146
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1147
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
1148
                        std::span<char16_t> output) noexcept {
1149
    #if SIMDUTF_CPLUSPLUS23
1150
  if consteval {
1151
    return scalar::latin1_to_utf16::convert<endianness::NATIVE>(
1152
        input.data(), input.size(), output.data());
1153
  } else
1154
    #endif
1155
  {
1156
    return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
1157
                                   input.size(), output.data());
1158
  }
1159
}
1160
  #endif // SIMDUTF_SPAN
1161
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1162
1163
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1164
/**
1165
 * Convert possibly broken UTF-8 string into UTF-16LE string.
1166
 *
1167
 * During the conversion also validation of the input string is done.
1168
 * This function is suitable to work with inputs from untrusted sources.
1169
 *
1170
 * @param input         the UTF-8 string to convert
1171
 * @param length        the length of the string in bytes
1172
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1173
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1174
 * string
1175
 */
1176
simdutf_warn_unused size_t convert_utf8_to_utf16le(
1177
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1178
  #if SIMDUTF_SPAN
1179
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1180
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
1181
                        std::span<char16_t> utf16_output) noexcept {
1182
    #if SIMDUTF_CPLUSPLUS23
1183
  if consteval {
1184
    return scalar::utf8_to_utf16::convert<endianness::LITTLE>(
1185
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1186
  } else
1187
    #endif
1188
  {
1189
    return convert_utf8_to_utf16le(
1190
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1191
        utf16_output.data());
1192
  }
1193
}
1194
  #endif // SIMDUTF_SPAN
1195
1196
/**
1197
 * Convert possibly broken UTF-8 string into UTF-16BE string.
1198
 *
1199
 * During the conversion also validation of the input string is done.
1200
 * This function is suitable to work with inputs from untrusted sources.
1201
 *
1202
 * @param input         the UTF-8 string to convert
1203
 * @param length        the length of the string in bytes
1204
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1205
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1206
 * string
1207
 */
1208
simdutf_warn_unused size_t convert_utf8_to_utf16be(
1209
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1210
  #if SIMDUTF_SPAN
1211
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1212
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
1213
                        std::span<char16_t> utf16_output) noexcept {
1214
1215
    #if SIMDUTF_CPLUSPLUS23
1216
  if consteval {
1217
    return scalar::utf8_to_utf16::convert<endianness::BIG>(
1218
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1219
  } else
1220
    #endif
1221
  {
1222
    return convert_utf8_to_utf16be(
1223
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1224
        utf16_output.data());
1225
  }
1226
}
1227
  #endif // SIMDUTF_SPAN
1228
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1229
1230
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1231
/**
1232
 * Convert possibly broken UTF-8 string into latin1 string with errors.
1233
 * If the string cannot be represented as Latin1, an error
1234
 * code is returned.
1235
 *
1236
 * During the conversion also validation of the input string is done.
1237
 * This function is suitable to work with inputs from untrusted sources.
1238
 *
1239
 * @param input         the UTF-8 string to convert
1240
 * @param length        the length of the string in bytes
1241
 * @param latin1_output  the pointer to buffer that can hold conversion result
1242
 * @return a result pair struct (of type simdutf::result containing the two
1243
 * fields error and count) with an error code and either position of the error
1244
 * (in the input in code units) if any, or the number of code units validated if
1245
 * successful.
1246
 */
1247
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
1248
    const char *input, size_t length, char *latin1_output) noexcept;
1249
  #if SIMDUTF_SPAN
1250
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1251
convert_utf8_to_latin1_with_errors(
1252
    const detail::input_span_of_byte_like auto &utf8_input,
1253
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1254
    #if SIMDUTF_CPLUSPLUS23
1255
  if consteval {
1256
    return scalar::utf8_to_latin1::convert_with_errors(
1257
        utf8_input.data(), utf8_input.size(), latin1_output.data());
1258
  } else
1259
    #endif
1260
  {
1261
    return convert_utf8_to_latin1_with_errors(
1262
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1263
        reinterpret_cast<char *>(latin1_output.data()));
1264
  }
1265
}
1266
  #endif // SIMDUTF_SPAN
1267
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1268
1269
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1270
/**
1271
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
1272
 * string and stop on error.
1273
 *
1274
 * During the conversion also validation of the input string is done.
1275
 * This function is suitable to work with inputs from untrusted sources.
1276
 *
1277
 * @param input         the UTF-8 string to convert
1278
 * @param length        the length of the string in bytes
1279
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1280
 * @return a result pair struct (of type simdutf::result containing the two
1281
 * fields error and count) with an error code and either position of the error
1282
 * (in the input in code units) if any, or the number of char16_t written if
1283
 * successful.
1284
 */
1285
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
1286
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1287
  #if SIMDUTF_SPAN
1288
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1289
convert_utf8_to_utf16_with_errors(
1290
    const detail::input_span_of_byte_like auto &utf8_input,
1291
    std::span<char16_t> utf16_output) noexcept {
1292
    #if SIMDUTF_CPLUSPLUS23
1293
  if consteval {
1294
    return scalar::utf8_to_utf16::convert_with_errors<endianness::NATIVE>(
1295
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1296
  } else
1297
    #endif
1298
  {
1299
    return convert_utf8_to_utf16_with_errors(
1300
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1301
        utf16_output.data());
1302
  }
1303
}
1304
  #endif // SIMDUTF_SPAN
1305
1306
/**
1307
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1308
 *
1309
 * During the conversion also validation of the input string is done.
1310
 * This function is suitable to work with inputs from untrusted sources.
1311
 *
1312
 * @param input         the UTF-8 string to convert
1313
 * @param length        the length of the string in bytes
1314
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1315
 * @return a result pair struct (of type simdutf::result containing the two
1316
 * fields error and count) with an error code and either position of the error
1317
 * (in the input in code units) if any, or the number of char16_t written if
1318
 * successful.
1319
 */
1320
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
1321
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1322
  #if SIMDUTF_SPAN
1323
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1324
convert_utf8_to_utf16le_with_errors(
1325
    const detail::input_span_of_byte_like auto &utf8_input,
1326
    std::span<char16_t> utf16_output) noexcept {
1327
    #if SIMDUTF_CPLUSPLUS23
1328
  if consteval {
1329
    return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
1330
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1331
  } else
1332
    #endif
1333
  {
1334
    return convert_utf8_to_utf16le_with_errors(
1335
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1336
        utf16_output.data());
1337
  }
1338
}
1339
  #endif // SIMDUTF_SPAN
1340
1341
/**
1342
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1343
 *
1344
 * During the conversion also validation of the input string is done.
1345
 * This function is suitable to work with inputs from untrusted sources.
1346
 *
1347
 * @param input         the UTF-8 string to convert
1348
 * @param length        the length of the string in bytes
1349
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1350
 * @return a result pair struct (of type simdutf::result containing the two
1351
 * fields error and count) with an error code and either position of the error
1352
 * (in the input in code units) if any, or the number of char16_t written if
1353
 * successful.
1354
 */
1355
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
1356
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1357
  #if SIMDUTF_SPAN
1358
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1359
convert_utf8_to_utf16be_with_errors(
1360
    const detail::input_span_of_byte_like auto &utf8_input,
1361
    std::span<char16_t> utf16_output) noexcept {
1362
    #if SIMDUTF_CPLUSPLUS23
1363
  if consteval {
1364
    return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
1365
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1366
  } else
1367
    #endif
1368
  {
1369
    return convert_utf8_to_utf16be_with_errors(
1370
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1371
        utf16_output.data());
1372
  }
1373
}
1374
  #endif // SIMDUTF_SPAN
1375
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1376
1377
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1378
/**
1379
 * Convert possibly broken UTF-8 string into UTF-32 string.
1380
 *
1381
 * During the conversion also validation of the input string is done.
1382
 * This function is suitable to work with inputs from untrusted sources.
1383
 *
1384
 * @param input         the UTF-8 string to convert
1385
 * @param length        the length of the string in bytes
1386
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1387
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
1388
 * string
1389
 */
1390
simdutf_warn_unused size_t convert_utf8_to_utf32(
1391
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1392
  #if SIMDUTF_SPAN
1393
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1394
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
1395
                      std::span<char32_t> utf32_output) noexcept {
1396
    #if SIMDUTF_CPLUSPLUS23
1397
  if consteval {
1398
    return scalar::utf8_to_utf32::convert(utf8_input.data(), utf8_input.size(),
1399
                                          utf32_output.data());
1400
  } else
1401
    #endif
1402
  {
1403
    return convert_utf8_to_utf32(
1404
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1405
        utf32_output.data());
1406
  }
1407
}
1408
  #endif // SIMDUTF_SPAN
1409
1410
/**
1411
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1412
 *
1413
 * During the conversion also validation of the input string is done.
1414
 * This function is suitable to work with inputs from untrusted sources.
1415
 *
1416
 * @param input         the UTF-8 string to convert
1417
 * @param length        the length of the string in bytes
1418
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1419
 * @return a result pair struct (of type simdutf::result containing the two
1420
 * fields error and count) with an error code and either position of the error
1421
 * (in the input in code units) if any, or the number of char32_t written if
1422
 * successful.
1423
 */
1424
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
1425
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1426
  #if SIMDUTF_SPAN
1427
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1428
convert_utf8_to_utf32_with_errors(
1429
    const detail::input_span_of_byte_like auto &utf8_input,
1430
    std::span<char32_t> utf32_output) noexcept {
1431
    #if SIMDUTF_CPLUSPLUS23
1432
  if consteval {
1433
    return scalar::utf8_to_utf32::convert_with_errors(
1434
        utf8_input.data(), utf8_input.size(), utf32_output.data());
1435
  } else
1436
    #endif
1437
  {
1438
    return convert_utf8_to_utf32_with_errors(
1439
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1440
        utf32_output.data());
1441
  }
1442
}
1443
  #endif // SIMDUTF_SPAN
1444
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1445
1446
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1447
/**
1448
 * Convert valid UTF-8 string into latin1 string.
1449
 *
1450
 * This function assumes that the input string is valid UTF-8 and that it can be
1451
 * represented as Latin1. If you violate this assumption, the result is
1452
 * implementation defined and may include system-dependent behavior such as
1453
 * crashes.
1454
 *
1455
 * This function is for expert users only and not part of our public API. Use
1456
 * convert_utf8_to_latin1 instead. The function may be removed from the library
1457
 * in the future.
1458
 *
1459
 * This function is not BOM-aware.
1460
 *
1461
 * @param input         the UTF-8 string to convert
1462
 * @param length        the length of the string in bytes
1463
 * @param latin1_output  the pointer to buffer that can hold conversion result
1464
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1465
 */
1466
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1467
    const char *input, size_t length, char *latin1_output) noexcept;
1468
  #if SIMDUTF_SPAN
1469
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1470
convert_valid_utf8_to_latin1(
1471
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1472
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1473
    #if SIMDUTF_CPLUSPLUS23
1474
  if consteval {
1475
    return scalar::utf8_to_latin1::convert_valid(
1476
        valid_utf8_input.data(), valid_utf8_input.size(), latin1_output.data());
1477
  } else
1478
    #endif
1479
  {
1480
    return convert_valid_utf8_to_latin1(
1481
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1482
        valid_utf8_input.size(), latin1_output.data());
1483
  }
1484
}
1485
  #endif // SIMDUTF_SPAN
1486
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1487
1488
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1489
/**
1490
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1491
 *
1492
 * This function assumes that the input string is valid UTF-8.
1493
 *
1494
 * @param input         the UTF-8 string to convert
1495
 * @param length        the length of the string in bytes
1496
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1497
 * @return the number of written char16_t
1498
 */
1499
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1500
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1501
  #if SIMDUTF_SPAN
1502
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1503
convert_valid_utf8_to_utf16(
1504
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1505
    std::span<char16_t> utf16_output) noexcept {
1506
    #if SIMDUTF_CPLUSPLUS23
1507
  if consteval {
1508
    return scalar::utf8_to_utf16::convert_valid<endianness::NATIVE>(
1509
        valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1510
  } else
1511
    #endif
1512
  {
1513
    return convert_valid_utf8_to_utf16(
1514
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1515
        valid_utf8_input.size(), utf16_output.data());
1516
  }
1517
}
1518
  #endif // SIMDUTF_SPAN
1519
1520
/**
1521
 * Convert valid UTF-8 string into UTF-16LE string.
1522
 *
1523
 * This function assumes that the input string is valid UTF-8.
1524
 *
1525
 * @param input         the UTF-8 string to convert
1526
 * @param length        the length of the string in bytes
1527
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1528
 * @return the number of written char16_t
1529
 */
1530
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1531
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1532
  #if SIMDUTF_SPAN
1533
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1534
convert_valid_utf8_to_utf16le(
1535
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1536
    std::span<char16_t> utf16_output) noexcept {
1537
1538
    #if SIMDUTF_CPLUSPLUS23
1539
  if consteval {
1540
    return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
1541
        valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1542
  } else
1543
    #endif
1544
  {
1545
    return convert_valid_utf8_to_utf16le(
1546
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1547
        valid_utf8_input.size(), utf16_output.data());
1548
  }
1549
}
1550
  #endif // SIMDUTF_SPAN
1551
1552
/**
1553
 * Convert valid UTF-8 string into UTF-16BE string.
1554
 *
1555
 * This function assumes that the input string is valid UTF-8.
1556
 *
1557
 * @param input         the UTF-8 string to convert
1558
 * @param length        the length of the string in bytes
1559
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1560
 * @return the number of written char16_t
1561
 */
1562
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1563
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1564
  #if SIMDUTF_SPAN
1565
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1566
convert_valid_utf8_to_utf16be(
1567
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1568
    std::span<char16_t> utf16_output) noexcept {
1569
    #if SIMDUTF_CPLUSPLUS23
1570
  if consteval {
1571
    return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
1572
        valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1573
  } else
1574
    #endif
1575
  {
1576
    return convert_valid_utf8_to_utf16be(
1577
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1578
        valid_utf8_input.size(), utf16_output.data());
1579
  }
1580
}
1581
  #endif // SIMDUTF_SPAN
1582
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1583
1584
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1585
/**
1586
 * Convert valid UTF-8 string into UTF-32 string.
1587
 *
1588
 * This function assumes that the input string is valid UTF-8.
1589
 *
1590
 * @param input         the UTF-8 string to convert
1591
 * @param length        the length of the string in bytes
1592
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1593
 * @return the number of written char32_t
1594
 */
1595
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1596
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1597
  #if SIMDUTF_SPAN
1598
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1599
convert_valid_utf8_to_utf32(
1600
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1601
    std::span<char32_t> utf32_output) noexcept {
1602
    #if SIMDUTF_CPLUSPLUS23
1603
  if consteval {
1604
    return scalar::utf8_to_utf32::convert_valid(
1605
        valid_utf8_input.data(), valid_utf8_input.size(), utf32_output.data());
1606
  } else
1607
    #endif
1608
  {
1609
    return convert_valid_utf8_to_utf32(
1610
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1611
        valid_utf8_input.size(), utf32_output.data());
1612
  }
1613
}
1614
  #endif // SIMDUTF_SPAN
1615
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1616
1617
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1618
/**
1619
 * Return the number of bytes that this Latin1 string would require in UTF-8
1620
 * format.
1621
 *
1622
 * @param input         the Latin1 string to convert
1623
 * @param length        the length of the string bytes
1624
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1625
 */
1626
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1627
                                                   size_t length) noexcept;
1628
  #if SIMDUTF_SPAN
1629
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1630
utf8_length_from_latin1(
1631
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1632
    #if SIMDUTF_CPLUSPLUS23
1633
  if consteval {
1634
    return scalar::latin1_to_utf8::utf8_length_from_latin1(latin1_input.data(),
1635
                                                           latin1_input.size());
1636
  } else
1637
    #endif
1638
  {
1639
    return utf8_length_from_latin1(
1640
        reinterpret_cast<const char *>(latin1_input.data()),
1641
        latin1_input.size());
1642
  }
1643
}
1644
  #endif // SIMDUTF_SPAN
1645
1646
/**
1647
 * Compute the number of bytes that this UTF-8 string would require in Latin1
1648
 * format.
1649
 *
1650
 * This function does not validate the input. It is acceptable to pass invalid
1651
 * UTF-8 strings but in such cases the result is implementation defined.
1652
 *
1653
 * This function is not BOM-aware.
1654
 *
1655
 * @param input         the UTF-8 string to convert
1656
 * @param length        the length of the string in byte
1657
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1658
 */
1659
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1660
                                                   size_t length) noexcept;
1661
  #if SIMDUTF_SPAN
1662
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1663
latin1_length_from_utf8(
1664
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1665
    #if SIMDUTF_CPLUSPLUS23
1666
  if consteval {
1667
    return scalar::utf8::count_code_points(valid_utf8_input.data(),
1668
                                           valid_utf8_input.size());
1669
  } else
1670
    #endif
1671
  {
1672
    return latin1_length_from_utf8(
1673
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1674
        valid_utf8_input.size());
1675
  }
1676
}
1677
  #endif // SIMDUTF_SPAN
1678
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1679
1680
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1681
/**
1682
 * Compute the number of 2-byte code units that this UTF-8 string would require
1683
 * in UTF-16LE format.
1684
 *
1685
 * This function does not validate the input. It is acceptable to pass invalid
1686
 * UTF-8 strings but in such cases the result is implementation defined.
1687
 *
1688
 * This function is not BOM-aware.
1689
 *
1690
 * @param input         the UTF-8 string to process
1691
 * @param length        the length of the string in bytes
1692
 * @return the number of char16_t code units required to encode the UTF-8 string
1693
 * as UTF-16LE
1694
 */
1695
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1696
                                                  size_t length) noexcept;
1697
  #if SIMDUTF_SPAN
1698
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1699
utf16_length_from_utf8(
1700
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1701
    #if SIMDUTF_CPLUSPLUS23
1702
  if consteval {
1703
    return scalar::utf8::utf16_length_from_utf8(valid_utf8_input.data(),
1704
                                                valid_utf8_input.size());
1705
  } else
1706
    #endif
1707
  {
1708
    return utf16_length_from_utf8(
1709
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1710
        valid_utf8_input.size());
1711
  }
1712
}
1713
  #endif // SIMDUTF_SPAN
1714
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1715
1716
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1717
/**
1718
 * Compute the number of 4-byte code units that this UTF-8 string would require
1719
 * in UTF-32 format.
1720
 *
1721
 * This function is equivalent to count_utf8
1722
 *
1723
 * This function does not validate the input. It is acceptable to pass invalid
1724
 * UTF-8 strings but in such cases the result is implementation defined.
1725
 *
1726
 * This function is not BOM-aware.
1727
 *
1728
 * @param input         the UTF-8 string to process
1729
 * @param length        the length of the string in bytes
1730
 * @return the number of char32_t code units required to encode the UTF-8 string
1731
 * as UTF-32
1732
 */
1733
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1734
                                                  size_t length) noexcept;
1735
  #if SIMDUTF_SPAN
1736
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1737
utf32_length_from_utf8(
1738
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1739
1740
    #if SIMDUTF_CPLUSPLUS23
1741
  if consteval {
1742
    return scalar::utf8::count_code_points(valid_utf8_input.data(),
1743
                                           valid_utf8_input.size());
1744
  } else
1745
    #endif
1746
  {
1747
    return utf32_length_from_utf8(
1748
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1749
        valid_utf8_input.size());
1750
  }
1751
}
1752
  #endif // SIMDUTF_SPAN
1753
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1754
1755
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1756
/**
1757
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1758
 * string.
1759
 *
1760
 * During the conversion also validation of the input string is done.
1761
 * This function is suitable to work with inputs from untrusted sources.
1762
 *
1763
 * This function is not BOM-aware.
1764
 *
1765
 * @param input         the UTF-16 string to convert
1766
 * @param length        the length of the string in 2-byte code units (char16_t)
1767
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1768
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1769
 * string
1770
 */
1771
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1772
                                                 size_t length,
1773
                                                 char *utf8_buffer) noexcept;
1774
  #if SIMDUTF_SPAN
1775
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1776
convert_utf16_to_utf8(
1777
    std::span<const char16_t> utf16_input,
1778
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1779
    #if SIMDUTF_CPLUSPLUS23
1780
  if consteval {
1781
    return scalar::utf16_to_utf8::convert<endianness::NATIVE>(
1782
        utf16_input.data(), utf16_input.size(), utf8_output.data());
1783
  } else
1784
    #endif
1785
  {
1786
    return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1787
                                 reinterpret_cast<char *>(utf8_output.data()));
1788
  }
1789
}
1790
  #endif // SIMDUTF_SPAN
1791
1792
/**
1793
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1794
 * string with output limit.
1795
 *
1796
 * We write as many characters as possible into the output buffer,
1797
 *
1798
 * During the conversion also validation of the input string is done.
1799
 * This function is suitable to work with inputs from untrusted sources.
1800
 *
1801
 * This function is not BOM-aware.
1802
 *
1803
 *
1804
 * @param input         the UTF-16 string to convert
1805
 * @param length        the length of the string in 16-bit code units (char16_t)
1806
 * @param utf8_output   the pointer to buffer that can hold conversion result
1807
 * @param utf8_len      the maximum output length
1808
 * @return the number of written char; 0 if conversion is not possible
1809
 */
1810
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1811
                                                      size_t length,
1812
                                                      char *utf8_output,
1813
                                                      size_t utf8_len) noexcept;
1814
  #if SIMDUTF_SPAN
1815
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1816
convert_utf16_to_utf8_safe(
1817
    std::span<const char16_t> utf16_input,
1818
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1819
      // implementation note: outputspan is a forwarding ref to avoid copying
1820
      // and allow both lvalues and rvalues. std::span can be copied without
1821
      // problems, but std::vector should not, and this function should accept
1822
      // both. it will allow using an owning rvalue ref (example: passing a
1823
      // temporary std::string) as output, but the user will quickly find out
1824
      // that he has no way of getting the data out of the object in that case.
1825
    #if SIMDUTF_CPLUSPLUS23
1826
  if consteval {
1827
    const full_result r =
1828
        scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE, true>(
1829
            utf16_input.data(), utf16_input.size(), utf8_output.data(),
1830
            utf8_output.size());
1831
    if (r.error != error_code::SUCCESS &&
1832
        r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) {
1833
      return 0;
1834
    }
1835
    return r.output_count;
1836
  } else
1837
    #endif
1838
  {
1839
    return convert_utf16_to_utf8_safe(
1840
        utf16_input.data(), utf16_input.size(),
1841
        reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1842
  }
1843
}
1844
  #endif // SIMDUTF_SPAN
1845
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1846
1847
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1848
/**
1849
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1850
 * string.
1851
 *
1852
 * During the conversion also validation of the input string is done.
1853
 * This function is suitable to work with inputs from untrusted sources.
1854
 *
1855
 * This function is not BOM-aware.
1856
 *
1857
 * @param input         the UTF-16 string to convert
1858
 * @param length        the length of the string in 2-byte code units (char16_t)
1859
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1860
 * @return number of written code units; 0 if input is not a valid UTF-16 string
1861
 * or if it cannot be represented as Latin1
1862
 */
1863
simdutf_warn_unused size_t convert_utf16_to_latin1(
1864
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1865
  #if SIMDUTF_SPAN
1866
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1867
convert_utf16_to_latin1(
1868
    std::span<const char16_t> utf16_input,
1869
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1870
    #if SIMDUTF_CPLUSPLUS23
1871
  if consteval {
1872
    return scalar::utf16_to_latin1::convert<endianness::NATIVE>(
1873
        utf16_input.data(), utf16_input.size(), latin1_output.data());
1874
  } else
1875
    #endif
1876
  {
1877
    return convert_utf16_to_latin1(
1878
        utf16_input.data(), utf16_input.size(),
1879
        reinterpret_cast<char *>(latin1_output.data()));
1880
  }
1881
}
1882
  #endif // SIMDUTF_SPAN
1883
1884
/**
1885
 * Convert possibly broken UTF-16LE string into Latin1 string.
1886
 * If the string cannot be represented as Latin1, an error
1887
 * is returned.
1888
 *
1889
 * During the conversion also validation of the input string is done.
1890
 * This function is suitable to work with inputs from untrusted sources.
1891
 *
1892
 * This function is not BOM-aware.
1893
 *
1894
 * @param input         the UTF-16LE string to convert
1895
 * @param length        the length of the string in 2-byte code units (char16_t)
1896
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1897
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1898
 * string or if it cannot be represented as Latin1
1899
 */
1900
simdutf_warn_unused size_t convert_utf16le_to_latin1(
1901
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1902
  #if SIMDUTF_SPAN
1903
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1904
convert_utf16le_to_latin1(
1905
    std::span<const char16_t> utf16_input,
1906
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1907
    #if SIMDUTF_CPLUSPLUS23
1908
  if consteval {
1909
    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(
1910
        utf16_input.data(), utf16_input.size(), latin1_output.data());
1911
  } else
1912
    #endif
1913
  {
1914
    return convert_utf16le_to_latin1(
1915
        utf16_input.data(), utf16_input.size(),
1916
        reinterpret_cast<char *>(latin1_output.data()));
1917
  }
1918
}
1919
  #endif // SIMDUTF_SPAN
1920
1921
/**
1922
 * Convert possibly broken UTF-16BE string into Latin1 string.
1923
 *
1924
 * During the conversion also validation of the input string is done.
1925
 * This function is suitable to work with inputs from untrusted sources.
1926
 *
1927
 * This function is not BOM-aware.
1928
 *
1929
 * @param input         the UTF-16BE string to convert
1930
 * @param length        the length of the string in 2-byte code units (char16_t)
1931
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1932
 * @return number of written code units; 0 if input is not a valid UTF-16BE
1933
 * string or if it cannot be represented as Latin1
1934
 */
1935
simdutf_warn_unused size_t convert_utf16be_to_latin1(
1936
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1937
  #if SIMDUTF_SPAN
1938
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1939
convert_utf16be_to_latin1(
1940
    std::span<const char16_t> utf16_input,
1941
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1942
    #if SIMDUTF_CPLUSPLUS23
1943
  if consteval {
1944
    return scalar::utf16_to_latin1::convert<endianness::BIG>(
1945
        utf16_input.data(), utf16_input.size(), latin1_output.data());
1946
  } else
1947
    #endif
1948
  {
1949
    return convert_utf16be_to_latin1(
1950
        utf16_input.data(), utf16_input.size(),
1951
        reinterpret_cast<char *>(latin1_output.data()));
1952
  }
1953
}
1954
  #endif // SIMDUTF_SPAN
1955
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1956
1957
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1958
/**
1959
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1960
 *
1961
 * During the conversion also validation of the input string is done.
1962
 * This function is suitable to work with inputs from untrusted sources.
1963
 *
1964
 * This function is not BOM-aware.
1965
 *
1966
 * @param input         the UTF-16LE string to convert
1967
 * @param length        the length of the string in 2-byte code units (char16_t)
1968
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1969
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1970
 * string
1971
 */
1972
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1973
                                                   size_t length,
1974
                                                   char *utf8_buffer) noexcept;
1975
  #if SIMDUTF_SPAN
1976
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1977
convert_utf16le_to_utf8(
1978
    std::span<const char16_t> utf16_input,
1979
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1980
    #if SIMDUTF_CPLUSPLUS23
1981
  if consteval {
1982
    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(
1983
        utf16_input.data(), utf16_input.size(), utf8_output.data());
1984
  } else
1985
    #endif
1986
  {
1987
    return convert_utf16le_to_utf8(
1988
        utf16_input.data(), utf16_input.size(),
1989
        reinterpret_cast<char *>(utf8_output.data()));
1990
  }
1991
}
1992
  #endif // SIMDUTF_SPAN
1993
1994
/**
1995
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1996
 *
1997
 * During the conversion also validation of the input string is done.
1998
 * This function is suitable to work with inputs from untrusted sources.
1999
 *
2000
 * This function is not BOM-aware.
2001
 *
2002
 * @param input         the UTF-16BE string to convert
2003
 * @param length        the length of the string in 2-byte code units (char16_t)
2004
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2005
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2006
 * string
2007
 */
2008
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
2009
                                                   size_t length,
2010
                                                   char *utf8_buffer) noexcept;
2011
  #if SIMDUTF_SPAN
2012
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2013
convert_utf16be_to_utf8(
2014
    std::span<const char16_t> utf16_input,
2015
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2016
    #if SIMDUTF_CPLUSPLUS23
2017
  if consteval {
2018
    return scalar::utf16_to_utf8::convert<endianness::BIG>(
2019
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2020
  } else
2021
    #endif
2022
  {
2023
    return convert_utf16be_to_utf8(
2024
        utf16_input.data(), utf16_input.size(),
2025
        reinterpret_cast<char *>(utf8_output.data()));
2026
  }
2027
}
2028
  #endif // SIMDUTF_SPAN
2029
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2030
2031
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2032
/**
2033
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
2034
 * string.
2035
 *
2036
 * During the conversion also validation of the input string is done.
2037
 * This function is suitable to work with inputs from untrusted sources.
2038
 * This function is not BOM-aware.
2039
 *
2040
 * @param input         the UTF-16 string to convert
2041
 * @param length        the length of the string in 2-byte code units (char16_t)
2042
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2043
 * @return a result pair struct (of type simdutf::result containing the two
2044
 * fields error and count) with an error code and either position of the error
2045
 * (in the input in code units) if any, or the number of char written if
2046
 * successful.
2047
 */
2048
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
2049
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2050
  #if SIMDUTF_SPAN
2051
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2052
convert_utf16_to_latin1_with_errors(
2053
    std::span<const char16_t> utf16_input,
2054
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2055
    #if SIMDUTF_CPLUSPLUS23
2056
  if consteval {
2057
    return scalar::utf16_to_latin1::convert_with_errors<endianness::NATIVE>(
2058
        utf16_input.data(), utf16_input.size(), latin1_output.data());
2059
  } else
2060
    #endif
2061
  {
2062
    return convert_utf16_to_latin1_with_errors(
2063
        utf16_input.data(), utf16_input.size(),
2064
        reinterpret_cast<char *>(latin1_output.data()));
2065
  }
2066
}
2067
  #endif // SIMDUTF_SPAN
2068
2069
/**
2070
 * Convert possibly broken UTF-16LE string into Latin1 string.
2071
 *
2072
 * During the conversion also validation of the input string is done.
2073
 * This function is suitable to work with inputs from untrusted sources.
2074
 * This function is not BOM-aware.
2075
 *
2076
 * @param input         the UTF-16LE string to convert
2077
 * @param length        the length of the string in 2-byte code units (char16_t)
2078
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2079
 * @return a result pair struct (of type simdutf::result containing the two
2080
 * fields error and count) with an error code and either position of the error
2081
 * (in the input in code units) if any, or the number of char written if
2082
 * successful.
2083
 */
2084
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
2085
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2086
  #if SIMDUTF_SPAN
2087
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2088
convert_utf16le_to_latin1_with_errors(
2089
    std::span<const char16_t> utf16_input,
2090
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2091
    #if SIMDUTF_CPLUSPLUS23
2092
  if consteval {
2093
    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
2094
        utf16_input.data(), utf16_input.size(), latin1_output.data());
2095
  } else
2096
    #endif
2097
  {
2098
    return convert_utf16le_to_latin1_with_errors(
2099
        utf16_input.data(), utf16_input.size(),
2100
        reinterpret_cast<char *>(latin1_output.data()));
2101
  }
2102
}
2103
  #endif // SIMDUTF_SPAN
2104
2105
/**
2106
 * Convert possibly broken UTF-16BE string into Latin1 string.
2107
 * If the string cannot be represented as Latin1, an error
2108
 * is returned.
2109
 *
2110
 * During the conversion also validation of the input string is done.
2111
 * This function is suitable to work with inputs from untrusted sources.
2112
 * This function is not BOM-aware.
2113
 *
2114
 * @param input         the UTF-16BE string to convert
2115
 * @param length        the length of the string in 2-byte code units (char16_t)
2116
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2117
 * @return a result pair struct (of type simdutf::result containing the two
2118
 * fields error and count) with an error code and either position of the error
2119
 * (in the input in code units) if any, or the number of char written if
2120
 * successful.
2121
 */
2122
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
2123
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2124
  #if SIMDUTF_SPAN
2125
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2126
convert_utf16be_to_latin1_with_errors(
2127
    std::span<const char16_t> utf16_input,
2128
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2129
    #if SIMDUTF_CPLUSPLUS23
2130
  if consteval {
2131
    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
2132
        utf16_input.data(), utf16_input.size(), latin1_output.data());
2133
  } else
2134
    #endif
2135
  {
2136
    return convert_utf16be_to_latin1_with_errors(
2137
        utf16_input.data(), utf16_input.size(),
2138
        reinterpret_cast<char *>(latin1_output.data()));
2139
  }
2140
}
2141
  #endif // SIMDUTF_SPAN
2142
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2143
2144
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2145
/**
2146
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
2147
 * string and stop on error.
2148
 *
2149
 * During the conversion also validation of the input string is done.
2150
 * This function is suitable to work with inputs from untrusted sources.
2151
 *
2152
 * This function is not BOM-aware.
2153
 *
2154
 * @param input         the UTF-16 string to convert
2155
 * @param length        the length of the string in 2-byte code units (char16_t)
2156
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2157
 * @return a result pair struct (of type simdutf::result containing the two
2158
 * fields error and count) with an error code and either position of the error
2159
 * (in the input in code units) if any, or the number of char written if
2160
 * successful.
2161
 */
2162
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
2163
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2164
  #if SIMDUTF_SPAN
2165
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2166
convert_utf16_to_utf8_with_errors(
2167
    std::span<const char16_t> utf16_input,
2168
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2169
    #if SIMDUTF_CPLUSPLUS23
2170
  if consteval {
2171
    return scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE>(
2172
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2173
  } else
2174
    #endif
2175
  {
2176
    return convert_utf16_to_utf8_with_errors(
2177
        utf16_input.data(), utf16_input.size(),
2178
        reinterpret_cast<char *>(utf8_output.data()));
2179
  }
2180
}
2181
  #endif // SIMDUTF_SPAN
2182
2183
/**
2184
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2185
 *
2186
 * During the conversion also validation of the input string is done.
2187
 * This function is suitable to work with inputs from untrusted sources.
2188
 *
2189
 * This function is not BOM-aware.
2190
 *
2191
 * @param input         the UTF-16LE string to convert
2192
 * @param length        the length of the string in 2-byte code units (char16_t)
2193
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2194
 * @return a result pair struct (of type simdutf::result containing the two
2195
 * fields error and count) with an error code and either position of the error
2196
 * (in the input in code units) if any, or the number of char written if
2197
 * successful.
2198
 */
2199
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
2200
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2201
  #if SIMDUTF_SPAN
2202
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2203
convert_utf16le_to_utf8_with_errors(
2204
    std::span<const char16_t> utf16_input,
2205
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2206
    #if SIMDUTF_CPLUSPLUS23
2207
  if consteval {
2208
    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
2209
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2210
  } else
2211
    #endif
2212
  {
2213
    return convert_utf16le_to_utf8_with_errors(
2214
        utf16_input.data(), utf16_input.size(),
2215
        reinterpret_cast<char *>(utf8_output.data()));
2216
  }
2217
}
2218
  #endif // SIMDUTF_SPAN
2219
2220
/**
2221
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2222
 *
2223
 * During the conversion also validation of the input string is done.
2224
 * This function is suitable to work with inputs from untrusted sources.
2225
 *
2226
 * This function is not BOM-aware.
2227
 *
2228
 * @param input         the UTF-16BE string to convert
2229
 * @param length        the length of the string in 2-byte code units (char16_t)
2230
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2231
 * @return a result pair struct (of type simdutf::result containing the two
2232
 * fields error and count) with an error code and either position of the error
2233
 * (in the input in code units) if any, or the number of char written if
2234
 * successful.
2235
 */
2236
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
2237
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2238
  #if SIMDUTF_SPAN
2239
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2240
convert_utf16be_to_utf8_with_errors(
2241
    std::span<const char16_t> utf16_input,
2242
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2243
    #if SIMDUTF_CPLUSPLUS23
2244
  if consteval {
2245
    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
2246
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2247
  } else
2248
    #endif
2249
  {
2250
    return convert_utf16be_to_utf8_with_errors(
2251
        utf16_input.data(), utf16_input.size(),
2252
        reinterpret_cast<char *>(utf8_output.data()));
2253
  }
2254
}
2255
  #endif // SIMDUTF_SPAN
2256
2257
/**
2258
 * Convert possibly broken UTF-16LE string into UTF-8 string, replacing
2259
 * unpaired surrogates with the Unicode replacement character U+FFFD.
2260
 *
2261
 * This function always succeeds: unpaired surrogates are replaced with
2262
 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
2263
 *
2264
 * This function is not BOM-aware.
2265
 *
2266
 * @param input         the UTF-16LE string to convert
2267
 * @param length        the length of the string in 2-byte code units (char16_t)
2268
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2269
 * @return number of written code units
2270
 */
2271
simdutf_warn_unused size_t convert_utf16le_to_utf8_with_replacement(
2272
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2273
  #if SIMDUTF_SPAN
2274
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2275
convert_utf16le_to_utf8_with_replacement(
2276
    std::span<const char16_t> utf16_input,
2277
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2278
    #if SIMDUTF_CPLUSPLUS23
2279
  if consteval {
2280
    return scalar::utf16_to_utf8::convert_with_replacement<endianness::LITTLE>(
2281
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2282
  } else
2283
    #endif
2284
  {
2285
    return convert_utf16le_to_utf8_with_replacement(
2286
        utf16_input.data(), utf16_input.size(),
2287
        reinterpret_cast<char *>(utf8_output.data()));
2288
  }
2289
}
2290
  #endif // SIMDUTF_SPAN
2291
2292
/**
2293
 * Convert possibly broken UTF-16BE string into UTF-8 string, replacing
2294
 * unpaired surrogates with the Unicode replacement character U+FFFD.
2295
 *
2296
 * This function always succeeds: unpaired surrogates are replaced with
2297
 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
2298
 *
2299
 * This function is not BOM-aware.
2300
 *
2301
 * @param input         the UTF-16BE string to convert
2302
 * @param length        the length of the string in 2-byte code units (char16_t)
2303
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2304
 * @return number of written code units
2305
 */
2306
simdutf_warn_unused size_t convert_utf16be_to_utf8_with_replacement(
2307
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2308
  #if SIMDUTF_SPAN
2309
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2310
convert_utf16be_to_utf8_with_replacement(
2311
    std::span<const char16_t> utf16_input,
2312
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2313
    #if SIMDUTF_CPLUSPLUS23
2314
  if consteval {
2315
    return scalar::utf16_to_utf8::convert_with_replacement<endianness::BIG>(
2316
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2317
  } else
2318
    #endif
2319
  {
2320
    return convert_utf16be_to_utf8_with_replacement(
2321
        utf16_input.data(), utf16_input.size(),
2322
        reinterpret_cast<char *>(utf8_output.data()));
2323
  }
2324
}
2325
  #endif // SIMDUTF_SPAN
2326
2327
/**
2328
 * Convert possibly broken UTF-16 string (native endianness) into UTF-8 string,
2329
 * replacing unpaired surrogates with the Unicode replacement character U+FFFD.
2330
 *
2331
 * This function always succeeds: unpaired surrogates are replaced with
2332
 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
2333
 *
2334
 * This function is not BOM-aware.
2335
 *
2336
 * @param input         the UTF-16 string to convert
2337
 * @param length        the length of the string in 2-byte code units (char16_t)
2338
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2339
 * @return number of written code units
2340
 */
2341
simdutf_warn_unused size_t convert_utf16_to_utf8_with_replacement(
2342
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2343
  #if SIMDUTF_SPAN
2344
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2345
convert_utf16_to_utf8_with_replacement(
2346
    std::span<const char16_t> utf16_input,
2347
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2348
    #if SIMDUTF_CPLUSPLUS23
2349
  if consteval {
2350
    return scalar::utf16_to_utf8::convert_with_replacement<endianness::NATIVE>(
2351
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2352
  } else
2353
    #endif
2354
  {
2355
    return convert_utf16_to_utf8_with_replacement(
2356
        utf16_input.data(), utf16_input.size(),
2357
        reinterpret_cast<char *>(utf8_output.data()));
2358
  }
2359
}
2360
  #endif // SIMDUTF_SPAN
2361
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2362
2363
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2364
/**
2365
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
2366
 *
2367
 * This function assumes that the input string is valid UTF-16.
2368
 *
2369
 * This function is not BOM-aware.
2370
 *
2371
 * @param input         the UTF-16 string to convert
2372
 * @param length        the length of the string in 2-byte code units (char16_t)
2373
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2374
 * result
2375
 * @return number of written code units; 0 if conversion is not possible
2376
 */
2377
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
2378
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2379
  #if SIMDUTF_SPAN
2380
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2381
convert_valid_utf16_to_utf8(
2382
    std::span<const char16_t> valid_utf16_input,
2383
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2384
    #if SIMDUTF_CPLUSPLUS23
2385
  if consteval {
2386
    return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
2387
        valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2388
  } else
2389
    #endif
2390
  {
2391
    return convert_valid_utf16_to_utf8(
2392
        valid_utf16_input.data(), valid_utf16_input.size(),
2393
        reinterpret_cast<char *>(utf8_output.data()));
2394
  }
2395
}
2396
  #endif // SIMDUTF_SPAN
2397
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2398
2399
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2400
/**
2401
 * Using native endianness, convert UTF-16 string into Latin1 string.
2402
 *
2403
 * This function assumes that the input string is valid UTF-16 and that it can
2404
 * be represented as Latin1. If you violate this assumption, the result is
2405
 * implementation defined and may include system-dependent behavior such as
2406
 * crashes.
2407
 *
2408
 * This function is for expert users only and not part of our public API. Use
2409
 * convert_utf16_to_latin1 instead. The function may be removed from the library
2410
 * in the future.
2411
 *
2412
 * This function is not BOM-aware.
2413
 *
2414
 * @param input         the UTF-16 string to convert
2415
 * @param length        the length of the string in 2-byte code units (char16_t)
2416
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2417
 * @return number of written code units; 0 if conversion is not possible
2418
 */
2419
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
2420
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2421
  #if SIMDUTF_SPAN
2422
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2423
convert_valid_utf16_to_latin1(
2424
    std::span<const char16_t> valid_utf16_input,
2425
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2426
    #if SIMDUTF_CPLUSPLUS23
2427
  if consteval {
2428
    return scalar::utf16_to_latin1::convert_valid_impl<endianness::NATIVE>(
2429
        detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2430
        valid_utf16_input.size(),
2431
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2432
  } else
2433
    #endif
2434
  {
2435
    return convert_valid_utf16_to_latin1(
2436
        valid_utf16_input.data(), valid_utf16_input.size(),
2437
        reinterpret_cast<char *>(latin1_output.data()));
2438
  }
2439
}
2440
  #endif // SIMDUTF_SPAN
2441
2442
/**
2443
 * Convert valid UTF-16LE string into Latin1 string.
2444
 *
2445
 * This function assumes that the input string is valid UTF-16LE and that it can
2446
 * be represented as Latin1. If you violate this assumption, the result is
2447
 * implementation defined and may include system-dependent behavior such as
2448
 * crashes.
2449
 *
2450
 * This function is for expert users only and not part of our public API. Use
2451
 * convert_utf16le_to_latin1 instead. The function may be removed from the
2452
 * library in the future.
2453
 *
2454
 * This function is not BOM-aware.
2455
 *
2456
 * @param input         the UTF-16LE string to convert
2457
 * @param length        the length of the string in 2-byte code units (char16_t)
2458
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2459
 * @return number of written code units; 0 if conversion is not possible
2460
 */
2461
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
2462
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2463
  #if SIMDUTF_SPAN
2464
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2465
convert_valid_utf16le_to_latin1(
2466
    std::span<const char16_t> valid_utf16_input,
2467
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2468
    #if SIMDUTF_CPLUSPLUS23
2469
  if consteval {
2470
    return scalar::utf16_to_latin1::convert_valid_impl<endianness::LITTLE>(
2471
        detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2472
        valid_utf16_input.size(),
2473
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2474
  } else
2475
    #endif
2476
  {
2477
    return convert_valid_utf16le_to_latin1(
2478
        valid_utf16_input.data(), valid_utf16_input.size(),
2479
        reinterpret_cast<char *>(latin1_output.data()));
2480
  }
2481
}
2482
  #endif // SIMDUTF_SPAN
2483
2484
/**
2485
 * Convert valid UTF-16BE string into Latin1 string.
2486
 *
2487
 * This function assumes that the input string is valid UTF-16BE and that it can
2488
 * be represented as Latin1. If you violate this assumption, the result is
2489
 * implementation defined and may include system-dependent behavior such as
2490
 * crashes.
2491
 *
2492
 * This function is for expert users only and not part of our public API. Use
2493
 * convert_utf16be_to_latin1 instead. The function may be removed from the
2494
 * library in the future.
2495
 *
2496
 * This function is not BOM-aware.
2497
 *
2498
 * @param input         the UTF-16BE string to convert
2499
 * @param length        the length of the string in 2-byte code units (char16_t)
2500
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2501
 * @return number of written code units; 0 if conversion is not possible
2502
 */
2503
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
2504
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2505
  #if SIMDUTF_SPAN
2506
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2507
convert_valid_utf16be_to_latin1(
2508
    std::span<const char16_t> valid_utf16_input,
2509
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2510
    #if SIMDUTF_CPLUSPLUS23
2511
  if consteval {
2512
    return scalar::utf16_to_latin1::convert_valid_impl<endianness::BIG>(
2513
        detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2514
        valid_utf16_input.size(),
2515
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2516
  } else
2517
    #endif
2518
  {
2519
    return convert_valid_utf16be_to_latin1(
2520
        valid_utf16_input.data(), valid_utf16_input.size(),
2521
        reinterpret_cast<char *>(latin1_output.data()));
2522
  }
2523
}
2524
  #endif // SIMDUTF_SPAN
2525
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2526
2527
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2528
/**
2529
 * Convert valid UTF-16LE string into UTF-8 string.
2530
 *
2531
 * This function assumes that the input string is valid UTF-16LE
2532
 *
2533
 * This function is not BOM-aware.
2534
 *
2535
 * @param input         the UTF-16LE string to convert
2536
 * @param length        the length of the string in 2-byte code units (char16_t)
2537
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2538
 * result
2539
 * @return number of written code units; 0 if conversion is not possible
2540
 */
2541
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
2542
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2543
  #if SIMDUTF_SPAN
2544
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2545
convert_valid_utf16le_to_utf8(
2546
    std::span<const char16_t> valid_utf16_input,
2547
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2548
    #if SIMDUTF_CPLUSPLUS23
2549
  if consteval {
2550
    return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
2551
        valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2552
  } else
2553
    #endif
2554
  {
2555
    return convert_valid_utf16le_to_utf8(
2556
        valid_utf16_input.data(), valid_utf16_input.size(),
2557
        reinterpret_cast<char *>(utf8_output.data()));
2558
  }
2559
}
2560
  #endif // SIMDUTF_SPAN
2561
2562
/**
2563
 * Convert valid UTF-16BE string into UTF-8 string.
2564
 *
2565
 * This function assumes that the input string is valid UTF-16BE.
2566
 *
2567
 * This function is not BOM-aware.
2568
 *
2569
 * @param input         the UTF-16BE string to convert
2570
 * @param length        the length of the string in 2-byte code units (char16_t)
2571
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2572
 * result
2573
 * @return number of written code units; 0 if conversion is not possible
2574
 */
2575
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
2576
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2577
  #if SIMDUTF_SPAN
2578
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2579
convert_valid_utf16be_to_utf8(
2580
    std::span<const char16_t> valid_utf16_input,
2581
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2582
    #if SIMDUTF_CPLUSPLUS23
2583
  if consteval {
2584
    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(
2585
        valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2586
  } else
2587
    #endif
2588
  {
2589
    return convert_valid_utf16be_to_utf8(
2590
        valid_utf16_input.data(), valid_utf16_input.size(),
2591
        reinterpret_cast<char *>(utf8_output.data()));
2592
  }
2593
}
2594
  #endif // SIMDUTF_SPAN
2595
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2596
2597
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2598
/**
2599
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
2600
 * string.
2601
 *
2602
 * During the conversion also validation of the input string is done.
2603
 * This function is suitable to work with inputs from untrusted sources.
2604
 *
2605
 * This function is not BOM-aware.
2606
 *
2607
 * @param input         the UTF-16 string to convert
2608
 * @param length        the length of the string in 2-byte code units (char16_t)
2609
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2610
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2611
 * string
2612
 */
2613
simdutf_warn_unused size_t convert_utf16_to_utf32(
2614
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2615
  #if SIMDUTF_SPAN
2616
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2617
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
2618
0
                       std::span<char32_t> utf32_output) noexcept {
2619
0
2620
0
    #if SIMDUTF_CPLUSPLUS23
2621
0
  if consteval {
2622
0
    return scalar::utf16_to_utf32::convert<endianness::NATIVE>(
2623
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2624
0
  } else
2625
0
    #endif
2626
0
  {
2627
0
    return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
2628
0
                                  utf32_output.data());
2629
0
  }
2630
0
}
2631
  #endif // SIMDUTF_SPAN
2632
2633
/**
2634
 * Convert possibly broken UTF-16LE string into UTF-32 string.
2635
 *
2636
 * During the conversion also validation of the input string is done.
2637
 * This function is suitable to work with inputs from untrusted sources.
2638
 *
2639
 * This function is not BOM-aware.
2640
 *
2641
 * @param input         the UTF-16LE string to convert
2642
 * @param length        the length of the string in 2-byte code units (char16_t)
2643
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2644
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2645
 * string
2646
 */
2647
simdutf_warn_unused size_t convert_utf16le_to_utf32(
2648
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2649
  #if SIMDUTF_SPAN
2650
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2651
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
2652
0
                         std::span<char32_t> utf32_output) noexcept {
2653
0
    #if SIMDUTF_CPLUSPLUS23
2654
0
  if consteval {
2655
0
    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(
2656
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2657
0
  } else
2658
0
    #endif
2659
0
  {
2660
0
    return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
2661
0
                                    utf32_output.data());
2662
0
  }
2663
0
}
2664
  #endif // SIMDUTF_SPAN
2665
2666
/**
2667
 * Convert possibly broken UTF-16BE string into UTF-32 string.
2668
 *
2669
 * During the conversion also validation of the input string is done.
2670
 * This function is suitable to work with inputs from untrusted sources.
2671
 *
2672
 * This function is not BOM-aware.
2673
 *
2674
 * @param input         the UTF-16BE string to convert
2675
 * @param length        the length of the string in 2-byte code units (char16_t)
2676
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2677
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2678
 * string
2679
 */
2680
simdutf_warn_unused size_t convert_utf16be_to_utf32(
2681
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2682
  #if SIMDUTF_SPAN
2683
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2684
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
2685
0
                         std::span<char32_t> utf32_output) noexcept {
2686
0
    #if SIMDUTF_CPLUSPLUS23
2687
0
  if consteval {
2688
0
    return scalar::utf16_to_utf32::convert<endianness::BIG>(
2689
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2690
0
  } else
2691
0
    #endif
2692
0
  {
2693
0
    return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
2694
0
                                    utf32_output.data());
2695
0
  }
2696
0
}
2697
  #endif // SIMDUTF_SPAN
2698
2699
/**
2700
 * Using native endianness, convert possibly broken UTF-16 string into
2701
 * UTF-32 string and stop on error.
2702
 *
2703
 * During the conversion also validation of the input string is done.
2704
 * This function is suitable to work with inputs from untrusted sources.
2705
 *
2706
 * This function is not BOM-aware.
2707
 *
2708
 * @param input         the UTF-16 string to convert
2709
 * @param length        the length of the string in 2-byte code units (char16_t)
2710
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2711
 * @return a result pair struct (of type simdutf::result containing the two
2712
 * fields error and count) with an error code and either position of the error
2713
 * (in the input in code units) if any, or the number of char32_t written if
2714
 * successful.
2715
 */
2716
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
2717
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2718
  #if SIMDUTF_SPAN
2719
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2720
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
2721
0
                                   std::span<char32_t> utf32_output) noexcept {
2722
0
    #if SIMDUTF_CPLUSPLUS23
2723
0
  if consteval {
2724
0
    return scalar::utf16_to_utf32::convert_with_errors<endianness::NATIVE>(
2725
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2726
0
  } else
2727
0
    #endif
2728
0
  {
2729
0
    return convert_utf16_to_utf32_with_errors(
2730
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2731
0
  }
2732
0
}
2733
  #endif // SIMDUTF_SPAN
2734
2735
/**
2736
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2737
 *
2738
 * During the conversion also validation of the input string is done.
2739
 * This function is suitable to work with inputs from untrusted sources.
2740
 *
2741
 * This function is not BOM-aware.
2742
 *
2743
 * @param input         the UTF-16LE string to convert
2744
 * @param length        the length of the string in 2-byte code units (char16_t)
2745
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2746
 * @return a result pair struct (of type simdutf::result containing the two
2747
 * fields error and count) with an error code and either position of the error
2748
 * (in the input in code units) if any, or the number of char32_t written if
2749
 * successful.
2750
 */
2751
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
2752
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2753
  #if SIMDUTF_SPAN
2754
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2755
convert_utf16le_to_utf32_with_errors(
2756
    std::span<const char16_t> utf16_input,
2757
0
    std::span<char32_t> utf32_output) noexcept {
2758
0
    #if SIMDUTF_CPLUSPLUS23
2759
0
  if consteval {
2760
0
    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
2761
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2762
0
  } else
2763
0
    #endif
2764
0
  {
2765
0
    return convert_utf16le_to_utf32_with_errors(
2766
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2767
0
  }
2768
0
}
2769
  #endif // SIMDUTF_SPAN
2770
2771
/**
2772
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2773
 *
2774
 * During the conversion also validation of the input string is done.
2775
 * This function is suitable to work with inputs from untrusted sources.
2776
 *
2777
 * This function is not BOM-aware.
2778
 *
2779
 * @param input         the UTF-16BE string to convert
2780
 * @param length        the length of the string in 2-byte code units (char16_t)
2781
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2782
 * @return a result pair struct (of type simdutf::result containing the two
2783
 * fields error and count) with an error code and either position of the error
2784
 * (in the input in code units) if any, or the number of char32_t written if
2785
 * successful.
2786
 */
2787
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
2788
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2789
  #if SIMDUTF_SPAN
2790
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2791
convert_utf16be_to_utf32_with_errors(
2792
    std::span<const char16_t> utf16_input,
2793
0
    std::span<char32_t> utf32_output) noexcept {
2794
0
    #if SIMDUTF_CPLUSPLUS23
2795
0
  if consteval {
2796
0
    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
2797
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2798
0
  } else
2799
0
    #endif
2800
0
  {
2801
0
    return convert_utf16be_to_utf32_with_errors(
2802
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2803
0
  }
2804
0
}
2805
  #endif // SIMDUTF_SPAN
2806
2807
/**
2808
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
2809
 *
2810
 * This function assumes that the input string is valid UTF-16 (native
2811
 * endianness).
2812
 *
2813
 * This function is not BOM-aware.
2814
 *
2815
 * @param input         the UTF-16 string to convert
2816
 * @param length        the length of the string in 2-byte code units (char16_t)
2817
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2818
 * result
2819
 * @return number of written code units; 0 if conversion is not possible
2820
 */
2821
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
2822
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2823
  #if SIMDUTF_SPAN
2824
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2825
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
2826
0
                             std::span<char32_t> utf32_output) noexcept {
2827
0
    #if SIMDUTF_CPLUSPLUS23
2828
0
  if consteval {
2829
0
    return scalar::utf16_to_utf32::convert_valid<endianness::NATIVE>(
2830
0
        valid_utf16_input.data(), valid_utf16_input.size(),
2831
0
        utf32_output.data());
2832
0
  } else
2833
0
    #endif
2834
0
  {
2835
0
    return convert_valid_utf16_to_utf32(valid_utf16_input.data(),
2836
0
                                        valid_utf16_input.size(),
2837
0
                                        utf32_output.data());
2838
0
  }
2839
0
}
2840
  #endif // SIMDUTF_SPAN
2841
2842
/**
2843
 * Convert valid UTF-16LE string into UTF-32 string.
2844
 *
2845
 * This function assumes that the input string is valid UTF-16LE.
2846
 *
2847
 * This function is not BOM-aware.
2848
 *
2849
 * @param input         the UTF-16LE string to convert
2850
 * @param length        the length of the string in 2-byte code units (char16_t)
2851
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2852
 * result
2853
 * @return number of written code units; 0 if conversion is not possible
2854
 */
2855
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
2856
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2857
  #if SIMDUTF_SPAN
2858
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2859
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
2860
0
                               std::span<char32_t> utf32_output) noexcept {
2861
0
    #if SIMDUTF_CPLUSPLUS23
2862
0
  if consteval {
2863
0
    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
2864
0
        valid_utf16_input.data(), valid_utf16_input.size(),
2865
0
        utf32_output.data());
2866
0
  } else
2867
0
    #endif
2868
0
  {
2869
0
    return convert_valid_utf16le_to_utf32(valid_utf16_input.data(),
2870
0
                                          valid_utf16_input.size(),
2871
0
                                          utf32_output.data());
2872
0
  }
2873
0
}
2874
  #endif // SIMDUTF_SPAN
2875
2876
/**
2877
 * Convert valid UTF-16BE string into UTF-32 string.
2878
 *
2879
 * This function assumes that the input string is valid UTF-16LE.
2880
 *
2881
 * This function is not BOM-aware.
2882
 *
2883
 * @param input         the UTF-16BE string to convert
2884
 * @param length        the length of the string in 2-byte code units (char16_t)
2885
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2886
 * result
2887
 * @return number of written code units; 0 if conversion is not possible
2888
 */
2889
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
2890
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2891
  #if SIMDUTF_SPAN
2892
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2893
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
2894
0
                               std::span<char32_t> utf32_output) noexcept {
2895
0
    #if SIMDUTF_CPLUSPLUS23
2896
0
  if consteval {
2897
0
    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(
2898
0
        valid_utf16_input.data(), valid_utf16_input.size(),
2899
0
        utf32_output.data());
2900
0
  } else
2901
0
    #endif
2902
0
  {
2903
0
    return convert_valid_utf16be_to_utf32(valid_utf16_input.data(),
2904
0
                                          valid_utf16_input.size(),
2905
0
                                          utf32_output.data());
2906
0
  }
2907
0
}
2908
  #endif // SIMDUTF_SPAN
2909
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2910
2911
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2912
/**
2913
 * Using native endianness; Compute the number of bytes that this UTF-16
2914
 * string would require in UTF-8 format.
2915
 *
2916
 * This function does not validate the input. It is acceptable to pass invalid
2917
 * UTF-16 strings but in such cases the result is implementation defined.
2918
 *
2919
 * @param input         the UTF-16 string to convert
2920
 * @param length        the length of the string in 2-byte code units (char16_t)
2921
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2922
 */
2923
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
2924
                                                  size_t length) noexcept;
2925
  #if SIMDUTF_SPAN
2926
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2927
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2928
0
    #if SIMDUTF_CPLUSPLUS23
2929
0
  if consteval {
2930
0
    return scalar::utf16::utf8_length_from_utf16<endianness::NATIVE>(
2931
0
        valid_utf16_input.data(), valid_utf16_input.size());
2932
0
  } else
2933
0
    #endif
2934
0
  {
2935
0
    return utf8_length_from_utf16(valid_utf16_input.data(),
2936
0
                                  valid_utf16_input.size());
2937
0
  }
2938
0
}
2939
  #endif // SIMDUTF_SPAN
2940
2941
/**
2942
 * Using native endianness; compute the number of bytes that this UTF-16
2943
 * string would require in UTF-8 format even when the UTF-16LE content contains
2944
 * mismatched surrogates that have to be replaced by the replacement character
2945
 * (0xFFFD).
2946
 *
2947
 * @param input         the UTF-16 string to convert
2948
 * @param length        the length of the string in 2-byte code units (char16_t)
2949
 * @return a result pair struct (of type simdutf::result containing the two
2950
 * fields error and count) where the count is the number of bytes required to
2951
 * encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or
2952
 * SURROGATE. The count is correct regardless of the error field.
2953
 * When SURROGATE is returned, it does not indicate an error in the case of this
2954
 * function: it indicates that at least one surrogate has been encountered: the
2955
 * surrogates may be matched or not (thus this function does not validate). If
2956
 * the returned error code is SUCCESS, then the input contains no surrogate, is
2957
 * in the Basic Multilingual Plane, and is necessarily valid.
2958
 */
2959
simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
2960
    const char16_t *input, size_t length) noexcept;
2961
  #if SIMDUTF_SPAN
2962
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2963
utf8_length_from_utf16_with_replacement(
2964
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2965
0
    #if SIMDUTF_CPLUSPLUS23
2966
0
  if consteval {
2967
0
    return scalar::utf16::utf8_length_from_utf16_with_replacement<
2968
0
        endianness::NATIVE>(valid_utf16_input.data(), valid_utf16_input.size());
2969
0
  } else
2970
0
    #endif
2971
0
  {
2972
0
    return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
2973
0
                                                   valid_utf16_input.size());
2974
0
  }
2975
0
}
2976
  #endif // SIMDUTF_SPAN
2977
2978
/**
2979
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2980
 * format.
2981
 *
2982
 * This function does not validate the input. It is acceptable to pass invalid
2983
 * UTF-16 strings but in such cases the result is implementation defined.
2984
 *
2985
 * @param input         the UTF-16LE string to convert
2986
 * @param length        the length of the string in 2-byte code units (char16_t)
2987
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2988
 */
2989
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
2990
                                                    size_t length) noexcept;
2991
  #if SIMDUTF_SPAN
2992
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2993
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2994
0
    #if SIMDUTF_CPLUSPLUS23
2995
0
  if consteval {
2996
0
    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
2997
0
        valid_utf16_input.data(), valid_utf16_input.size());
2998
0
  } else
2999
0
    #endif
3000
0
  {
3001
0
    return utf8_length_from_utf16le(valid_utf16_input.data(),
3002
0
                                    valid_utf16_input.size());
3003
0
  }
3004
0
}
3005
  #endif // SIMDUTF_SPAN
3006
3007
/**
3008
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
3009
 * format.
3010
 *
3011
 * This function does not validate the input. It is acceptable to pass invalid
3012
 * UTF-16 strings but in such cases the result is implementation defined.
3013
 *
3014
 * @param input         the UTF-16BE string to convert
3015
 * @param length        the length of the string in 2-byte code units (char16_t)
3016
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
3017
 */
3018
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
3019
                                                    size_t length) noexcept;
3020
  #if SIMDUTF_SPAN
3021
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3022
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3023
0
    #if SIMDUTF_CPLUSPLUS23
3024
0
  if consteval {
3025
0
    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
3026
0
        valid_utf16_input.data(), valid_utf16_input.size());
3027
0
  } else
3028
0
    #endif
3029
0
  {
3030
0
    return utf8_length_from_utf16be(valid_utf16_input.data(),
3031
0
                                    valid_utf16_input.size());
3032
0
  }
3033
0
}
3034
  #endif // SIMDUTF_SPAN
3035
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3036
3037
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3038
/**
3039
 * Convert possibly broken UTF-32 string into UTF-8 string.
3040
 *
3041
 * During the conversion also validation of the input string is done.
3042
 * This function is suitable to work with inputs from untrusted sources.
3043
 *
3044
 * This function is not BOM-aware.
3045
 *
3046
 * @param input         the UTF-32 string to convert
3047
 * @param length        the length of the string in 4-byte code units (char32_t)
3048
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
3049
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3050
 */
3051
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
3052
                                                 size_t length,
3053
                                                 char *utf8_buffer) noexcept;
3054
  #if SIMDUTF_SPAN
3055
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3056
convert_utf32_to_utf8(
3057
    std::span<const char32_t> utf32_input,
3058
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3059
    #if SIMDUTF_CPLUSPLUS23
3060
  if consteval {
3061
    return scalar::utf32_to_utf8::convert(
3062
        utf32_input.data(), utf32_input.size(), utf8_output.data());
3063
  } else
3064
    #endif
3065
  {
3066
    return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
3067
                                 reinterpret_cast<char *>(utf8_output.data()));
3068
  }
3069
}
3070
  #endif // SIMDUTF_SPAN
3071
3072
/**
3073
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
3074
 *
3075
 * During the conversion also validation of the input string is done.
3076
 * This function is suitable to work with inputs from untrusted sources.
3077
 *
3078
 * This function is not BOM-aware.
3079
 *
3080
 * @param input         the UTF-32 string to convert
3081
 * @param length        the length of the string in 4-byte code units (char32_t)
3082
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
3083
 * @return a result pair struct (of type simdutf::result containing the two
3084
 * fields error and count) with an error code and either position of the error
3085
 * (in the input in code units) if any, or the number of char written if
3086
 * successful.
3087
 */
3088
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
3089
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
3090
  #if SIMDUTF_SPAN
3091
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3092
convert_utf32_to_utf8_with_errors(
3093
    std::span<const char32_t> utf32_input,
3094
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3095
    #if SIMDUTF_CPLUSPLUS23
3096
  if consteval {
3097
    return scalar::utf32_to_utf8::convert_with_errors(
3098
        utf32_input.data(), utf32_input.size(), utf8_output.data());
3099
  } else
3100
    #endif
3101
  {
3102
    return convert_utf32_to_utf8_with_errors(
3103
        utf32_input.data(), utf32_input.size(),
3104
        reinterpret_cast<char *>(utf8_output.data()));
3105
  }
3106
}
3107
  #endif // SIMDUTF_SPAN
3108
3109
/**
3110
 * Convert valid UTF-32 string into UTF-8 string.
3111
 *
3112
 * This function assumes that the input string is valid UTF-32.
3113
 *
3114
 * This function is not BOM-aware.
3115
 *
3116
 * @param input         the UTF-32 string to convert
3117
 * @param length        the length of the string in 4-byte code units (char32_t)
3118
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
3119
 * result
3120
 * @return number of written code units; 0 if conversion is not possible
3121
 */
3122
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
3123
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
3124
  #if SIMDUTF_SPAN
3125
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3126
convert_valid_utf32_to_utf8(
3127
    std::span<const char32_t> valid_utf32_input,
3128
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3129
    #if SIMDUTF_CPLUSPLUS23
3130
  if consteval {
3131
    return scalar::utf32_to_utf8::convert_valid(
3132
        valid_utf32_input.data(), valid_utf32_input.size(), utf8_output.data());
3133
  } else
3134
    #endif
3135
  {
3136
    return convert_valid_utf32_to_utf8(
3137
        valid_utf32_input.data(), valid_utf32_input.size(),
3138
        reinterpret_cast<char *>(utf8_output.data()));
3139
  }
3140
}
3141
  #endif // SIMDUTF_SPAN
3142
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3143
3144
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3145
/**
3146
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
3147
 * string.
3148
 *
3149
 * During the conversion also validation of the input string is done.
3150
 * This function is suitable to work with inputs from untrusted sources.
3151
 *
3152
 * This function is not BOM-aware.
3153
 *
3154
 * @param input         the UTF-32 string to convert
3155
 * @param length        the length of the string in 4-byte code units (char32_t)
3156
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3157
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3158
 */
3159
simdutf_warn_unused size_t convert_utf32_to_utf16(
3160
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3161
  #if SIMDUTF_SPAN
3162
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3163
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
3164
0
                       std::span<char16_t> utf16_output) noexcept {
3165
0
    #if SIMDUTF_CPLUSPLUS23
3166
0
  if consteval {
3167
0
    return scalar::utf32_to_utf16::convert<endianness::NATIVE>(
3168
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3169
0
  } else
3170
0
    #endif
3171
0
  {
3172
0
    return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
3173
0
                                  utf16_output.data());
3174
0
  }
3175
0
}
3176
  #endif // SIMDUTF_SPAN
3177
3178
/**
3179
 * Convert possibly broken UTF-32 string into UTF-16LE string.
3180
 *
3181
 * During the conversion also validation of the input string is done.
3182
 * This function is suitable to work with inputs from untrusted sources.
3183
 *
3184
 * This function is not BOM-aware.
3185
 *
3186
 * @param input         the UTF-32 string to convert
3187
 * @param length        the length of the string in 4-byte code units (char32_t)
3188
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3189
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3190
 */
3191
simdutf_warn_unused size_t convert_utf32_to_utf16le(
3192
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3193
  #if SIMDUTF_SPAN
3194
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3195
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
3196
0
                         std::span<char16_t> utf16_output) noexcept {
3197
0
    #if SIMDUTF_CPLUSPLUS23
3198
0
  if consteval {
3199
0
    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(
3200
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3201
0
  } else
3202
0
    #endif
3203
0
  {
3204
0
    return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
3205
0
                                    utf16_output.data());
3206
0
  }
3207
0
}
3208
  #endif // SIMDUTF_SPAN
3209
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3210
3211
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3212
/**
3213
 * Convert possibly broken UTF-32 string into Latin1 string.
3214
 *
3215
 * During the conversion also validation of the input string is done.
3216
 * This function is suitable to work with inputs from untrusted sources.
3217
 *
3218
 * This function is not BOM-aware.
3219
 *
3220
 * @param input         the UTF-32 string to convert
3221
 * @param length        the length of the string in 4-byte code units (char32_t)
3222
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
3223
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3224
 * or if it cannot be represented as Latin1
3225
 */
3226
simdutf_warn_unused size_t convert_utf32_to_latin1(
3227
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3228
  #if SIMDUTF_SPAN
3229
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3230
convert_utf32_to_latin1(
3231
    std::span<const char32_t> utf32_input,
3232
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3233
    #if SIMDUTF_CPLUSPLUS23
3234
  if consteval {
3235
    return scalar::utf32_to_latin1::convert(
3236
        utf32_input.data(), utf32_input.size(), latin1_output.data());
3237
  } else
3238
    #endif
3239
  {
3240
    return convert_utf32_to_latin1(
3241
        utf32_input.data(), utf32_input.size(),
3242
        reinterpret_cast<char *>(latin1_output.data()));
3243
  }
3244
}
3245
  #endif // SIMDUTF_SPAN
3246
3247
/**
3248
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
3249
 * If the string cannot be represented as Latin1, an error is returned.
3250
 *
3251
 * During the conversion also validation of the input string is done.
3252
 * This function is suitable to work with inputs from untrusted sources.
3253
 *
3254
 * This function is not BOM-aware.
3255
 *
3256
 * @param input         the UTF-32 string to convert
3257
 * @param length        the length of the string in 4-byte code units (char32_t)
3258
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
3259
 * @return a result pair struct (of type simdutf::result containing the two
3260
 * fields error and count) with an error code and either position of the error
3261
 * (in the input in code units) if any, or the number of char written if
3262
 * successful.
3263
 */
3264
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
3265
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3266
  #if SIMDUTF_SPAN
3267
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3268
convert_utf32_to_latin1_with_errors(
3269
    std::span<const char32_t> utf32_input,
3270
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3271
    #if SIMDUTF_CPLUSPLUS23
3272
  if consteval {
3273
    return scalar::utf32_to_latin1::convert_with_errors(
3274
        utf32_input.data(), utf32_input.size(), latin1_output.data());
3275
  } else
3276
    #endif
3277
  {
3278
    return convert_utf32_to_latin1_with_errors(
3279
        utf32_input.data(), utf32_input.size(),
3280
        reinterpret_cast<char *>(latin1_output.data()));
3281
  }
3282
}
3283
  #endif // SIMDUTF_SPAN
3284
3285
/**
3286
 * Convert valid UTF-32 string into Latin1 string.
3287
 *
3288
 * This function assumes that the input string is valid UTF-32 and that it can
3289
 * be represented as Latin1. If you violate this assumption, the result is
3290
 * implementation defined and may include system-dependent behavior such as
3291
 * crashes.
3292
 *
3293
 * This function is for expert users only and not part of our public API. Use
3294
 * convert_utf32_to_latin1 instead. The function may be removed from the library
3295
 * in the future.
3296
 *
3297
 * This function is not BOM-aware.
3298
 *
3299
 * @param input         the UTF-32 string to convert
3300
 * @param length        the length of the string in 4-byte code units (char32_t)
3301
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
3302
 * result
3303
 * @return number of written code units; 0 if conversion is not possible
3304
 */
3305
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
3306
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3307
  #if SIMDUTF_SPAN
3308
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
3309
convert_valid_utf32_to_latin1(
3310
    std::span<const char32_t> valid_utf32_input,
3311
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3312
    #if SIMDUTF_CPLUSPLUS23
3313
  if consteval {
3314
    return scalar::utf32_to_latin1::convert_valid(
3315
        detail::constexpr_cast_ptr<uint32_t>(valid_utf32_input.data()),
3316
        valid_utf32_input.size(),
3317
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
3318
  }
3319
    #endif
3320
  {
3321
    return convert_valid_utf32_to_latin1(
3322
        valid_utf32_input.data(), valid_utf32_input.size(),
3323
        reinterpret_cast<char *>(latin1_output.data()));
3324
  }
3325
}
3326
  #endif // SIMDUTF_SPAN
3327
3328
/**
3329
 * Compute the number of bytes that this UTF-32 string would require in Latin1
3330
 * format.
3331
 *
3332
 * This function does not validate the input. It is acceptable to pass invalid
3333
 * UTF-32 strings but in such cases the result is implementation defined.
3334
 *
3335
 * This function is not BOM-aware.
3336
 *
3337
 * @param length        the length of the string in 4-byte code units (char32_t)
3338
 * @return the number of bytes required to encode the UTF-32 string as Latin1
3339
 */
3340
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t
3341
latin1_length_from_utf32(size_t length) noexcept {
3342
  return length;
3343
}
3344
3345
/**
3346
 * Compute the number of bytes that this Latin1 string would require in UTF-32
3347
 * format.
3348
 *
3349
 * @param length        the length of the string in Latin1 code units (char)
3350
 * @return the length of the string in 4-byte code units (char32_t) required to
3351
 * encode the Latin1 string as UTF-32
3352
 */
3353
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t
3354
0
utf32_length_from_latin1(size_t length) noexcept {
3355
0
  return length;
3356
0
}
3357
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3358
3359
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3360
/**
3361
 * Convert possibly broken UTF-32 string into UTF-16BE string.
3362
 *
3363
 * During the conversion also validation of the input string is done.
3364
 * This function is suitable to work with inputs from untrusted sources.
3365
 *
3366
 * This function is not BOM-aware.
3367
 *
3368
 * @param input         the UTF-32 string to convert
3369
 * @param length        the length of the string in 4-byte code units (char32_t)
3370
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3371
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3372
 */
3373
simdutf_warn_unused size_t convert_utf32_to_utf16be(
3374
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3375
  #if SIMDUTF_SPAN
3376
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3377
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
3378
0
                         std::span<char16_t> utf16_output) noexcept {
3379
0
    #if SIMDUTF_CPLUSPLUS23
3380
0
  if consteval {
3381
0
    return scalar::utf32_to_utf16::convert<endianness::BIG>(
3382
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3383
0
  } else
3384
0
    #endif
3385
0
  {
3386
0
    return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
3387
0
                                    utf16_output.data());
3388
0
  }
3389
0
}
3390
  #endif // SIMDUTF_SPAN
3391
3392
/**
3393
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
3394
 * string and stop on error.
3395
 *
3396
 * During the conversion also validation of the input string is done.
3397
 * This function is suitable to work with inputs from untrusted sources.
3398
 *
3399
 * This function is not BOM-aware.
3400
 *
3401
 * @param input         the UTF-32 string to convert
3402
 * @param length        the length of the string in 4-byte code units (char32_t)
3403
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3404
 * @return a result pair struct (of type simdutf::result containing the two
3405
 * fields error and count) with an error code and either position of the error
3406
 * (in the input in code units) if any, or the number of char16_t written if
3407
 * successful.
3408
 */
3409
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
3410
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3411
  #if SIMDUTF_SPAN
3412
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3413
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
3414
0
                                   std::span<char16_t> utf16_output) noexcept {
3415
0
    #if SIMDUTF_CPLUSPLUS23
3416
0
  if consteval {
3417
0
    return scalar::utf32_to_utf16::convert_with_errors<endianness::NATIVE>(
3418
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3419
0
  } else
3420
0
    #endif
3421
0
  {
3422
0
    return convert_utf32_to_utf16_with_errors(
3423
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3424
0
  }
3425
0
}
3426
  #endif // SIMDUTF_SPAN
3427
3428
/**
3429
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
3430
 *
3431
 * During the conversion also validation of the input string is done.
3432
 * This function is suitable to work with inputs from untrusted sources.
3433
 *
3434
 * This function is not BOM-aware.
3435
 *
3436
 * @param input         the UTF-32 string to convert
3437
 * @param length        the length of the string in 4-byte code units (char32_t)
3438
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3439
 * @return a result pair struct (of type simdutf::result containing the two
3440
 * fields error and count) with an error code and either position of the error
3441
 * (in the input in code units) if any, or the number of char16_t written if
3442
 * successful.
3443
 */
3444
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
3445
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3446
  #if SIMDUTF_SPAN
3447
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3448
convert_utf32_to_utf16le_with_errors(
3449
    std::span<const char32_t> utf32_input,
3450
0
    std::span<char16_t> utf16_output) noexcept {
3451
0
    #if SIMDUTF_CPLUSPLUS23
3452
0
  if consteval {
3453
0
    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
3454
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3455
0
  } else
3456
0
    #endif
3457
0
  {
3458
0
    return convert_utf32_to_utf16le_with_errors(
3459
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3460
0
  }
3461
0
}
3462
  #endif // SIMDUTF_SPAN
3463
3464
/**
3465
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
3466
 *
3467
 * During the conversion also validation of the input string is done.
3468
 * This function is suitable to work with inputs from untrusted sources.
3469
 *
3470
 * This function is not BOM-aware.
3471
 *
3472
 * @param input         the UTF-32 string to convert
3473
 * @param length        the length of the string in 4-byte code units (char32_t)
3474
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3475
 * @return a result pair struct (of type simdutf::result containing the two
3476
 * fields error and count) with an error code and either position of the error
3477
 * (in the input in code units) if any, or the number of char16_t written if
3478
 * successful.
3479
 */
3480
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
3481
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3482
  #if SIMDUTF_SPAN
3483
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3484
convert_utf32_to_utf16be_with_errors(
3485
    std::span<const char32_t> utf32_input,
3486
0
    std::span<char16_t> utf16_output) noexcept {
3487
0
    #if SIMDUTF_CPLUSPLUS23
3488
0
  if consteval {
3489
0
    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
3490
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3491
0
  } else
3492
0
    #endif
3493
0
  {
3494
0
    return convert_utf32_to_utf16be_with_errors(
3495
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3496
0
  }
3497
0
}
3498
  #endif // SIMDUTF_SPAN
3499
3500
/**
3501
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
3502
 *
3503
 * This function assumes that the input string is valid UTF-32.
3504
 *
3505
 * This function is not BOM-aware.
3506
 *
3507
 * @param input         the UTF-32 string to convert
3508
 * @param length        the length of the string in 4-byte code units (char32_t)
3509
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3510
 * result
3511
 * @return number of written code units; 0 if conversion is not possible
3512
 */
3513
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
3514
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3515
  #if SIMDUTF_SPAN
3516
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3517
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
3518
0
                             std::span<char16_t> utf16_output) noexcept {
3519
0
3520
0
    #if SIMDUTF_CPLUSPLUS23
3521
0
  if consteval {
3522
0
    return scalar::utf32_to_utf16::convert_valid<endianness::NATIVE>(
3523
0
        valid_utf32_input.data(), valid_utf32_input.size(),
3524
0
        utf16_output.data());
3525
0
  } else
3526
0
    #endif
3527
0
  {
3528
0
    return convert_valid_utf32_to_utf16(valid_utf32_input.data(),
3529
0
                                        valid_utf32_input.size(),
3530
0
                                        utf16_output.data());
3531
0
  }
3532
0
}
3533
  #endif // SIMDUTF_SPAN
3534
3535
/**
3536
 * Convert valid UTF-32 string into UTF-16LE string.
3537
 *
3538
 * This function assumes that the input string is valid UTF-32.
3539
 *
3540
 * This function is not BOM-aware.
3541
 *
3542
 * @param input         the UTF-32 string to convert
3543
 * @param length        the length of the string in 4-byte code units (char32_t)
3544
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3545
 * result
3546
 * @return number of written code units; 0 if conversion is not possible
3547
 */
3548
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
3549
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3550
  #if SIMDUTF_SPAN
3551
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3552
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
3553
0
                               std::span<char16_t> utf16_output) noexcept {
3554
0
    #if SIMDUTF_CPLUSPLUS23
3555
0
  if consteval {
3556
0
    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
3557
0
        valid_utf32_input.data(), valid_utf32_input.size(),
3558
0
        utf16_output.data());
3559
0
  } else
3560
0
    #endif
3561
0
  {
3562
0
    return convert_valid_utf32_to_utf16le(valid_utf32_input.data(),
3563
0
                                          valid_utf32_input.size(),
3564
0
                                          utf16_output.data());
3565
0
  }
3566
0
}
3567
  #endif // SIMDUTF_SPAN
3568
3569
/**
3570
 * Convert valid UTF-32 string into UTF-16BE string.
3571
 *
3572
 * This function assumes that the input string is valid UTF-32.
3573
 *
3574
 * This function is not BOM-aware.
3575
 *
3576
 * @param input         the UTF-32 string to convert
3577
 * @param length        the length of the string in 4-byte code units (char32_t)
3578
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3579
 * result
3580
 * @return number of written code units; 0 if conversion is not possible
3581
 */
3582
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
3583
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3584
  #if SIMDUTF_SPAN
3585
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3586
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
3587
0
                               std::span<char16_t> utf16_output) noexcept {
3588
0
    #if SIMDUTF_CPLUSPLUS23
3589
0
  if consteval {
3590
0
    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(
3591
0
        valid_utf32_input.data(), valid_utf32_input.size(),
3592
0
        utf16_output.data());
3593
0
  } else
3594
0
    #endif
3595
0
  {
3596
0
    return convert_valid_utf32_to_utf16be(valid_utf32_input.data(),
3597
0
                                          valid_utf32_input.size(),
3598
0
                                          utf16_output.data());
3599
0
  }
3600
0
}
3601
  #endif // SIMDUTF_SPAN
3602
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3603
3604
#if SIMDUTF_FEATURE_UTF16
3605
/**
3606
 * Change the endianness of the input. Can be used to go from UTF-16LE to
3607
 * UTF-16BE or from UTF-16BE to UTF-16LE.
3608
 *
3609
 * This function does not validate the input.
3610
 *
3611
 * This function is not BOM-aware.
3612
 *
3613
 * @param input         the UTF-16 string to process
3614
 * @param length        the length of the string in 2-byte code units (char16_t)
3615
 * @param output        the pointer to a buffer that can hold the conversion
3616
 * result
3617
 */
3618
void change_endianness_utf16(const char16_t *input, size_t length,
3619
                             char16_t *output) noexcept;
3620
  #if SIMDUTF_SPAN
3621
simdutf_really_inline simdutf_constexpr23 void
3622
change_endianness_utf16(std::span<const char16_t> utf16_input,
3623
0
                        std::span<char16_t> utf16_output) noexcept {
3624
0
    #if SIMDUTF_CPLUSPLUS23
3625
0
  if consteval {
3626
0
    return scalar::utf16::change_endianness_utf16(
3627
0
        utf16_input.data(), utf16_input.size(), utf16_output.data());
3628
0
  } else
3629
0
    #endif
3630
0
  {
3631
0
    return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
3632
0
                                   utf16_output.data());
3633
0
  }
3634
0
}
3635
  #endif // SIMDUTF_SPAN
3636
#endif   // SIMDUTF_FEATURE_UTF16
3637
3638
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3639
/**
3640
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
3641
 * format.
3642
 *
3643
 * This function does not validate the input. It is acceptable to pass invalid
3644
 * UTF-32 strings but in such cases the result is implementation defined.
3645
 *
3646
 * @param input         the UTF-32 string to convert
3647
 * @param length        the length of the string in 4-byte code units (char32_t)
3648
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
3649
 */
3650
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
3651
                                                  size_t length) noexcept;
3652
  #if SIMDUTF_SPAN
3653
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3654
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3655
0
    #if SIMDUTF_CPLUSPLUS23
3656
0
  if consteval {
3657
0
    return scalar::utf32::utf8_length_from_utf32(valid_utf32_input.data(),
3658
0
                                                 valid_utf32_input.size());
3659
0
  } else
3660
0
    #endif
3661
0
  {
3662
0
    return utf8_length_from_utf32(valid_utf32_input.data(),
3663
0
                                  valid_utf32_input.size());
3664
0
  }
3665
0
}
3666
  #endif // SIMDUTF_SPAN
3667
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3668
3669
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3670
/**
3671
 * Compute the number of two-byte code units that this UTF-32 string would
3672
 * require in UTF-16 format.
3673
 *
3674
 * This function does not validate the input. It is acceptable to pass invalid
3675
 * UTF-32 strings but in such cases the result is implementation defined.
3676
 *
3677
 * @param input         the UTF-32 string to convert
3678
 * @param length        the length of the string in 4-byte code units (char32_t)
3679
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
3680
 */
3681
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
3682
                                                   size_t length) noexcept;
3683
  #if SIMDUTF_SPAN
3684
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3685
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3686
0
    #if SIMDUTF_CPLUSPLUS23
3687
0
  if consteval {
3688
0
    return scalar::utf32::utf16_length_from_utf32(valid_utf32_input.data(),
3689
0
                                                  valid_utf32_input.size());
3690
0
  } else
3691
0
    #endif
3692
0
  {
3693
0
    return utf16_length_from_utf32(valid_utf32_input.data(),
3694
0
                                   valid_utf32_input.size());
3695
0
  }
3696
0
}
3697
  #endif // SIMDUTF_SPAN
3698
3699
/**
3700
 * Using native endianness; Compute the number of bytes that this UTF-16
3701
 * string would require in UTF-32 format.
3702
 *
3703
 * This function is equivalent to count_utf16.
3704
 *
3705
 * This function does not validate the input. It is acceptable to pass invalid
3706
 * UTF-16 strings but in such cases the result is implementation defined.
3707
 *
3708
 * This function is not BOM-aware.
3709
 *
3710
 * @param input         the UTF-16 string to convert
3711
 * @param length        the length of the string in 2-byte code units (char16_t)
3712
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3713
 */
3714
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
3715
                                                   size_t length) noexcept;
3716
  #if SIMDUTF_SPAN
3717
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3718
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3719
0
    #if SIMDUTF_CPLUSPLUS23
3720
0
  if consteval {
3721
0
    return scalar::utf16::utf32_length_from_utf16<endianness::NATIVE>(
3722
0
        valid_utf16_input.data(), valid_utf16_input.size());
3723
0
  } else
3724
0
    #endif
3725
0
  {
3726
0
    return utf32_length_from_utf16(valid_utf16_input.data(),
3727
0
                                   valid_utf16_input.size());
3728
0
  }
3729
0
}
3730
  #endif // SIMDUTF_SPAN
3731
3732
/**
3733
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
3734
 * format.
3735
 *
3736
 * This function is equivalent to count_utf16le.
3737
 *
3738
 * This function does not validate the input. It is acceptable to pass invalid
3739
 * UTF-16 strings but in such cases the result is implementation defined.
3740
 *
3741
 * This function is not BOM-aware.
3742
 *
3743
 * @param input         the UTF-16LE string to convert
3744
 * @param length        the length of the string in 2-byte code units (char16_t)
3745
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3746
 */
3747
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
3748
                                                     size_t length) noexcept;
3749
  #if SIMDUTF_SPAN
3750
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3751
utf32_length_from_utf16le(
3752
0
    std::span<const char16_t> valid_utf16_input) noexcept {
3753
0
    #if SIMDUTF_CPLUSPLUS23
3754
0
  if consteval {
3755
0
    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(
3756
0
        valid_utf16_input.data(), valid_utf16_input.size());
3757
0
  } else
3758
0
    #endif
3759
0
  {
3760
0
    return utf32_length_from_utf16le(valid_utf16_input.data(),
3761
0
                                     valid_utf16_input.size());
3762
0
  }
3763
0
}
3764
  #endif // SIMDUTF_SPAN
3765
3766
/**
3767
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
3768
 * format.
3769
 *
3770
 * This function is equivalent to count_utf16be.
3771
 *
3772
 * This function does not validate the input. It is acceptable to pass invalid
3773
 * UTF-16 strings but in such cases the result is implementation defined.
3774
 *
3775
 * This function is not BOM-aware.
3776
 *
3777
 * @param input         the UTF-16BE string to convert
3778
 * @param length        the length of the string in 2-byte code units (char16_t)
3779
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
3780
 */
3781
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
3782
                                                     size_t length) noexcept;
3783
  #if SIMDUTF_SPAN
3784
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3785
utf32_length_from_utf16be(
3786
0
    std::span<const char16_t> valid_utf16_input) noexcept {
3787
0
    #if SIMDUTF_CPLUSPLUS23
3788
0
  if consteval {
3789
0
    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(
3790
0
        valid_utf16_input.data(), valid_utf16_input.size());
3791
0
  } else
3792
0
    #endif
3793
0
  {
3794
0
    return utf32_length_from_utf16be(valid_utf16_input.data(),
3795
0
                                     valid_utf16_input.size());
3796
0
  }
3797
0
}
3798
  #endif // SIMDUTF_SPAN
3799
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3800
3801
#if SIMDUTF_FEATURE_UTF16
3802
/**
3803
 * Count the number of code points (characters) in the string assuming that
3804
 * it is valid.
3805
 *
3806
 * This function assumes that the input string is valid UTF-16 (native
3807
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
3808
 * cases the result is implementation defined.
3809
 *
3810
 * This function is not BOM-aware.
3811
 *
3812
 * @param input         the UTF-16 string to process
3813
 * @param length        the length of the string in 2-byte code units (char16_t)
3814
 * @return number of code points
3815
 */
3816
simdutf_warn_unused size_t count_utf16(const char16_t *input,
3817
                                       size_t length) noexcept;
3818
  #if SIMDUTF_SPAN
3819
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3820
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3821
0
    #if SIMDUTF_CPLUSPLUS23
3822
0
  if consteval {
3823
0
    return scalar::utf16::count_code_points<endianness::NATIVE>(
3824
0
        valid_utf16_input.data(), valid_utf16_input.size());
3825
0
  } else
3826
0
    #endif
3827
0
  {
3828
0
    return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
3829
0
  }
3830
0
}
3831
  #endif // SIMDUTF_SPAN
3832
3833
/**
3834
 * Count the number of code points (characters) in the string assuming that
3835
 * it is valid.
3836
 *
3837
 * This function assumes that the input string is valid UTF-16LE.
3838
 * It is acceptable to pass invalid UTF-16 strings but in such cases
3839
 * the result is implementation defined.
3840
 *
3841
 * This function is not BOM-aware.
3842
 *
3843
 * @param input         the UTF-16LE string to process
3844
 * @param length        the length of the string in 2-byte code units (char16_t)
3845
 * @return number of code points
3846
 */
3847
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
3848
                                         size_t length) noexcept;
3849
  #if SIMDUTF_SPAN
3850
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3851
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3852
0
    #if SIMDUTF_CPLUSPLUS23
3853
0
  if consteval {
3854
0
    return scalar::utf16::count_code_points<endianness::LITTLE>(
3855
0
        valid_utf16_input.data(), valid_utf16_input.size());
3856
0
  } else
3857
0
    #endif
3858
0
  {
3859
0
    return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
3860
0
  }
3861
0
}
3862
  #endif // SIMDUTF_SPAN
3863
3864
/**
3865
 * Count the number of code points (characters) in the string assuming that
3866
 * it is valid.
3867
 *
3868
 * This function assumes that the input string is valid UTF-16BE.
3869
 * It is acceptable to pass invalid UTF-16 strings but in such cases
3870
 * the result is implementation defined.
3871
 *
3872
 * This function is not BOM-aware.
3873
 *
3874
 * @param input         the UTF-16BE string to process
3875
 * @param length        the length of the string in 2-byte code units (char16_t)
3876
 * @return number of code points
3877
 */
3878
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
3879
                                         size_t length) noexcept;
3880
  #if SIMDUTF_SPAN
3881
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3882
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3883
0
    #if SIMDUTF_CPLUSPLUS23
3884
0
  if consteval {
3885
0
    return scalar::utf16::count_code_points<endianness::BIG>(
3886
0
        valid_utf16_input.data(), valid_utf16_input.size());
3887
0
  } else
3888
0
    #endif
3889
0
  {
3890
0
    return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
3891
0
  }
3892
0
}
3893
  #endif // SIMDUTF_SPAN
3894
#endif   // SIMDUTF_FEATURE_UTF16
3895
3896
#if SIMDUTF_FEATURE_UTF8
3897
/**
3898
 * Count the number of code points (characters) in the string assuming that
3899
 * it is valid.
3900
 *
3901
 * This function assumes that the input string is valid UTF-8.
3902
 * It is acceptable to pass invalid UTF-8 strings but in such cases
3903
 * the result is implementation defined.
3904
 *
3905
 * @param input         the UTF-8 string to process
3906
 * @param length        the length of the string in bytes
3907
 * @return number of code points
3908
 */
3909
simdutf_warn_unused size_t count_utf8(const char *input,
3910
                                      size_t length) noexcept;
3911
  #if SIMDUTF_SPAN
3912
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t count_utf8(
3913
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3914
    #if SIMDUTF_CPLUSPLUS23
3915
  if consteval {
3916
    return scalar::utf8::count_code_points(valid_utf8_input.data(),
3917
                                           valid_utf8_input.size());
3918
  } else
3919
    #endif
3920
  {
3921
    return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
3922
                      valid_utf8_input.size());
3923
  }
3924
}
3925
  #endif // SIMDUTF_SPAN
3926
3927
/**
3928
 * Given a valid UTF-8 string having a possibly truncated last character,
3929
 * this function checks the end of string. If the last character is truncated
3930
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
3931
 * that the short UTF-8 strings only contain complete characters. If there is no
3932
 * truncated character, the original length is returned.
3933
 *
3934
 * This function assumes that the input string is valid UTF-8, but possibly
3935
 * truncated.
3936
 *
3937
 * @param input         the UTF-8 string to process
3938
 * @param length        the length of the string in bytes
3939
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
3940
 */
3941
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
3942
  #if SIMDUTF_SPAN
3943
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3944
trim_partial_utf8(
3945
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3946
    #if SIMDUTF_CPLUSPLUS23
3947
  if consteval {
3948
    return scalar::utf8::trim_partial_utf8(valid_utf8_input.data(),
3949
                                           valid_utf8_input.size());
3950
  } else
3951
    #endif
3952
  {
3953
    return trim_partial_utf8(
3954
        reinterpret_cast<const char *>(valid_utf8_input.data()),
3955
        valid_utf8_input.size());
3956
  }
3957
}
3958
  #endif // SIMDUTF_SPAN
3959
#endif   // SIMDUTF_FEATURE_UTF8
3960
3961
#if SIMDUTF_FEATURE_UTF16
3962
/**
3963
 * Given a valid UTF-16BE string having a possibly truncated last character,
3964
 * this function checks the end of string. If the last character is truncated
3965
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3966
 * the short UTF-16BE strings only contain complete characters. If there is no
3967
 * truncated character, the original length is returned.
3968
 *
3969
 * This function assumes that the input string is valid UTF-16BE, but possibly
3970
 * truncated.
3971
 *
3972
 * @param input         the UTF-16BE string to process
3973
 * @param length        the length of the string in bytes
3974
 * @return the length of the string in bytes, possibly shorter by 1 unit
3975
 */
3976
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
3977
                                                size_t length);
3978
  #if SIMDUTF_SPAN
3979
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3980
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3981
0
    #if SIMDUTF_CPLUSPLUS23
3982
0
  if consteval {
3983
0
    return scalar::utf16::trim_partial_utf16<endianness::BIG>(
3984
0
        valid_utf16_input.data(), valid_utf16_input.size());
3985
0
  } else
3986
0
    #endif
3987
0
  {
3988
0
    return trim_partial_utf16be(valid_utf16_input.data(),
3989
0
                                valid_utf16_input.size());
3990
0
  }
3991
0
}
3992
  #endif // SIMDUTF_SPAN
3993
3994
/**
3995
 * Given a valid UTF-16LE string having a possibly truncated last character,
3996
 * this function checks the end of string. If the last character is truncated
3997
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3998
 * the short UTF-16LE strings only contain complete characters. If there is no
3999
 * truncated character, the original length is returned.
4000
 *
4001
 * This function assumes that the input string is valid UTF-16LE, but possibly
4002
 * truncated.
4003
 *
4004
 * @param input         the UTF-16LE string to process
4005
 * @param length        the length of the string in bytes
4006
 * @return the length of the string in unit, possibly shorter by 1 unit
4007
 */
4008
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
4009
                                                size_t length);
4010
  #if SIMDUTF_SPAN
4011
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4012
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
4013
0
    #if SIMDUTF_CPLUSPLUS23
4014
0
  if consteval {
4015
0
    return scalar::utf16::trim_partial_utf16<endianness::LITTLE>(
4016
0
        valid_utf16_input.data(), valid_utf16_input.size());
4017
0
  } else
4018
0
    #endif
4019
0
  {
4020
0
    return trim_partial_utf16le(valid_utf16_input.data(),
4021
0
                                valid_utf16_input.size());
4022
0
  }
4023
0
}
4024
  #endif // SIMDUTF_SPAN
4025
4026
/**
4027
 * Given a valid UTF-16 string having a possibly truncated last character,
4028
 * this function checks the end of string. If the last character is truncated
4029
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
4030
 * the short UTF-16 strings only contain complete characters. If there is no
4031
 * truncated character, the original length is returned.
4032
 *
4033
 * This function assumes that the input string is valid UTF-16, but possibly
4034
 * truncated. We use the native endianness.
4035
 *
4036
 * @param input         the UTF-16 string to process
4037
 * @param length        the length of the string in bytes
4038
 * @return the length of the string in unit, possibly shorter by 1 unit
4039
 */
4040
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
4041
                                              size_t length);
4042
  #if SIMDUTF_SPAN
4043
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4044
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
4045
0
    #if SIMDUTF_CPLUSPLUS23
4046
0
  if consteval {
4047
0
    return scalar::utf16::trim_partial_utf16<endianness::NATIVE>(
4048
0
        valid_utf16_input.data(), valid_utf16_input.size());
4049
0
  } else
4050
0
    #endif
4051
0
  {
4052
0
    return trim_partial_utf16(valid_utf16_input.data(),
4053
0
                              valid_utf16_input.size());
4054
0
  }
4055
0
}
4056
  #endif // SIMDUTF_SPAN
4057
#endif   // SIMDUTF_FEATURE_UTF16
4058
4059
#if SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 ||                         \
4060
    SIMDUTF_FEATURE_DETECT_ENCODING
4061
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
4062
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
4063
  #endif
4064
#endif // SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 ||
4065
       // SIMDUTF_FEATURE_DETECT_ENCODING
4066
4067
#if SIMDUTF_FEATURE_BASE64
4068
// base64_options are used to specify the base64 encoding options.
4069
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
4070
// garbage characters are characters that are not part of the base64 alphabet
4071
// nor ASCII spaces.
4072
constexpr uint64_t base64_reverse_padding =
4073
    2; /* modifier for base64_default and base64_url */
4074
enum base64_options : uint64_t {
4075
  base64_default = 0, /* standard base64 format (with padding) */
4076
  base64_url = 1,     /* base64url format (no padding) */
4077
  base64_default_no_padding =
4078
      base64_default |
4079
      base64_reverse_padding, /* standard base64 format without padding */
4080
  base64_url_with_padding =
4081
      base64_url | base64_reverse_padding, /* base64url with padding */
4082
  base64_default_accept_garbage =
4083
      4, /* standard base64 format accepting garbage characters, the input stops
4084
            with the first '=' if any */
4085
  base64_url_accept_garbage =
4086
      5, /* base64url format accepting garbage characters, the input stops with
4087
            the first '=' if any */
4088
  base64_default_or_url =
4089
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
4090
  base64_default_or_url_accept_garbage =
4091
      12, /* standard/base64url hybrid format accepting garbage characters
4092
             (only meaningful for decoding!), the input stops with the first '='
4093
             if any */
4094
};
4095
4096
// last_chunk_handling_options are used to specify the handling of the last
4097
// chunk in base64 decoding.
4098
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4099
enum last_chunk_handling_options : uint64_t {
4100
  loose = 0,  /* standard base64 format, decode partial final chunk */
4101
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
4102
                 unpadded, or non-zero bit padding */
4103
  stop_before_partial =
4104
      2, /* if the last chunk is partial, ignore it (no error) */
4105
  only_full_chunks =
4106
      3 /* only decode full blocks (4 base64 characters, no padding) */
4107
};
4108
4109
inline simdutf_constexpr23 bool
4110
is_partial(last_chunk_handling_options options) {
4111
  return (options == stop_before_partial) || (options == only_full_chunks);
4112
}
4113
4114
namespace detail {
4115
simdutf_warn_unused const char *find(const char *start, const char *end,
4116
                                     char character) noexcept;
4117
simdutf_warn_unused const char16_t *
4118
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
4119
} // namespace detail
4120
4121
/**
4122
 * Find the first occurrence of a character in a string. If the character is
4123
 * not found, return a pointer to the end of the string.
4124
 * @param start        the start of the string
4125
 * @param end          the end of the string
4126
 * @param character    the character to find
4127
 * @return a pointer to the first occurrence of the character in the string,
4128
 * or a pointer to the end of the string if the character is not found.
4129
 *
4130
 */
4131
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char *
4132
find(const char *start, const char *end, char character) noexcept {
4133
  #if SIMDUTF_CPLUSPLUS23
4134
  if consteval {
4135
    for (; start != end; ++start)
4136
      if (*start == character)
4137
        return start;
4138
    return end;
4139
  } else
4140
  #endif
4141
  {
4142
    return detail::find(start, end, character);
4143
  }
4144
}
4145
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char16_t *
4146
find(const char16_t *start, const char16_t *end, char16_t character) noexcept {
4147
    // implementation note: this is repeated instead of a template, to ensure
4148
    // the api is still a function and compiles without concepts
4149
  #if SIMDUTF_CPLUSPLUS23
4150
  if consteval {
4151
    for (; start != end; ++start)
4152
      if (*start == character)
4153
        return start;
4154
    return end;
4155
  } else
4156
  #endif
4157
  {
4158
    return detail::find(start, end, character);
4159
  }
4160
}
4161
}
4162
  // We include base64_tables once.
4163
  #include <simdutf/base64_tables.h>
4164
  #include <simdutf/scalar/base64.h>
4165
4166
namespace simdutf {
4167
4168
  #if SIMDUTF_CPLUSPLUS17
4169
0
inline std::string_view to_string(base64_options options) {
4170
0
  switch (options) {
4171
0
  case base64_default:
4172
0
    return "base64_default";
4173
0
  case base64_url:
4174
0
    return "base64_url";
4175
0
  case base64_reverse_padding:
4176
0
    return "base64_reverse_padding";
4177
0
  case base64_url_with_padding:
4178
0
    return "base64_url_with_padding";
4179
0
  case base64_default_accept_garbage:
4180
0
    return "base64_default_accept_garbage";
4181
0
  case base64_url_accept_garbage:
4182
0
    return "base64_url_accept_garbage";
4183
0
  case base64_default_or_url:
4184
0
    return "base64_default_or_url";
4185
0
  case base64_default_or_url_accept_garbage:
4186
0
    return "base64_default_or_url_accept_garbage";
4187
0
  }
4188
0
  return "<unknown>";
4189
0
}
4190
  #endif // SIMDUTF_CPLUSPLUS17
4191
4192
  #if SIMDUTF_CPLUSPLUS17
4193
0
inline std::string_view to_string(last_chunk_handling_options options) {
4194
0
  switch (options) {
4195
0
  case loose:
4196
0
    return "loose";
4197
0
  case strict:
4198
0
    return "strict";
4199
0
  case stop_before_partial:
4200
0
    return "stop_before_partial";
4201
0
  case only_full_chunks:
4202
0
    return "only_full_chunks";
4203
0
  }
4204
0
  return "<unknown>";
4205
0
}
4206
  #endif
4207
4208
/**
4209
 * Provide the maximal binary length in bytes given the base64 input.
4210
 * As long as the input does not contain ignorable characters (e.g., ASCII
4211
 * spaces or linefeed characters), the result is exact. In particular, the
4212
 * function checks for padding characters.
4213
 *
4214
 * The function is fast (constant time). It checks up to two characters at
4215
 * the end of the string. The input is not otherwise validated or read.
4216
 *
4217
 * @param input         the base64 input to process
4218
 * @param length        the length of the base64 input in bytes
4219
 * @return maximum number of binary bytes
4220
 */
4221
simdutf_warn_unused size_t
4222
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
4223
  #if SIMDUTF_SPAN
4224
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4225
maximal_binary_length_from_base64(
4226
    const detail::input_span_of_byte_like auto &input) noexcept {
4227
    #if SIMDUTF_CPLUSPLUS23
4228
  if consteval {
4229
    return scalar::base64::maximal_binary_length_from_base64(
4230
        detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
4231
  } else
4232
    #endif
4233
  {
4234
    return maximal_binary_length_from_base64(
4235
        reinterpret_cast<const char *>(input.data()), input.size());
4236
  }
4237
}
4238
  #endif // SIMDUTF_SPAN
4239
4240
/**
4241
 * Provide the maximal binary length in bytes given the base64 input.
4242
 * As long as the input does not contain ignorable characters (e.g., ASCII
4243
 * spaces or linefeed characters), the result is exact. In particular, the
4244
 * function checks for padding characters.
4245
 *
4246
 * The function is fast (constant time). It checks up to two characters at
4247
 * the end of the string. The input is not otherwise validated or read.
4248
 *
4249
 * @param input         the base64 input to process, in ASCII stored as 16-bit
4250
 * units
4251
 * @param length        the length of the base64 input in 16-bit units
4252
 * @return maximal number of binary bytes
4253
 */
4254
simdutf_warn_unused size_t maximal_binary_length_from_base64(
4255
    const char16_t *input, size_t length) noexcept;
4256
  #if SIMDUTF_SPAN
4257
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4258
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
4259
0
    #if SIMDUTF_CPLUSPLUS23
4260
0
  if consteval {
4261
0
    return scalar::base64::maximal_binary_length_from_base64(input.data(),
4262
0
                                                             input.size());
4263
0
  } else
4264
0
    #endif
4265
0
  {
4266
0
    return maximal_binary_length_from_base64(input.data(), input.size());
4267
0
  }
4268
0
}
4269
  #endif // SIMDUTF_SPAN
4270
4271
/**
4272
 * Convert a base64 input to a binary output.
4273
 *
4274
 * This function follows the WHATWG forgiving-base64 format, which means that it
4275
 * will ignore any ASCII spaces in the input. You may provide a padded input
4276
 * (with one or two equal signs at the end) or an unpadded input (without any
4277
 * equal signs at the end).
4278
 *
4279
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4280
 *
4281
 * This function will fail in case of invalid input. When last_chunk_options =
4282
 * loose, there are two possible reasons for failure: the input contains a
4283
 * number of base64 characters that when divided by 4, leaves a single remainder
4284
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4285
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4286
 *
4287
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4288
 * input where the invalid character was found. When the error is
4289
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4290
 *
4291
 * The default option (simdutf::base64_default) expects the characters `+` and
4292
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4293
 * characters `-` and `_` as part of its alphabet.
4294
 *
4295
 * The padding (`=`) is validated if present. There may be at most two padding
4296
 * characters at the end of the input. If there are any padding characters, the
4297
 * total number of characters (excluding spaces but including padding
4298
 * characters) must be divisible by four.
4299
 *
4300
 * You should call this function with a buffer that is at least
4301
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
4302
 * provide that much space, the function may cause a buffer overflow.
4303
 *
4304
 * Advanced users may want to tailor how the last chunk is handled. By default,
4305
 * we use a loose (forgiving) approach but we also support a strict approach
4306
 * as well as a stop_before_partial approach, as per the following proposal:
4307
 *
4308
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4309
 *
4310
 * @param input         the base64 string to process
4311
 * @param length        the length of the string in bytes
4312
 * @param output        the pointer to a buffer that can hold the conversion
4313
 * result (should be at least maximal_binary_length_from_base64(input, length)
4314
 * bytes long).
4315
 * @param options       the base64 options to use, usually base64_default or
4316
 * base64_url, and base64_default by default.
4317
 * @param last_chunk_options the last chunk handling options,
4318
 * last_chunk_handling_options::loose by default
4319
 * but can also be last_chunk_handling_options::strict or
4320
 * last_chunk_handling_options::stop_before_partial.
4321
 * @return a result pair struct (of type simdutf::result containing the two
4322
 * fields error and count) with an error code and either position of the error
4323
 * (in the input in bytes) if any, or the number of bytes written if successful.
4324
 */
4325
simdutf_warn_unused result base64_to_binary(
4326
    const char *input, size_t length, char *output,
4327
    base64_options options = base64_default,
4328
    last_chunk_handling_options last_chunk_options = loose) noexcept;
4329
  #if SIMDUTF_SPAN
4330
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
4331
base64_to_binary(
4332
    const detail::input_span_of_byte_like auto &input,
4333
    detail::output_span_of_byte_like auto &&binary_output,
4334
    base64_options options = base64_default,
4335
    last_chunk_handling_options last_chunk_options = loose) noexcept {
4336
    #if SIMDUTF_CPLUSPLUS23
4337
  if consteval {
4338
    return scalar::base64::base64_to_binary_details_impl(
4339
        input.data(), input.size(), binary_output.data(), options,
4340
        last_chunk_options);
4341
  } else
4342
    #endif
4343
  {
4344
    return base64_to_binary(reinterpret_cast<const char *>(input.data()),
4345
                            input.size(),
4346
                            reinterpret_cast<char *>(binary_output.data()),
4347
                            options, last_chunk_options);
4348
  }
4349
}
4350
  #endif // SIMDUTF_SPAN
4351
4352
/**
4353
 * Provide the base64 length in bytes given the length of a binary input.
4354
 *
4355
 * @param length        the length of the input in bytes
4356
 * @return number of base64 bytes
4357
 */
4358
inline simdutf_warn_unused simdutf_constexpr23 size_t base64_length_from_binary(
4359
    size_t length, base64_options options = base64_default) noexcept {
4360
  return scalar::base64::base64_length_from_binary(length, options);
4361
}
4362
4363
/**
4364
 * Provide the base64 length in bytes given the length of a binary input,
4365
 * taking into account line breaks.
4366
 *
4367
 * @param length        the length of the input in bytes
4368
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
4369
 * interpreted as 4),
4370
 * @return number of base64 bytes
4371
 */
4372
inline simdutf_warn_unused simdutf_constexpr23 size_t
4373
base64_length_from_binary_with_lines(
4374
    size_t length, base64_options options = base64_default,
4375
    size_t line_length = default_line_length) noexcept {
4376
  return scalar::base64::base64_length_from_binary_with_lines(length, options,
4377
                                                              line_length);
4378
}
4379
4380
/**
4381
 * Convert a binary input to a base64 output.
4382
 *
4383
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4384
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4385
 * output to ensure that the output length is a multiple of four.
4386
 *
4387
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4388
 * of its alphabet. No padding is added at the end of the output.
4389
 *
4390
 * This function always succeeds.
4391
 *
4392
 * @param input         the binary to process
4393
 * @param length        the length of the input in bytes
4394
 * @param output        the pointer to a buffer that can hold the conversion
4395
 * result (should be at least base64_length_from_binary(length) bytes long)
4396
 * @param options       the base64 options to use, can be base64_default or
4397
 * base64_url, is base64_default by default.
4398
 * @return number of written bytes, will be equal to
4399
 * base64_length_from_binary(length, options)
4400
 */
4401
size_t binary_to_base64(const char *input, size_t length, char *output,
4402
                        base64_options options = base64_default) noexcept;
4403
  #if SIMDUTF_SPAN
4404
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4405
binary_to_base64(const detail::input_span_of_byte_like auto &input,
4406
                 detail::output_span_of_byte_like auto &&binary_output,
4407
                 base64_options options = base64_default) noexcept {
4408
    #if SIMDUTF_CPLUSPLUS23
4409
  if consteval {
4410
    return scalar::base64::tail_encode_base64(
4411
        binary_output.data(), input.data(), input.size(), options);
4412
  } else
4413
    #endif
4414
  {
4415
    return binary_to_base64(
4416
        reinterpret_cast<const char *>(input.data()), input.size(),
4417
        reinterpret_cast<char *>(binary_output.data()), options);
4418
  }
4419
}
4420
  #endif // SIMDUTF_SPAN
4421
4422
/**
4423
 * Convert a binary input to a base64 output with line breaks.
4424
 *
4425
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4426
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4427
 * output to ensure that the output length is a multiple of four.
4428
 *
4429
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4430
 * of its alphabet. No padding is added at the end of the output.
4431
 *
4432
 * This function always succeeds.
4433
 *
4434
 * @param input         the binary to process
4435
 * @param length        the length of the input in bytes
4436
 * @param output        the pointer to a buffer that can hold the conversion
4437
 * result (should be at least base64_length_from_binary_with_lines(length,
4438
 * options, line_length) bytes long)
4439
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
4440
 * interpreted as 4),
4441
 * @param options       the base64 options to use, can be base64_default or
4442
 * base64_url, is base64_default by default.
4443
 * @return number of written bytes, will be equal to
4444
 * base64_length_from_binary_with_lines(length, options)
4445
 */
4446
size_t
4447
binary_to_base64_with_lines(const char *input, size_t length, char *output,
4448
                            size_t line_length = simdutf::default_line_length,
4449
                            base64_options options = base64_default) noexcept;
4450
  #if SIMDUTF_SPAN
4451
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4452
binary_to_base64_with_lines(
4453
    const detail::input_span_of_byte_like auto &input,
4454
    detail::output_span_of_byte_like auto &&binary_output,
4455
    size_t line_length = simdutf::default_line_length,
4456
    base64_options options = base64_default) noexcept {
4457
    #if SIMDUTF_CPLUSPLUS23
4458
  if consteval {
4459
    return scalar::base64::tail_encode_base64_impl<true>(
4460
        binary_output.data(), input.data(), input.size(), options, line_length);
4461
  } else
4462
    #endif
4463
  {
4464
    return binary_to_base64_with_lines(
4465
        reinterpret_cast<const char *>(input.data()), input.size(),
4466
        reinterpret_cast<char *>(binary_output.data()), line_length, options);
4467
  }
4468
}
4469
  #endif // SIMDUTF_SPAN
4470
4471
  #if SIMDUTF_ATOMIC_REF
4472
/**
4473
 * Convert a binary input to a base64 output, using atomic accesses.
4474
 * This function comes with a potentially significant performance
4475
 * penalty, but it may be useful in some cases where the input
4476
 * buffers are shared between threads, to avoid undefined
4477
 * behavior in case of data races.
4478
 *
4479
 * The function is for advanced users. Its main use case is when
4480
 * to silence sanitizer warnings. We have no documented use case
4481
 * where this function is actually necessary in terms of practical correctness.
4482
 *
4483
 * This function is only available when simdutf is compiled with
4484
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4485
 * the availability of this function by checking the macro
4486
 * SIMDUTF_ATOMIC_REF.
4487
 *
4488
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4489
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4490
 * output to ensure that the output length is a multiple of four.
4491
 *
4492
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4493
 * of its alphabet. No padding is added at the end of the output.
4494
 *
4495
 * This function always succeeds.
4496
 *
4497
 * This function is considered experimental. It is not tested by default
4498
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
4499
 * It is not documented in the public API documentation (README). It is
4500
 * offered on a best effort basis. We rely on the community for further
4501
 * testing and feedback.
4502
 *
4503
 * @brief atomic_binary_to_base64
4504
 * @param input         the binary to process
4505
 * @param length        the length of the input in bytes
4506
 * @param output        the pointer to a buffer that can hold the conversion
4507
 * result (should be at least base64_length_from_binary(length) bytes long)
4508
 * @param options       the base64 options to use, can be base64_default or
4509
 * base64_url, is base64_default by default.
4510
 * @return number of written bytes, will be equal to
4511
 * base64_length_from_binary(length, options)
4512
 */
4513
size_t
4514
atomic_binary_to_base64(const char *input, size_t length, char *output,
4515
                        base64_options options = base64_default) noexcept;
4516
    #if SIMDUTF_SPAN
4517
simdutf_really_inline simdutf_warn_unused size_t
4518
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
4519
                        detail::output_span_of_byte_like auto &&binary_output,
4520
                        base64_options options = base64_default) noexcept {
4521
  return atomic_binary_to_base64(
4522
      reinterpret_cast<const char *>(input.data()), input.size(),
4523
      reinterpret_cast<char *>(binary_output.data()), options);
4524
}
4525
    #endif // SIMDUTF_SPAN
4526
  #endif   // SIMDUTF_ATOMIC_REF
4527
4528
/**
4529
 * Convert a base64 input to a binary output.
4530
 *
4531
 * This function follows the WHATWG forgiving-base64 format, which means that it
4532
 * will ignore any ASCII spaces in the input. You may provide a padded input
4533
 * (with one or two equal signs at the end) or an unpadded input (without any
4534
 * equal signs at the end).
4535
 *
4536
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4537
 *
4538
 * This function will fail in case of invalid input. When last_chunk_options =
4539
 * loose, there are two possible reasons for failure: the input contains a
4540
 * number of base64 characters that when divided by 4, leaves a single remainder
4541
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4542
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4543
 *
4544
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4545
 * input where the invalid character was found. When the error is
4546
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4547
 *
4548
 * The default option (simdutf::base64_default) expects the characters `+` and
4549
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4550
 * characters `-` and `_` as part of its alphabet.
4551
 *
4552
 * The padding (`=`) is validated if present. There may be at most two padding
4553
 * characters at the end of the input. If there are any padding characters, the
4554
 * total number of characters (excluding spaces but including padding
4555
 * characters) must be divisible by four.
4556
 *
4557
 * You should call this function with a buffer that is at least
4558
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
4559
 * to provide that much space, the function may cause a buffer overflow.
4560
 *
4561
 * Advanced users may want to tailor how the last chunk is handled. By default,
4562
 * we use a loose (forgiving) approach but we also support a strict approach
4563
 * as well as a stop_before_partial approach, as per the following proposal:
4564
 *
4565
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4566
 *
4567
 * @param input         the base64 string to process, in ASCII stored as 16-bit
4568
 * units
4569
 * @param length        the length of the string in 16-bit units
4570
 * @param output        the pointer to a buffer that can hold the conversion
4571
 * result (should be at least maximal_binary_length_from_base64(input, length)
4572
 * bytes long).
4573
 * @param options       the base64 options to use, can be base64_default or
4574
 * base64_url, is base64_default by default.
4575
 * @param last_chunk_options the last chunk handling options,
4576
 * last_chunk_handling_options::loose by default
4577
 * but can also be last_chunk_handling_options::strict or
4578
 * last_chunk_handling_options::stop_before_partial.
4579
 * @return a result pair struct (of type simdutf::result containing the two
4580
 * fields error and count) with an error code and position of the
4581
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4582
 * of bytes written if successful.
4583
 */
4584
simdutf_warn_unused result
4585
base64_to_binary(const char16_t *input, size_t length, char *output,
4586
                 base64_options options = base64_default,
4587
                 last_chunk_handling_options last_chunk_options =
4588
                     last_chunk_handling_options::loose) noexcept;
4589
  #if SIMDUTF_SPAN
4590
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
4591
base64_to_binary(
4592
    std::span<const char16_t> input,
4593
    detail::output_span_of_byte_like auto &&binary_output,
4594
    base64_options options = base64_default,
4595
    last_chunk_handling_options last_chunk_options = loose) noexcept {
4596
    #if SIMDUTF_CPLUSPLUS23
4597
  if consteval {
4598
    return scalar::base64::base64_to_binary_details_impl(
4599
        input.data(), input.size(), binary_output.data(), options,
4600
        last_chunk_options);
4601
  } else
4602
    #endif
4603
  {
4604
    return base64_to_binary(input.data(), input.size(),
4605
                            reinterpret_cast<char *>(binary_output.data()),
4606
                            options, last_chunk_options);
4607
  }
4608
}
4609
  #endif // SIMDUTF_SPAN
4610
4611
/**
4612
 * Check if a character is an ignorable base64 character.
4613
 * Checking a large input, character by character, is not computationally
4614
 * efficient.
4615
 *
4616
 * @param input         the character to check
4617
 * @param options       the base64 options to use, is base64_default by default.
4618
 * @return true if the character is an ignorable base64 character, false
4619
 * otherwise.
4620
 */
4621
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4622
base64_ignorable(char input, base64_options options = base64_default) noexcept {
4623
  return scalar::base64::is_ignorable(input, options);
4624
}
4625
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4626
base64_ignorable(char16_t input,
4627
                 base64_options options = base64_default) noexcept {
4628
  return scalar::base64::is_ignorable(input, options);
4629
}
4630
4631
/**
4632
 * Check if a character is a valid base64 character.
4633
 * Checking a large input, character by character, is not computationally
4634
 * efficient.
4635
 * Note that padding characters are not considered valid base64 characters in
4636
 * this context, nor are spaces.
4637
 *
4638
 * @param input         the character to check
4639
 * @param options       the base64 options to use, is base64_default by default.
4640
 * @return true if the character is a base64 character, false otherwise.
4641
 */
4642
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4643
0
base64_valid(char input, base64_options options = base64_default) noexcept {
4644
0
  return scalar::base64::is_base64(input, options);
4645
0
}
4646
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4647
0
base64_valid(char16_t input, base64_options options = base64_default) noexcept {
4648
0
  return scalar::base64::is_base64(input, options);
4649
0
}
4650
4651
/**
4652
 * Check if a character is a valid base64 character or the padding character
4653
 * ('='). Checking a large input, character by character, is not computationally
4654
 * efficient.
4655
 *
4656
 * @param input         the character to check
4657
 * @param options       the base64 options to use, is base64_default by default.
4658
 * @return true if the character is a base64 character, false otherwise.
4659
 */
4660
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4661
base64_valid_or_padding(char input,
4662
0
                        base64_options options = base64_default) noexcept {
4663
0
  return scalar::base64::is_base64_or_padding(input, options);
4664
0
}
4665
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4666
base64_valid_or_padding(char16_t input,
4667
0
                        base64_options options = base64_default) noexcept {
4668
0
  return scalar::base64::is_base64_or_padding(input, options);
4669
0
}
4670
4671
/**
4672
 * Convert a base64 input to a binary output.
4673
 *
4674
 * This function follows the WHATWG forgiving-base64 format, which means that it
4675
 * will ignore any ASCII spaces in the input. You may provide a padded input
4676
 * (with one or two equal signs at the end) or an unpadded input (without any
4677
 * equal signs at the end).
4678
 *
4679
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4680
 *
4681
 * This function will fail in case of invalid input. When last_chunk_options =
4682
 * loose, there are three possible reasons for failure: the input contains a
4683
 * number of base64 characters that when divided by 4, leaves a single remainder
4684
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
4685
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
4686
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
4687
 *
4688
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
4689
 * and the number of units processed, see description of the parameters and
4690
 * returned value.
4691
 *
4692
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4693
 * input where the invalid character was found. When the error is
4694
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4695
 *
4696
 * The default option (simdutf::base64_default) expects the characters `+` and
4697
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4698
 * characters `-` and `_` as part of its alphabet.
4699
 *
4700
 * The padding (`=`) is validated if present. There may be at most two padding
4701
 * characters at the end of the input. If there are any padding characters, the
4702
 * total number of characters (excluding spaces but including padding
4703
 * characters) must be divisible by four.
4704
 *
4705
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
4706
 * to discard the output unless the parameter decode_up_to_bad_char is set to
4707
 * true. In that case, the function will decode up to the first invalid
4708
 * character. Extra padding characters ('=') are considered invalid characters.
4709
 *
4710
 * Advanced users may want to tailor how the last chunk is handled. By default,
4711
 * we use a loose (forgiving) approach but we also support a strict approach
4712
 * as well as a stop_before_partial approach, as per the following proposal:
4713
 *
4714
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4715
 *
4716
 * @param input         the base64 string to process, in ASCII stored as 8-bit
4717
 * or 16-bit units
4718
 * @param length        the length of the string in 8-bit or 16-bit units.
4719
 * @param output        the pointer to a buffer that can hold the conversion
4720
 * result.
4721
 * @param outlen        the number of bytes that can be written in the output
4722
 * buffer. Upon return, it is modified to reflect how many bytes were written.
4723
 * @param options       the base64 options to use, can be base64_default or
4724
 * base64_url, is base64_default by default.
4725
 * @param last_chunk_options the last chunk handling options,
4726
 * last_chunk_handling_options::loose by default
4727
 * but can also be last_chunk_handling_options::strict or
4728
 * last_chunk_handling_options::stop_before_partial.
4729
 * @param decode_up_to_bad_char if true, the function will decode up to the
4730
 * first invalid character. By default (false), it is assumed that the output
4731
 * buffer is to be discarded. When there are multiple errors in the input,
4732
 * using decode_up_to_bad_char might trigger a different error.
4733
 * @return a result pair struct (of type simdutf::result containing the two
4734
 * fields error and count) with an error code and position of the
4735
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4736
 * of units processed if successful.
4737
 */
4738
simdutf_warn_unused result
4739
base64_to_binary_safe(const char *input, size_t length, char *output,
4740
                      size_t &outlen, base64_options options = base64_default,
4741
                      last_chunk_handling_options last_chunk_options =
4742
                          last_chunk_handling_options::loose,
4743
                      bool decode_up_to_bad_char = false) noexcept;
4744
// the span overload has moved to the bottom of the file
4745
4746
simdutf_warn_unused result
4747
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
4748
                      size_t &outlen, base64_options options = base64_default,
4749
                      last_chunk_handling_options last_chunk_options =
4750
                          last_chunk_handling_options::loose,
4751
                      bool decode_up_to_bad_char = false) noexcept;
4752
  // span overload moved to bottom of file
4753
4754
  #if SIMDUTF_ATOMIC_REF
4755
/**
4756
 * Convert a base64 input to a binary output with a size limit and using atomic
4757
 * operations.
4758
 *
4759
 * Like `base64_to_binary_safe` but using atomic operations, this function is
4760
 * thread-safe for concurrent memory access, allowing the output
4761
 * buffers to be shared between threads without undefined behavior in case of
4762
 * data races.
4763
 *
4764
 * This function comes with a potentially significant performance penalty, but
4765
 * is useful when thread safety is needed during base64 decoding.
4766
 *
4767
 * This function is only available when simdutf is compiled with
4768
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4769
 * the availability of this function by checking the macro
4770
 * SIMDUTF_ATOMIC_REF.
4771
 *
4772
 * This function is considered experimental. It is not tested by default
4773
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
4774
 * It is not documented in the public API documentation (README). It is
4775
 * offered on a best effort basis. We rely on the community for further
4776
 * testing and feedback.
4777
 *
4778
 * @param input         the base64 input to decode
4779
 * @param length        the length of the input in bytes
4780
 * @param output        the pointer to buffer that can hold the conversion
4781
 * result
4782
 * @param outlen        the number of bytes that can be written in the output
4783
 * buffer. Upon return, it is modified to reflect how many bytes were written.
4784
 * @param options       the base64 options to use (default, url, etc.)
4785
 * @param last_chunk_options the last chunk handling options (loose, strict,
4786
 * stop_before_partial)
4787
 * @param decode_up_to_bad_char if true, the function will decode up to the
4788
 * first invalid character. By default (false), it is assumed that the output
4789
 * buffer is to be discarded. When there are multiple errors in the input,
4790
 * using decode_up_to_bad_char might trigger a different error.
4791
 * @return a result struct with an error code and count indicating error
4792
 * position or success
4793
 */
4794
simdutf_warn_unused result atomic_base64_to_binary_safe(
4795
    const char *input, size_t length, char *output, size_t &outlen,
4796
    base64_options options = base64_default,
4797
    last_chunk_handling_options last_chunk_options =
4798
        last_chunk_handling_options::loose,
4799
    bool decode_up_to_bad_char = false) noexcept;
4800
simdutf_warn_unused result atomic_base64_to_binary_safe(
4801
    const char16_t *input, size_t length, char *output, size_t &outlen,
4802
    base64_options options = base64_default,
4803
    last_chunk_handling_options last_chunk_options = loose,
4804
    bool decode_up_to_bad_char = false) noexcept;
4805
    #if SIMDUTF_SPAN
4806
/**
4807
 * @brief span overload
4808
 * @return a tuple of result and outlen
4809
 */
4810
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
4811
atomic_base64_to_binary_safe(
4812
    const detail::input_span_of_byte_like auto &binary_input,
4813
    detail::output_span_of_byte_like auto &&output,
4814
    base64_options options = base64_default,
4815
    last_chunk_handling_options last_chunk_options =
4816
        last_chunk_handling_options::loose,
4817
    bool decode_up_to_bad_char = false) noexcept {
4818
  size_t outlen = output.size();
4819
  auto ret = atomic_base64_to_binary_safe(
4820
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
4821
      reinterpret_cast<char *>(output.data()), outlen, options,
4822
      last_chunk_options, decode_up_to_bad_char);
4823
  return {ret, outlen};
4824
}
4825
/**
4826
 * @brief span overload
4827
 * @return a tuple of result and outlen
4828
 */
4829
simdutf_warn_unused std::tuple<result, std::size_t>
4830
atomic_base64_to_binary_safe(
4831
    std::span<const char16_t> base64_input,
4832
    detail::output_span_of_byte_like auto &&binary_output,
4833
    base64_options options = base64_default,
4834
    last_chunk_handling_options last_chunk_options = loose,
4835
    bool decode_up_to_bad_char = false) noexcept {
4836
  size_t outlen = binary_output.size();
4837
  auto ret = atomic_base64_to_binary_safe(
4838
      base64_input.data(), base64_input.size(),
4839
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
4840
      last_chunk_options, decode_up_to_bad_char);
4841
  return {ret, outlen};
4842
}
4843
    #endif // SIMDUTF_SPAN
4844
  #endif   // SIMDUTF_ATOMIC_REF
4845
4846
#endif // SIMDUTF_FEATURE_BASE64
4847
4848
/**
4849
 * An implementation of simdutf for a particular CPU architecture.
4850
 *
4851
 * Also used to maintain the currently active implementation. The active
4852
 * implementation is automatically initialized on first use to the most advanced
4853
 * implementation supported by the host.
4854
 */
4855
class implementation {
4856
public:
4857
  /**
4858
   * The name of this implementation.
4859
   *
4860
   *     const implementation *impl = simdutf::active_implementation;
4861
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
4862
   * impl->description() << ")" << endl;
4863
   *
4864
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
4865
   */
4866
  virtual std::string name() const { return std::string(_name); }
4867
4868
  /**
4869
   * The description of this implementation.
4870
   *
4871
   *     const implementation *impl = simdutf::active_implementation;
4872
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
4873
   * impl->description() << ")" << endl;
4874
   *
4875
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
4876
   */
4877
  virtual std::string description() const { return std::string(_description); }
4878
4879
  /**
4880
   * The instruction sets this implementation is compiled against
4881
   * and the current CPU match. This function may poll the current CPU/system
4882
   * and should therefore not be called too often if performance is a concern.
4883
   *
4884
   *
4885
   * @return true if the implementation can be safely used on the current system
4886
   * (determined at runtime)
4887
   */
4888
  bool supported_by_runtime_system() const;
4889
4890
#if SIMDUTF_FEATURE_DETECT_ENCODING
4891
  /**
4892
   * This function will try to detect the encoding
4893
   * @param input the string to identify
4894
   * @param length the length of the string in bytes.
4895
   * @return the encoding type detected
4896
   */
4897
  virtual encoding_type autodetect_encoding(const char *input,
4898
                                            size_t length) const noexcept;
4899
4900
  /**
4901
   * This function will try to detect the possible encodings in one pass
4902
   * @param input the string to identify
4903
   * @param length the length of the string in bytes.
4904
   * @return the encoding type detected
4905
   */
4906
  virtual int detect_encodings(const char *input,
4907
                               size_t length) const noexcept = 0;
4908
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
4909
4910
  /**
4911
   * @private For internal implementation use
4912
   *
4913
   * The instruction sets this implementation is compiled against.
4914
   *
4915
   * @return a mask of all required `internal::instruction_set::` values
4916
   */
4917
  virtual uint32_t required_instruction_sets() const {
4918
    return _required_instruction_sets;
4919
  }
4920
4921
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
4922
  /**
4923
   * Validate the UTF-8 string.
4924
   *
4925
   * Overridden by each implementation.
4926
   *
4927
   * @param buf the UTF-8 string to validate.
4928
   * @param len the length of the string in bytes.
4929
   * @return true if and only if the string is valid UTF-8.
4930
   */
4931
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
4932
                                                 size_t len) const noexcept = 0;
4933
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
4934
4935
#if SIMDUTF_FEATURE_UTF8
4936
  /**
4937
   * Validate the UTF-8 string and stop on errors.
4938
   *
4939
   * Overridden by each implementation.
4940
   *
4941
   * @param buf the UTF-8 string to validate.
4942
   * @param len the length of the string in bytes.
4943
   * @return a result pair struct (of type simdutf::result containing the two
4944
   * fields error and count) with an error code and either position of the error
4945
   * (in the input in code units) if any, or the number of code units validated
4946
   * if successful.
4947
   */
4948
  simdutf_warn_unused virtual result
4949
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
4950
#endif // SIMDUTF_FEATURE_UTF8
4951
4952
#if SIMDUTF_FEATURE_ASCII
4953
  /**
4954
   * Validate the ASCII string.
4955
   *
4956
   * Overridden by each implementation.
4957
   *
4958
   * @param buf the ASCII string to validate.
4959
   * @param len the length of the string in bytes.
4960
   * @return true if and only if the string is valid ASCII.
4961
   */
4962
  simdutf_warn_unused virtual bool
4963
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
4964
4965
  /**
4966
   * Validate the ASCII string and stop on error.
4967
   *
4968
   * Overridden by each implementation.
4969
   *
4970
   * @param buf the ASCII string to validate.
4971
   * @param len the length of the string in bytes.
4972
   * @return a result pair struct (of type simdutf::result containing the two
4973
   * fields error and count) with an error code and either position of the error
4974
   * (in the input in code units) if any, or the number of code units validated
4975
   * if successful.
4976
   */
4977
  simdutf_warn_unused virtual result
4978
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
4979
4980
#endif // SIMDUTF_FEATURE_ASCII
4981
4982
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
4983
  /**
4984
   * Validate the ASCII string as a UTF-16BE sequence.
4985
   * An UTF-16 sequence is considered an ASCII sequence
4986
   * if it could be converted to an ASCII string losslessly.
4987
   *
4988
   * Overridden by each implementation.
4989
   *
4990
   * @param buf the UTF-16BE string to validate.
4991
   * @param len the length of the string in bytes.
4992
   * @return true if and only if the string is valid ASCII.
4993
   */
4994
  simdutf_warn_unused virtual bool
4995
  validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
4996
4997
  /**
4998
   * Validate the ASCII string as a UTF-16LE sequence.
4999
   * An UTF-16 sequence is considered an ASCII sequence
5000
   * if it could be converted to an ASCII string losslessly.
5001
   *
5002
   * Overridden by each implementation.
5003
   *
5004
   * @param buf the UTF-16LE string to validate.
5005
   * @param len the length of the string in bytes.
5006
   * @return true if and only if the string is valid ASCII.
5007
   */
5008
  simdutf_warn_unused virtual bool
5009
  validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
5010
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
5011
5012
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
5013
  /**
5014
   * Validate the UTF-16LE string.This function may be best when you expect
5015
   * the input to be almost always valid. Otherwise, consider using
5016
   * validate_utf16le_with_errors.
5017
   *
5018
   * Overridden by each implementation.
5019
   *
5020
   * This function is not BOM-aware.
5021
   *
5022
   * @param buf the UTF-16LE string to validate.
5023
   * @param len the length of the string in number of 2-byte code units
5024
   * (char16_t).
5025
   * @return true if and only if the string is valid UTF-16LE.
5026
   */
5027
  simdutf_warn_unused virtual bool
5028
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
5029
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
5030
5031
#if SIMDUTF_FEATURE_UTF16
5032
  /**
5033
   * Validate the UTF-16BE string. This function may be best when you expect
5034
   * the input to be almost always valid. Otherwise, consider using
5035
   * validate_utf16be_with_errors.
5036
   *
5037
   * Overridden by each implementation.
5038
   *
5039
   * This function is not BOM-aware.
5040
   *
5041
   * @param buf the UTF-16BE string to validate.
5042
   * @param len the length of the string in number of 2-byte code units
5043
   * (char16_t).
5044
   * @return true if and only if the string is valid UTF-16BE.
5045
   */
5046
  simdutf_warn_unused virtual bool
5047
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
5048
5049
  /**
5050
   * Validate the UTF-16LE string and stop on error.  It might be faster than
5051
   * validate_utf16le when an error is expected to occur early.
5052
   *
5053
   * Overridden by each implementation.
5054
   *
5055
   * This function is not BOM-aware.
5056
   *
5057
   * @param buf the UTF-16LE string to validate.
5058
   * @param len the length of the string in number of 2-byte code units
5059
   * (char16_t).
5060
   * @return a result pair struct (of type simdutf::result containing the two
5061
   * fields error and count) with an error code and either position of the error
5062
   * (in the input in code units) if any, or the number of code units validated
5063
   * if successful.
5064
   */
5065
  simdutf_warn_unused virtual result
5066
  validate_utf16le_with_errors(const char16_t *buf,
5067
                               size_t len) const noexcept = 0;
5068
5069
  /**
5070
   * Validate the UTF-16BE string and stop on error. It might be faster than
5071
   * validate_utf16be when an error is expected to occur early.
5072
   *
5073
   * Overridden by each implementation.
5074
   *
5075
   * This function is not BOM-aware.
5076
   *
5077
   * @param buf the UTF-16BE string to validate.
5078
   * @param len the length of the string in number of 2-byte code units
5079
   * (char16_t).
5080
   * @return a result pair struct (of type simdutf::result containing the two
5081
   * fields error and count) with an error code and either position of the error
5082
   * (in the input in code units) if any, or the number of code units validated
5083
   * if successful.
5084
   */
5085
  simdutf_warn_unused virtual result
5086
  validate_utf16be_with_errors(const char16_t *buf,
5087
                               size_t len) const noexcept = 0;
5088
  /**
5089
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
5090
   * Unicode replacement character U+FFFD. We allow the input and output to be
5091
   * the same buffer so that the correction is done in-place.
5092
   *
5093
   * Overridden by each implementation.
5094
   *
5095
   * @param input the UTF-16LE string to correct.
5096
   * @param len the length of the string in number of 2-byte code units
5097
   * (char16_t).
5098
   * @param output the output buffer.
5099
   */
5100
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
5101
                                      char16_t *output) const noexcept = 0;
5102
  /**
5103
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
5104
   * Unicode replacement character U+FFFD. We allow the input and output to be
5105
   * the same buffer so that the correction is done in-place.
5106
   *
5107
   * Overridden by each implementation.
5108
   *
5109
   * @param input the UTF-16BE string to correct.
5110
   * @param len the length of the string in number of 2-byte code units
5111
   * (char16_t).
5112
   * @param output the output buffer.
5113
   */
5114
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
5115
                                      char16_t *output) const noexcept = 0;
5116
#endif // SIMDUTF_FEATURE_UTF16
5117
5118
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
5119
  /**
5120
   * Validate the UTF-32 string.
5121
   *
5122
   * Overridden by each implementation.
5123
   *
5124
   * This function is not BOM-aware.
5125
   *
5126
   * @param buf the UTF-32 string to validate.
5127
   * @param len the length of the string in number of 4-byte code units
5128
   * (char32_t).
5129
   * @return true if and only if the string is valid UTF-32.
5130
   */
5131
  simdutf_warn_unused virtual bool
5132
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
5133
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
5134
5135
#if SIMDUTF_FEATURE_UTF32
5136
  /**
5137
   * Validate the UTF-32 string and stop on error.
5138
   *
5139
   * Overridden by each implementation.
5140
   *
5141
   * This function is not BOM-aware.
5142
   *
5143
   * @param buf the UTF-32 string to validate.
5144
   * @param len the length of the string in number of 4-byte code units
5145
   * (char32_t).
5146
   * @return a result pair struct (of type simdutf::result containing the two
5147
   * fields error and count) with an error code and either position of the error
5148
   * (in the input in code units) if any, or the number of code units validated
5149
   * if successful.
5150
   */
5151
  simdutf_warn_unused virtual result
5152
  validate_utf32_with_errors(const char32_t *buf,
5153
                             size_t len) const noexcept = 0;
5154
#endif // SIMDUTF_FEATURE_UTF32
5155
5156
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5157
  /**
5158
   * Convert Latin1 string into UTF-8 string.
5159
   *
5160
   * This function is suitable to work with inputs from untrusted sources.
5161
   *
5162
   * @param input         the Latin1 string to convert
5163
   * @param length        the length of the string in bytes
5164
   * @param utf8_output  the pointer to buffer that can hold conversion result
5165
   * @return the number of written char; 0 if conversion is not possible
5166
   */
5167
  simdutf_warn_unused virtual size_t
5168
  convert_latin1_to_utf8(const char *input, size_t length,
5169
                         char *utf8_output) const noexcept = 0;
5170
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5171
5172
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5173
  /**
5174
   * Convert possibly Latin1 string into UTF-16LE string.
5175
   *
5176
   * This function is suitable to work with inputs from untrusted sources.
5177
   *
5178
   * @param input         the Latin1  string to convert
5179
   * @param length        the length of the string in bytes
5180
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5181
   * @return the number of written char16_t; 0 if conversion is not possible
5182
   */
5183
  simdutf_warn_unused virtual size_t
5184
  convert_latin1_to_utf16le(const char *input, size_t length,
5185
                            char16_t *utf16_output) const noexcept = 0;
5186
5187
  /**
5188
   * Convert Latin1 string into UTF-16BE string.
5189
   *
5190
   * This function is suitable to work with inputs from untrusted sources.
5191
   *
5192
   * @param input         the Latin1 string to convert
5193
   * @param length        the length of the string in bytes
5194
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5195
   * @return the number of written char16_t; 0 if conversion is not possible
5196
   */
5197
  simdutf_warn_unused virtual size_t
5198
  convert_latin1_to_utf16be(const char *input, size_t length,
5199
                            char16_t *utf16_output) const noexcept = 0;
5200
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5201
5202
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5203
  /**
5204
   * Convert Latin1 string into UTF-32 string.
5205
   *
5206
   * This function is suitable to work with inputs from untrusted sources.
5207
   *
5208
   * @param input         the Latin1 string to convert
5209
   * @param length        the length of the string in bytes
5210
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5211
   * @return the number of written char32_t; 0 if conversion is not possible
5212
   */
5213
  simdutf_warn_unused virtual size_t
5214
  convert_latin1_to_utf32(const char *input, size_t length,
5215
                          char32_t *utf32_buffer) const noexcept = 0;
5216
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5217
5218
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5219
  /**
5220
   * Convert possibly broken UTF-8 string into latin1 string.
5221
   *
5222
   * During the conversion also validation of the input string is done.
5223
   * This function is suitable to work with inputs from untrusted sources.
5224
   *
5225
   * @param input         the UTF-8 string to convert
5226
   * @param length        the length of the string in bytes
5227
   * @param latin1_output  the pointer to buffer that can hold conversion result
5228
   * @return the number of written char; 0 if the input was not valid UTF-8
5229
   * string or if it cannot be represented as Latin1
5230
   */
5231
  simdutf_warn_unused virtual size_t
5232
  convert_utf8_to_latin1(const char *input, size_t length,
5233
                         char *latin1_output) const noexcept = 0;
5234
5235
  /**
5236
   * Convert possibly broken UTF-8 string into latin1 string with errors.
5237
   * If the string cannot be represented as Latin1, an error
5238
   * code is returned.
5239
   *
5240
   * During the conversion also validation of the input string is done.
5241
   * This function is suitable to work with inputs from untrusted sources.
5242
   *
5243
   * @param input         the UTF-8 string to convert
5244
   * @param length        the length of the string in bytes
5245
   * @param latin1_output  the pointer to buffer that can hold conversion result
5246
   * @return a result pair struct (of type simdutf::result containing the two
5247
   * fields error and count) with an error code and either position of the error
5248
   * (in the input in code units) if any, or the number of code units validated
5249
   * if successful.
5250
   */
5251
  simdutf_warn_unused virtual result
5252
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
5253
                                     char *latin1_output) const noexcept = 0;
5254
5255
  /**
5256
   * Convert valid UTF-8 string into latin1 string.
5257
   *
5258
   * This function assumes that the input string is valid UTF-8 and that it can
5259
   * be represented as Latin1. If you violate this assumption, the result is
5260
   * implementation defined and may include system-dependent behavior such as
5261
   * crashes.
5262
   *
5263
   * This function is for expert users only and not part of our public API. Use
5264
   * convert_utf8_to_latin1 instead.
5265
   *
5266
   * This function is not BOM-aware.
5267
   *
5268
   * @param input         the UTF-8 string to convert
5269
   * @param length        the length of the string in bytes
5270
   * @param latin1_output  the pointer to buffer that can hold conversion result
5271
   * @return the number of written char; 0 if the input was not valid UTF-8
5272
   * string
5273
   */
5274
  simdutf_warn_unused virtual size_t
5275
  convert_valid_utf8_to_latin1(const char *input, size_t length,
5276
                               char *latin1_output) const noexcept = 0;
5277
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5278
5279
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5280
  /**
5281
   * Convert possibly broken UTF-8 string into UTF-16LE string.
5282
   *
5283
   * During the conversion also validation of the input string is done.
5284
   * This function is suitable to work with inputs from untrusted sources.
5285
   *
5286
   * @param input         the UTF-8 string to convert
5287
   * @param length        the length of the string in bytes
5288
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5289
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
5290
   * string
5291
   */
5292
  simdutf_warn_unused virtual size_t
5293
  convert_utf8_to_utf16le(const char *input, size_t length,
5294
                          char16_t *utf16_output) const noexcept = 0;
5295
5296
  /**
5297
   * Convert possibly broken UTF-8 string into UTF-16BE string.
5298
   *
5299
   * During the conversion also validation of the input string is done.
5300
   * This function is suitable to work with inputs from untrusted sources.
5301
   *
5302
   * @param input         the UTF-8 string to convert
5303
   * @param length        the length of the string in bytes
5304
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5305
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
5306
   * string
5307
   */
5308
  simdutf_warn_unused virtual size_t
5309
  convert_utf8_to_utf16be(const char *input, size_t length,
5310
                          char16_t *utf16_output) const noexcept = 0;
5311
5312
  /**
5313
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
5314
   * error.
5315
   *
5316
   * During the conversion also validation of the input string is done.
5317
   * This function is suitable to work with inputs from untrusted sources.
5318
   *
5319
   * @param input         the UTF-8 string to convert
5320
   * @param length        the length of the string in bytes
5321
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5322
   * @return a result pair struct (of type simdutf::result containing the two
5323
   * fields error and count) with an error code and either position of the error
5324
   * (in the input in code units) if any, or the number of code units validated
5325
   * if successful.
5326
   */
5327
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
5328
      const char *input, size_t length,
5329
      char16_t *utf16_output) const noexcept = 0;
5330
5331
  /**
5332
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
5333
   * error.
5334
   *
5335
   * During the conversion also validation of the input string is done.
5336
   * This function is suitable to work with inputs from untrusted sources.
5337
   *
5338
   * @param input         the UTF-8 string to convert
5339
   * @param length        the length of the string in bytes
5340
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5341
   * @return a result pair struct (of type simdutf::result containing the two
5342
   * fields error and count) with an error code and either position of the error
5343
   * (in the input in code units) if any, or the number of code units validated
5344
   * if successful.
5345
   */
5346
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
5347
      const char *input, size_t length,
5348
      char16_t *utf16_output) const noexcept = 0;
5349
  /**
5350
   * Compute the number of bytes that this UTF-16LE string would require in
5351
   * UTF-8 format even when the UTF-16LE content contains mismatched
5352
   * surrogates that have to be replaced by the replacement character (0xFFFD).
5353
   *
5354
   * @param input         the UTF-16LE string to convert
5355
   * @param length        the length of the string in 2-byte code units
5356
   * (char16_t)
5357
   * @return a result pair struct (of type simdutf::result containing the two
5358
   * fields error and count) where the count is the number of bytes required to
5359
   * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS
5360
   * or SURROGATE. The count is correct regardless of the error field.
5361
   * When SURROGATE is returned, it does not indicate an error in the case of
5362
   * this function: it indicates that at least one surrogate has been
5363
   * encountered: the surrogates may be matched or not (thus this function does
5364
   * not validate). If the returned error code is SUCCESS, then the input
5365
   * contains no surrogate, is in the Basic Multilingual Plane, and is
5366
   * necessarily valid.
5367
   */
5368
  virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
5369
      const char16_t *input, size_t length) const noexcept = 0;
5370
5371
  /**
5372
   * Compute the number of bytes that this UTF-16BE string would require in
5373
   * UTF-8 format even when the UTF-16BE content contains mismatched
5374
   * surrogates that have to be replaced by the replacement character (0xFFFD).
5375
   *
5376
   * @param input         the UTF-16BE string to convert
5377
   * @param length        the length of the string in 2-byte code units
5378
   * (char16_t)
5379
   * @return a result pair struct (of type simdutf::result containing the two
5380
   * fields error and count) where the count is the number of bytes required to
5381
   * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS
5382
   * or SURROGATE. The count is correct regardless of the error field.
5383
   * When SURROGATE is returned, it does not indicate an error in the case of
5384
   * this function: it indicates that at least one surrogate has been
5385
   * encountered: the surrogates may be matched or not (thus this function does
5386
   * not validate). If the returned error code is SUCCESS, then the input
5387
   * contains no surrogate, is in the Basic Multilingual Plane, and is
5388
   * necessarily valid.
5389
   */
5390
  virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
5391
      const char16_t *input, size_t length) const noexcept = 0;
5392
5393
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5394
5395
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5396
  /**
5397
   * Convert possibly broken UTF-8 string into UTF-32 string.
5398
   *
5399
   * During the conversion also validation of the input string is done.
5400
   * This function is suitable to work with inputs from untrusted sources.
5401
   *
5402
   * @param input         the UTF-8 string to convert
5403
   * @param length        the length of the string in bytes
5404
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5405
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
5406
   * string
5407
   */
5408
  simdutf_warn_unused virtual size_t
5409
  convert_utf8_to_utf32(const char *input, size_t length,
5410
                        char32_t *utf32_output) const noexcept = 0;
5411
5412
  /**
5413
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
5414
   *
5415
   * During the conversion also validation of the input string is done.
5416
   * This function is suitable to work with inputs from untrusted sources.
5417
   *
5418
   * @param input         the UTF-8 string to convert
5419
   * @param length        the length of the string in bytes
5420
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5421
   * @return a result pair struct (of type simdutf::result containing the two
5422
   * fields error and count) with an error code and either position of the error
5423
   * (in the input in code units) if any, or the number of char32_t written if
5424
   * successful.
5425
   */
5426
  simdutf_warn_unused virtual result
5427
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
5428
                                    char32_t *utf32_output) const noexcept = 0;
5429
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5430
5431
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5432
  /**
5433
   * Convert valid UTF-8 string into UTF-16LE string.
5434
   *
5435
   * This function assumes that the input string is valid UTF-8.
5436
   *
5437
   * @param input         the UTF-8 string to convert
5438
   * @param length        the length of the string in bytes
5439
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5440
   * @return the number of written char16_t
5441
   */
5442
  simdutf_warn_unused virtual size_t
5443
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
5444
                                char16_t *utf16_buffer) const noexcept = 0;
5445
5446
  /**
5447
   * Convert valid UTF-8 string into UTF-16BE string.
5448
   *
5449
   * This function assumes that the input string is valid UTF-8.
5450
   *
5451
   * @param input         the UTF-8 string to convert
5452
   * @param length        the length of the string in bytes
5453
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5454
   * @return the number of written char16_t
5455
   */
5456
  simdutf_warn_unused virtual size_t
5457
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
5458
                                char16_t *utf16_buffer) const noexcept = 0;
5459
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5460
5461
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5462
  /**
5463
   * Convert valid UTF-8 string into UTF-32 string.
5464
   *
5465
   * This function assumes that the input string is valid UTF-8.
5466
   *
5467
   * @param input         the UTF-8 string to convert
5468
   * @param length        the length of the string in bytes
5469
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5470
   * @return the number of written char32_t
5471
   */
5472
  simdutf_warn_unused virtual size_t
5473
  convert_valid_utf8_to_utf32(const char *input, size_t length,
5474
                              char32_t *utf32_buffer) const noexcept = 0;
5475
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5476
5477
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5478
  /**
5479
   * Compute the number of 2-byte code units that this UTF-8 string would
5480
   * require in UTF-16LE format.
5481
   *
5482
   * This function does not validate the input. It is acceptable to pass invalid
5483
   * UTF-8 strings but in such cases the result is implementation defined.
5484
   *
5485
   * @param input         the UTF-8 string to process
5486
   * @param length        the length of the string in bytes
5487
   * @return the number of char16_t code units required to encode the UTF-8
5488
   * string as UTF-16LE
5489
   */
5490
  simdutf_warn_unused virtual size_t
5491
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5492
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5493
5494
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5495
  /**
5496
   * Compute the number of 4-byte code units that this UTF-8 string would
5497
   * require in UTF-32 format.
5498
   *
5499
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
5500
   * UTF-8 strings but in such cases the result is implementation defined.
5501
   *
5502
   * This function does not validate the input.
5503
   *
5504
   * @param input         the UTF-8 string to process
5505
   * @param length        the length of the string in bytes
5506
   * @return the number of char32_t code units required to encode the UTF-8
5507
   * string as UTF-32
5508
   */
5509
  simdutf_warn_unused virtual size_t
5510
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5511
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5512
5513
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5514
  /**
5515
   * Convert possibly broken UTF-16LE string into Latin1 string.
5516
   *
5517
   * During the conversion also validation of the input string is done.
5518
   * This function is suitable to work with inputs from untrusted sources.
5519
   *
5520
   * This function is not BOM-aware.
5521
   *
5522
   * @param input         the UTF-16LE string to convert
5523
   * @param length        the length of the string in 2-byte code units
5524
   * (char16_t)
5525
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5526
   * result
5527
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5528
   * string or if it cannot be represented as Latin1
5529
   */
5530
  simdutf_warn_unused virtual size_t
5531
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
5532
                            char *latin1_buffer) const noexcept = 0;
5533
5534
  /**
5535
   * Convert possibly broken UTF-16BE string into Latin1 string.
5536
   *
5537
   * During the conversion also validation of the input string is done.
5538
   * This function is suitable to work with inputs from untrusted sources.
5539
   *
5540
   * This function is not BOM-aware.
5541
   *
5542
   * @param input         the UTF-16BE string to convert
5543
   * @param length        the length of the string in 2-byte code units
5544
   * (char16_t)
5545
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5546
   * result
5547
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5548
   * string or if it cannot be represented as Latin1
5549
   */
5550
  simdutf_warn_unused virtual size_t
5551
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
5552
                            char *latin1_buffer) const noexcept = 0;
5553
5554
  /**
5555
   * Convert possibly broken UTF-16LE string into Latin1 string.
5556
   * If the string cannot be represented as Latin1, an error
5557
   * is returned.
5558
   *
5559
   * During the conversion also validation of the input string is done.
5560
   * This function is suitable to work with inputs from untrusted sources.
5561
   * This function is not BOM-aware.
5562
   *
5563
   * @param input         the UTF-16LE string to convert
5564
   * @param length        the length of the string in 2-byte code units
5565
   * (char16_t)
5566
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5567
   * result
5568
   * @return a result pair struct (of type simdutf::result containing the two
5569
   * fields error and count) with an error code and either position of the error
5570
   * (in the input in code units) if any, or the number of char written if
5571
   * successful.
5572
   */
5573
  simdutf_warn_unused virtual result
5574
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
5575
                                        char *latin1_buffer) const noexcept = 0;
5576
5577
  /**
5578
   * Convert possibly broken UTF-16BE string into Latin1 string.
5579
   * If the string cannot be represented as Latin1, an error
5580
   * is returned.
5581
   *
5582
   * During the conversion also validation of the input string is done.
5583
   * This function is suitable to work with inputs from untrusted sources.
5584
   * This function is not BOM-aware.
5585
   *
5586
   * @param input         the UTF-16BE string to convert
5587
   * @param length        the length of the string in 2-byte code units
5588
   * (char16_t)
5589
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5590
   * result
5591
   * @return a result pair struct (of type simdutf::result containing the two
5592
   * fields error and count) with an error code and either position of the error
5593
   * (in the input in code units) if any, or the number of char written if
5594
   * successful.
5595
   */
5596
  simdutf_warn_unused virtual result
5597
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
5598
                                        char *latin1_buffer) const noexcept = 0;
5599
5600
  /**
5601
   * Convert valid UTF-16LE string into Latin1 string.
5602
   *
5603
   * This function assumes that the input string is valid UTF-L16LE and that it
5604
   * can be represented as Latin1. If you violate this assumption, the result is
5605
   * implementation defined and may include system-dependent behavior such as
5606
   * crashes.
5607
   *
5608
   * This function is for expert users only and not part of our public API. Use
5609
   * convert_utf16le_to_latin1 instead.
5610
   *
5611
   * This function is not BOM-aware.
5612
   *
5613
   * @param input         the UTF-16LE string to convert
5614
   * @param length        the length of the string in 2-byte code units
5615
   * (char16_t)
5616
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5617
   * result
5618
   * @return number of written code units; 0 if conversion is not possible
5619
   */
5620
  simdutf_warn_unused virtual size_t
5621
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
5622
                                  char *latin1_buffer) const noexcept = 0;
5623
5624
  /**
5625
   * Convert valid UTF-16BE string into Latin1 string.
5626
   *
5627
   * This function assumes that the input string is valid UTF16-BE and that it
5628
   * can be represented as Latin1. If you violate this assumption, the result is
5629
   * implementation defined and may include system-dependent behavior such as
5630
   * crashes.
5631
   *
5632
   * This function is for expert users only and not part of our public API. Use
5633
   * convert_utf16be_to_latin1 instead.
5634
   *
5635
   * This function is not BOM-aware.
5636
   *
5637
   * @param input         the UTF-16BE string to convert
5638
   * @param length        the length of the string in 2-byte code units
5639
   * (char16_t)
5640
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5641
   * result
5642
   * @return number of written code units; 0 if conversion is not possible
5643
   */
5644
  simdutf_warn_unused virtual size_t
5645
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
5646
                                  char *latin1_buffer) const noexcept = 0;
5647
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5648
5649
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5650
  /**
5651
   * Convert possibly broken UTF-16LE string into UTF-8 string.
5652
   *
5653
   * During the conversion also validation of the input string is done.
5654
   * This function is suitable to work with inputs from untrusted sources.
5655
   *
5656
   * This function is not BOM-aware.
5657
   *
5658
   * @param input         the UTF-16LE string to convert
5659
   * @param length        the length of the string in 2-byte code units
5660
   * (char16_t)
5661
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5662
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5663
   * string
5664
   */
5665
  simdutf_warn_unused virtual size_t
5666
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
5667
                          char *utf8_buffer) const noexcept = 0;
5668
5669
  /**
5670
   * Convert possibly broken UTF-16BE string into UTF-8 string.
5671
   *
5672
   * During the conversion also validation of the input string is done.
5673
   * This function is suitable to work with inputs from untrusted sources.
5674
   *
5675
   * This function is not BOM-aware.
5676
   *
5677
   * @param input         the UTF-16BE string to convert
5678
   * @param length        the length of the string in 2-byte code units
5679
   * (char16_t)
5680
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5681
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5682
   * string
5683
   */
5684
  simdutf_warn_unused virtual size_t
5685
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
5686
                          char *utf8_buffer) const noexcept = 0;
5687
5688
  /**
5689
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
5690
   * error.
5691
   *
5692
   * During the conversion also validation of the input string is done.
5693
   * This function is suitable to work with inputs from untrusted sources.
5694
   *
5695
   * This function is not BOM-aware.
5696
   *
5697
   * @param input         the UTF-16LE string to convert
5698
   * @param length        the length of the string in 2-byte code units
5699
   * (char16_t)
5700
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5701
   * @return a result pair struct (of type simdutf::result containing the two
5702
   * fields error and count) with an error code and either position of the error
5703
   * (in the input in code units) if any, or the number of char written if
5704
   * successful.
5705
   */
5706
  simdutf_warn_unused virtual result
5707
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
5708
                                      char *utf8_buffer) const noexcept = 0;
5709
5710
  /**
5711
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
5712
   * error.
5713
   *
5714
   * During the conversion also validation of the input string is done.
5715
   * This function is suitable to work with inputs from untrusted sources.
5716
   *
5717
   * This function is not BOM-aware.
5718
   *
5719
   * @param input         the UTF-16BE string to convert
5720
   * @param length        the length of the string in 2-byte code units
5721
   * (char16_t)
5722
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5723
   * @return a result pair struct (of type simdutf::result containing the two
5724
   * fields error and count) with an error code and either position of the error
5725
   * (in the input in code units) if any, or the number of char written if
5726
   * successful.
5727
   */
5728
  simdutf_warn_unused virtual result
5729
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
5730
                                      char *utf8_buffer) const noexcept = 0;
5731
5732
  /**
5733
   * Convert possibly broken UTF-16LE string into UTF-8 string, replacing
5734
   * unpaired surrogates with the Unicode replacement character U+FFFD.
5735
   *
5736
   * This function always succeeds: unpaired surrogates are replaced with
5737
   * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
5738
   *
5739
   * This function is not BOM-aware.
5740
   *
5741
   * @param input         the UTF-16LE string to convert
5742
   * @param length        the length of the string in 2-byte code units
5743
   * (char16_t)
5744
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5745
   * @return number of written code units
5746
   */
5747
  simdutf_warn_unused virtual size_t convert_utf16le_to_utf8_with_replacement(
5748
      const char16_t *input, size_t length,
5749
      char *utf8_buffer) const noexcept = 0;
5750
5751
  /**
5752
   * Convert possibly broken UTF-16BE string into UTF-8 string, replacing
5753
   * unpaired surrogates with the Unicode replacement character U+FFFD.
5754
   *
5755
   * This function always succeeds: unpaired surrogates are replaced with
5756
   * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
5757
   *
5758
   * This function is not BOM-aware.
5759
   *
5760
   * @param input         the UTF-16BE string to convert
5761
   * @param length        the length of the string in 2-byte code units
5762
   * (char16_t)
5763
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5764
   * @return number of written code units
5765
   */
5766
  simdutf_warn_unused virtual size_t convert_utf16be_to_utf8_with_replacement(
5767
      const char16_t *input, size_t length,
5768
      char *utf8_buffer) const noexcept = 0;
5769
5770
  /**
5771
   * Convert valid UTF-16LE string into UTF-8 string.
5772
   *
5773
   * This function assumes that the input string is valid UTF-16LE.
5774
   *
5775
   * This function is not BOM-aware.
5776
   *
5777
   * @param input         the UTF-16LE string to convert
5778
   * @param length        the length of the string in 2-byte code units
5779
   * (char16_t)
5780
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5781
   * result
5782
   * @return number of written code units; 0 if conversion is not possible
5783
   */
5784
  simdutf_warn_unused virtual size_t
5785
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
5786
                                char *utf8_buffer) const noexcept = 0;
5787
5788
  /**
5789
   * Convert valid UTF-16BE string into UTF-8 string.
5790
   *
5791
   * This function assumes that the input string is valid UTF-16BE.
5792
   *
5793
   * This function is not BOM-aware.
5794
   *
5795
   * @param input         the UTF-16BE string to convert
5796
   * @param length        the length of the string in 2-byte code units
5797
   * (char16_t)
5798
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5799
   * result
5800
   * @return number of written code units; 0 if conversion is not possible
5801
   */
5802
  simdutf_warn_unused virtual size_t
5803
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
5804
                                char *utf8_buffer) const noexcept = 0;
5805
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5806
5807
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5808
  /**
5809
   * Convert possibly broken UTF-16LE string into UTF-32 string.
5810
   *
5811
   * During the conversion also validation of the input string is done.
5812
   * This function is suitable to work with inputs from untrusted sources.
5813
   *
5814
   * This function is not BOM-aware.
5815
   *
5816
   * @param input         the UTF-16LE string to convert
5817
   * @param length        the length of the string in 2-byte code units
5818
   * (char16_t)
5819
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5820
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5821
   * string
5822
   */
5823
  simdutf_warn_unused virtual size_t
5824
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
5825
                           char32_t *utf32_buffer) const noexcept = 0;
5826
5827
  /**
5828
   * Convert possibly broken UTF-16BE string into UTF-32 string.
5829
   *
5830
   * During the conversion also validation of the input string is done.
5831
   * This function is suitable to work with inputs from untrusted sources.
5832
   *
5833
   * This function is not BOM-aware.
5834
   *
5835
   * @param input         the UTF-16BE string to convert
5836
   * @param length        the length of the string in 2-byte code units
5837
   * (char16_t)
5838
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5839
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5840
   * string
5841
   */
5842
  simdutf_warn_unused virtual size_t
5843
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
5844
                           char32_t *utf32_buffer) const noexcept = 0;
5845
5846
  /**
5847
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
5848
   * error.
5849
   *
5850
   * During the conversion also validation of the input string is done.
5851
   * This function is suitable to work with inputs from untrusted sources.
5852
   *
5853
   * This function is not BOM-aware.
5854
   *
5855
   * @param input         the UTF-16LE string to convert
5856
   * @param length        the length of the string in 2-byte code units
5857
   * (char16_t)
5858
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5859
   * @return a result pair struct (of type simdutf::result containing the two
5860
   * fields error and count) with an error code and either position of the error
5861
   * (in the input in code units) if any, or the number of char32_t written if
5862
   * successful.
5863
   */
5864
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
5865
      const char16_t *input, size_t length,
5866
      char32_t *utf32_buffer) const noexcept = 0;
5867
5868
  /**
5869
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
5870
   * error.
5871
   *
5872
   * During the conversion also validation of the input string is done.
5873
   * This function is suitable to work with inputs from untrusted sources.
5874
   *
5875
   * This function is not BOM-aware.
5876
   *
5877
   * @param input         the UTF-16BE string to convert
5878
   * @param length        the length of the string in 2-byte code units
5879
   * (char16_t)
5880
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5881
   * @return a result pair struct (of type simdutf::result containing the two
5882
   * fields error and count) with an error code and either position of the error
5883
   * (in the input in code units) if any, or the number of char32_t written if
5884
   * successful.
5885
   */
5886
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
5887
      const char16_t *input, size_t length,
5888
      char32_t *utf32_buffer) const noexcept = 0;
5889
5890
  /**
5891
   * Convert valid UTF-16LE string into UTF-32 string.
5892
   *
5893
   * This function assumes that the input string is valid UTF-16LE.
5894
   *
5895
   * This function is not BOM-aware.
5896
   *
5897
   * @param input         the UTF-16LE string to convert
5898
   * @param length        the length of the string in 2-byte code units
5899
   * (char16_t)
5900
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
5901
   * result
5902
   * @return number of written code units; 0 if conversion is not possible
5903
   */
5904
  simdutf_warn_unused virtual size_t
5905
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
5906
                                 char32_t *utf32_buffer) const noexcept = 0;
5907
5908
  /**
5909
   * Convert valid UTF-16LE string into UTF-32BE string.
5910
   *
5911
   * This function assumes that the input string is valid UTF-16BE.
5912
   *
5913
   * This function is not BOM-aware.
5914
   *
5915
   * @param input         the UTF-16BE string to convert
5916
   * @param length        the length of the string in 2-byte code units
5917
   * (char16_t)
5918
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
5919
   * result
5920
   * @return number of written code units; 0 if conversion is not possible
5921
   */
5922
  simdutf_warn_unused virtual size_t
5923
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
5924
                                 char32_t *utf32_buffer) const noexcept = 0;
5925
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5926
5927
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5928
  /**
5929
   * Compute the number of bytes that this UTF-16LE string would require in
5930
   * UTF-8 format.
5931
   *
5932
   * This function does not validate the input. It is acceptable to pass invalid
5933
   * UTF-16 strings but in such cases the result is implementation defined.
5934
   *
5935
   * This function is not BOM-aware.
5936
   *
5937
   * @param input         the UTF-16LE string to convert
5938
   * @param length        the length of the string in 2-byte code units
5939
   * (char16_t)
5940
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
5941
   */
5942
  simdutf_warn_unused virtual size_t
5943
  utf8_length_from_utf16le(const char16_t *input,
5944
                           size_t length) const noexcept = 0;
5945
5946
  /**
5947
   * Compute the number of bytes that this UTF-16BE string would require in
5948
   * UTF-8 format.
5949
   *
5950
   * This function does not validate the input. It is acceptable to pass invalid
5951
   * UTF-16 strings but in such cases the result is implementation defined.
5952
   *
5953
   * This function is not BOM-aware.
5954
   *
5955
   * @param input         the UTF-16BE string to convert
5956
   * @param length        the length of the string in 2-byte code units
5957
   * (char16_t)
5958
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
5959
   */
5960
  simdutf_warn_unused virtual size_t
5961
  utf8_length_from_utf16be(const char16_t *input,
5962
                           size_t length) const noexcept = 0;
5963
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5964
5965
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5966
  /**
5967
   * Convert possibly broken UTF-32 string into Latin1 string.
5968
   *
5969
   * During the conversion also validation of the input string is done.
5970
   * This function is suitable to work with inputs from untrusted sources.
5971
   *
5972
   * This function is not BOM-aware.
5973
   *
5974
   * @param input         the UTF-32 string to convert
5975
   * @param length        the length of the string in 4-byte code units
5976
   * (char32_t)
5977
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5978
   * result
5979
   * @return number of written code units; 0 if input is not a valid UTF-32
5980
   * string
5981
   */
5982
  simdutf_warn_unused virtual size_t
5983
  convert_utf32_to_latin1(const char32_t *input, size_t length,
5984
                          char *latin1_buffer) const noexcept = 0;
5985
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5986
5987
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5988
  /**
5989
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
5990
   * If the string cannot be represented as Latin1, an error is returned.
5991
   *
5992
   * During the conversion also validation of the input string is done.
5993
   * This function is suitable to work with inputs from untrusted sources.
5994
   *
5995
   * This function is not BOM-aware.
5996
   *
5997
   * @param input         the UTF-32 string to convert
5998
   * @param length        the length of the string in 4-byte code units
5999
   * (char32_t)
6000
   * @param latin1_buffer   the pointer to buffer that can hold conversion
6001
   * result
6002
   * @return a result pair struct (of type simdutf::result containing the two
6003
   * fields error and count) with an error code and either position of the error
6004
   * (in the input in code units) if any, or the number of char written if
6005
   * successful.
6006
   */
6007
  simdutf_warn_unused virtual result
6008
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
6009
                                      char *latin1_buffer) const noexcept = 0;
6010
6011
  /**
6012
   * Convert valid UTF-32 string into Latin1 string.
6013
   *
6014
   * This function assumes that the input string is valid UTF-32 and can be
6015
   * represented as Latin1. If you violate this assumption, the result is
6016
   * implementation defined and may include system-dependent behavior such as
6017
   * crashes.
6018
   *
6019
   * This function is for expert users only and not part of our public API. Use
6020
   * convert_utf32_to_latin1 instead.
6021
   *
6022
   * This function is not BOM-aware.
6023
   *
6024
   * @param input         the UTF-32 string to convert
6025
   * @param length        the length of the string in 4-byte code units
6026
   * (char32_t)
6027
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
6028
   * result
6029
   * @return number of written code units; 0 if conversion is not possible
6030
   */
6031
  simdutf_warn_unused virtual size_t
6032
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
6033
                                char *latin1_buffer) const noexcept = 0;
6034
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6035
6036
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6037
  /**
6038
   * Convert possibly broken UTF-32 string into UTF-8 string.
6039
   *
6040
   * During the conversion also validation of the input string is done.
6041
   * This function is suitable to work with inputs from untrusted sources.
6042
   *
6043
   * This function is not BOM-aware.
6044
   *
6045
   * @param input         the UTF-32 string to convert
6046
   * @param length        the length of the string in 4-byte code units
6047
   * (char32_t)
6048
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
6049
   * @return number of written code units; 0 if input is not a valid UTF-32
6050
   * string
6051
   */
6052
  simdutf_warn_unused virtual size_t
6053
  convert_utf32_to_utf8(const char32_t *input, size_t length,
6054
                        char *utf8_buffer) const noexcept = 0;
6055
6056
  /**
6057
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
6058
   *
6059
   * During the conversion also validation of the input string is done.
6060
   * This function is suitable to work with inputs from untrusted sources.
6061
   *
6062
   * This function is not BOM-aware.
6063
   *
6064
   * @param input         the UTF-32 string to convert
6065
   * @param length        the length of the string in 4-byte code units
6066
   * (char32_t)
6067
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
6068
   * @return a result pair struct (of type simdutf::result containing the two
6069
   * fields error and count) with an error code and either position of the error
6070
   * (in the input in code units) if any, or the number of char written if
6071
   * successful.
6072
   */
6073
  simdutf_warn_unused virtual result
6074
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
6075
                                    char *utf8_buffer) const noexcept = 0;
6076
6077
  /**
6078
   * Convert valid UTF-32 string into UTF-8 string.
6079
   *
6080
   * This function assumes that the input string is valid UTF-32.
6081
   *
6082
   * This function is not BOM-aware.
6083
   *
6084
   * @param input         the UTF-32 string to convert
6085
   * @param length        the length of the string in 4-byte code units
6086
   * (char32_t)
6087
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
6088
   * result
6089
   * @return number of written code units; 0 if conversion is not possible
6090
   */
6091
  simdutf_warn_unused virtual size_t
6092
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
6093
                              char *utf8_buffer) const noexcept = 0;
6094
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6095
6096
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6097
  /**
6098
   * Return the number of bytes that this UTF-16 string would require in Latin1
6099
   * format.
6100
   *
6101
   *
6102
   * @param input         the UTF-16 string to convert
6103
   * @param length        the length of the string in 2-byte code units
6104
   * (char16_t)
6105
   * @return the number of bytes required to encode the UTF-16 string as Latin1
6106
   */
6107
  simdutf_warn_unused virtual size_t
6108
  utf16_length_from_latin1(size_t length) const noexcept {
6109
    return length;
6110
  }
6111
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6112
6113
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6114
  /**
6115
   * Convert possibly broken UTF-32 string into UTF-16LE string.
6116
   *
6117
   * During the conversion also validation of the input string is done.
6118
   * This function is suitable to work with inputs from untrusted sources.
6119
   *
6120
   * This function is not BOM-aware.
6121
   *
6122
   * @param input         the UTF-32 string to convert
6123
   * @param length        the length of the string in 4-byte code units
6124
   * (char32_t)
6125
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
6126
   * @return number of written code units; 0 if input is not a valid UTF-32
6127
   * string
6128
   */
6129
  simdutf_warn_unused virtual size_t
6130
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
6131
                           char16_t *utf16_buffer) const noexcept = 0;
6132
6133
  /**
6134
   * Convert possibly broken UTF-32 string into UTF-16BE string.
6135
   *
6136
   * During the conversion also validation of the input string is done.
6137
   * This function is suitable to work with inputs from untrusted sources.
6138
   *
6139
   * This function is not BOM-aware.
6140
   *
6141
   * @param input         the UTF-32 string to convert
6142
   * @param length        the length of the string in 4-byte code units
6143
   * (char32_t)
6144
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
6145
   * @return number of written code units; 0 if input is not a valid UTF-32
6146
   * string
6147
   */
6148
  simdutf_warn_unused virtual size_t
6149
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
6150
                           char16_t *utf16_buffer) const noexcept = 0;
6151
6152
  /**
6153
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
6154
   * error.
6155
   *
6156
   * During the conversion also validation of the input string is done.
6157
   * This function is suitable to work with inputs from untrusted sources.
6158
   *
6159
   * This function is not BOM-aware.
6160
   *
6161
   * @param input         the UTF-32 string to convert
6162
   * @param length        the length of the string in 4-byte code units
6163
   * (char32_t)
6164
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
6165
   * @return a result pair struct (of type simdutf::result containing the two
6166
   * fields error and count) with an error code and either position of the error
6167
   * (in the input in code units) if any, or the number of char16_t written if
6168
   * successful.
6169
   */
6170
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
6171
      const char32_t *input, size_t length,
6172
      char16_t *utf16_buffer) const noexcept = 0;
6173
6174
  /**
6175
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
6176
   * error.
6177
   *
6178
   * During the conversion also validation of the input string is done.
6179
   * This function is suitable to work with inputs from untrusted sources.
6180
   *
6181
   * This function is not BOM-aware.
6182
   *
6183
   * @param input         the UTF-32 string to convert
6184
   * @param length        the length of the string in 4-byte code units
6185
   * (char32_t)
6186
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
6187
   * @return a result pair struct (of type simdutf::result containing the two
6188
   * fields error and count) with an error code and either position of the error
6189
   * (in the input in code units) if any, or the number of char16_t written if
6190
   * successful.
6191
   */
6192
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
6193
      const char32_t *input, size_t length,
6194
      char16_t *utf16_buffer) const noexcept = 0;
6195
6196
  /**
6197
   * Convert valid UTF-32 string into UTF-16LE string.
6198
   *
6199
   * This function assumes that the input string is valid UTF-32.
6200
   *
6201
   * This function is not BOM-aware.
6202
   *
6203
   * @param input         the UTF-32 string to convert
6204
   * @param length        the length of the string in 4-byte code units
6205
   * (char32_t)
6206
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
6207
   * result
6208
   * @return number of written code units; 0 if conversion is not possible
6209
   */
6210
  simdutf_warn_unused virtual size_t
6211
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
6212
                                 char16_t *utf16_buffer) const noexcept = 0;
6213
6214
  /**
6215
   * Convert valid UTF-32 string into UTF-16BE string.
6216
   *
6217
   * This function assumes that the input string is valid UTF-32.
6218
   *
6219
   * This function is not BOM-aware.
6220
   *
6221
   * @param input         the UTF-32 string to convert
6222
   * @param length        the length of the string in 4-byte code units
6223
   * (char32_t)
6224
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
6225
   * result
6226
   * @return number of written code units; 0 if conversion is not possible
6227
   */
6228
  simdutf_warn_unused virtual size_t
6229
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
6230
                                 char16_t *utf16_buffer) const noexcept = 0;
6231
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6232
6233
#if SIMDUTF_FEATURE_UTF16
6234
  /**
6235
   * Change the endianness of the input. Can be used to go from UTF-16LE to
6236
   * UTF-16BE or from UTF-16BE to UTF-16LE.
6237
   *
6238
   * This function does not validate the input.
6239
   *
6240
   * This function is not BOM-aware.
6241
   *
6242
   * @param input         the UTF-16 string to process
6243
   * @param length        the length of the string in 2-byte code units
6244
   * (char16_t)
6245
   * @param output        the pointer to a buffer that can hold the conversion
6246
   * result
6247
   */
6248
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
6249
                                       char16_t *output) const noexcept = 0;
6250
#endif // SIMDUTF_FEATURE_UTF16
6251
6252
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6253
  /**
6254
   * Return the number of bytes that this Latin1 string would require in UTF-8
6255
   * format.
6256
   *
6257
   * @param input         the Latin1 string to convert
6258
   * @param length        the length of the string bytes
6259
   * @return the number of bytes required to encode the Latin1 string as UTF-8
6260
   */
6261
  simdutf_warn_unused virtual size_t
6262
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
6263
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6264
6265
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6266
  /**
6267
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
6268
   * format.
6269
   *
6270
   * This function does not validate the input. It is acceptable to pass invalid
6271
   * UTF-32 strings but in such cases the result is implementation defined.
6272
   *
6273
   * @param input         the UTF-32 string to convert
6274
   * @param length        the length of the string in 4-byte code units
6275
   * (char32_t)
6276
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
6277
   */
6278
  simdutf_warn_unused virtual size_t
6279
  utf8_length_from_utf32(const char32_t *input,
6280
                         size_t length) const noexcept = 0;
6281
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6282
6283
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6284
  /**
6285
   * Compute the number of bytes that this UTF-32 string would require in Latin1
6286
   * format.
6287
   *
6288
   * This function does not validate the input. It is acceptable to pass invalid
6289
   * UTF-32 strings but in such cases the result is implementation defined.
6290
   *
6291
   * @param length        the length of the string in 4-byte code units
6292
   * (char32_t)
6293
   * @return the number of bytes required to encode the UTF-32 string as Latin1
6294
   */
6295
  simdutf_warn_unused virtual size_t
6296
  latin1_length_from_utf32(size_t length) const noexcept {
6297
    return length;
6298
  }
6299
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6300
6301
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6302
  /**
6303
   * Compute the number of bytes that this UTF-8 string would require in Latin1
6304
   * format.
6305
   *
6306
   * This function does not validate the input. It is acceptable to pass invalid
6307
   * UTF-8 strings but in such cases the result is implementation defined.
6308
   *
6309
   * @param input         the UTF-8 string to convert
6310
   * @param length        the length of the string in byte
6311
   * @return the number of bytes required to encode the UTF-8 string as Latin1
6312
   */
6313
  simdutf_warn_unused virtual size_t
6314
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
6315
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6316
6317
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6318
  /**
6319
   * Compute the number of bytes that this UTF-16LE/BE string would require in
6320
   * Latin1 format.
6321
   *
6322
   * This function does not validate the input. It is acceptable to pass invalid
6323
   * UTF-16 strings but in such cases the result is implementation defined.
6324
   *
6325
   * This function is not BOM-aware.
6326
   *
6327
   * @param input         the UTF-16LE string to convert
6328
   * @param length        the length of the string in 2-byte code units
6329
   * (char16_t)
6330
   * @return the number of bytes required to encode the UTF-16LE string as
6331
   * Latin1
6332
   */
6333
  simdutf_warn_unused virtual size_t
6334
  latin1_length_from_utf16(size_t length) const noexcept {
6335
    return length;
6336
  }
6337
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6338
6339
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6340
  /**
6341
   * Compute the number of two-byte code units that this UTF-32 string would
6342
   * require in UTF-16 format.
6343
   *
6344
   * This function does not validate the input. It is acceptable to pass invalid
6345
   * UTF-32 strings but in such cases the result is implementation defined.
6346
   *
6347
   * @param input         the UTF-32 string to convert
6348
   * @param length        the length of the string in 4-byte code units
6349
   * (char32_t)
6350
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
6351
   */
6352
  simdutf_warn_unused virtual size_t
6353
  utf16_length_from_utf32(const char32_t *input,
6354
                          size_t length) const noexcept = 0;
6355
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6356
6357
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6358
  /**
6359
   * Return the number of bytes that this UTF-32 string would require in Latin1
6360
   * format.
6361
   *
6362
   * @param length        the length of the string in 4-byte code units
6363
   * (char32_t)
6364
   * @return the number of bytes required to encode the UTF-32 string as Latin1
6365
   */
6366
  simdutf_warn_unused virtual size_t
6367
  utf32_length_from_latin1(size_t length) const noexcept {
6368
    return length;
6369
  }
6370
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6371
6372
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6373
  /**
6374
   * Compute the number of bytes that this UTF-16LE string would require in
6375
   * UTF-32 format.
6376
   *
6377
   * This function is equivalent to count_utf16le.
6378
   *
6379
   * This function does not validate the input. It is acceptable to pass invalid
6380
   * UTF-16 strings but in such cases the result is implementation defined.
6381
   *
6382
   * This function is not BOM-aware.
6383
   *
6384
   * @param input         the UTF-16LE string to convert
6385
   * @param length        the length of the string in 2-byte code units
6386
   * (char16_t)
6387
   * @return the number of bytes required to encode the UTF-16LE string as
6388
   * UTF-32
6389
   */
6390
  simdutf_warn_unused virtual size_t
6391
  utf32_length_from_utf16le(const char16_t *input,
6392
                            size_t length) const noexcept = 0;
6393
6394
  /**
6395
   * Compute the number of bytes that this UTF-16BE string would require in
6396
   * UTF-32 format.
6397
   *
6398
   * This function is equivalent to count_utf16be.
6399
   *
6400
   * This function does not validate the input. It is acceptable to pass invalid
6401
   * UTF-16 strings but in such cases the result is implementation defined.
6402
   *
6403
   * This function is not BOM-aware.
6404
   *
6405
   * @param input         the UTF-16BE string to convert
6406
   * @param length        the length of the string in 2-byte code units
6407
   * (char16_t)
6408
   * @return the number of bytes required to encode the UTF-16BE string as
6409
   * UTF-32
6410
   */
6411
  simdutf_warn_unused virtual size_t
6412
  utf32_length_from_utf16be(const char16_t *input,
6413
                            size_t length) const noexcept = 0;
6414
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6415
6416
#if SIMDUTF_FEATURE_UTF16
6417
  /**
6418
   * Count the number of code points (characters) in the string assuming that
6419
   * it is valid.
6420
   *
6421
   * This function assumes that the input string is valid UTF-16LE.
6422
   * It is acceptable to pass invalid UTF-16 strings but in such cases
6423
   * the result is implementation defined.
6424
   *
6425
   * This function is not BOM-aware.
6426
   *
6427
   * @param input         the UTF-16LE string to process
6428
   * @param length        the length of the string in 2-byte code units
6429
   * (char16_t)
6430
   * @return number of code points
6431
   */
6432
  simdutf_warn_unused virtual size_t
6433
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
6434
6435
  /**
6436
   * Count the number of code points (characters) in the string assuming that
6437
   * it is valid.
6438
   *
6439
   * This function assumes that the input string is valid UTF-16BE.
6440
   * It is acceptable to pass invalid UTF-16 strings but in such cases
6441
   * the result is implementation defined.
6442
   *
6443
   * This function is not BOM-aware.
6444
   *
6445
   * @param input         the UTF-16BE string to process
6446
   * @param length        the length of the string in 2-byte code units
6447
   * (char16_t)
6448
   * @return number of code points
6449
   */
6450
  simdutf_warn_unused virtual size_t
6451
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
6452
#endif // SIMDUTF_FEATURE_UTF16
6453
6454
#if SIMDUTF_FEATURE_UTF8
6455
  /**
6456
   * Count the number of code points (characters) in the string assuming that
6457
   * it is valid.
6458
   *
6459
   * This function assumes that the input string is valid UTF-8.
6460
   * It is acceptable to pass invalid UTF-8 strings but in such cases
6461
   * the result is implementation defined.
6462
   *
6463
   * @param input         the UTF-8 string to process
6464
   * @param length        the length of the string in bytes
6465
   * @return number of code points
6466
   */
6467
  simdutf_warn_unused virtual size_t
6468
  count_utf8(const char *input, size_t length) const noexcept = 0;
6469
#endif // SIMDUTF_FEATURE_UTF8
6470
6471
#if SIMDUTF_FEATURE_BASE64
6472
  /**
6473
   * Provide the maximal binary length in bytes given the base64 input.
6474
   * As long as the input does not contain ignorable characters (e.g., ASCII
6475
   * spaces or linefeed characters), the result is exact. In particular, the
6476
   * function checks for padding characters.
6477
   *
6478
   * The function is fast (constant time). It checks up to two characters at
6479
   * the end of the string. The input is not otherwise validated or read..
6480
   *
6481
   * @param input         the base64 input to process
6482
   * @param length        the length of the base64 input in bytes
6483
   * @return maximal number of binary bytes
6484
   */
6485
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
6486
      const char *input, size_t length) const noexcept;
6487
6488
  /**
6489
   * Provide the maximal binary length in bytes given the base64 input.
6490
   * As long as the input does not contain ignorable characters (e.g., ASCII
6491
   * spaces or linefeed characters), the result is exact. In particular, the
6492
   * function checks for padding characters.
6493
   *
6494
   * The function is fast (constant time). It checks up to two characters at
6495
   * the end of the string. The input is not otherwise validated or read.
6496
   *
6497
   * @param input         the base64 input to process, in ASCII stored as 16-bit
6498
   * units
6499
   * @param length        the length of the base64 input in 16-bit units
6500
   * @return maximal number of binary bytes
6501
   */
6502
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
6503
      const char16_t *input, size_t length) const noexcept;
6504
6505
  /**
6506
   * Convert a base64 input to a binary output.
6507
   *
6508
   * This function follows the WHATWG forgiving-base64 format, which means that
6509
   * it will ignore any ASCII spaces in the input. You may provide a padded
6510
   * input (with one or two equal signs at the end) or an unpadded input
6511
   * (without any equal signs at the end).
6512
   *
6513
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6514
   *
6515
   * This function will fail in case of invalid input. When last_chunk_options =
6516
   * loose, there are two possible reasons for failure: the input contains a
6517
   * number of base64 characters that when divided by 4, leaves a single
6518
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6519
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6520
   *
6521
   * You should call this function with a buffer that is at least
6522
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6523
   * provide that much space, the function may cause a buffer overflow.
6524
   *
6525
   * @param input         the base64 string to process
6526
   * @param length        the length of the string in bytes
6527
   * @param output        the pointer to a buffer that can hold the conversion
6528
   * result (should be at least maximal_binary_length_from_base64(input, length)
6529
   * bytes long).
6530
   * @param options       the base64 options to use, can be base64_default or
6531
   * base64_url, is base64_default by default.
6532
   * @return a result pair struct (of type simdutf::result containing the two
6533
   * fields error and count) with an error code and either position of the error
6534
   * (in the input in bytes) if any, or the number of bytes written if
6535
   * successful.
6536
   */
6537
  simdutf_warn_unused virtual result
6538
  base64_to_binary(const char *input, size_t length, char *output,
6539
                   base64_options options = base64_default,
6540
                   last_chunk_handling_options last_chunk_options =
6541
                       last_chunk_handling_options::loose) const noexcept = 0;
6542
6543
  /**
6544
   * Convert a base64 input to a binary output while returning more details
6545
   * than base64_to_binary.
6546
   *
6547
   * This function follows the WHATWG forgiving-base64 format, which means that
6548
   * it will ignore any ASCII spaces in the input. You may provide a padded
6549
   * input (with one or two equal signs at the end) or an unpadded input
6550
   * (without any equal signs at the end).
6551
   *
6552
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6553
   *
6554
   * This function will fail in case of invalid input. When last_chunk_options =
6555
   * loose, there are two possible reasons for failure: the input contains a
6556
   * number of base64 characters that when divided by 4, leaves a single
6557
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6558
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6559
   *
6560
   * You should call this function with a buffer that is at least
6561
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6562
   * provide that much space, the function may cause a buffer overflow.
6563
   *
6564
   * @param input         the base64 string to process
6565
   * @param length        the length of the string in bytes
6566
   * @param output        the pointer to a buffer that can hold the conversion
6567
   * result (should be at least maximal_binary_length_from_base64(input, length)
6568
   * bytes long).
6569
   * @param options       the base64 options to use, can be base64_default or
6570
   * base64_url, is base64_default by default.
6571
   * @return a full_result pair struct (of type simdutf::result containing the
6572
   * three fields error, input_count and output_count).
6573
   */
6574
  simdutf_warn_unused virtual full_result base64_to_binary_details(
6575
      const char *input, size_t length, char *output,
6576
      base64_options options = base64_default,
6577
      last_chunk_handling_options last_chunk_options =
6578
          last_chunk_handling_options::loose) const noexcept = 0;
6579
6580
  /**
6581
   * Convert a base64 input to a binary output.
6582
   *
6583
   * This function follows the WHATWG forgiving-base64 format, which means that
6584
   * it will ignore any ASCII spaces in the input. You may provide a padded
6585
   * input (with one or two equal signs at the end) or an unpadded input
6586
   * (without any equal signs at the end).
6587
   *
6588
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6589
   *
6590
   * This function will fail in case of invalid input. When last_chunk_options =
6591
   * loose, there are two possible reasons for failure: the input contains a
6592
   * number of base64 characters that when divided by 4, leaves a single
6593
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6594
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6595
   *
6596
   * You should call this function with a buffer that is at least
6597
   * maximal_binary_length_from_base64(input, length) bytes long. If you
6598
   * fail to provide that much space, the function may cause a buffer overflow.
6599
   *
6600
   * @param input         the base64 string to process, in ASCII stored as
6601
   * 16-bit units
6602
   * @param length        the length of the string in 16-bit units
6603
   * @param output        the pointer to a buffer that can hold the conversion
6604
   * result (should be at least maximal_binary_length_from_base64(input, length)
6605
   * bytes long).
6606
   * @param options       the base64 options to use, can be base64_default or
6607
   * base64_url, is base64_default by default.
6608
   * @return a result pair struct (of type simdutf::result containing the two
6609
   * fields error and count) with an error code and position of the
6610
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
6611
   * number of bytes written if successful.
6612
   */
6613
  simdutf_warn_unused virtual result
6614
  base64_to_binary(const char16_t *input, size_t length, char *output,
6615
                   base64_options options = base64_default,
6616
                   last_chunk_handling_options last_chunk_options =
6617
                       last_chunk_handling_options::loose) const noexcept = 0;
6618
6619
  /**
6620
   * Convert a base64 input to a binary output while returning more details
6621
   * than base64_to_binary.
6622
   *
6623
   * This function follows the WHATWG forgiving-base64 format, which means that
6624
   * it will ignore any ASCII spaces in the input. You may provide a padded
6625
   * input (with one or two equal signs at the end) or an unpadded input
6626
   * (without any equal signs at the end).
6627
   *
6628
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6629
   *
6630
   * This function will fail in case of invalid input. When last_chunk_options =
6631
   * loose, there are two possible reasons for failure: the input contains a
6632
   * number of base64 characters that when divided by 4, leaves a single
6633
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6634
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6635
   *
6636
   * You should call this function with a buffer that is at least
6637
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6638
   * provide that much space, the function may cause a buffer overflow.
6639
   *
6640
   * @param input         the base64 string to process
6641
   * @param length        the length of the string in bytes
6642
   * @param output        the pointer to a buffer that can hold the conversion
6643
   * result (should be at least maximal_binary_length_from_base64(input, length)
6644
   * bytes long).
6645
   * @param options       the base64 options to use, can be base64_default or
6646
   * base64_url, is base64_default by default.
6647
   * @return a full_result pair struct (of type simdutf::result containing the
6648
   * three fields error, input_count and output_count).
6649
   */
6650
  simdutf_warn_unused virtual full_result base64_to_binary_details(
6651
      const char16_t *input, size_t length, char *output,
6652
      base64_options options = base64_default,
6653
      last_chunk_handling_options last_chunk_options =
6654
          last_chunk_handling_options::loose) const noexcept = 0;
6655
6656
  /**
6657
   * Provide the base64 length in bytes given the length of a binary input.
6658
   *
6659
   * @param length        the length of the input in bytes
6660
   * @param options       the base64 options to use, can be base64_default or
6661
   * base64_url, is base64_default by default.
6662
   * @return number of base64 bytes
6663
   */
6664
  simdutf_warn_unused size_t base64_length_from_binary(
6665
      size_t length, base64_options options = base64_default) const noexcept;
6666
6667
  /**
6668
   * Convert a binary input to a base64 output.
6669
   *
6670
   * The default option (simdutf::base64_default) uses the characters `+` and
6671
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
6672
   * the output to ensure that the output length is a multiple of four.
6673
   *
6674
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
6675
   * part of its alphabet. No padding is added at the end of the output.
6676
   *
6677
   * This function always succeeds.
6678
   *
6679
   * @param input         the binary to process
6680
   * @param length        the length of the input in bytes
6681
   * @param output        the pointer to a buffer that can hold the conversion
6682
   * result (should be at least base64_length_from_binary(length) bytes long)
6683
   * @param options       the base64 options to use, can be base64_default or
6684
   * base64_url, is base64_default by default.
6685
   * @return number of written bytes, will be equal to
6686
   * base64_length_from_binary(length, options)
6687
   */
6688
  virtual size_t
6689
  binary_to_base64(const char *input, size_t length, char *output,
6690
                   base64_options options = base64_default) const noexcept = 0;
6691
6692
  /**
6693
   * Convert a binary input to a base64 output with lines of given length.
6694
   * Lines are separated by a single linefeed character.
6695
   *
6696
   * The default option (simdutf::base64_default) uses the characters `+` and
6697
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
6698
   * the output to ensure that the output length is a multiple of four.
6699
   *
6700
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
6701
   * part of its alphabet. No padding is added at the end of the output.
6702
   *
6703
   * This function always succeeds.
6704
   *
6705
   * @param input         the binary to process
6706
   * @param length        the length of the input in bytes
6707
   * @param output        the pointer to a buffer that can hold the conversion
6708
   * result (should be at least base64_length_from_binary_with_lines(length,
6709
   * options, line_length) bytes long)
6710
   * @param line_length   the length of each line, values smaller than 4 are
6711
   * interpreted as 4
6712
   * @param options       the base64 options to use, can be base64_default or
6713
   * base64_url, is base64_default by default.
6714
   * @return number of written bytes, will be equal to
6715
   * base64_length_from_binary_with_lines(length, options, line_length)
6716
   */
6717
  virtual size_t binary_to_base64_with_lines(
6718
      const char *input, size_t length, char *output,
6719
      size_t line_length = simdutf::default_line_length,
6720
      base64_options options = base64_default) const noexcept = 0;
6721
6722
  /**
6723
   * Find the first occurrence of a character in a string. If the character is
6724
   * not found, return a pointer to the end of the string.
6725
   * @param start        the start of the string
6726
   * @param end          the end of the string
6727
   * @param character    the character to find
6728
   * @return a pointer to the first occurrence of the character in the string,
6729
   * or a pointer to the end of the string if the character is not found.
6730
   *
6731
   */
6732
  virtual const char *find(const char *start, const char *end,
6733
                           char character) const noexcept = 0;
6734
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
6735
                               char16_t character) const noexcept = 0;
6736
#endif // SIMDUTF_FEATURE_BASE64
6737
6738
#ifdef SIMDUTF_INTERNAL_TESTS
6739
  // This method is exported only in developer mode, its purpose
6740
  // is to expose some internal test procedures from the given
6741
  // implementation and then use them through our standard test
6742
  // framework.
6743
  //
6744
  // Regular users should not use it, the tests of the public
6745
  // API are enough.
6746
6747
  struct TestProcedure {
6748
    // display name
6749
    std::string name;
6750
6751
    // procedure should return whether given test pass or not
6752
    void (*procedure)(const implementation &);
6753
  };
6754
6755
  virtual std::vector<TestProcedure> internal_tests() const;
6756
#endif
6757
6758
protected:
6759
  /** @private Construct an implementation with the given name and description.
6760
   * For subclasses. */
6761
  simdutf_really_inline implementation(const char *name,
6762
                                       const char *description,
6763
                                       uint32_t required_instruction_sets)
6764
      : _name(name), _description(description),
6765
        _required_instruction_sets(required_instruction_sets) {}
6766
6767
protected:
6768
  ~implementation() = default;
6769
6770
private:
6771
  /**
6772
   * The name of this implementation.
6773
   */
6774
  const char *_name;
6775
6776
  /**
6777
   * The description of this implementation.
6778
   */
6779
  const char *_description;
6780
6781
  /**
6782
   * Instruction sets required for this implementation.
6783
   */
6784
  const uint32_t _required_instruction_sets;
6785
};
6786
6787
/** @private */
6788
namespace internal {
6789
6790
/**
6791
 * The list of available implementations compiled into simdutf.
6792
 */
6793
class available_implementation_list {
6794
public:
6795
  /** Get the list of available implementations compiled into simdutf */
6796
  simdutf_really_inline available_implementation_list() {}
6797
  /** Number of implementations */
6798
  size_t size() const noexcept;
6799
  /** STL const begin() iterator */
6800
  const implementation *const *begin() const noexcept;
6801
  /** STL const end() iterator */
6802
  const implementation *const *end() const noexcept;
6803
6804
  /**
6805
   * Get the implementation with the given name.
6806
   *
6807
   * Case sensitive.
6808
   *
6809
   *     const implementation *impl =
6810
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
6811
   * (!imp->supported_by_runtime_system()) { exit(1); }
6812
   *     simdutf::active_implementation = impl;
6813
   *
6814
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
6815
   * @return the implementation, or nullptr if the parse failed.
6816
   */
6817
  const implementation *operator[](const std::string &name) const noexcept {
6818
    for (const implementation *impl : *this) {
6819
      if (impl->name() == name) {
6820
        return impl;
6821
      }
6822
    }
6823
    return nullptr;
6824
  }
6825
6826
  /**
6827
   * Detect the most advanced implementation supported by the current host.
6828
   *
6829
   * This is used to initialize the implementation on startup.
6830
   *
6831
   *     const implementation *impl =
6832
   * simdutf::available_implementation::detect_best_supported();
6833
   *     simdutf::active_implementation = impl;
6834
   *
6835
   * @return the most advanced supported implementation for the current host, or
6836
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
6837
   * supported implementation. Will never return nullptr.
6838
   */
6839
  const implementation *detect_best_supported() const noexcept;
6840
};
6841
6842
template <typename T> class atomic_ptr {
6843
public:
6844
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
6845
6846
#if defined(SIMDUTF_NO_THREADS)
6847
  operator const T *() const { return ptr; }
6848
  const T &operator*() const { return *ptr; }
6849
  const T *operator->() const { return ptr; }
6850
6851
  operator T *() { return ptr; }
6852
  T &operator*() { return *ptr; }
6853
  T *operator->() { return ptr; }
6854
  atomic_ptr &operator=(T *_ptr) {
6855
    ptr = _ptr;
6856
    return *this;
6857
  }
6858
6859
#else
6860
  operator const T *() const { return ptr.load(); }
6861
  const T &operator*() const { return *ptr; }
6862
  const T *operator->() const { return ptr.load(); }
6863
6864
  operator T *() { return ptr.load(); }
6865
  T &operator*() { return *ptr; }
6866
  T *operator->() { return ptr.load(); }
6867
  atomic_ptr &operator=(T *_ptr) {
6868
    ptr = _ptr;
6869
    return *this;
6870
  }
6871
6872
#endif
6873
6874
private:
6875
#if defined(SIMDUTF_NO_THREADS)
6876
  T *ptr;
6877
#else
6878
  std::atomic<T *> ptr;
6879
#endif
6880
};
6881
6882
class detect_best_supported_implementation_on_first_use;
6883
6884
} // namespace internal
6885
6886
/**
6887
 * The list of available implementations compiled into simdutf.
6888
 */
6889
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
6890
get_available_implementations();
6891
6892
/**
6893
 * The active implementation.
6894
 *
6895
 * Automatically initialized on first use to the most advanced implementation
6896
 * supported by this hardware.
6897
 */
6898
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
6899
get_active_implementation();
6900
6901
} // namespace simdutf
6902
6903
#if SIMDUTF_FEATURE_BASE64
6904
  // this header is not part of the public api
6905
  #include <simdutf/base64_implementation.h>
6906
6907
namespace simdutf {
6908
  #if SIMDUTF_SPAN
6909
/**
6910
 * @brief span overload
6911
 * @return a tuple of result and outlen
6912
 */
6913
simdutf_really_inline
6914
    simdutf_constexpr23 simdutf_warn_unused std::tuple<result, std::size_t>
6915
    base64_to_binary_safe(
6916
        const detail::input_span_of_byte_like auto &input,
6917
        detail::output_span_of_byte_like auto &&binary_output,
6918
        base64_options options = base64_default,
6919
        last_chunk_handling_options last_chunk_options = loose,
6920
        bool decode_up_to_bad_char = false) noexcept {
6921
  size_t outlen = binary_output.size();
6922
    #if SIMDUTF_CPLUSPLUS23
6923
  if consteval {
6924
    using CInput = std::decay_t<decltype(*input.data())>;
6925
    static_assert(std::is_same_v<CInput, char>,
6926
                  "sorry, the constexpr implementation is for now limited to "
6927
                  "input of type char");
6928
    using COutput = std::decay_t<decltype(*binary_output.data())>;
6929
    static_assert(std::is_same_v<COutput, char>,
6930
                  "sorry, the constexpr implementation is for now limited to "
6931
                  "output of type char");
6932
    auto r = base64_to_binary_safe_impl(
6933
        input.data(), input.size(), binary_output.data(), outlen, options,
6934
        last_chunk_options, decode_up_to_bad_char);
6935
    return {r, outlen};
6936
  } else
6937
    #endif
6938
  {
6939
    auto r = base64_to_binary_safe_impl<char>(
6940
        reinterpret_cast<const char *>(input.data()), input.size(),
6941
        reinterpret_cast<char *>(binary_output.data()), outlen, options,
6942
        last_chunk_options, decode_up_to_bad_char);
6943
    return {r, outlen};
6944
  }
6945
}
6946
6947
    #if SIMDUTF_SPAN
6948
/**
6949
 * @brief span overload
6950
 * @return a tuple of result and outlen
6951
 */
6952
simdutf_really_inline
6953
    simdutf_warn_unused simdutf_constexpr23 std::tuple<result, std::size_t>
6954
    base64_to_binary_safe(
6955
        std::span<const char16_t> input,
6956
        detail::output_span_of_byte_like auto &&binary_output,
6957
        base64_options options = base64_default,
6958
        last_chunk_handling_options last_chunk_options = loose,
6959
        bool decode_up_to_bad_char = false) noexcept {
6960
  size_t outlen = binary_output.size();
6961
      #if SIMDUTF_CPLUSPLUS23
6962
  if consteval {
6963
    auto r = base64_to_binary_safe_impl(
6964
        input.data(), input.size(), binary_output.data(), outlen, options,
6965
        last_chunk_options, decode_up_to_bad_char);
6966
    return {r, outlen};
6967
  } else
6968
      #endif
6969
  {
6970
    auto r = base64_to_binary_safe(
6971
        input.data(), input.size(),
6972
        reinterpret_cast<char *>(binary_output.data()), outlen, options,
6973
        last_chunk_options, decode_up_to_bad_char);
6974
    return {r, outlen};
6975
  }
6976
}
6977
    #endif // SIMDUTF_SPAN
6978
6979
  #endif // SIMDUTF_SPAN
6980
} // namespace simdutf
6981
6982
#endif // SIMDUTF_FEATURE_BASE64
6983
6984
#endif // SIMDUTF_IMPLEMENTATION_H