Coverage Report

Created: 2026-01-09 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/include/simdutf/implementation.h
Line
Count
Source
1
#ifndef SIMDUTF_IMPLEMENTATION_H
2
#define SIMDUTF_IMPLEMENTATION_H
3
#if !defined(SIMDUTF_NO_THREADS)
4
  #include <atomic>
5
#endif
6
#include <string>
7
#ifdef SIMDUTF_INTERNAL_TESTS
8
  #include <vector>
9
#endif
10
#include "simdutf/common_defs.h"
11
#include "simdutf/compiler_check.h"
12
#include "simdutf/encoding_types.h"
13
#include "simdutf/error.h"
14
#include "simdutf/internal/isadetection.h"
15
16
#if SIMDUTF_SPAN
17
  #include <concepts>
18
  #include <type_traits>
19
  #include <span>
20
  #include <tuple>
21
#endif
22
#if SIMDUTF_CPLUSPLUS17
23
  #include <string_view>
24
#endif
25
// The following defines are conditionally enabled/disabled during amalgamation.
26
// By default all features are enabled, regular code shouldn't check them. Only
27
// when user code really relies of a selected subset, it's good to verify these
28
// flags, like:
29
//
30
//      #if !SIMDUTF_FEATURE_UTF16
31
//      #   error("Please amalgamate simdutf with UTF-16 support")
32
//      #endif
33
//
34
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
35
#define SIMDUTF_FEATURE_ASCII 1
36
#define SIMDUTF_FEATURE_LATIN1 1
37
#define SIMDUTF_FEATURE_UTF8 1
38
#define SIMDUTF_FEATURE_UTF16 1
39
#define SIMDUTF_FEATURE_UTF32 1
40
#define SIMDUTF_FEATURE_BASE64 1
41
42
#if SIMDUTF_CPLUSPLUS23
43
  #include <simdutf/constexpr_ptr.h>
44
#endif
45
46
#if SIMDUTF_SPAN
47
/// helpers placed in namespace detail are not a part of the public API
48
namespace simdutf {
49
namespace detail {
50
/**
51
 * matches a byte, in the many ways C++ allows. note that these
52
 * are all distinct types.
53
 */
54
template <typename T>
55
concept byte_like = std::is_same_v<T, std::byte> ||     //
56
                    std::is_same_v<T, char> ||          //
57
                    std::is_same_v<T, signed char> ||   //
58
                    std::is_same_v<T, unsigned char> || //
59
                    std::is_same_v<T, char8_t>;
60
61
template <typename T>
62
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
63
64
template <typename T>
65
concept is_pointer = std::is_pointer_v<T>;
66
67
/**
68
 * matches anything that behaves like std::span and points to character-like
69
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
70
 * std::uint8_t
71
 */
72
template <typename T>
73
concept input_span_of_byte_like = requires(const T &t) {
74
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
75
  { t.data() } noexcept -> is_pointer;
76
  { *t.data() } noexcept -> is_byte_like;
77
};
78
79
template <typename T>
80
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
81
82
/**
83
 * like span_of_byte_like, but for an output span (intended to be written to)
84
 */
85
template <typename T>
86
concept output_span_of_byte_like = requires(T &t) {
87
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
88
  { t.data() } noexcept -> is_pointer;
89
  { *t.data() } noexcept -> is_byte_like;
90
  { *t.data() } noexcept -> is_mutable;
91
};
92
93
/**
94
 * a pointer like object, when indexed, results in a byte like result.
95
 * valid examples: char*, const char*, std::array<char,10>
96
 * invalid examples: int*, std::array<int,10>
97
 */
98
template <class InputPtr>
99
concept indexes_into_byte_like = requires(InputPtr p) {
100
  { std::decay_t<decltype(p[0])>{} } -> simdutf::detail::byte_like;
101
};
102
template <class InputPtr>
103
concept indexes_into_utf16 = requires(InputPtr p) {
104
  { std::decay_t<decltype(p[0])>{} } -> std::same_as<char16_t>;
105
};
106
template <class InputPtr>
107
concept indexes_into_utf32 = requires(InputPtr p) {
108
  { std::decay_t<decltype(p[0])>{} } -> std::same_as<char32_t>;
109
};
110
111
template <class InputPtr>
112
concept index_assignable_from_char = requires(InputPtr p, char s) {
113
  { p[0] = s };
114
};
115
116
/**
117
 * a pointer like object that results in a uint32_t when indexed.
118
 * valid examples: uint32_t*
119
 */
120
template <class InputPtr>
121
concept indexes_into_uint32 = requires(InputPtr p) {
122
  { std::decay_t<decltype(p[0])>{} } -> std::same_as<std::uint32_t>;
123
};
124
} // namespace detail
125
} // namespace simdutf
126
#endif // SIMDUTF_SPAN
127
128
// these includes are needed for constexpr support. they are
129
// not part of the public api.
130
#include <simdutf/scalar/swap_bytes.h>
131
#include <simdutf/scalar/ascii.h>
132
#include <simdutf/scalar/atomic_util.h>
133
#include <simdutf/scalar/latin1.h>
134
#include <simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h>
135
#include <simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h>
136
#include <simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h>
137
#include <simdutf/scalar/utf16.h>
138
#include <simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h>
139
#include <simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h>
140
#include <simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h>
141
#include <simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h>
142
#include <simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h>
143
#include <simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h>
144
#include <simdutf/scalar/utf32.h>
145
#include <simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h>
146
#include <simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h>
147
#include <simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h>
148
#include <simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h>
149
#include <simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h>
150
#include <simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h>
151
#include <simdutf/scalar/utf8.h>
152
#include <simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h>
153
#include <simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h>
154
#include <simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h>
155
#include <simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h>
156
#include <simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h>
157
#include <simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h>
158
159
namespace simdutf {
160
161
constexpr size_t default_line_length =
162
    76; ///< default line length for base64 encoding with lines
163
164
#if SIMDUTF_FEATURE_DETECT_ENCODING
165
/**
166
 * Autodetect the encoding of the input, a single encoding is recommended.
167
 * E.g., the function might return simdutf::encoding_type::UTF8,
168
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
169
 * simdutf::encoding_type::UTF32_LE.
170
 *
171
 * @param input the string to analyze.
172
 * @param length the length of the string in bytes.
173
 * @return the detected encoding type
174
 */
175
simdutf_warn_unused simdutf::encoding_type
176
autodetect_encoding(const char *input, size_t length) noexcept;
177
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
178
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
179
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
180
0
}
181
  #if SIMDUTF_SPAN
182
/**
183
 * Autodetect the encoding of the input, a single encoding is recommended.
184
 * E.g., the function might return simdutf::encoding_type::UTF8,
185
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
186
 * simdutf::encoding_type::UTF32_LE.
187
 *
188
 * @param input the string to analyze. can be a anything span-like that has a
189
 * data() and size() that points to character data: std::string,
190
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
191
 * @return the detected encoding type
192
 */
193
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
194
autodetect_encoding(
195
    const detail::input_span_of_byte_like auto &input) noexcept {
196
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
197
                             input.size());
198
}
199
  #endif // SIMDUTF_SPAN
200
201
/**
202
 * Autodetect the possible encodings of the input in one pass.
203
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
204
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
205
 *
206
 * Overridden by each implementation.
207
 *
208
 * @param input the string to analyze.
209
 * @param length the length of the string in bytes.
210
 * @return the detected encoding type
211
 */
212
simdutf_warn_unused int detect_encodings(const char *input,
213
                                         size_t length) noexcept;
214
simdutf_really_inline simdutf_warn_unused int
215
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
216
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
217
0
}
218
  #if SIMDUTF_SPAN
219
simdutf_really_inline simdutf_warn_unused int
220
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
221
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
222
                          input.size());
223
}
224
  #endif // SIMDUTF_SPAN
225
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
226
227
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
228
/**
229
 * Validate the UTF-8 string. This function may be best when you expect
230
 * the input to be almost always valid. Otherwise, consider using
231
 * validate_utf8_with_errors.
232
 *
233
 * Overridden by each implementation.
234
 *
235
 * @param buf the UTF-8 string to validate.
236
 * @param len the length of the string in bytes.
237
 * @return true if and only if the string is valid UTF-8.
238
 */
239
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
240
  #if SIMDUTF_SPAN
241
simdutf_constexpr23 simdutf_really_inline simdutf_warn_unused bool
242
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
243
    #if SIMDUTF_CPLUSPLUS23
244
  if consteval {
245
    return scalar::utf8::validate(
246
        detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
247
  } else
248
    #endif
249
  {
250
    return validate_utf8(reinterpret_cast<const char *>(input.data()),
251
                         input.size());
252
  }
253
}
254
  #endif // SIMDUTF_SPAN
255
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
256
257
#if SIMDUTF_FEATURE_UTF8
258
/**
259
 * Validate the UTF-8 string and stop on error.
260
 *
261
 * Overridden by each implementation.
262
 *
263
 * @param buf the UTF-8 string to validate.
264
 * @param len the length of the string in bytes.
265
 * @return a result pair struct (of type simdutf::result containing the two
266
 * fields error and count) with an error code and either position of the error
267
 * (in the input in code units) if any, or the number of code units validated if
268
 * successful.
269
 */
270
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
271
                                                     size_t len) noexcept;
272
  #if SIMDUTF_SPAN
273
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
274
validate_utf8_with_errors(
275
    const detail::input_span_of_byte_like auto &input) noexcept {
276
    #if SIMDUTF_CPLUSPLUS23
277
  if consteval {
278
    return scalar::utf8::validate_with_errors(
279
        detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
280
  } else
281
    #endif
282
  {
283
    return validate_utf8_with_errors(
284
        reinterpret_cast<const char *>(input.data()), input.size());
285
  }
286
}
287
  #endif // SIMDUTF_SPAN
288
#endif   // SIMDUTF_FEATURE_UTF8
289
290
#if SIMDUTF_FEATURE_ASCII
291
/**
292
 * Validate the ASCII string.
293
 *
294
 * Overridden by each implementation.
295
 *
296
 * @param buf the ASCII string to validate.
297
 * @param len the length of the string in bytes.
298
 * @return true if and only if the string is valid ASCII.
299
 */
300
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
301
  #if SIMDUTF_SPAN
302
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
303
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
304
    #if SIMDUTF_CPLUSPLUS23
305
  if consteval {
306
    return scalar::ascii::validate(
307
        detail::constexpr_cast_ptr<std::uint8_t>(input.data()), input.size());
308
  } else
309
    #endif
310
  {
311
    return validate_ascii(reinterpret_cast<const char *>(input.data()),
312
                          input.size());
313
  }
314
}
315
  #endif // SIMDUTF_SPAN
316
317
/**
318
 * Validate the ASCII string and stop on error. It might be faster than
319
 * validate_utf8 when an error is expected to occur early.
320
 *
321
 * Overridden by each implementation.
322
 *
323
 * @param buf the ASCII string to validate.
324
 * @param len the length of the string in bytes.
325
 * @return a result pair struct (of type simdutf::result containing the two
326
 * fields error and count) with an error code and either position of the error
327
 * (in the input in code units) if any, or the number of code units validated if
328
 * successful.
329
 */
330
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
331
                                                      size_t len) noexcept;
332
  #if SIMDUTF_SPAN
333
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
334
validate_ascii_with_errors(
335
    const detail::input_span_of_byte_like auto &input) noexcept {
336
    #if SIMDUTF_CPLUSPLUS23
337
  if consteval {
338
    return scalar::ascii::validate_with_errors(
339
        detail::constexpr_cast_ptr<std::uint8_t>(input.data()), input.size());
340
  } else
341
    #endif
342
  {
343
    return validate_ascii_with_errors(
344
        reinterpret_cast<const char *>(input.data()), input.size());
345
  }
346
}
347
  #endif // SIMDUTF_SPAN
348
#endif   // SIMDUTF_FEATURE_ASCII
349
350
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
351
/**
352
 * Validate the ASCII string as a UTF-16 sequence.
353
 * An UTF-16 sequence is considered an ASCII sequence
354
 * if it could be converted to an ASCII string losslessly.
355
 *
356
 * Overridden by each implementation.
357
 *
358
 * @param buf the UTF-16 string to validate.
359
 * @param len the length of the string in bytes.
360
 * @return true if and only if the string is valid ASCII.
361
 */
362
simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf,
363
                                                 size_t len) noexcept;
364
  #if SIMDUTF_SPAN
365
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
366
0
validate_utf16_as_ascii(std::span<const char16_t> input) noexcept {
367
0
    #if SIMDUTF_CPLUSPLUS23
368
0
  if consteval {
369
0
    return scalar::utf16::validate_as_ascii<endianness::NATIVE>(input.data(),
370
0
                                                                input.size());
371
0
  } else
372
0
    #endif
373
0
  {
374
0
    return validate_utf16_as_ascii(input.data(), input.size());
375
0
  }
376
0
}
377
  #endif // SIMDUTF_SPAN
378
379
/**
380
 * Validate the ASCII string as a UTF-16BE sequence.
381
 * An UTF-16 sequence is considered an ASCII sequence
382
 * if it could be converted to an ASCII string losslessly.
383
 *
384
 * Overridden by each implementation.
385
 *
386
 * @param buf the UTF-16BE string to validate.
387
 * @param len the length of the string in bytes.
388
 * @return true if and only if the string is valid ASCII.
389
 */
390
simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf,
391
                                                   size_t len) noexcept;
392
  #if SIMDUTF_SPAN
393
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
394
0
validate_utf16be_as_ascii(std::span<const char16_t> input) noexcept {
395
0
    #if SIMDUTF_CPLUSPLUS23
396
0
  if consteval {
397
0
    return scalar::utf16::validate_as_ascii<endianness::BIG>(input.data(),
398
0
                                                             input.size());
399
0
  } else
400
0
    #endif
401
0
  {
402
0
    return validate_utf16be_as_ascii(input.data(), input.size());
403
0
  }
404
0
}
405
  #endif // SIMDUTF_SPAN
406
407
/**
408
 * Validate the ASCII string as a UTF-16LE sequence.
409
 * An UTF-16 sequence is considered an ASCII sequence
410
 * if it could be converted to an ASCII string losslessly.
411
 *
412
 * Overridden by each implementation.
413
 *
414
 * @param buf the UTF-16LE string to validate.
415
 * @param len the length of the string in bytes.
416
 * @return true if and only if the string is valid ASCII.
417
 */
418
simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf,
419
                                                   size_t len) noexcept;
420
  #if SIMDUTF_SPAN
421
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
422
0
validate_utf16le_as_ascii(std::span<const char16_t> input) noexcept {
423
0
    #if SIMDUTF_CPLUSPLUS23
424
0
  if consteval {
425
0
    return scalar::utf16::validate_as_ascii<endianness::LITTLE>(input.data(),
426
0
                                                                input.size());
427
0
  } else
428
0
    #endif
429
0
  {
430
0
    return validate_utf16le_as_ascii(input.data(), input.size());
431
0
  }
432
0
}
433
  #endif // SIMDUTF_SPAN
434
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
435
436
#if SIMDUTF_FEATURE_UTF16
437
/**
438
 * Using native endianness; Validate the UTF-16 string.
439
 * This function may be best when you expect the input to be almost always
440
 * valid. Otherwise, consider using validate_utf16_with_errors.
441
 *
442
 * Overridden by each implementation.
443
 *
444
 * This function is not BOM-aware.
445
 *
446
 * @param buf the UTF-16 string to validate.
447
 * @param len the length of the string in number of 2-byte code units
448
 * (char16_t).
449
 * @return true if and only if the string is valid UTF-16.
450
 */
451
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
452
                                        size_t len) noexcept;
453
  #if SIMDUTF_SPAN
454
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
455
0
validate_utf16(std::span<const char16_t> input) noexcept {
456
0
    #if SIMDUTF_CPLUSPLUS23
457
0
  if consteval {
458
0
    return scalar::utf16::validate<endianness::NATIVE>(input.data(),
459
0
                                                       input.size());
460
0
  } else
461
0
    #endif
462
0
  {
463
0
    return validate_utf16(input.data(), input.size());
464
0
  }
465
0
}
466
  #endif // SIMDUTF_SPAN
467
#endif   // SIMDUTF_FEATURE_UTF16
468
469
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
470
/**
471
 * Validate the UTF-16LE string. This function may be best when you expect
472
 * the input to be almost always valid. Otherwise, consider using
473
 * validate_utf16le_with_errors.
474
 *
475
 * Overridden by each implementation.
476
 *
477
 * This function is not BOM-aware.
478
 *
479
 * @param buf the UTF-16LE string to validate.
480
 * @param len the length of the string in number of 2-byte code units
481
 * (char16_t).
482
 * @return true if and only if the string is valid UTF-16LE.
483
 */
484
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
485
                                          size_t len) noexcept;
486
  #if SIMDUTF_SPAN
487
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused bool
488
0
validate_utf16le(std::span<const char16_t> input) noexcept {
489
0
    #if SIMDUTF_CPLUSPLUS23
490
0
  if consteval {
491
0
    return scalar::utf16::validate<endianness::LITTLE>(input.data(),
492
0
                                                       input.size());
493
0
  } else
494
0
    #endif
495
0
  {
496
0
    return validate_utf16le(input.data(), input.size());
497
0
  }
498
0
}
499
  #endif // SIMDUTF_SPAN
500
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
501
502
#if SIMDUTF_FEATURE_UTF16
503
/**
504
 * Validate the UTF-16BE string. This function may be best when you expect
505
 * the input to be almost always valid. Otherwise, consider using
506
 * validate_utf16be_with_errors.
507
 *
508
 * Overridden by each implementation.
509
 *
510
 * This function is not BOM-aware.
511
 *
512
 * @param buf the UTF-16BE string to validate.
513
 * @param len the length of the string in number of 2-byte code units
514
 * (char16_t).
515
 * @return true if and only if the string is valid UTF-16BE.
516
 */
517
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
518
                                          size_t len) noexcept;
519
  #if SIMDUTF_SPAN
520
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
521
0
validate_utf16be(std::span<const char16_t> input) noexcept {
522
0
    #if SIMDUTF_CPLUSPLUS23
523
0
  if consteval {
524
0
    return scalar::utf16::validate<endianness::BIG>(input.data(), input.size());
525
0
  } else
526
0
    #endif
527
0
  {
528
0
    return validate_utf16be(input.data(), input.size());
529
0
  }
530
0
}
531
  #endif // SIMDUTF_SPAN
532
533
/**
534
 * Using native endianness; Validate the UTF-16 string and stop on error.
535
 * It might be faster than validate_utf16 when an error is expected to occur
536
 * early.
537
 *
538
 * Overridden by each implementation.
539
 *
540
 * This function is not BOM-aware.
541
 *
542
 * @param buf the UTF-16 string to validate.
543
 * @param len the length of the string in number of 2-byte code units
544
 * (char16_t).
545
 * @return a result pair struct (of type simdutf::result containing the two
546
 * fields error and count) with an error code and either position of the error
547
 * (in the input in code units) if any, or the number of code units validated if
548
 * successful.
549
 */
550
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
551
                                                      size_t len) noexcept;
552
  #if SIMDUTF_SPAN
553
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
554
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
555
0
    #if SIMDUTF_CPLUSPLUS23
556
0
  if consteval {
557
0
    return scalar::utf16::validate_with_errors<endianness::NATIVE>(
558
0
        input.data(), input.size());
559
0
  } else
560
0
    #endif
561
0
  {
562
0
    return validate_utf16_with_errors(input.data(), input.size());
563
0
  }
564
0
}
565
  #endif // SIMDUTF_SPAN
566
567
/**
568
 * Validate the UTF-16LE string and stop on error. It might be faster than
569
 * validate_utf16le when an error is expected to occur early.
570
 *
571
 * Overridden by each implementation.
572
 *
573
 * This function is not BOM-aware.
574
 *
575
 * @param buf the UTF-16LE string to validate.
576
 * @param len the length of the string in number of 2-byte code units
577
 * (char16_t).
578
 * @return a result pair struct (of type simdutf::result containing the two
579
 * fields error and count) with an error code and either position of the error
580
 * (in the input in code units) if any, or the number of code units validated if
581
 * successful.
582
 */
583
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
584
                                                        size_t len) noexcept;
585
  #if SIMDUTF_SPAN
586
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
587
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
588
0
    #if SIMDUTF_CPLUSPLUS23
589
0
  if consteval {
590
0
    return scalar::utf16::validate_with_errors<endianness::LITTLE>(
591
0
        input.data(), input.size());
592
0
  } else
593
0
    #endif
594
0
  {
595
0
    return validate_utf16le_with_errors(input.data(), input.size());
596
0
  }
597
0
}
598
  #endif // SIMDUTF_SPAN
599
600
/**
601
 * Validate the UTF-16BE string and stop on error. It might be faster than
602
 * validate_utf16be when an error is expected to occur early.
603
 *
604
 * Overridden by each implementation.
605
 *
606
 * This function is not BOM-aware.
607
 *
608
 * @param buf the UTF-16BE string to validate.
609
 * @param len the length of the string in number of 2-byte code units
610
 * (char16_t).
611
 * @return a result pair struct (of type simdutf::result containing the two
612
 * fields error and count) with an error code and either position of the error
613
 * (in the input in code units) if any, or the number of code units validated if
614
 * successful.
615
 */
616
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
617
                                                        size_t len) noexcept;
618
  #if SIMDUTF_SPAN
619
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
620
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
621
0
    #if SIMDUTF_CPLUSPLUS23
622
0
  if consteval {
623
0
    return scalar::utf16::validate_with_errors<endianness::BIG>(input.data(),
624
0
                                                                input.size());
625
0
  } else
626
0
    #endif
627
0
  {
628
0
    return validate_utf16be_with_errors(input.data(), input.size());
629
0
  }
630
0
}
631
  #endif // SIMDUTF_SPAN
632
633
/**
634
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
635
 * the Unicode replacement character U+FFFD. If input and output points to
636
 * different memory areas, the procedure copies string, and it's expected that
637
 * output memory is at least as big as the input. It's also possible to set
638
 * input equal output, that makes replacements an in-place operation.
639
 *
640
 * @param input the UTF-16LE string to correct.
641
 * @param len the length of the string in number of 2-byte code units
642
 * (char16_t).
643
 * @param output the output buffer.
644
 */
645
void to_well_formed_utf16le(const char16_t *input, size_t len,
646
                            char16_t *output) noexcept;
647
  #if SIMDUTF_SPAN
648
simdutf_really_inline simdutf_constexpr23 void
649
to_well_formed_utf16le(std::span<const char16_t> input,
650
0
                       std::span<char16_t> output) noexcept {
651
0
    #if SIMDUTF_CPLUSPLUS23
652
0
  if consteval {
653
0
    scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(
654
0
        input.data(), input.size(), output.data());
655
0
  } else
656
0
    #endif
657
0
  {
658
0
    to_well_formed_utf16le(input.data(), input.size(), output.data());
659
0
  }
660
0
}
661
  #endif // SIMDUTF_SPAN
662
663
/**
664
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
665
 * the Unicode replacement character U+FFFD. If input and output points to
666
 * different memory areas, the procedure copies string, and it's expected that
667
 * output memory is at least as big as the input. It's also possible to set
668
 * input equal output, that makes replacements an in-place operation.
669
 *
670
 * @param input the UTF-16BE string to correct.
671
 * @param len the length of the string in number of 2-byte code units
672
 * (char16_t).
673
 * @param output the output buffer.
674
 */
675
void to_well_formed_utf16be(const char16_t *input, size_t len,
676
                            char16_t *output) noexcept;
677
  #if SIMDUTF_SPAN
678
simdutf_really_inline simdutf_constexpr23 void
679
to_well_formed_utf16be(std::span<const char16_t> input,
680
0
                       std::span<char16_t> output) noexcept {
681
0
    #if SIMDUTF_CPLUSPLUS23
682
0
  if consteval {
683
0
    scalar::utf16::to_well_formed_utf16<endianness::BIG>(
684
0
        input.data(), input.size(), output.data());
685
0
  } else
686
0
    #endif
687
0
  {
688
0
    to_well_formed_utf16be(input.data(), input.size(), output.data());
689
0
  }
690
0
}
691
  #endif // SIMDUTF_SPAN
692
693
/**
694
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
695
 * Unicode replacement character U+FFFD. If input and output points to different
696
 * memory areas, the procedure copies string, and it's expected that output
697
 * memory is at least as big as the input. It's also possible to set input equal
698
 * output, that makes replacements an in-place operation.
699
 *
700
 * @param input the UTF-16 string to correct.
701
 * @param len the length of the string in number of 2-byte code units
702
 * (char16_t).
703
 * @param output the output buffer.
704
 */
705
void to_well_formed_utf16(const char16_t *input, size_t len,
706
                          char16_t *output) noexcept;
707
  #if SIMDUTF_SPAN
708
simdutf_really_inline simdutf_constexpr23 void
709
to_well_formed_utf16(std::span<const char16_t> input,
710
0
                     std::span<char16_t> output) noexcept {
711
0
    #if SIMDUTF_CPLUSPLUS23
712
0
  if consteval {
713
0
    scalar::utf16::to_well_formed_utf16<endianness::NATIVE>(
714
0
        input.data(), input.size(), output.data());
715
0
  } else
716
0
    #endif
717
0
  {
718
0
    to_well_formed_utf16(input.data(), input.size(), output.data());
719
0
  }
720
0
}
721
  #endif // SIMDUTF_SPAN
722
723
#endif // SIMDUTF_FEATURE_UTF16
724
725
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
726
/**
727
 * Validate the UTF-32 string. This function may be best when you expect
728
 * the input to be almost always valid. Otherwise, consider using
729
 * validate_utf32_with_errors.
730
 *
731
 * Overridden by each implementation.
732
 *
733
 * This function is not BOM-aware.
734
 *
735
 * @param buf the UTF-32 string to validate.
736
 * @param len the length of the string in number of 4-byte code units
737
 * (char32_t).
738
 * @return true if and only if the string is valid UTF-32.
739
 */
740
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
741
                                        size_t len) noexcept;
742
  #if SIMDUTF_SPAN
743
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
744
0
validate_utf32(std::span<const char32_t> input) noexcept {
745
0
    #if SIMDUTF_CPLUSPLUS23
746
0
  if consteval {
747
0
    return scalar::utf32::validate(
748
0
        detail::constexpr_cast_ptr<std::uint32_t>(input.data()), input.size());
749
0
  } else
750
0
    #endif
751
0
  {
752
0
    return validate_utf32(input.data(), input.size());
753
0
  }
754
0
}
755
  #endif // SIMDUTF_SPAN
756
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
757
758
#if SIMDUTF_FEATURE_UTF32
759
/**
760
 * Validate the UTF-32 string and stop on error. It might be faster than
761
 * validate_utf32 when an error is expected to occur early.
762
 *
763
 * Overridden by each implementation.
764
 *
765
 * This function is not BOM-aware.
766
 *
767
 * @param buf the UTF-32 string to validate.
768
 * @param len the length of the string in number of 4-byte code units
769
 * (char32_t).
770
 * @return a result pair struct (of type simdutf::result containing the two
771
 * fields error and count) with an error code and either position of the error
772
 * (in the input in code units) if any, or the number of code units validated if
773
 * successful.
774
 */
775
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
776
                                                      size_t len) noexcept;
777
  #if SIMDUTF_SPAN
778
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
779
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
780
0
    #if SIMDUTF_CPLUSPLUS23
781
0
  if consteval {
782
0
    return scalar::utf32::validate_with_errors(
783
0
        detail::constexpr_cast_ptr<std::uint32_t>(input.data()), input.size());
784
0
  } else
785
0
    #endif
786
0
  {
787
0
    return validate_utf32_with_errors(input.data(), input.size());
788
0
  }
789
0
}
790
  #endif // SIMDUTF_SPAN
791
#endif   // SIMDUTF_FEATURE_UTF32
792
793
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
794
/**
795
 * Convert Latin1 string into UTF-8 string.
796
 *
797
 * This function is suitable to work with inputs from untrusted sources.
798
 *
799
 * @param input         the Latin1 string to convert
800
 * @param length        the length of the string in bytes
801
 * @param utf8_output   the pointer to buffer that can hold conversion result
802
 * @return the number of written char; 0 if conversion is not possible
803
 */
804
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
805
                                                  size_t length,
806
                                                  char *utf8_output) noexcept;
807
  #if SIMDUTF_SPAN
808
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
809
convert_latin1_to_utf8(
810
    const detail::input_span_of_byte_like auto &latin1_input,
811
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
812
    #if SIMDUTF_CPLUSPLUS23
813
  if consteval {
814
    return scalar::latin1_to_utf8::convert(
815
        detail::constexpr_cast_ptr<char>(latin1_input.data()),
816
        latin1_input.size(),
817
        detail::constexpr_cast_writeptr<char>(utf8_output.data()));
818
  } else
819
    #endif
820
  {
821
    return convert_latin1_to_utf8(
822
        reinterpret_cast<const char *>(latin1_input.data()),
823
        latin1_input.size(), reinterpret_cast<char *>(utf8_output.data()));
824
  }
825
}
826
  #endif // SIMDUTF_SPAN
827
828
/**
829
 * Convert Latin1 string into UTF-8 string with output limit.
830
 *
831
 * This function is suitable to work with inputs from untrusted sources.
832
 *
833
 * We write as many characters as possible.
834
 *
835
 * @param input         the Latin1 string to convert
836
 * @param length        the length of the string in bytes
837
 * @param utf8_output   the pointer to buffer that can hold conversion result
838
 * @param utf8_len      the maximum output length
839
 * @return the number of written char; 0 if conversion is not possible
840
 */
841
simdutf_warn_unused size_t
842
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
843
                            size_t utf8_len) noexcept;
844
  #if SIMDUTF_SPAN
845
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
846
convert_latin1_to_utf8_safe(
847
    const detail::input_span_of_byte_like auto &input,
848
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
849
      // implementation note: outputspan is a forwarding ref to avoid copying
850
      // and allow both lvalues and rvalues. std::span can be copied without
851
      // problems, but std::vector should not, and this function should accept
852
      // both. it will allow using an owning rvalue ref (example: passing a
853
      // temporary std::string) as output, but the user will quickly find out
854
      // that he has no way of getting the data out of the object in that case.
855
    #if SIMDUTF_CPLUSPLUS23
856
  if consteval {
857
    return scalar::latin1_to_utf8::convert_safe_constexpr(
858
        input.data(), input.size(), utf8_output.data(), utf8_output.size());
859
  } else
860
    #endif
861
  {
862
    return convert_latin1_to_utf8_safe(
863
        input.data(), input.size(),
864
        reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
865
  }
866
}
867
  #endif // SIMDUTF_SPAN
868
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
869
870
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
871
/**
872
 * Convert possibly Latin1 string into UTF-16LE string.
873
 *
874
 * This function is suitable to work with inputs from untrusted sources.
875
 *
876
 * @param input         the Latin1 string to convert
877
 * @param length        the length of the string in bytes
878
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
879
 * @return the number of written char16_t; 0 if conversion is not possible
880
 */
881
simdutf_warn_unused size_t convert_latin1_to_utf16le(
882
    const char *input, size_t length, char16_t *utf16_output) noexcept;
883
  #if SIMDUTF_SPAN
884
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
885
convert_latin1_to_utf16le(
886
    const detail::input_span_of_byte_like auto &latin1_input,
887
    std::span<char16_t> utf16_output) noexcept {
888
    #if SIMDUTF_CPLUSPLUS23
889
  if consteval {
890
    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(
891
        latin1_input.data(), latin1_input.size(), utf16_output.data());
892
  } else
893
    #endif
894
  {
895
    return convert_latin1_to_utf16le(
896
        reinterpret_cast<const char *>(latin1_input.data()),
897
        latin1_input.size(), utf16_output.data());
898
  }
899
}
900
  #endif // SIMDUTF_SPAN
901
902
/**
903
 * Convert Latin1 string into UTF-16BE string.
904
 *
905
 * This function is suitable to work with inputs from untrusted sources.
906
 *
907
 * @param input         the Latin1 string to convert
908
 * @param length        the length of the string in bytes
909
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
910
 * @return the number of written char16_t; 0 if conversion is not possible
911
 */
912
simdutf_warn_unused size_t convert_latin1_to_utf16be(
913
    const char *input, size_t length, char16_t *utf16_output) noexcept;
914
  #if SIMDUTF_SPAN
915
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
916
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
917
                          std::span<char16_t> output) noexcept {
918
    #if SIMDUTF_CPLUSPLUS23
919
  if consteval {
920
    return scalar::latin1_to_utf16::convert<endianness::BIG>(
921
        input.data(), input.size(), output.data());
922
  } else
923
    #endif
924
  {
925
    return convert_latin1_to_utf16be(
926
        reinterpret_cast<const char *>(input.data()), input.size(),
927
        output.data());
928
  }
929
}
930
  #endif // SIMDUTF_SPAN
931
/**
932
 * Compute the number of bytes that this UTF-16 string would require in Latin1
933
 * format.
934
 *
935
 * @param length        the length of the string in Latin1 code units (char)
936
 * @return the length of the string in Latin1 code units (char) required to
937
 * encode the UTF-16 string as Latin1
938
 */
939
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
940
latin1_length_from_utf16(size_t length) noexcept {
941
  return length;
942
}
943
944
/**
945
 * Compute the number of code units that this Latin1 string would require in
946
 * UTF-16 format.
947
 *
948
 * @param length        the length of the string in Latin1 code units (char)
949
 * @return the length of the string in 2-byte code units (char16_t) required to
950
 * encode the Latin1 string as UTF-16
951
 */
952
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
953
0
utf16_length_from_latin1(size_t length) noexcept {
954
0
  return length;
955
0
}
956
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
957
958
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
959
/**
960
 * Convert Latin1 string into UTF-32 string.
961
 *
962
 * This function is suitable to work with inputs from untrusted sources.
963
 *
964
 * @param input         the Latin1 string to convert
965
 * @param length        the length of the string in bytes
966
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
967
 * @return the number of written char32_t; 0 if conversion is not possible
968
 */
969
simdutf_warn_unused size_t convert_latin1_to_utf32(
970
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
971
  #if SIMDUTF_SPAN
972
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
973
convert_latin1_to_utf32(
974
    const detail::input_span_of_byte_like auto &latin1_input,
975
    std::span<char32_t> utf32_output) noexcept {
976
    #if SIMDUTF_CPLUSPLUS23
977
  if consteval {
978
    return scalar::latin1_to_utf32::convert(
979
        latin1_input.data(), latin1_input.size(), utf32_output.data());
980
  } else
981
    #endif
982
  {
983
    return convert_latin1_to_utf32(
984
        reinterpret_cast<const char *>(latin1_input.data()),
985
        latin1_input.size(), utf32_output.data());
986
  }
987
}
988
  #endif // SIMDUTF_SPAN
989
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
990
991
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
992
/**
993
 * Convert possibly broken UTF-8 string into latin1 string.
994
 *
995
 * During the conversion also validation of the input string is done.
996
 * This function is suitable to work with inputs from untrusted sources.
997
 *
998
 * @param input         the UTF-8 string to convert
999
 * @param length        the length of the string in bytes
1000
 * @param latin1_output  the pointer to buffer that can hold conversion result
1001
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1002
 * or if it cannot be represented as Latin1
1003
 */
1004
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
1005
                                                  size_t length,
1006
                                                  char *latin1_output) noexcept;
1007
  #if SIMDUTF_SPAN
1008
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1009
convert_utf8_to_latin1(
1010
    const detail::input_span_of_byte_like auto &input,
1011
    detail::output_span_of_byte_like auto &&output) noexcept {
1012
    #if SIMDUTF_CPLUSPLUS23
1013
  if consteval {
1014
    return scalar::utf8_to_latin1::convert(input.data(), input.size(),
1015
                                           output.data());
1016
  } else
1017
    #endif
1018
  {
1019
    return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
1020
                                  input.size(),
1021
                                  reinterpret_cast<char *>(output.data()));
1022
  }
1023
}
1024
  #endif // SIMDUTF_SPAN
1025
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1026
1027
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1028
/**
1029
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
1030
 * string.
1031
 *
1032
 * During the conversion also validation of the input string is done.
1033
 * This function is suitable to work with inputs from untrusted sources.
1034
 *
1035
 * @param input         the UTF-8 string to convert
1036
 * @param length        the length of the string in bytes
1037
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1038
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1039
 * string
1040
 */
1041
simdutf_warn_unused size_t convert_utf8_to_utf16(
1042
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1043
  #if SIMDUTF_SPAN
1044
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1045
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
1046
                      std::span<char16_t> output) noexcept {
1047
    #if SIMDUTF_CPLUSPLUS23
1048
  if consteval {
1049
    return scalar::utf8_to_utf16::convert<endianness::NATIVE>(
1050
        input.data(), input.size(), output.data());
1051
  } else
1052
    #endif
1053
  {
1054
    return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
1055
                                 input.size(), output.data());
1056
  }
1057
}
1058
  #endif // SIMDUTF_SPAN
1059
1060
/**
1061
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
1062
 * format even when the UTF-16LE content contains mismatched surrogates
1063
 * that have to be replaced by the replacement character (0xFFFD).
1064
 *
1065
 * @param input         the UTF-16LE string to convert
1066
 * @param length        the length of the string in 2-byte code units (char16_t)
1067
 * @return a result pair struct (of type simdutf::result containing the two
1068
 * fields error and count) where the count is the number of bytes required to
1069
 * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or
1070
 * SURROGATE. The count is correct regardless of the error field.
1071
 * When SURROGATE is returned, it does not indicate an error in the case of this
1072
 * function: it indicates that at least one surrogate has been encountered: the
1073
 * surrogates may be matched or not (thus this function does not validate). If
1074
 * the returned error code is SUCCESS, then the input contains no surrogate, is
1075
 * in the Basic Multilingual Plane, and is necessarily valid.
1076
 */
1077
simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
1078
    const char16_t *input, size_t length) noexcept;
1079
  #if SIMDUTF_SPAN
1080
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
1081
utf8_length_from_utf16le_with_replacement(
1082
0
    std::span<const char16_t> valid_utf16_input) noexcept {
1083
0
    #if SIMDUTF_CPLUSPLUS23
1084
0
  if consteval {
1085
0
    return scalar::utf16::utf8_length_from_utf16_with_replacement<
1086
0
        endianness::LITTLE>(valid_utf16_input.data(), valid_utf16_input.size());
1087
0
  } else
1088
0
    #endif
1089
0
  {
1090
0
    return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
1091
0
                                                     valid_utf16_input.size());
1092
0
  }
1093
0
}
1094
  #endif // SIMDUTF_SPAN
1095
1096
/**
1097
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
1098
 * format even when the UTF-16BE content contains mismatched surrogates
1099
 * that have to be replaced by the replacement character (0xFFFD).
1100
 *
1101
 * @param input         the UTF-16BE string to convert
1102
 * @param length        the length of the string in 2-byte code units (char16_t)
1103
 * @return a result pair struct (of type simdutf::result containing the two
1104
 * fields error and count) where the count is the number of bytes required to
1105
 * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or
1106
 * SURROGATE. The count is correct regardless of the error field.
1107
 * When SURROGATE is returned, it does not indicate an error in the case of this
1108
 * function: it indicates that at least one surrogate has been encountered: the
1109
 * surrogates may be matched or not (thus this function does not validate). If
1110
 * the returned error code is SUCCESS, then the input contains no surrogate, is
1111
 * in the Basic Multilingual Plane, and is necessarily valid.
1112
 */
1113
simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
1114
    const char16_t *input, size_t length) noexcept;
1115
  #if SIMDUTF_SPAN
1116
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1117
utf8_length_from_utf16be_with_replacement(
1118
0
    std::span<const char16_t> valid_utf16_input) noexcept {
1119
0
    #if SIMDUTF_CPLUSPLUS23
1120
0
  if consteval {
1121
0
    return scalar::utf16::utf8_length_from_utf16_with_replacement<
1122
0
        endianness::BIG>(valid_utf16_input.data(), valid_utf16_input.size());
1123
0
  } else
1124
0
    #endif
1125
0
  {
1126
0
    return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
1127
0
                                                     valid_utf16_input.size());
1128
0
  }
1129
0
}
1130
  #endif // SIMDUTF_SPAN
1131
1132
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1133
1134
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1135
/**
1136
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
1137
 *
1138
 * @param input         the Latin1 string to convert
1139
 * @param length        the length of the string in bytes
1140
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1141
 * @return the number of written char16_t.
1142
 */
1143
simdutf_warn_unused size_t convert_latin1_to_utf16(
1144
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1145
  #if SIMDUTF_SPAN
1146
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1147
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
1148
                        std::span<char16_t> output) noexcept {
1149
    #if SIMDUTF_CPLUSPLUS23
1150
  if consteval {
1151
    return scalar::latin1_to_utf16::convert<endianness::NATIVE>(
1152
        input.data(), input.size(), output.data());
1153
  } else
1154
    #endif
1155
  {
1156
    return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
1157
                                   input.size(), output.data());
1158
  }
1159
}
1160
  #endif // SIMDUTF_SPAN
1161
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1162
1163
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1164
/**
1165
 * Convert possibly broken UTF-8 string into UTF-16LE string.
1166
 *
1167
 * During the conversion also validation of the input string is done.
1168
 * This function is suitable to work with inputs from untrusted sources.
1169
 *
1170
 * @param input         the UTF-8 string to convert
1171
 * @param length        the length of the string in bytes
1172
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1173
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1174
 * string
1175
 */
1176
simdutf_warn_unused size_t convert_utf8_to_utf16le(
1177
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1178
  #if SIMDUTF_SPAN
1179
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1180
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
1181
                        std::span<char16_t> utf16_output) noexcept {
1182
    #if SIMDUTF_CPLUSPLUS23
1183
  if consteval {
1184
    return scalar::utf8_to_utf16::convert<endianness::LITTLE>(
1185
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1186
  } else
1187
    #endif
1188
  {
1189
    return convert_utf8_to_utf16le(
1190
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1191
        utf16_output.data());
1192
  }
1193
}
1194
  #endif // SIMDUTF_SPAN
1195
1196
/**
1197
 * Convert possibly broken UTF-8 string into UTF-16BE string.
1198
 *
1199
 * During the conversion also validation of the input string is done.
1200
 * This function is suitable to work with inputs from untrusted sources.
1201
 *
1202
 * @param input         the UTF-8 string to convert
1203
 * @param length        the length of the string in bytes
1204
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1205
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1206
 * string
1207
 */
1208
simdutf_warn_unused size_t convert_utf8_to_utf16be(
1209
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1210
  #if SIMDUTF_SPAN
1211
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1212
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
1213
                        std::span<char16_t> utf16_output) noexcept {
1214
1215
    #if SIMDUTF_CPLUSPLUS23
1216
  if consteval {
1217
    return scalar::utf8_to_utf16::convert<endianness::BIG>(
1218
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1219
  } else
1220
    #endif
1221
  {
1222
    return convert_utf8_to_utf16be(
1223
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1224
        utf16_output.data());
1225
  }
1226
}
1227
  #endif // SIMDUTF_SPAN
1228
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1229
1230
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1231
/**
1232
 * Convert possibly broken UTF-8 string into latin1 string with errors.
1233
 * If the string cannot be represented as Latin1, an error
1234
 * code is returned.
1235
 *
1236
 * During the conversion also validation of the input string is done.
1237
 * This function is suitable to work with inputs from untrusted sources.
1238
 *
1239
 * @param input         the UTF-8 string to convert
1240
 * @param length        the length of the string in bytes
1241
 * @param latin1_output  the pointer to buffer that can hold conversion result
1242
 * @return a result pair struct (of type simdutf::result containing the two
1243
 * fields error and count) with an error code and either position of the error
1244
 * (in the input in code units) if any, or the number of code units validated if
1245
 * successful.
1246
 */
1247
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
1248
    const char *input, size_t length, char *latin1_output) noexcept;
1249
  #if SIMDUTF_SPAN
1250
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1251
convert_utf8_to_latin1_with_errors(
1252
    const detail::input_span_of_byte_like auto &utf8_input,
1253
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1254
    #if SIMDUTF_CPLUSPLUS23
1255
  if consteval {
1256
    return scalar::utf8_to_latin1::convert_with_errors(
1257
        utf8_input.data(), utf8_input.size(), latin1_output.data());
1258
  } else
1259
    #endif
1260
  {
1261
    return convert_utf8_to_latin1_with_errors(
1262
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1263
        reinterpret_cast<char *>(latin1_output.data()));
1264
  }
1265
}
1266
  #endif // SIMDUTF_SPAN
1267
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1268
1269
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1270
/**
1271
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
1272
 * string and stop on error.
1273
 *
1274
 * During the conversion also validation of the input string is done.
1275
 * This function is suitable to work with inputs from untrusted sources.
1276
 *
1277
 * @param input         the UTF-8 string to convert
1278
 * @param length        the length of the string in bytes
1279
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1280
 * @return a result pair struct (of type simdutf::result containing the two
1281
 * fields error and count) with an error code and either position of the error
1282
 * (in the input in code units) if any, or the number of char16_t written if
1283
 * successful.
1284
 */
1285
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
1286
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1287
  #if SIMDUTF_SPAN
1288
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1289
convert_utf8_to_utf16_with_errors(
1290
    const detail::input_span_of_byte_like auto &utf8_input,
1291
    std::span<char16_t> utf16_output) noexcept {
1292
    #if SIMDUTF_CPLUSPLUS23
1293
  if consteval {
1294
    return scalar::utf8_to_utf16::convert_with_errors<endianness::NATIVE>(
1295
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1296
  } else
1297
    #endif
1298
  {
1299
    return convert_utf8_to_utf16_with_errors(
1300
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1301
        utf16_output.data());
1302
  }
1303
}
1304
  #endif // SIMDUTF_SPAN
1305
1306
/**
1307
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1308
 *
1309
 * During the conversion also validation of the input string is done.
1310
 * This function is suitable to work with inputs from untrusted sources.
1311
 *
1312
 * @param input         the UTF-8 string to convert
1313
 * @param length        the length of the string in bytes
1314
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1315
 * @return a result pair struct (of type simdutf::result containing the two
1316
 * fields error and count) with an error code and either position of the error
1317
 * (in the input in code units) if any, or the number of char16_t written if
1318
 * successful.
1319
 */
1320
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
1321
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1322
  #if SIMDUTF_SPAN
1323
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1324
convert_utf8_to_utf16le_with_errors(
1325
    const detail::input_span_of_byte_like auto &utf8_input,
1326
    std::span<char16_t> utf16_output) noexcept {
1327
    #if SIMDUTF_CPLUSPLUS23
1328
  if consteval {
1329
    return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
1330
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1331
  } else
1332
    #endif
1333
  {
1334
    return convert_utf8_to_utf16le_with_errors(
1335
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1336
        utf16_output.data());
1337
  }
1338
}
1339
  #endif // SIMDUTF_SPAN
1340
1341
/**
1342
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1343
 *
1344
 * During the conversion also validation of the input string is done.
1345
 * This function is suitable to work with inputs from untrusted sources.
1346
 *
1347
 * @param input         the UTF-8 string to convert
1348
 * @param length        the length of the string in bytes
1349
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1350
 * @return a result pair struct (of type simdutf::result containing the two
1351
 * fields error and count) with an error code and either position of the error
1352
 * (in the input in code units) if any, or the number of char16_t written if
1353
 * successful.
1354
 */
1355
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
1356
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1357
  #if SIMDUTF_SPAN
1358
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1359
convert_utf8_to_utf16be_with_errors(
1360
    const detail::input_span_of_byte_like auto &utf8_input,
1361
    std::span<char16_t> utf16_output) noexcept {
1362
    #if SIMDUTF_CPLUSPLUS23
1363
  if consteval {
1364
    return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
1365
        utf8_input.data(), utf8_input.size(), utf16_output.data());
1366
  } else
1367
    #endif
1368
  {
1369
    return convert_utf8_to_utf16be_with_errors(
1370
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1371
        utf16_output.data());
1372
  }
1373
}
1374
  #endif // SIMDUTF_SPAN
1375
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1376
1377
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1378
/**
1379
 * Convert possibly broken UTF-8 string into UTF-32 string.
1380
 *
1381
 * During the conversion also validation of the input string is done.
1382
 * This function is suitable to work with inputs from untrusted sources.
1383
 *
1384
 * @param input         the UTF-8 string to convert
1385
 * @param length        the length of the string in bytes
1386
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1387
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
1388
 * string
1389
 */
1390
simdutf_warn_unused size_t convert_utf8_to_utf32(
1391
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1392
  #if SIMDUTF_SPAN
1393
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1394
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
1395
                      std::span<char32_t> utf32_output) noexcept {
1396
    #if SIMDUTF_CPLUSPLUS23
1397
  if consteval {
1398
    return scalar::utf8_to_utf32::convert(utf8_input.data(), utf8_input.size(),
1399
                                          utf32_output.data());
1400
  } else
1401
    #endif
1402
  {
1403
    return convert_utf8_to_utf32(
1404
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1405
        utf32_output.data());
1406
  }
1407
}
1408
  #endif // SIMDUTF_SPAN
1409
1410
/**
1411
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1412
 *
1413
 * During the conversion also validation of the input string is done.
1414
 * This function is suitable to work with inputs from untrusted sources.
1415
 *
1416
 * @param input         the UTF-8 string to convert
1417
 * @param length        the length of the string in bytes
1418
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1419
 * @return a result pair struct (of type simdutf::result containing the two
1420
 * fields error and count) with an error code and either position of the error
1421
 * (in the input in code units) if any, or the number of char32_t written if
1422
 * successful.
1423
 */
1424
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
1425
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1426
  #if SIMDUTF_SPAN
1427
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1428
convert_utf8_to_utf32_with_errors(
1429
    const detail::input_span_of_byte_like auto &utf8_input,
1430
    std::span<char32_t> utf32_output) noexcept {
1431
    #if SIMDUTF_CPLUSPLUS23
1432
  if consteval {
1433
    return scalar::utf8_to_utf32::convert_with_errors(
1434
        utf8_input.data(), utf8_input.size(), utf32_output.data());
1435
  } else
1436
    #endif
1437
  {
1438
    return convert_utf8_to_utf32_with_errors(
1439
        reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1440
        utf32_output.data());
1441
  }
1442
}
1443
  #endif // SIMDUTF_SPAN
1444
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1445
1446
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1447
/**
1448
 * Convert valid UTF-8 string into latin1 string.
1449
 *
1450
 * This function assumes that the input string is valid UTF-8 and that it can be
1451
 * represented as Latin1. If you violate this assumption, the result is
1452
 * implementation defined and may include system-dependent behavior such as
1453
 * crashes.
1454
 *
1455
 * This function is for expert users only and not part of our public API. Use
1456
 * convert_utf8_to_latin1 instead. The function may be removed from the library
1457
 * in the future.
1458
 *
1459
 * This function is not BOM-aware.
1460
 *
1461
 * @param input         the UTF-8 string to convert
1462
 * @param length        the length of the string in bytes
1463
 * @param latin1_output  the pointer to buffer that can hold conversion result
1464
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1465
 */
1466
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1467
    const char *input, size_t length, char *latin1_output) noexcept;
1468
  #if SIMDUTF_SPAN
1469
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1470
convert_valid_utf8_to_latin1(
1471
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1472
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1473
    #if SIMDUTF_CPLUSPLUS23
1474
  if consteval {
1475
    return scalar::utf8_to_latin1::convert_valid(
1476
        valid_utf8_input.data(), valid_utf8_input.size(), latin1_output.data());
1477
  } else
1478
    #endif
1479
  {
1480
    return convert_valid_utf8_to_latin1(
1481
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1482
        valid_utf8_input.size(), latin1_output.data());
1483
  }
1484
}
1485
  #endif // SIMDUTF_SPAN
1486
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1487
1488
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1489
/**
1490
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1491
 *
1492
 * This function assumes that the input string is valid UTF-8.
1493
 *
1494
 * @param input         the UTF-8 string to convert
1495
 * @param length        the length of the string in bytes
1496
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1497
 * @return the number of written char16_t
1498
 */
1499
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1500
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1501
  #if SIMDUTF_SPAN
1502
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1503
convert_valid_utf8_to_utf16(
1504
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1505
    std::span<char16_t> utf16_output) noexcept {
1506
    #if SIMDUTF_CPLUSPLUS23
1507
  if consteval {
1508
    return scalar::utf8_to_utf16::convert_valid<endianness::NATIVE>(
1509
        valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1510
  } else
1511
    #endif
1512
  {
1513
    return convert_valid_utf8_to_utf16(
1514
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1515
        valid_utf8_input.size(), utf16_output.data());
1516
  }
1517
}
1518
  #endif // SIMDUTF_SPAN
1519
1520
/**
1521
 * Convert valid UTF-8 string into UTF-16LE string.
1522
 *
1523
 * This function assumes that the input string is valid UTF-8.
1524
 *
1525
 * @param input         the UTF-8 string to convert
1526
 * @param length        the length of the string in bytes
1527
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1528
 * @return the number of written char16_t
1529
 */
1530
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1531
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1532
  #if SIMDUTF_SPAN
1533
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1534
convert_valid_utf8_to_utf16le(
1535
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1536
    std::span<char16_t> utf16_output) noexcept {
1537
1538
    #if SIMDUTF_CPLUSPLUS23
1539
  if consteval {
1540
    return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
1541
        valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1542
  } else
1543
    #endif
1544
  {
1545
    return convert_valid_utf8_to_utf16le(
1546
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1547
        valid_utf8_input.size(), utf16_output.data());
1548
  }
1549
}
1550
  #endif // SIMDUTF_SPAN
1551
1552
/**
1553
 * Convert valid UTF-8 string into UTF-16BE string.
1554
 *
1555
 * This function assumes that the input string is valid UTF-8.
1556
 *
1557
 * @param input         the UTF-8 string to convert
1558
 * @param length        the length of the string in bytes
1559
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1560
 * @return the number of written char16_t
1561
 */
1562
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1563
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1564
  #if SIMDUTF_SPAN
1565
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1566
convert_valid_utf8_to_utf16be(
1567
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1568
    std::span<char16_t> utf16_output) noexcept {
1569
    #if SIMDUTF_CPLUSPLUS23
1570
  if consteval {
1571
    return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
1572
        valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1573
  } else
1574
    #endif
1575
  {
1576
    return convert_valid_utf8_to_utf16be(
1577
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1578
        valid_utf8_input.size(), utf16_output.data());
1579
  }
1580
}
1581
  #endif // SIMDUTF_SPAN
1582
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1583
1584
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1585
/**
1586
 * Convert valid UTF-8 string into UTF-32 string.
1587
 *
1588
 * This function assumes that the input string is valid UTF-8.
1589
 *
1590
 * @param input         the UTF-8 string to convert
1591
 * @param length        the length of the string in bytes
1592
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1593
 * @return the number of written char32_t
1594
 */
1595
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1596
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1597
  #if SIMDUTF_SPAN
1598
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1599
convert_valid_utf8_to_utf32(
1600
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1601
    std::span<char32_t> utf32_output) noexcept {
1602
    #if SIMDUTF_CPLUSPLUS23
1603
  if consteval {
1604
    return scalar::utf8_to_utf32::convert_valid(
1605
        valid_utf8_input.data(), valid_utf8_input.size(), utf32_output.data());
1606
  } else
1607
    #endif
1608
  {
1609
    return convert_valid_utf8_to_utf32(
1610
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1611
        valid_utf8_input.size(), utf32_output.data());
1612
  }
1613
}
1614
  #endif // SIMDUTF_SPAN
1615
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1616
1617
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1618
/**
1619
 * Return the number of bytes that this Latin1 string would require in UTF-8
1620
 * format.
1621
 *
1622
 * @param input         the Latin1 string to convert
1623
 * @param length        the length of the string bytes
1624
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1625
 */
1626
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1627
                                                   size_t length) noexcept;
1628
  #if SIMDUTF_SPAN
1629
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1630
utf8_length_from_latin1(
1631
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1632
    #if SIMDUTF_CPLUSPLUS23
1633
  if consteval {
1634
    return scalar::latin1_to_utf8::utf8_length_from_latin1(latin1_input.data(),
1635
                                                           latin1_input.size());
1636
  } else
1637
    #endif
1638
  {
1639
    return utf8_length_from_latin1(
1640
        reinterpret_cast<const char *>(latin1_input.data()),
1641
        latin1_input.size());
1642
  }
1643
}
1644
  #endif // SIMDUTF_SPAN
1645
1646
/**
1647
 * Compute the number of bytes that this UTF-8 string would require in Latin1
1648
 * format.
1649
 *
1650
 * This function does not validate the input. It is acceptable to pass invalid
1651
 * UTF-8 strings but in such cases the result is implementation defined.
1652
 *
1653
 * This function is not BOM-aware.
1654
 *
1655
 * @param input         the UTF-8 string to convert
1656
 * @param length        the length of the string in byte
1657
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1658
 */
1659
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1660
                                                   size_t length) noexcept;
1661
  #if SIMDUTF_SPAN
1662
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1663
latin1_length_from_utf8(
1664
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1665
    #if SIMDUTF_CPLUSPLUS23
1666
  if consteval {
1667
    return scalar::utf8::count_code_points(valid_utf8_input.data(),
1668
                                           valid_utf8_input.size());
1669
  } else
1670
    #endif
1671
  {
1672
    return latin1_length_from_utf8(
1673
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1674
        valid_utf8_input.size());
1675
  }
1676
}
1677
  #endif // SIMDUTF_SPAN
1678
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1679
1680
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1681
/**
1682
 * Compute the number of 2-byte code units that this UTF-8 string would require
1683
 * in UTF-16LE format.
1684
 *
1685
 * This function does not validate the input. It is acceptable to pass invalid
1686
 * UTF-8 strings but in such cases the result is implementation defined.
1687
 *
1688
 * This function is not BOM-aware.
1689
 *
1690
 * @param input         the UTF-8 string to process
1691
 * @param length        the length of the string in bytes
1692
 * @return the number of char16_t code units required to encode the UTF-8 string
1693
 * as UTF-16LE
1694
 */
1695
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1696
                                                  size_t length) noexcept;
1697
  #if SIMDUTF_SPAN
1698
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1699
utf16_length_from_utf8(
1700
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1701
    #if SIMDUTF_CPLUSPLUS23
1702
  if consteval {
1703
    return scalar::utf8::utf16_length_from_utf8(valid_utf8_input.data(),
1704
                                                valid_utf8_input.size());
1705
  } else
1706
    #endif
1707
  {
1708
    return utf16_length_from_utf8(
1709
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1710
        valid_utf8_input.size());
1711
  }
1712
}
1713
  #endif // SIMDUTF_SPAN
1714
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1715
1716
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1717
/**
1718
 * Compute the number of 4-byte code units that this UTF-8 string would require
1719
 * in UTF-32 format.
1720
 *
1721
 * This function is equivalent to count_utf8
1722
 *
1723
 * This function does not validate the input. It is acceptable to pass invalid
1724
 * UTF-8 strings but in such cases the result is implementation defined.
1725
 *
1726
 * This function is not BOM-aware.
1727
 *
1728
 * @param input         the UTF-8 string to process
1729
 * @param length        the length of the string in bytes
1730
 * @return the number of char32_t code units required to encode the UTF-8 string
1731
 * as UTF-32
1732
 */
1733
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1734
                                                  size_t length) noexcept;
1735
  #if SIMDUTF_SPAN
1736
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1737
utf32_length_from_utf8(
1738
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1739
1740
    #if SIMDUTF_CPLUSPLUS23
1741
  if consteval {
1742
    return scalar::utf8::count_code_points(valid_utf8_input.data(),
1743
                                           valid_utf8_input.size());
1744
  } else
1745
    #endif
1746
  {
1747
    return utf32_length_from_utf8(
1748
        reinterpret_cast<const char *>(valid_utf8_input.data()),
1749
        valid_utf8_input.size());
1750
  }
1751
}
1752
  #endif // SIMDUTF_SPAN
1753
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1754
1755
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1756
/**
1757
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1758
 * string.
1759
 *
1760
 * During the conversion also validation of the input string is done.
1761
 * This function is suitable to work with inputs from untrusted sources.
1762
 *
1763
 * This function is not BOM-aware.
1764
 *
1765
 * @param input         the UTF-16 string to convert
1766
 * @param length        the length of the string in 2-byte code units (char16_t)
1767
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1768
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1769
 * string
1770
 */
1771
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1772
                                                 size_t length,
1773
                                                 char *utf8_buffer) noexcept;
1774
  #if SIMDUTF_SPAN
1775
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1776
convert_utf16_to_utf8(
1777
    std::span<const char16_t> utf16_input,
1778
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1779
    #if SIMDUTF_CPLUSPLUS23
1780
  if consteval {
1781
    return scalar::utf16_to_utf8::convert<endianness::NATIVE>(
1782
        utf16_input.data(), utf16_input.size(), utf8_output.data());
1783
  } else
1784
    #endif
1785
  {
1786
    return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1787
                                 reinterpret_cast<char *>(utf8_output.data()));
1788
  }
1789
}
1790
  #endif // SIMDUTF_SPAN
1791
1792
/**
1793
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1794
 * string with output limit.
1795
 *
1796
 * We write as many characters as possible into the output buffer,
1797
 *
1798
 * During the conversion also validation of the input string is done.
1799
 * This function is suitable to work with inputs from untrusted sources.
1800
 *
1801
 * This function is not BOM-aware.
1802
 *
1803
 *
1804
 * @param input         the UTF-16 string to convert
1805
 * @param length        the length of the string in 16-bit code units (char16_t)
1806
 * @param utf8_output   the pointer to buffer that can hold conversion result
1807
 * @param utf8_len      the maximum output length
1808
 * @return the number of written char; 0 if conversion is not possible
1809
 */
1810
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1811
                                                      size_t length,
1812
                                                      char *utf8_output,
1813
                                                      size_t utf8_len) noexcept;
1814
  #if SIMDUTF_SPAN
1815
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1816
convert_utf16_to_utf8_safe(
1817
    std::span<const char16_t> utf16_input,
1818
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1819
      // implementation note: outputspan is a forwarding ref to avoid copying
1820
      // and allow both lvalues and rvalues. std::span can be copied without
1821
      // problems, but std::vector should not, and this function should accept
1822
      // both. it will allow using an owning rvalue ref (example: passing a
1823
      // temporary std::string) as output, but the user will quickly find out
1824
      // that he has no way of getting the data out of the object in that case.
1825
    #if SIMDUTF_CPLUSPLUS23
1826
  if consteval {
1827
    const full_result r =
1828
        scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE, true>(
1829
            utf16_input.data(), utf16_input.size(), utf8_output.data(),
1830
            utf8_output.size());
1831
    if (r.error != error_code::SUCCESS &&
1832
        r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) {
1833
      return 0;
1834
    }
1835
    return r.output_count;
1836
  } else
1837
    #endif
1838
  {
1839
    return convert_utf16_to_utf8_safe(
1840
        utf16_input.data(), utf16_input.size(),
1841
        reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1842
  }
1843
}
1844
  #endif // SIMDUTF_SPAN
1845
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1846
1847
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1848
/**
1849
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1850
 * string.
1851
 *
1852
 * During the conversion also validation of the input string is done.
1853
 * This function is suitable to work with inputs from untrusted sources.
1854
 *
1855
 * This function is not BOM-aware.
1856
 *
1857
 * @param input         the UTF-16 string to convert
1858
 * @param length        the length of the string in 2-byte code units (char16_t)
1859
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1860
 * @return number of written code units; 0 if input is not a valid UTF-16 string
1861
 * or if it cannot be represented as Latin1
1862
 */
1863
simdutf_warn_unused size_t convert_utf16_to_latin1(
1864
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1865
  #if SIMDUTF_SPAN
1866
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1867
convert_utf16_to_latin1(
1868
    std::span<const char16_t> utf16_input,
1869
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1870
    #if SIMDUTF_CPLUSPLUS23
1871
  if consteval {
1872
    return scalar::utf16_to_latin1::convert<endianness::NATIVE>(
1873
        utf16_input.data(), utf16_input.size(), latin1_output.data());
1874
  } else
1875
    #endif
1876
  {
1877
    return convert_utf16_to_latin1(
1878
        utf16_input.data(), utf16_input.size(),
1879
        reinterpret_cast<char *>(latin1_output.data()));
1880
  }
1881
}
1882
  #endif // SIMDUTF_SPAN
1883
1884
/**
1885
 * Convert possibly broken UTF-16LE string into Latin1 string.
1886
 * If the string cannot be represented as Latin1, an error
1887
 * is returned.
1888
 *
1889
 * During the conversion also validation of the input string is done.
1890
 * This function is suitable to work with inputs from untrusted sources.
1891
 *
1892
 * This function is not BOM-aware.
1893
 *
1894
 * @param input         the UTF-16LE string to convert
1895
 * @param length        the length of the string in 2-byte code units (char16_t)
1896
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1897
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1898
 * string or if it cannot be represented as Latin1
1899
 */
1900
simdutf_warn_unused size_t convert_utf16le_to_latin1(
1901
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1902
  #if SIMDUTF_SPAN
1903
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1904
convert_utf16le_to_latin1(
1905
    std::span<const char16_t> utf16_input,
1906
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1907
    #if SIMDUTF_CPLUSPLUS23
1908
  if consteval {
1909
    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(
1910
        utf16_input.data(), utf16_input.size(), latin1_output.data());
1911
  } else
1912
    #endif
1913
  {
1914
    return convert_utf16le_to_latin1(
1915
        utf16_input.data(), utf16_input.size(),
1916
        reinterpret_cast<char *>(latin1_output.data()));
1917
  }
1918
}
1919
  #endif // SIMDUTF_SPAN
1920
1921
/**
1922
 * Convert possibly broken UTF-16BE string into Latin1 string.
1923
 *
1924
 * During the conversion also validation of the input string is done.
1925
 * This function is suitable to work with inputs from untrusted sources.
1926
 *
1927
 * This function is not BOM-aware.
1928
 *
1929
 * @param input         the UTF-16BE string to convert
1930
 * @param length        the length of the string in 2-byte code units (char16_t)
1931
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1932
 * @return number of written code units; 0 if input is not a valid UTF-16BE
1933
 * string or if it cannot be represented as Latin1
1934
 */
1935
simdutf_warn_unused size_t convert_utf16be_to_latin1(
1936
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1937
  #if SIMDUTF_SPAN
1938
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1939
convert_utf16be_to_latin1(
1940
    std::span<const char16_t> utf16_input,
1941
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1942
    #if SIMDUTF_CPLUSPLUS23
1943
  if consteval {
1944
    return scalar::utf16_to_latin1::convert<endianness::BIG>(
1945
        utf16_input.data(), utf16_input.size(), latin1_output.data());
1946
  } else
1947
    #endif
1948
  {
1949
    return convert_utf16be_to_latin1(
1950
        utf16_input.data(), utf16_input.size(),
1951
        reinterpret_cast<char *>(latin1_output.data()));
1952
  }
1953
}
1954
  #endif // SIMDUTF_SPAN
1955
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1956
1957
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1958
/**
1959
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1960
 *
1961
 * During the conversion also validation of the input string is done.
1962
 * This function is suitable to work with inputs from untrusted sources.
1963
 *
1964
 * This function is not BOM-aware.
1965
 *
1966
 * @param input         the UTF-16LE string to convert
1967
 * @param length        the length of the string in 2-byte code units (char16_t)
1968
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1969
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1970
 * string
1971
 */
1972
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1973
                                                   size_t length,
1974
                                                   char *utf8_buffer) noexcept;
1975
  #if SIMDUTF_SPAN
1976
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1977
convert_utf16le_to_utf8(
1978
    std::span<const char16_t> utf16_input,
1979
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1980
    #if SIMDUTF_CPLUSPLUS23
1981
  if consteval {
1982
    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(
1983
        utf16_input.data(), utf16_input.size(), utf8_output.data());
1984
  } else
1985
    #endif
1986
  {
1987
    return convert_utf16le_to_utf8(
1988
        utf16_input.data(), utf16_input.size(),
1989
        reinterpret_cast<char *>(utf8_output.data()));
1990
  }
1991
}
1992
  #endif // SIMDUTF_SPAN
1993
1994
/**
1995
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1996
 *
1997
 * During the conversion also validation of the input string is done.
1998
 * This function is suitable to work with inputs from untrusted sources.
1999
 *
2000
 * This function is not BOM-aware.
2001
 *
2002
 * @param input         the UTF-16BE string to convert
2003
 * @param length        the length of the string in 2-byte code units (char16_t)
2004
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2005
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2006
 * string
2007
 */
2008
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
2009
                                                   size_t length,
2010
                                                   char *utf8_buffer) noexcept;
2011
  #if SIMDUTF_SPAN
2012
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2013
convert_utf16be_to_utf8(
2014
    std::span<const char16_t> utf16_input,
2015
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2016
    #if SIMDUTF_CPLUSPLUS23
2017
  if consteval {
2018
    return scalar::utf16_to_utf8::convert<endianness::BIG>(
2019
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2020
  } else
2021
    #endif
2022
  {
2023
    return convert_utf16be_to_utf8(
2024
        utf16_input.data(), utf16_input.size(),
2025
        reinterpret_cast<char *>(utf8_output.data()));
2026
  }
2027
}
2028
  #endif // SIMDUTF_SPAN
2029
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2030
2031
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2032
/**
2033
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
2034
 * string.
2035
 *
2036
 * During the conversion also validation of the input string is done.
2037
 * This function is suitable to work with inputs from untrusted sources.
2038
 * This function is not BOM-aware.
2039
 *
2040
 * @param input         the UTF-16 string to convert
2041
 * @param length        the length of the string in 2-byte code units (char16_t)
2042
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2043
 * @return a result pair struct (of type simdutf::result containing the two
2044
 * fields error and count) with an error code and either position of the error
2045
 * (in the input in code units) if any, or the number of char written if
2046
 * successful.
2047
 */
2048
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
2049
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2050
  #if SIMDUTF_SPAN
2051
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2052
convert_utf16_to_latin1_with_errors(
2053
    std::span<const char16_t> utf16_input,
2054
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2055
    #if SIMDUTF_CPLUSPLUS23
2056
  if consteval {
2057
    return scalar::utf16_to_latin1::convert_with_errors<endianness::NATIVE>(
2058
        utf16_input.data(), utf16_input.size(), latin1_output.data());
2059
  } else
2060
    #endif
2061
  {
2062
    return convert_utf16_to_latin1_with_errors(
2063
        utf16_input.data(), utf16_input.size(),
2064
        reinterpret_cast<char *>(latin1_output.data()));
2065
  }
2066
}
2067
  #endif // SIMDUTF_SPAN
2068
2069
/**
2070
 * Convert possibly broken UTF-16LE string into Latin1 string.
2071
 *
2072
 * During the conversion also validation of the input string is done.
2073
 * This function is suitable to work with inputs from untrusted sources.
2074
 * This function is not BOM-aware.
2075
 *
2076
 * @param input         the UTF-16LE string to convert
2077
 * @param length        the length of the string in 2-byte code units (char16_t)
2078
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2079
 * @return a result pair struct (of type simdutf::result containing the two
2080
 * fields error and count) with an error code and either position of the error
2081
 * (in the input in code units) if any, or the number of char written if
2082
 * successful.
2083
 */
2084
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
2085
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2086
  #if SIMDUTF_SPAN
2087
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2088
convert_utf16le_to_latin1_with_errors(
2089
    std::span<const char16_t> utf16_input,
2090
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2091
    #if SIMDUTF_CPLUSPLUS23
2092
  if consteval {
2093
    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
2094
        utf16_input.data(), utf16_input.size(), latin1_output.data());
2095
  } else
2096
    #endif
2097
  {
2098
    return convert_utf16le_to_latin1_with_errors(
2099
        utf16_input.data(), utf16_input.size(),
2100
        reinterpret_cast<char *>(latin1_output.data()));
2101
  }
2102
}
2103
  #endif // SIMDUTF_SPAN
2104
2105
/**
2106
 * Convert possibly broken UTF-16BE string into Latin1 string.
2107
 * If the string cannot be represented as Latin1, an error
2108
 * is returned.
2109
 *
2110
 * During the conversion also validation of the input string is done.
2111
 * This function is suitable to work with inputs from untrusted sources.
2112
 * This function is not BOM-aware.
2113
 *
2114
 * @param input         the UTF-16BE string to convert
2115
 * @param length        the length of the string in 2-byte code units (char16_t)
2116
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2117
 * @return a result pair struct (of type simdutf::result containing the two
2118
 * fields error and count) with an error code and either position of the error
2119
 * (in the input in code units) if any, or the number of char written if
2120
 * successful.
2121
 */
2122
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
2123
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2124
  #if SIMDUTF_SPAN
2125
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2126
convert_utf16be_to_latin1_with_errors(
2127
    std::span<const char16_t> utf16_input,
2128
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2129
    #if SIMDUTF_CPLUSPLUS23
2130
  if consteval {
2131
    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
2132
        utf16_input.data(), utf16_input.size(), latin1_output.data());
2133
  } else
2134
    #endif
2135
  {
2136
    return convert_utf16be_to_latin1_with_errors(
2137
        utf16_input.data(), utf16_input.size(),
2138
        reinterpret_cast<char *>(latin1_output.data()));
2139
  }
2140
}
2141
  #endif // SIMDUTF_SPAN
2142
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2143
2144
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2145
/**
2146
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
2147
 * string and stop on error.
2148
 *
2149
 * During the conversion also validation of the input string is done.
2150
 * This function is suitable to work with inputs from untrusted sources.
2151
 *
2152
 * This function is not BOM-aware.
2153
 *
2154
 * @param input         the UTF-16 string to convert
2155
 * @param length        the length of the string in 2-byte code units (char16_t)
2156
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2157
 * @return a result pair struct (of type simdutf::result containing the two
2158
 * fields error and count) with an error code and either position of the error
2159
 * (in the input in code units) if any, or the number of char written if
2160
 * successful.
2161
 */
2162
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
2163
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2164
  #if SIMDUTF_SPAN
2165
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2166
convert_utf16_to_utf8_with_errors(
2167
    std::span<const char16_t> utf16_input,
2168
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2169
    #if SIMDUTF_CPLUSPLUS23
2170
  if consteval {
2171
    return scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE>(
2172
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2173
  } else
2174
    #endif
2175
  {
2176
    return convert_utf16_to_utf8_with_errors(
2177
        utf16_input.data(), utf16_input.size(),
2178
        reinterpret_cast<char *>(utf8_output.data()));
2179
  }
2180
}
2181
  #endif // SIMDUTF_SPAN
2182
2183
/**
2184
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2185
 *
2186
 * During the conversion also validation of the input string is done.
2187
 * This function is suitable to work with inputs from untrusted sources.
2188
 *
2189
 * This function is not BOM-aware.
2190
 *
2191
 * @param input         the UTF-16LE string to convert
2192
 * @param length        the length of the string in 2-byte code units (char16_t)
2193
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2194
 * @return a result pair struct (of type simdutf::result containing the two
2195
 * fields error and count) with an error code and either position of the error
2196
 * (in the input in code units) if any, or the number of char written if
2197
 * successful.
2198
 */
2199
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
2200
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2201
  #if SIMDUTF_SPAN
2202
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2203
convert_utf16le_to_utf8_with_errors(
2204
    std::span<const char16_t> utf16_input,
2205
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2206
    #if SIMDUTF_CPLUSPLUS23
2207
  if consteval {
2208
    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
2209
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2210
  } else
2211
    #endif
2212
  {
2213
    return convert_utf16le_to_utf8_with_errors(
2214
        utf16_input.data(), utf16_input.size(),
2215
        reinterpret_cast<char *>(utf8_output.data()));
2216
  }
2217
}
2218
  #endif // SIMDUTF_SPAN
2219
2220
/**
2221
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2222
 *
2223
 * During the conversion also validation of the input string is done.
2224
 * This function is suitable to work with inputs from untrusted sources.
2225
 *
2226
 * This function is not BOM-aware.
2227
 *
2228
 * @param input         the UTF-16BE string to convert
2229
 * @param length        the length of the string in 2-byte code units (char16_t)
2230
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2231
 * @return a result pair struct (of type simdutf::result containing the two
2232
 * fields error and count) with an error code and either position of the error
2233
 * (in the input in code units) if any, or the number of char written if
2234
 * successful.
2235
 */
2236
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
2237
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2238
  #if SIMDUTF_SPAN
2239
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2240
convert_utf16be_to_utf8_with_errors(
2241
    std::span<const char16_t> utf16_input,
2242
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2243
    #if SIMDUTF_CPLUSPLUS23
2244
  if consteval {
2245
    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
2246
        utf16_input.data(), utf16_input.size(), utf8_output.data());
2247
  } else
2248
    #endif
2249
  {
2250
    return convert_utf16be_to_utf8_with_errors(
2251
        utf16_input.data(), utf16_input.size(),
2252
        reinterpret_cast<char *>(utf8_output.data()));
2253
  }
2254
}
2255
  #endif // SIMDUTF_SPAN
2256
2257
/**
2258
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
2259
 *
2260
 * This function assumes that the input string is valid UTF-16.
2261
 *
2262
 * This function is not BOM-aware.
2263
 *
2264
 * @param input         the UTF-16 string to convert
2265
 * @param length        the length of the string in 2-byte code units (char16_t)
2266
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2267
 * result
2268
 * @return number of written code units; 0 if conversion is not possible
2269
 */
2270
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
2271
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2272
  #if SIMDUTF_SPAN
2273
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2274
convert_valid_utf16_to_utf8(
2275
    std::span<const char16_t> valid_utf16_input,
2276
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2277
    #if SIMDUTF_CPLUSPLUS23
2278
  if consteval {
2279
    return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
2280
        valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2281
  } else
2282
    #endif
2283
  {
2284
    return convert_valid_utf16_to_utf8(
2285
        valid_utf16_input.data(), valid_utf16_input.size(),
2286
        reinterpret_cast<char *>(utf8_output.data()));
2287
  }
2288
}
2289
  #endif // SIMDUTF_SPAN
2290
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2291
2292
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2293
/**
2294
 * Using native endianness, convert UTF-16 string into Latin1 string.
2295
 *
2296
 * This function assumes that the input string is valid UTF-16 and that it can
2297
 * be represented as Latin1. If you violate this assumption, the result is
2298
 * implementation defined and may include system-dependent behavior such as
2299
 * crashes.
2300
 *
2301
 * This function is for expert users only and not part of our public API. Use
2302
 * convert_utf16_to_latin1 instead. The function may be removed from the library
2303
 * in the future.
2304
 *
2305
 * This function is not BOM-aware.
2306
 *
2307
 * @param input         the UTF-16 string to convert
2308
 * @param length        the length of the string in 2-byte code units (char16_t)
2309
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2310
 * @return number of written code units; 0 if conversion is not possible
2311
 */
2312
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
2313
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2314
  #if SIMDUTF_SPAN
2315
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2316
convert_valid_utf16_to_latin1(
2317
    std::span<const char16_t> valid_utf16_input,
2318
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2319
    #if SIMDUTF_CPLUSPLUS23
2320
  if consteval {
2321
    return scalar::utf16_to_latin1::convert_valid_impl<endianness::NATIVE>(
2322
        detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2323
        valid_utf16_input.size(),
2324
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2325
  } else
2326
    #endif
2327
  {
2328
    return convert_valid_utf16_to_latin1(
2329
        valid_utf16_input.data(), valid_utf16_input.size(),
2330
        reinterpret_cast<char *>(latin1_output.data()));
2331
  }
2332
}
2333
  #endif // SIMDUTF_SPAN
2334
2335
/**
2336
 * Convert valid UTF-16LE string into Latin1 string.
2337
 *
2338
 * This function assumes that the input string is valid UTF-16LE and that it can
2339
 * be represented as Latin1. If you violate this assumption, the result is
2340
 * implementation defined and may include system-dependent behavior such as
2341
 * crashes.
2342
 *
2343
 * This function is for expert users only and not part of our public API. Use
2344
 * convert_utf16le_to_latin1 instead. The function may be removed from the
2345
 * library in the future.
2346
 *
2347
 * This function is not BOM-aware.
2348
 *
2349
 * @param input         the UTF-16LE string to convert
2350
 * @param length        the length of the string in 2-byte code units (char16_t)
2351
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2352
 * @return number of written code units; 0 if conversion is not possible
2353
 */
2354
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
2355
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2356
  #if SIMDUTF_SPAN
2357
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2358
convert_valid_utf16le_to_latin1(
2359
    std::span<const char16_t> valid_utf16_input,
2360
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2361
    #if SIMDUTF_CPLUSPLUS23
2362
  if consteval {
2363
    return scalar::utf16_to_latin1::convert_valid_impl<endianness::LITTLE>(
2364
        detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2365
        valid_utf16_input.size(),
2366
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2367
  } else
2368
    #endif
2369
  {
2370
    return convert_valid_utf16le_to_latin1(
2371
        valid_utf16_input.data(), valid_utf16_input.size(),
2372
        reinterpret_cast<char *>(latin1_output.data()));
2373
  }
2374
}
2375
  #endif // SIMDUTF_SPAN
2376
2377
/**
2378
 * Convert valid UTF-16BE string into Latin1 string.
2379
 *
2380
 * This function assumes that the input string is valid UTF-16BE and that it can
2381
 * be represented as Latin1. If you violate this assumption, the result is
2382
 * implementation defined and may include system-dependent behavior such as
2383
 * crashes.
2384
 *
2385
 * This function is for expert users only and not part of our public API. Use
2386
 * convert_utf16be_to_latin1 instead. The function may be removed from the
2387
 * library in the future.
2388
 *
2389
 * This function is not BOM-aware.
2390
 *
2391
 * @param input         the UTF-16BE string to convert
2392
 * @param length        the length of the string in 2-byte code units (char16_t)
2393
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2394
 * @return number of written code units; 0 if conversion is not possible
2395
 */
2396
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
2397
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2398
  #if SIMDUTF_SPAN
2399
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2400
convert_valid_utf16be_to_latin1(
2401
    std::span<const char16_t> valid_utf16_input,
2402
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2403
    #if SIMDUTF_CPLUSPLUS23
2404
  if consteval {
2405
    return scalar::utf16_to_latin1::convert_valid_impl<endianness::BIG>(
2406
        detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2407
        valid_utf16_input.size(),
2408
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2409
  } else
2410
    #endif
2411
  {
2412
    return convert_valid_utf16be_to_latin1(
2413
        valid_utf16_input.data(), valid_utf16_input.size(),
2414
        reinterpret_cast<char *>(latin1_output.data()));
2415
  }
2416
}
2417
  #endif // SIMDUTF_SPAN
2418
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2419
2420
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2421
/**
2422
 * Convert valid UTF-16LE string into UTF-8 string.
2423
 *
2424
 * This function assumes that the input string is valid UTF-16LE
2425
 *
2426
 * This function is not BOM-aware.
2427
 *
2428
 * @param input         the UTF-16LE string to convert
2429
 * @param length        the length of the string in 2-byte code units (char16_t)
2430
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2431
 * result
2432
 * @return number of written code units; 0 if conversion is not possible
2433
 */
2434
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
2435
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2436
  #if SIMDUTF_SPAN
2437
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2438
convert_valid_utf16le_to_utf8(
2439
    std::span<const char16_t> valid_utf16_input,
2440
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2441
    #if SIMDUTF_CPLUSPLUS23
2442
  if consteval {
2443
    return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
2444
        valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2445
  } else
2446
    #endif
2447
  {
2448
    return convert_valid_utf16le_to_utf8(
2449
        valid_utf16_input.data(), valid_utf16_input.size(),
2450
        reinterpret_cast<char *>(utf8_output.data()));
2451
  }
2452
}
2453
  #endif // SIMDUTF_SPAN
2454
2455
/**
2456
 * Convert valid UTF-16BE string into UTF-8 string.
2457
 *
2458
 * This function assumes that the input string is valid UTF-16BE.
2459
 *
2460
 * This function is not BOM-aware.
2461
 *
2462
 * @param input         the UTF-16BE string to convert
2463
 * @param length        the length of the string in 2-byte code units (char16_t)
2464
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2465
 * result
2466
 * @return number of written code units; 0 if conversion is not possible
2467
 */
2468
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
2469
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2470
  #if SIMDUTF_SPAN
2471
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2472
convert_valid_utf16be_to_utf8(
2473
    std::span<const char16_t> valid_utf16_input,
2474
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2475
    #if SIMDUTF_CPLUSPLUS23
2476
  if consteval {
2477
    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(
2478
        valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2479
  } else
2480
    #endif
2481
  {
2482
    return convert_valid_utf16be_to_utf8(
2483
        valid_utf16_input.data(), valid_utf16_input.size(),
2484
        reinterpret_cast<char *>(utf8_output.data()));
2485
  }
2486
}
2487
  #endif // SIMDUTF_SPAN
2488
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2489
2490
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2491
/**
2492
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
2493
 * string.
2494
 *
2495
 * During the conversion also validation of the input string is done.
2496
 * This function is suitable to work with inputs from untrusted sources.
2497
 *
2498
 * This function is not BOM-aware.
2499
 *
2500
 * @param input         the UTF-16 string to convert
2501
 * @param length        the length of the string in 2-byte code units (char16_t)
2502
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2503
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2504
 * string
2505
 */
2506
simdutf_warn_unused size_t convert_utf16_to_utf32(
2507
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2508
  #if SIMDUTF_SPAN
2509
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2510
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
2511
0
                       std::span<char32_t> utf32_output) noexcept {
2512
0
2513
0
    #if SIMDUTF_CPLUSPLUS23
2514
0
  if consteval {
2515
0
    return scalar::utf16_to_utf32::convert<endianness::NATIVE>(
2516
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2517
0
  } else
2518
0
    #endif
2519
0
  {
2520
0
    return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
2521
0
                                  utf32_output.data());
2522
0
  }
2523
0
}
2524
  #endif // SIMDUTF_SPAN
2525
2526
/**
2527
 * Convert possibly broken UTF-16LE string into UTF-32 string.
2528
 *
2529
 * During the conversion also validation of the input string is done.
2530
 * This function is suitable to work with inputs from untrusted sources.
2531
 *
2532
 * This function is not BOM-aware.
2533
 *
2534
 * @param input         the UTF-16LE string to convert
2535
 * @param length        the length of the string in 2-byte code units (char16_t)
2536
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2537
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2538
 * string
2539
 */
2540
simdutf_warn_unused size_t convert_utf16le_to_utf32(
2541
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2542
  #if SIMDUTF_SPAN
2543
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2544
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
2545
0
                         std::span<char32_t> utf32_output) noexcept {
2546
0
    #if SIMDUTF_CPLUSPLUS23
2547
0
  if consteval {
2548
0
    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(
2549
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2550
0
  } else
2551
0
    #endif
2552
0
  {
2553
0
    return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
2554
0
                                    utf32_output.data());
2555
0
  }
2556
0
}
2557
  #endif // SIMDUTF_SPAN
2558
2559
/**
2560
 * Convert possibly broken UTF-16BE string into UTF-32 string.
2561
 *
2562
 * During the conversion also validation of the input string is done.
2563
 * This function is suitable to work with inputs from untrusted sources.
2564
 *
2565
 * This function is not BOM-aware.
2566
 *
2567
 * @param input         the UTF-16BE string to convert
2568
 * @param length        the length of the string in 2-byte code units (char16_t)
2569
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2570
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2571
 * string
2572
 */
2573
simdutf_warn_unused size_t convert_utf16be_to_utf32(
2574
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2575
  #if SIMDUTF_SPAN
2576
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2577
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
2578
0
                         std::span<char32_t> utf32_output) noexcept {
2579
0
    #if SIMDUTF_CPLUSPLUS23
2580
0
  if consteval {
2581
0
    return scalar::utf16_to_utf32::convert<endianness::BIG>(
2582
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2583
0
  } else
2584
0
    #endif
2585
0
  {
2586
0
    return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
2587
0
                                    utf32_output.data());
2588
0
  }
2589
0
}
2590
  #endif // SIMDUTF_SPAN
2591
2592
/**
2593
 * Using native endianness, convert possibly broken UTF-16 string into
2594
 * UTF-32 string and stop on error.
2595
 *
2596
 * During the conversion also validation of the input string is done.
2597
 * This function is suitable to work with inputs from untrusted sources.
2598
 *
2599
 * This function is not BOM-aware.
2600
 *
2601
 * @param input         the UTF-16 string to convert
2602
 * @param length        the length of the string in 2-byte code units (char16_t)
2603
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2604
 * @return a result pair struct (of type simdutf::result containing the two
2605
 * fields error and count) with an error code and either position of the error
2606
 * (in the input in code units) if any, or the number of char32_t written if
2607
 * successful.
2608
 */
2609
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
2610
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2611
  #if SIMDUTF_SPAN
2612
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2613
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
2614
0
                                   std::span<char32_t> utf32_output) noexcept {
2615
0
    #if SIMDUTF_CPLUSPLUS23
2616
0
  if consteval {
2617
0
    return scalar::utf16_to_utf32::convert_with_errors<endianness::NATIVE>(
2618
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2619
0
  } else
2620
0
    #endif
2621
0
  {
2622
0
    return convert_utf16_to_utf32_with_errors(
2623
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2624
0
  }
2625
0
}
2626
  #endif // SIMDUTF_SPAN
2627
2628
/**
2629
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2630
 *
2631
 * During the conversion also validation of the input string is done.
2632
 * This function is suitable to work with inputs from untrusted sources.
2633
 *
2634
 * This function is not BOM-aware.
2635
 *
2636
 * @param input         the UTF-16LE string to convert
2637
 * @param length        the length of the string in 2-byte code units (char16_t)
2638
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2639
 * @return a result pair struct (of type simdutf::result containing the two
2640
 * fields error and count) with an error code and either position of the error
2641
 * (in the input in code units) if any, or the number of char32_t written if
2642
 * successful.
2643
 */
2644
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
2645
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2646
  #if SIMDUTF_SPAN
2647
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2648
convert_utf16le_to_utf32_with_errors(
2649
    std::span<const char16_t> utf16_input,
2650
0
    std::span<char32_t> utf32_output) noexcept {
2651
0
    #if SIMDUTF_CPLUSPLUS23
2652
0
  if consteval {
2653
0
    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
2654
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2655
0
  } else
2656
0
    #endif
2657
0
  {
2658
0
    return convert_utf16le_to_utf32_with_errors(
2659
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2660
0
  }
2661
0
}
2662
  #endif // SIMDUTF_SPAN
2663
2664
/**
2665
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2666
 *
2667
 * During the conversion also validation of the input string is done.
2668
 * This function is suitable to work with inputs from untrusted sources.
2669
 *
2670
 * This function is not BOM-aware.
2671
 *
2672
 * @param input         the UTF-16BE string to convert
2673
 * @param length        the length of the string in 2-byte code units (char16_t)
2674
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2675
 * @return a result pair struct (of type simdutf::result containing the two
2676
 * fields error and count) with an error code and either position of the error
2677
 * (in the input in code units) if any, or the number of char32_t written if
2678
 * successful.
2679
 */
2680
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
2681
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2682
  #if SIMDUTF_SPAN
2683
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2684
convert_utf16be_to_utf32_with_errors(
2685
    std::span<const char16_t> utf16_input,
2686
0
    std::span<char32_t> utf32_output) noexcept {
2687
0
    #if SIMDUTF_CPLUSPLUS23
2688
0
  if consteval {
2689
0
    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
2690
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2691
0
  } else
2692
0
    #endif
2693
0
  {
2694
0
    return convert_utf16be_to_utf32_with_errors(
2695
0
        utf16_input.data(), utf16_input.size(), utf32_output.data());
2696
0
  }
2697
0
}
2698
  #endif // SIMDUTF_SPAN
2699
2700
/**
2701
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
2702
 *
2703
 * This function assumes that the input string is valid UTF-16 (native
2704
 * endianness).
2705
 *
2706
 * This function is not BOM-aware.
2707
 *
2708
 * @param input         the UTF-16 string to convert
2709
 * @param length        the length of the string in 2-byte code units (char16_t)
2710
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2711
 * result
2712
 * @return number of written code units; 0 if conversion is not possible
2713
 */
2714
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
2715
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2716
  #if SIMDUTF_SPAN
2717
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2718
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
2719
0
                             std::span<char32_t> utf32_output) noexcept {
2720
0
    #if SIMDUTF_CPLUSPLUS23
2721
0
  if consteval {
2722
0
    return scalar::utf16_to_utf32::convert_valid<endianness::NATIVE>(
2723
0
        valid_utf16_input.data(), valid_utf16_input.size(),
2724
0
        utf32_output.data());
2725
0
  } else
2726
0
    #endif
2727
0
  {
2728
0
    return convert_valid_utf16_to_utf32(valid_utf16_input.data(),
2729
0
                                        valid_utf16_input.size(),
2730
0
                                        utf32_output.data());
2731
0
  }
2732
0
}
2733
  #endif // SIMDUTF_SPAN
2734
2735
/**
2736
 * Convert valid UTF-16LE string into UTF-32 string.
2737
 *
2738
 * This function assumes that the input string is valid UTF-16LE.
2739
 *
2740
 * This function is not BOM-aware.
2741
 *
2742
 * @param input         the UTF-16LE string to convert
2743
 * @param length        the length of the string in 2-byte code units (char16_t)
2744
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2745
 * result
2746
 * @return number of written code units; 0 if conversion is not possible
2747
 */
2748
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
2749
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2750
  #if SIMDUTF_SPAN
2751
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2752
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
2753
0
                               std::span<char32_t> utf32_output) noexcept {
2754
0
    #if SIMDUTF_CPLUSPLUS23
2755
0
  if consteval {
2756
0
    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
2757
0
        valid_utf16_input.data(), valid_utf16_input.size(),
2758
0
        utf32_output.data());
2759
0
  } else
2760
0
    #endif
2761
0
  {
2762
0
    return convert_valid_utf16le_to_utf32(valid_utf16_input.data(),
2763
0
                                          valid_utf16_input.size(),
2764
0
                                          utf32_output.data());
2765
0
  }
2766
0
}
2767
  #endif // SIMDUTF_SPAN
2768
2769
/**
2770
 * Convert valid UTF-16BE string into UTF-32 string.
2771
 *
2772
 * This function assumes that the input string is valid UTF-16LE.
2773
 *
2774
 * This function is not BOM-aware.
2775
 *
2776
 * @param input         the UTF-16BE string to convert
2777
 * @param length        the length of the string in 2-byte code units (char16_t)
2778
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2779
 * result
2780
 * @return number of written code units; 0 if conversion is not possible
2781
 */
2782
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
2783
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2784
  #if SIMDUTF_SPAN
2785
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2786
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
2787
0
                               std::span<char32_t> utf32_output) noexcept {
2788
0
    #if SIMDUTF_CPLUSPLUS23
2789
0
  if consteval {
2790
0
    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(
2791
0
        valid_utf16_input.data(), valid_utf16_input.size(),
2792
0
        utf32_output.data());
2793
0
  } else
2794
0
    #endif
2795
0
  {
2796
0
    return convert_valid_utf16be_to_utf32(valid_utf16_input.data(),
2797
0
                                          valid_utf16_input.size(),
2798
0
                                          utf32_output.data());
2799
0
  }
2800
0
}
2801
  #endif // SIMDUTF_SPAN
2802
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2803
2804
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2805
/**
2806
 * Using native endianness; Compute the number of bytes that this UTF-16
2807
 * string would require in UTF-8 format.
2808
 *
2809
 * This function does not validate the input. It is acceptable to pass invalid
2810
 * UTF-16 strings but in such cases the result is implementation defined.
2811
 *
2812
 * @param input         the UTF-16 string to convert
2813
 * @param length        the length of the string in 2-byte code units (char16_t)
2814
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2815
 */
2816
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
2817
                                                  size_t length) noexcept;
2818
  #if SIMDUTF_SPAN
2819
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2820
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2821
0
    #if SIMDUTF_CPLUSPLUS23
2822
0
  if consteval {
2823
0
    return scalar::utf16::utf8_length_from_utf16<endianness::NATIVE>(
2824
0
        valid_utf16_input.data(), valid_utf16_input.size());
2825
0
  } else
2826
0
    #endif
2827
0
  {
2828
0
    return utf8_length_from_utf16(valid_utf16_input.data(),
2829
0
                                  valid_utf16_input.size());
2830
0
  }
2831
0
}
2832
  #endif // SIMDUTF_SPAN
2833
2834
/**
2835
 * Using native endianness; compute the number of bytes that this UTF-16
2836
 * string would require in UTF-8 format even when the UTF-16LE content contains
2837
 * mismatched surrogates that have to be replaced by the replacement character
2838
 * (0xFFFD).
2839
 *
2840
 * @param input         the UTF-16 string to convert
2841
 * @param length        the length of the string in 2-byte code units (char16_t)
2842
 * @return a result pair struct (of type simdutf::result containing the two
2843
 * fields error and count) where the count is the number of bytes required to
2844
 * encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or
2845
 * SURROGATE. The count is correct regardless of the error field.
2846
 * When SURROGATE is returned, it does not indicate an error in the case of this
2847
 * function: it indicates that at least one surrogate has been encountered: the
2848
 * surrogates may be matched or not (thus this function does not validate). If
2849
 * the returned error code is SUCCESS, then the input contains no surrogate, is
2850
 * in the Basic Multilingual Plane, and is necessarily valid.
2851
 */
2852
simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
2853
    const char16_t *input, size_t length) noexcept;
2854
  #if SIMDUTF_SPAN
2855
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2856
utf8_length_from_utf16_with_replacement(
2857
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2858
0
    #if SIMDUTF_CPLUSPLUS23
2859
0
  if consteval {
2860
0
    return scalar::utf16::utf8_length_from_utf16_with_replacement<
2861
0
        endianness::NATIVE>(valid_utf16_input.data(), valid_utf16_input.size());
2862
0
  } else
2863
0
    #endif
2864
0
  {
2865
0
    return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
2866
0
                                                   valid_utf16_input.size());
2867
0
  }
2868
0
}
2869
  #endif // SIMDUTF_SPAN
2870
2871
/**
2872
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2873
 * format.
2874
 *
2875
 * This function does not validate the input. It is acceptable to pass invalid
2876
 * UTF-16 strings but in such cases the result is implementation defined.
2877
 *
2878
 * @param input         the UTF-16LE string to convert
2879
 * @param length        the length of the string in 2-byte code units (char16_t)
2880
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2881
 */
2882
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
2883
                                                    size_t length) noexcept;
2884
  #if SIMDUTF_SPAN
2885
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2886
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2887
0
    #if SIMDUTF_CPLUSPLUS23
2888
0
  if consteval {
2889
0
    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
2890
0
        valid_utf16_input.data(), valid_utf16_input.size());
2891
0
  } else
2892
0
    #endif
2893
0
  {
2894
0
    return utf8_length_from_utf16le(valid_utf16_input.data(),
2895
0
                                    valid_utf16_input.size());
2896
0
  }
2897
0
}
2898
  #endif // SIMDUTF_SPAN
2899
2900
/**
2901
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
2902
 * format.
2903
 *
2904
 * This function does not validate the input. It is acceptable to pass invalid
2905
 * UTF-16 strings but in such cases the result is implementation defined.
2906
 *
2907
 * @param input         the UTF-16BE string to convert
2908
 * @param length        the length of the string in 2-byte code units (char16_t)
2909
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2910
 */
2911
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
2912
                                                    size_t length) noexcept;
2913
  #if SIMDUTF_SPAN
2914
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2915
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2916
0
    #if SIMDUTF_CPLUSPLUS23
2917
0
  if consteval {
2918
0
    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
2919
0
        valid_utf16_input.data(), valid_utf16_input.size());
2920
0
  } else
2921
0
    #endif
2922
0
  {
2923
0
    return utf8_length_from_utf16be(valid_utf16_input.data(),
2924
0
                                    valid_utf16_input.size());
2925
0
  }
2926
0
}
2927
  #endif // SIMDUTF_SPAN
2928
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2929
2930
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2931
/**
2932
 * Convert possibly broken UTF-32 string into UTF-8 string.
2933
 *
2934
 * During the conversion also validation of the input string is done.
2935
 * This function is suitable to work with inputs from untrusted sources.
2936
 *
2937
 * This function is not BOM-aware.
2938
 *
2939
 * @param input         the UTF-32 string to convert
2940
 * @param length        the length of the string in 4-byte code units (char32_t)
2941
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2942
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2943
 */
2944
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
2945
                                                 size_t length,
2946
                                                 char *utf8_buffer) noexcept;
2947
  #if SIMDUTF_SPAN
2948
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2949
convert_utf32_to_utf8(
2950
    std::span<const char32_t> utf32_input,
2951
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2952
    #if SIMDUTF_CPLUSPLUS23
2953
  if consteval {
2954
    return scalar::utf32_to_utf8::convert(
2955
        utf32_input.data(), utf32_input.size(), utf8_output.data());
2956
  } else
2957
    #endif
2958
  {
2959
    return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
2960
                                 reinterpret_cast<char *>(utf8_output.data()));
2961
  }
2962
}
2963
  #endif // SIMDUTF_SPAN
2964
2965
/**
2966
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2967
 *
2968
 * During the conversion also validation of the input string is done.
2969
 * This function is suitable to work with inputs from untrusted sources.
2970
 *
2971
 * This function is not BOM-aware.
2972
 *
2973
 * @param input         the UTF-32 string to convert
2974
 * @param length        the length of the string in 4-byte code units (char32_t)
2975
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2976
 * @return a result pair struct (of type simdutf::result containing the two
2977
 * fields error and count) with an error code and either position of the error
2978
 * (in the input in code units) if any, or the number of char written if
2979
 * successful.
2980
 */
2981
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
2982
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2983
  #if SIMDUTF_SPAN
2984
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2985
convert_utf32_to_utf8_with_errors(
2986
    std::span<const char32_t> utf32_input,
2987
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2988
    #if SIMDUTF_CPLUSPLUS23
2989
  if consteval {
2990
    return scalar::utf32_to_utf8::convert_with_errors(
2991
        utf32_input.data(), utf32_input.size(), utf8_output.data());
2992
  } else
2993
    #endif
2994
  {
2995
    return convert_utf32_to_utf8_with_errors(
2996
        utf32_input.data(), utf32_input.size(),
2997
        reinterpret_cast<char *>(utf8_output.data()));
2998
  }
2999
}
3000
  #endif // SIMDUTF_SPAN
3001
3002
/**
3003
 * Convert valid UTF-32 string into UTF-8 string.
3004
 *
3005
 * This function assumes that the input string is valid UTF-32.
3006
 *
3007
 * This function is not BOM-aware.
3008
 *
3009
 * @param input         the UTF-32 string to convert
3010
 * @param length        the length of the string in 4-byte code units (char32_t)
3011
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
3012
 * result
3013
 * @return number of written code units; 0 if conversion is not possible
3014
 */
3015
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
3016
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
3017
  #if SIMDUTF_SPAN
3018
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3019
convert_valid_utf32_to_utf8(
3020
    std::span<const char32_t> valid_utf32_input,
3021
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3022
    #if SIMDUTF_CPLUSPLUS23
3023
  if consteval {
3024
    return scalar::utf32_to_utf8::convert_valid(
3025
        valid_utf32_input.data(), valid_utf32_input.size(), utf8_output.data());
3026
  } else
3027
    #endif
3028
  {
3029
    return convert_valid_utf32_to_utf8(
3030
        valid_utf32_input.data(), valid_utf32_input.size(),
3031
        reinterpret_cast<char *>(utf8_output.data()));
3032
  }
3033
}
3034
  #endif // SIMDUTF_SPAN
3035
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3036
3037
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3038
/**
3039
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
3040
 * string.
3041
 *
3042
 * During the conversion also validation of the input string is done.
3043
 * This function is suitable to work with inputs from untrusted sources.
3044
 *
3045
 * This function is not BOM-aware.
3046
 *
3047
 * @param input         the UTF-32 string to convert
3048
 * @param length        the length of the string in 4-byte code units (char32_t)
3049
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3050
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3051
 */
3052
simdutf_warn_unused size_t convert_utf32_to_utf16(
3053
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3054
  #if SIMDUTF_SPAN
3055
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3056
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
3057
0
                       std::span<char16_t> utf16_output) noexcept {
3058
0
    #if SIMDUTF_CPLUSPLUS23
3059
0
  if consteval {
3060
0
    return scalar::utf32_to_utf16::convert<endianness::NATIVE>(
3061
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3062
0
  } else
3063
0
    #endif
3064
0
  {
3065
0
    return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
3066
0
                                  utf16_output.data());
3067
0
  }
3068
0
}
3069
  #endif // SIMDUTF_SPAN
3070
3071
/**
3072
 * Convert possibly broken UTF-32 string into UTF-16LE string.
3073
 *
3074
 * During the conversion also validation of the input string is done.
3075
 * This function is suitable to work with inputs from untrusted sources.
3076
 *
3077
 * This function is not BOM-aware.
3078
 *
3079
 * @param input         the UTF-32 string to convert
3080
 * @param length        the length of the string in 4-byte code units (char32_t)
3081
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3082
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3083
 */
3084
simdutf_warn_unused size_t convert_utf32_to_utf16le(
3085
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3086
  #if SIMDUTF_SPAN
3087
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3088
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
3089
0
                         std::span<char16_t> utf16_output) noexcept {
3090
0
    #if SIMDUTF_CPLUSPLUS23
3091
0
  if consteval {
3092
0
    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(
3093
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3094
0
  } else
3095
0
    #endif
3096
0
  {
3097
0
    return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
3098
0
                                    utf16_output.data());
3099
0
  }
3100
0
}
3101
  #endif // SIMDUTF_SPAN
3102
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3103
3104
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3105
/**
3106
 * Convert possibly broken UTF-32 string into Latin1 string.
3107
 *
3108
 * During the conversion also validation of the input string is done.
3109
 * This function is suitable to work with inputs from untrusted sources.
3110
 *
3111
 * This function is not BOM-aware.
3112
 *
3113
 * @param input         the UTF-32 string to convert
3114
 * @param length        the length of the string in 4-byte code units (char32_t)
3115
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
3116
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3117
 * or if it cannot be represented as Latin1
3118
 */
3119
simdutf_warn_unused size_t convert_utf32_to_latin1(
3120
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3121
  #if SIMDUTF_SPAN
3122
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3123
convert_utf32_to_latin1(
3124
    std::span<const char32_t> utf32_input,
3125
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3126
    #if SIMDUTF_CPLUSPLUS23
3127
  if consteval {
3128
    return scalar::utf32_to_latin1::convert(
3129
        utf32_input.data(), utf32_input.size(), latin1_output.data());
3130
  } else
3131
    #endif
3132
  {
3133
    return convert_utf32_to_latin1(
3134
        utf32_input.data(), utf32_input.size(),
3135
        reinterpret_cast<char *>(latin1_output.data()));
3136
  }
3137
}
3138
  #endif // SIMDUTF_SPAN
3139
3140
/**
3141
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
3142
 * If the string cannot be represented as Latin1, an error is returned.
3143
 *
3144
 * During the conversion also validation of the input string is done.
3145
 * This function is suitable to work with inputs from untrusted sources.
3146
 *
3147
 * This function is not BOM-aware.
3148
 *
3149
 * @param input         the UTF-32 string to convert
3150
 * @param length        the length of the string in 4-byte code units (char32_t)
3151
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
3152
 * @return a result pair struct (of type simdutf::result containing the two
3153
 * fields error and count) with an error code and either position of the error
3154
 * (in the input in code units) if any, or the number of char written if
3155
 * successful.
3156
 */
3157
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
3158
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3159
  #if SIMDUTF_SPAN
3160
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3161
convert_utf32_to_latin1_with_errors(
3162
    std::span<const char32_t> utf32_input,
3163
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3164
    #if SIMDUTF_CPLUSPLUS23
3165
  if consteval {
3166
    return scalar::utf32_to_latin1::convert_with_errors(
3167
        utf32_input.data(), utf32_input.size(), latin1_output.data());
3168
  } else
3169
    #endif
3170
  {
3171
    return convert_utf32_to_latin1_with_errors(
3172
        utf32_input.data(), utf32_input.size(),
3173
        reinterpret_cast<char *>(latin1_output.data()));
3174
  }
3175
}
3176
  #endif // SIMDUTF_SPAN
3177
3178
/**
3179
 * Convert valid UTF-32 string into Latin1 string.
3180
 *
3181
 * This function assumes that the input string is valid UTF-32 and that it can
3182
 * be represented as Latin1. If you violate this assumption, the result is
3183
 * implementation defined and may include system-dependent behavior such as
3184
 * crashes.
3185
 *
3186
 * This function is for expert users only and not part of our public API. Use
3187
 * convert_utf32_to_latin1 instead. The function may be removed from the library
3188
 * in the future.
3189
 *
3190
 * This function is not BOM-aware.
3191
 *
3192
 * @param input         the UTF-32 string to convert
3193
 * @param length        the length of the string in 4-byte code units (char32_t)
3194
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
3195
 * result
3196
 * @return number of written code units; 0 if conversion is not possible
3197
 */
3198
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
3199
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3200
  #if SIMDUTF_SPAN
3201
simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
3202
convert_valid_utf32_to_latin1(
3203
    std::span<const char32_t> valid_utf32_input,
3204
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3205
    #if SIMDUTF_CPLUSPLUS23
3206
  if consteval {
3207
    return scalar::utf32_to_latin1::convert_valid(
3208
        detail::constexpr_cast_ptr<uint32_t>(valid_utf32_input.data()),
3209
        valid_utf32_input.size(),
3210
        detail::constexpr_cast_writeptr<char>(latin1_output.data()));
3211
  }
3212
    #endif
3213
  {
3214
    return convert_valid_utf32_to_latin1(
3215
        valid_utf32_input.data(), valid_utf32_input.size(),
3216
        reinterpret_cast<char *>(latin1_output.data()));
3217
  }
3218
}
3219
  #endif // SIMDUTF_SPAN
3220
3221
/**
3222
 * Compute the number of bytes that this UTF-32 string would require in Latin1
3223
 * format.
3224
 *
3225
 * This function does not validate the input. It is acceptable to pass invalid
3226
 * UTF-32 strings but in such cases the result is implementation defined.
3227
 *
3228
 * This function is not BOM-aware.
3229
 *
3230
 * @param length        the length of the string in 4-byte code units (char32_t)
3231
 * @return the number of bytes required to encode the UTF-32 string as Latin1
3232
 */
3233
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t
3234
latin1_length_from_utf32(size_t length) noexcept {
3235
  return length;
3236
}
3237
3238
/**
3239
 * Compute the number of bytes that this Latin1 string would require in UTF-32
3240
 * format.
3241
 *
3242
 * @param length        the length of the string in Latin1 code units (char)
3243
 * @return the length of the string in 4-byte code units (char32_t) required to
3244
 * encode the Latin1 string as UTF-32
3245
 */
3246
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t
3247
0
utf32_length_from_latin1(size_t length) noexcept {
3248
0
  return length;
3249
0
}
3250
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3251
3252
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3253
/**
3254
 * Convert possibly broken UTF-32 string into UTF-16BE string.
3255
 *
3256
 * During the conversion also validation of the input string is done.
3257
 * This function is suitable to work with inputs from untrusted sources.
3258
 *
3259
 * This function is not BOM-aware.
3260
 *
3261
 * @param input         the UTF-32 string to convert
3262
 * @param length        the length of the string in 4-byte code units (char32_t)
3263
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3264
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3265
 */
3266
simdutf_warn_unused size_t convert_utf32_to_utf16be(
3267
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3268
  #if SIMDUTF_SPAN
3269
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3270
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
3271
0
                         std::span<char16_t> utf16_output) noexcept {
3272
0
    #if SIMDUTF_CPLUSPLUS23
3273
0
  if consteval {
3274
0
    return scalar::utf32_to_utf16::convert<endianness::BIG>(
3275
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3276
0
  } else
3277
0
    #endif
3278
0
  {
3279
0
    return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
3280
0
                                    utf16_output.data());
3281
0
  }
3282
0
}
3283
  #endif // SIMDUTF_SPAN
3284
3285
/**
3286
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
3287
 * string and stop on error.
3288
 *
3289
 * During the conversion also validation of the input string is done.
3290
 * This function is suitable to work with inputs from untrusted sources.
3291
 *
3292
 * This function is not BOM-aware.
3293
 *
3294
 * @param input         the UTF-32 string to convert
3295
 * @param length        the length of the string in 4-byte code units (char32_t)
3296
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3297
 * @return a result pair struct (of type simdutf::result containing the two
3298
 * fields error and count) with an error code and either position of the error
3299
 * (in the input in code units) if any, or the number of char16_t written if
3300
 * successful.
3301
 */
3302
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
3303
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3304
  #if SIMDUTF_SPAN
3305
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3306
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
3307
0
                                   std::span<char16_t> utf16_output) noexcept {
3308
0
    #if SIMDUTF_CPLUSPLUS23
3309
0
  if consteval {
3310
0
    return scalar::utf32_to_utf16::convert_with_errors<endianness::NATIVE>(
3311
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3312
0
  } else
3313
0
    #endif
3314
0
  {
3315
0
    return convert_utf32_to_utf16_with_errors(
3316
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3317
0
  }
3318
0
}
3319
  #endif // SIMDUTF_SPAN
3320
3321
/**
3322
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
3323
 *
3324
 * During the conversion also validation of the input string is done.
3325
 * This function is suitable to work with inputs from untrusted sources.
3326
 *
3327
 * This function is not BOM-aware.
3328
 *
3329
 * @param input         the UTF-32 string to convert
3330
 * @param length        the length of the string in 4-byte code units (char32_t)
3331
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3332
 * @return a result pair struct (of type simdutf::result containing the two
3333
 * fields error and count) with an error code and either position of the error
3334
 * (in the input in code units) if any, or the number of char16_t written if
3335
 * successful.
3336
 */
3337
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
3338
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3339
  #if SIMDUTF_SPAN
3340
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3341
convert_utf32_to_utf16le_with_errors(
3342
    std::span<const char32_t> utf32_input,
3343
0
    std::span<char16_t> utf16_output) noexcept {
3344
0
    #if SIMDUTF_CPLUSPLUS23
3345
0
  if consteval {
3346
0
    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
3347
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3348
0
  } else
3349
0
    #endif
3350
0
  {
3351
0
    return convert_utf32_to_utf16le_with_errors(
3352
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3353
0
  }
3354
0
}
3355
  #endif // SIMDUTF_SPAN
3356
3357
/**
3358
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
3359
 *
3360
 * During the conversion also validation of the input string is done.
3361
 * This function is suitable to work with inputs from untrusted sources.
3362
 *
3363
 * This function is not BOM-aware.
3364
 *
3365
 * @param input         the UTF-32 string to convert
3366
 * @param length        the length of the string in 4-byte code units (char32_t)
3367
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3368
 * @return a result pair struct (of type simdutf::result containing the two
3369
 * fields error and count) with an error code and either position of the error
3370
 * (in the input in code units) if any, or the number of char16_t written if
3371
 * successful.
3372
 */
3373
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
3374
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3375
  #if SIMDUTF_SPAN
3376
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3377
convert_utf32_to_utf16be_with_errors(
3378
    std::span<const char32_t> utf32_input,
3379
0
    std::span<char16_t> utf16_output) noexcept {
3380
0
    #if SIMDUTF_CPLUSPLUS23
3381
0
  if consteval {
3382
0
    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
3383
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3384
0
  } else
3385
0
    #endif
3386
0
  {
3387
0
    return convert_utf32_to_utf16be_with_errors(
3388
0
        utf32_input.data(), utf32_input.size(), utf16_output.data());
3389
0
  }
3390
0
}
3391
  #endif // SIMDUTF_SPAN
3392
3393
/**
3394
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
3395
 *
3396
 * This function assumes that the input string is valid UTF-32.
3397
 *
3398
 * This function is not BOM-aware.
3399
 *
3400
 * @param input         the UTF-32 string to convert
3401
 * @param length        the length of the string in 4-byte code units (char32_t)
3402
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3403
 * result
3404
 * @return number of written code units; 0 if conversion is not possible
3405
 */
3406
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
3407
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3408
  #if SIMDUTF_SPAN
3409
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3410
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
3411
0
                             std::span<char16_t> utf16_output) noexcept {
3412
0
3413
0
    #if SIMDUTF_CPLUSPLUS23
3414
0
  if consteval {
3415
0
    return scalar::utf32_to_utf16::convert_valid<endianness::NATIVE>(
3416
0
        valid_utf32_input.data(), valid_utf32_input.size(),
3417
0
        utf16_output.data());
3418
0
  } else
3419
0
    #endif
3420
0
  {
3421
0
    return convert_valid_utf32_to_utf16(valid_utf32_input.data(),
3422
0
                                        valid_utf32_input.size(),
3423
0
                                        utf16_output.data());
3424
0
  }
3425
0
}
3426
  #endif // SIMDUTF_SPAN
3427
3428
/**
3429
 * Convert valid UTF-32 string into UTF-16LE string.
3430
 *
3431
 * This function assumes that the input string is valid UTF-32.
3432
 *
3433
 * This function is not BOM-aware.
3434
 *
3435
 * @param input         the UTF-32 string to convert
3436
 * @param length        the length of the string in 4-byte code units (char32_t)
3437
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3438
 * result
3439
 * @return number of written code units; 0 if conversion is not possible
3440
 */
3441
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
3442
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3443
  #if SIMDUTF_SPAN
3444
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3445
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
3446
0
                               std::span<char16_t> utf16_output) noexcept {
3447
0
    #if SIMDUTF_CPLUSPLUS23
3448
0
  if consteval {
3449
0
    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
3450
0
        valid_utf32_input.data(), valid_utf32_input.size(),
3451
0
        utf16_output.data());
3452
0
  } else
3453
0
    #endif
3454
0
  {
3455
0
    return convert_valid_utf32_to_utf16le(valid_utf32_input.data(),
3456
0
                                          valid_utf32_input.size(),
3457
0
                                          utf16_output.data());
3458
0
  }
3459
0
}
3460
  #endif // SIMDUTF_SPAN
3461
3462
/**
3463
 * Convert valid UTF-32 string into UTF-16BE string.
3464
 *
3465
 * This function assumes that the input string is valid UTF-32.
3466
 *
3467
 * This function is not BOM-aware.
3468
 *
3469
 * @param input         the UTF-32 string to convert
3470
 * @param length        the length of the string in 4-byte code units (char32_t)
3471
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3472
 * result
3473
 * @return number of written code units; 0 if conversion is not possible
3474
 */
3475
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
3476
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3477
  #if SIMDUTF_SPAN
3478
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3479
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
3480
0
                               std::span<char16_t> utf16_output) noexcept {
3481
0
    #if SIMDUTF_CPLUSPLUS23
3482
0
  if consteval {
3483
0
    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(
3484
0
        valid_utf32_input.data(), valid_utf32_input.size(),
3485
0
        utf16_output.data());
3486
0
  } else
3487
0
    #endif
3488
0
  {
3489
0
    return convert_valid_utf32_to_utf16be(valid_utf32_input.data(),
3490
0
                                          valid_utf32_input.size(),
3491
0
                                          utf16_output.data());
3492
0
  }
3493
0
}
3494
  #endif // SIMDUTF_SPAN
3495
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3496
3497
#if SIMDUTF_FEATURE_UTF16
3498
/**
3499
 * Change the endianness of the input. Can be used to go from UTF-16LE to
3500
 * UTF-16BE or from UTF-16BE to UTF-16LE.
3501
 *
3502
 * This function does not validate the input.
3503
 *
3504
 * This function is not BOM-aware.
3505
 *
3506
 * @param input         the UTF-16 string to process
3507
 * @param length        the length of the string in 2-byte code units (char16_t)
3508
 * @param output        the pointer to a buffer that can hold the conversion
3509
 * result
3510
 */
3511
void change_endianness_utf16(const char16_t *input, size_t length,
3512
                             char16_t *output) noexcept;
3513
  #if SIMDUTF_SPAN
3514
simdutf_really_inline simdutf_constexpr23 void
3515
change_endianness_utf16(std::span<const char16_t> utf16_input,
3516
0
                        std::span<char16_t> utf16_output) noexcept {
3517
0
    #if SIMDUTF_CPLUSPLUS23
3518
0
  if consteval {
3519
0
    return scalar::utf16::change_endianness_utf16(
3520
0
        utf16_input.data(), utf16_input.size(), utf16_output.data());
3521
0
  } else
3522
0
    #endif
3523
0
  {
3524
0
    return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
3525
0
                                   utf16_output.data());
3526
0
  }
3527
0
}
3528
  #endif // SIMDUTF_SPAN
3529
#endif   // SIMDUTF_FEATURE_UTF16
3530
3531
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3532
/**
3533
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
3534
 * format.
3535
 *
3536
 * This function does not validate the input. It is acceptable to pass invalid
3537
 * UTF-32 strings but in such cases the result is implementation defined.
3538
 *
3539
 * @param input         the UTF-32 string to convert
3540
 * @param length        the length of the string in 4-byte code units (char32_t)
3541
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
3542
 */
3543
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
3544
                                                  size_t length) noexcept;
3545
  #if SIMDUTF_SPAN
3546
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3547
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3548
0
    #if SIMDUTF_CPLUSPLUS23
3549
0
  if consteval {
3550
0
    return scalar::utf32::utf8_length_from_utf32(valid_utf32_input.data(),
3551
0
                                                 valid_utf32_input.size());
3552
0
  } else
3553
0
    #endif
3554
0
  {
3555
0
    return utf8_length_from_utf32(valid_utf32_input.data(),
3556
0
                                  valid_utf32_input.size());
3557
0
  }
3558
0
}
3559
  #endif // SIMDUTF_SPAN
3560
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3561
3562
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3563
/**
3564
 * Compute the number of two-byte code units that this UTF-32 string would
3565
 * require in UTF-16 format.
3566
 *
3567
 * This function does not validate the input. It is acceptable to pass invalid
3568
 * UTF-32 strings but in such cases the result is implementation defined.
3569
 *
3570
 * @param input         the UTF-32 string to convert
3571
 * @param length        the length of the string in 4-byte code units (char32_t)
3572
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
3573
 */
3574
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
3575
                                                   size_t length) noexcept;
3576
  #if SIMDUTF_SPAN
3577
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3578
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3579
0
    #if SIMDUTF_CPLUSPLUS23
3580
0
  if consteval {
3581
0
    return scalar::utf32::utf16_length_from_utf32(valid_utf32_input.data(),
3582
0
                                                  valid_utf32_input.size());
3583
0
  } else
3584
0
    #endif
3585
0
  {
3586
0
    return utf16_length_from_utf32(valid_utf32_input.data(),
3587
0
                                   valid_utf32_input.size());
3588
0
  }
3589
0
}
3590
  #endif // SIMDUTF_SPAN
3591
3592
/**
3593
 * Using native endianness; Compute the number of bytes that this UTF-16
3594
 * string would require in UTF-32 format.
3595
 *
3596
 * This function is equivalent to count_utf16.
3597
 *
3598
 * This function does not validate the input. It is acceptable to pass invalid
3599
 * UTF-16 strings but in such cases the result is implementation defined.
3600
 *
3601
 * This function is not BOM-aware.
3602
 *
3603
 * @param input         the UTF-16 string to convert
3604
 * @param length        the length of the string in 2-byte code units (char16_t)
3605
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3606
 */
3607
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
3608
                                                   size_t length) noexcept;
3609
  #if SIMDUTF_SPAN
3610
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3611
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3612
0
    #if SIMDUTF_CPLUSPLUS23
3613
0
  if consteval {
3614
0
    return scalar::utf16::utf32_length_from_utf16<endianness::NATIVE>(
3615
0
        valid_utf16_input.data(), valid_utf16_input.size());
3616
0
  } else
3617
0
    #endif
3618
0
  {
3619
0
    return utf32_length_from_utf16(valid_utf16_input.data(),
3620
0
                                   valid_utf16_input.size());
3621
0
  }
3622
0
}
3623
  #endif // SIMDUTF_SPAN
3624
3625
/**
3626
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
3627
 * format.
3628
 *
3629
 * This function is equivalent to count_utf16le.
3630
 *
3631
 * This function does not validate the input. It is acceptable to pass invalid
3632
 * UTF-16 strings but in such cases the result is implementation defined.
3633
 *
3634
 * This function is not BOM-aware.
3635
 *
3636
 * @param input         the UTF-16LE string to convert
3637
 * @param length        the length of the string in 2-byte code units (char16_t)
3638
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3639
 */
3640
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
3641
                                                     size_t length) noexcept;
3642
  #if SIMDUTF_SPAN
3643
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3644
utf32_length_from_utf16le(
3645
0
    std::span<const char16_t> valid_utf16_input) noexcept {
3646
0
    #if SIMDUTF_CPLUSPLUS23
3647
0
  if consteval {
3648
0
    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(
3649
0
        valid_utf16_input.data(), valid_utf16_input.size());
3650
0
  } else
3651
0
    #endif
3652
0
  {
3653
0
    return utf32_length_from_utf16le(valid_utf16_input.data(),
3654
0
                                     valid_utf16_input.size());
3655
0
  }
3656
0
}
3657
  #endif // SIMDUTF_SPAN
3658
3659
/**
3660
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
3661
 * format.
3662
 *
3663
 * This function is equivalent to count_utf16be.
3664
 *
3665
 * This function does not validate the input. It is acceptable to pass invalid
3666
 * UTF-16 strings but in such cases the result is implementation defined.
3667
 *
3668
 * This function is not BOM-aware.
3669
 *
3670
 * @param input         the UTF-16BE string to convert
3671
 * @param length        the length of the string in 2-byte code units (char16_t)
3672
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
3673
 */
3674
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
3675
                                                     size_t length) noexcept;
3676
  #if SIMDUTF_SPAN
3677
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3678
utf32_length_from_utf16be(
3679
0
    std::span<const char16_t> valid_utf16_input) noexcept {
3680
0
    #if SIMDUTF_CPLUSPLUS23
3681
0
  if consteval {
3682
0
    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(
3683
0
        valid_utf16_input.data(), valid_utf16_input.size());
3684
0
  } else
3685
0
    #endif
3686
0
  {
3687
0
    return utf32_length_from_utf16be(valid_utf16_input.data(),
3688
0
                                     valid_utf16_input.size());
3689
0
  }
3690
0
}
3691
  #endif // SIMDUTF_SPAN
3692
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3693
3694
#if SIMDUTF_FEATURE_UTF16
3695
/**
3696
 * Count the number of code points (characters) in the string assuming that
3697
 * it is valid.
3698
 *
3699
 * This function assumes that the input string is valid UTF-16 (native
3700
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
3701
 * cases the result is implementation defined.
3702
 *
3703
 * This function is not BOM-aware.
3704
 *
3705
 * @param input         the UTF-16 string to process
3706
 * @param length        the length of the string in 2-byte code units (char16_t)
3707
 * @return number of code points
3708
 */
3709
simdutf_warn_unused size_t count_utf16(const char16_t *input,
3710
                                       size_t length) noexcept;
3711
  #if SIMDUTF_SPAN
3712
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3713
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3714
0
    #if SIMDUTF_CPLUSPLUS23
3715
0
  if consteval {
3716
0
    return scalar::utf16::count_code_points<endianness::NATIVE>(
3717
0
        valid_utf16_input.data(), valid_utf16_input.size());
3718
0
  } else
3719
0
    #endif
3720
0
  {
3721
0
    return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
3722
0
  }
3723
0
}
3724
  #endif // SIMDUTF_SPAN
3725
3726
/**
3727
 * Count the number of code points (characters) in the string assuming that
3728
 * it is valid.
3729
 *
3730
 * This function assumes that the input string is valid UTF-16LE.
3731
 * It is acceptable to pass invalid UTF-16 strings but in such cases
3732
 * the result is implementation defined.
3733
 *
3734
 * This function is not BOM-aware.
3735
 *
3736
 * @param input         the UTF-16LE string to process
3737
 * @param length        the length of the string in 2-byte code units (char16_t)
3738
 * @return number of code points
3739
 */
3740
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
3741
                                         size_t length) noexcept;
3742
  #if SIMDUTF_SPAN
3743
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3744
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3745
0
    #if SIMDUTF_CPLUSPLUS23
3746
0
  if consteval {
3747
0
    return scalar::utf16::count_code_points<endianness::LITTLE>(
3748
0
        valid_utf16_input.data(), valid_utf16_input.size());
3749
0
  } else
3750
0
    #endif
3751
0
  {
3752
0
    return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
3753
0
  }
3754
0
}
3755
  #endif // SIMDUTF_SPAN
3756
3757
/**
3758
 * Count the number of code points (characters) in the string assuming that
3759
 * it is valid.
3760
 *
3761
 * This function assumes that the input string is valid UTF-16BE.
3762
 * It is acceptable to pass invalid UTF-16 strings but in such cases
3763
 * the result is implementation defined.
3764
 *
3765
 * This function is not BOM-aware.
3766
 *
3767
 * @param input         the UTF-16BE string to process
3768
 * @param length        the length of the string in 2-byte code units (char16_t)
3769
 * @return number of code points
3770
 */
3771
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
3772
                                         size_t length) noexcept;
3773
  #if SIMDUTF_SPAN
3774
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3775
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3776
0
    #if SIMDUTF_CPLUSPLUS23
3777
0
  if consteval {
3778
0
    return scalar::utf16::count_code_points<endianness::BIG>(
3779
0
        valid_utf16_input.data(), valid_utf16_input.size());
3780
0
  } else
3781
0
    #endif
3782
0
  {
3783
0
    return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
3784
0
  }
3785
0
}
3786
  #endif // SIMDUTF_SPAN
3787
#endif   // SIMDUTF_FEATURE_UTF16
3788
3789
#if SIMDUTF_FEATURE_UTF8
3790
/**
3791
 * Count the number of code points (characters) in the string assuming that
3792
 * it is valid.
3793
 *
3794
 * This function assumes that the input string is valid UTF-8.
3795
 * It is acceptable to pass invalid UTF-8 strings but in such cases
3796
 * the result is implementation defined.
3797
 *
3798
 * @param input         the UTF-8 string to process
3799
 * @param length        the length of the string in bytes
3800
 * @return number of code points
3801
 */
3802
simdutf_warn_unused size_t count_utf8(const char *input,
3803
                                      size_t length) noexcept;
3804
  #if SIMDUTF_SPAN
3805
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t count_utf8(
3806
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3807
    #if SIMDUTF_CPLUSPLUS23
3808
  if consteval {
3809
    return scalar::utf8::count_code_points(valid_utf8_input.data(),
3810
                                           valid_utf8_input.size());
3811
  } else
3812
    #endif
3813
  {
3814
    return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
3815
                      valid_utf8_input.size());
3816
  }
3817
}
3818
  #endif // SIMDUTF_SPAN
3819
3820
/**
3821
 * Given a valid UTF-8 string having a possibly truncated last character,
3822
 * this function checks the end of string. If the last character is truncated
3823
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
3824
 * that the short UTF-8 strings only contain complete characters. If there is no
3825
 * truncated character, the original length is returned.
3826
 *
3827
 * This function assumes that the input string is valid UTF-8, but possibly
3828
 * truncated.
3829
 *
3830
 * @param input         the UTF-8 string to process
3831
 * @param length        the length of the string in bytes
3832
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
3833
 */
3834
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
3835
  #if SIMDUTF_SPAN
3836
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3837
trim_partial_utf8(
3838
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3839
    #if SIMDUTF_CPLUSPLUS23
3840
  if consteval {
3841
    return scalar::utf8::trim_partial_utf8(valid_utf8_input.data(),
3842
                                           valid_utf8_input.size());
3843
  } else
3844
    #endif
3845
  {
3846
    return trim_partial_utf8(
3847
        reinterpret_cast<const char *>(valid_utf8_input.data()),
3848
        valid_utf8_input.size());
3849
  }
3850
}
3851
  #endif // SIMDUTF_SPAN
3852
#endif   // SIMDUTF_FEATURE_UTF8
3853
3854
#if SIMDUTF_FEATURE_UTF16
3855
/**
3856
 * Given a valid UTF-16BE string having a possibly truncated last character,
3857
 * this function checks the end of string. If the last character is truncated
3858
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3859
 * the short UTF-16BE strings only contain complete characters. If there is no
3860
 * truncated character, the original length is returned.
3861
 *
3862
 * This function assumes that the input string is valid UTF-16BE, but possibly
3863
 * truncated.
3864
 *
3865
 * @param input         the UTF-16BE string to process
3866
 * @param length        the length of the string in bytes
3867
 * @return the length of the string in bytes, possibly shorter by 1 unit
3868
 */
3869
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
3870
                                                size_t length);
3871
  #if SIMDUTF_SPAN
3872
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3873
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3874
0
    #if SIMDUTF_CPLUSPLUS23
3875
0
  if consteval {
3876
0
    return scalar::utf16::trim_partial_utf16<endianness::BIG>(
3877
0
        valid_utf16_input.data(), valid_utf16_input.size());
3878
0
  } else
3879
0
    #endif
3880
0
  {
3881
0
    return trim_partial_utf16be(valid_utf16_input.data(),
3882
0
                                valid_utf16_input.size());
3883
0
  }
3884
0
}
3885
  #endif // SIMDUTF_SPAN
3886
3887
/**
3888
 * Given a valid UTF-16LE string having a possibly truncated last character,
3889
 * this function checks the end of string. If the last character is truncated
3890
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3891
 * the short UTF-16LE strings only contain complete characters. If there is no
3892
 * truncated character, the original length is returned.
3893
 *
3894
 * This function assumes that the input string is valid UTF-16LE, but possibly
3895
 * truncated.
3896
 *
3897
 * @param input         the UTF-16LE string to process
3898
 * @param length        the length of the string in bytes
3899
 * @return the length of the string in unit, possibly shorter by 1 unit
3900
 */
3901
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
3902
                                                size_t length);
3903
  #if SIMDUTF_SPAN
3904
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3905
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3906
0
    #if SIMDUTF_CPLUSPLUS23
3907
0
  if consteval {
3908
0
    return scalar::utf16::trim_partial_utf16<endianness::LITTLE>(
3909
0
        valid_utf16_input.data(), valid_utf16_input.size());
3910
0
  } else
3911
0
    #endif
3912
0
  {
3913
0
    return trim_partial_utf16le(valid_utf16_input.data(),
3914
0
                                valid_utf16_input.size());
3915
0
  }
3916
0
}
3917
  #endif // SIMDUTF_SPAN
3918
3919
/**
3920
 * Given a valid UTF-16 string having a possibly truncated last character,
3921
 * this function checks the end of string. If the last character is truncated
3922
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3923
 * the short UTF-16 strings only contain complete characters. If there is no
3924
 * truncated character, the original length is returned.
3925
 *
3926
 * This function assumes that the input string is valid UTF-16, but possibly
3927
 * truncated. We use the native endianness.
3928
 *
3929
 * @param input         the UTF-16 string to process
3930
 * @param length        the length of the string in bytes
3931
 * @return the length of the string in unit, possibly shorter by 1 unit
3932
 */
3933
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
3934
                                              size_t length);
3935
  #if SIMDUTF_SPAN
3936
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3937
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3938
0
    #if SIMDUTF_CPLUSPLUS23
3939
0
  if consteval {
3940
0
    return scalar::utf16::trim_partial_utf16<endianness::NATIVE>(
3941
0
        valid_utf16_input.data(), valid_utf16_input.size());
3942
0
  } else
3943
0
    #endif
3944
0
  {
3945
0
    return trim_partial_utf16(valid_utf16_input.data(),
3946
0
                              valid_utf16_input.size());
3947
0
  }
3948
0
}
3949
  #endif // SIMDUTF_SPAN
3950
#endif   // SIMDUTF_FEATURE_UTF16
3951
3952
#if SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 ||                         \
3953
    SIMDUTF_FEATURE_DETECT_ENCODING
3954
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
3955
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
3956
  #endif
3957
#endif // SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 ||
3958
       // SIMDUTF_FEATURE_DETECT_ENCODING
3959
3960
#if SIMDUTF_FEATURE_BASE64
3961
// base64_options are used to specify the base64 encoding options.
3962
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
3963
// garbage characters are characters that are not part of the base64 alphabet
3964
// nor ASCII spaces.
3965
constexpr uint64_t base64_reverse_padding =
3966
    2; /* modifier for base64_default and base64_url */
3967
enum base64_options : uint64_t {
3968
  base64_default = 0, /* standard base64 format (with padding) */
3969
  base64_url = 1,     /* base64url format (no padding) */
3970
  base64_default_no_padding =
3971
      base64_default |
3972
      base64_reverse_padding, /* standard base64 format without padding */
3973
  base64_url_with_padding =
3974
      base64_url | base64_reverse_padding, /* base64url with padding */
3975
  base64_default_accept_garbage =
3976
      4, /* standard base64 format accepting garbage characters, the input stops
3977
            with the first '=' if any */
3978
  base64_url_accept_garbage =
3979
      5, /* base64url format accepting garbage characters, the input stops with
3980
            the first '=' if any */
3981
  base64_default_or_url =
3982
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
3983
  base64_default_or_url_accept_garbage =
3984
      12, /* standard/base64url hybrid format accepting garbage characters
3985
             (only meaningful for decoding!), the input stops with the first '='
3986
             if any */
3987
};
3988
3989
// last_chunk_handling_options are used to specify the handling of the last
3990
// chunk in base64 decoding.
3991
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3992
enum last_chunk_handling_options : uint64_t {
3993
  loose = 0,  /* standard base64 format, decode partial final chunk */
3994
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
3995
                 unpadded, or non-zero bit padding */
3996
  stop_before_partial =
3997
      2, /* if the last chunk is partial, ignore it (no error) */
3998
  only_full_chunks =
3999
      3 /* only decode full blocks (4 base64 characters, no padding) */
4000
};
4001
4002
inline simdutf_constexpr23 bool
4003
is_partial(last_chunk_handling_options options) {
4004
  return (options == stop_before_partial) || (options == only_full_chunks);
4005
}
4006
4007
namespace detail {
4008
simdutf_warn_unused const char *find(const char *start, const char *end,
4009
                                     char character) noexcept;
4010
simdutf_warn_unused const char16_t *
4011
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
4012
} // namespace detail
4013
4014
/**
4015
 * Find the first occurrence of a character in a string. If the character is
4016
 * not found, return a pointer to the end of the string.
4017
 * @param start        the start of the string
4018
 * @param end          the end of the string
4019
 * @param character    the character to find
4020
 * @return a pointer to the first occurrence of the character in the string,
4021
 * or a pointer to the end of the string if the character is not found.
4022
 *
4023
 */
4024
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char *
4025
find(const char *start, const char *end, char character) noexcept {
4026
  #if SIMDUTF_CPLUSPLUS23
4027
  if consteval {
4028
    for (; start != end; ++start)
4029
      if (*start == character)
4030
        return start;
4031
    return end;
4032
  } else
4033
  #endif
4034
  {
4035
    return detail::find(start, end, character);
4036
  }
4037
}
4038
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char16_t *
4039
find(const char16_t *start, const char16_t *end, char16_t character) noexcept {
4040
    // implementation note: this is repeated instead of a template, to ensure
4041
    // the api is still a function and compiles without concepts
4042
  #if SIMDUTF_CPLUSPLUS23
4043
  if consteval {
4044
    for (; start != end; ++start)
4045
      if (*start == character)
4046
        return start;
4047
    return end;
4048
  } else
4049
  #endif
4050
  {
4051
    return detail::find(start, end, character);
4052
  }
4053
}
4054
}
4055
  // We include base64_tables once.
4056
  #include <simdutf/base64_tables.h>
4057
  #include <simdutf/scalar/base64.h>
4058
4059
namespace simdutf {
4060
4061
  #if SIMDUTF_CPLUSPLUS17
4062
0
inline std::string_view to_string(base64_options options) {
4063
0
  switch (options) {
4064
0
  case base64_default:
4065
0
    return "base64_default";
4066
0
  case base64_url:
4067
0
    return "base64_url";
4068
0
  case base64_reverse_padding:
4069
0
    return "base64_reverse_padding";
4070
0
  case base64_url_with_padding:
4071
0
    return "base64_url_with_padding";
4072
0
  case base64_default_accept_garbage:
4073
0
    return "base64_default_accept_garbage";
4074
0
  case base64_url_accept_garbage:
4075
0
    return "base64_url_accept_garbage";
4076
0
  case base64_default_or_url:
4077
0
    return "base64_default_or_url";
4078
0
  case base64_default_or_url_accept_garbage:
4079
0
    return "base64_default_or_url_accept_garbage";
4080
0
  }
4081
0
  return "<unknown>";
4082
0
}
4083
  #endif // SIMDUTF_CPLUSPLUS17
4084
4085
  #if SIMDUTF_CPLUSPLUS17
4086
0
inline std::string_view to_string(last_chunk_handling_options options) {
4087
0
  switch (options) {
4088
0
  case loose:
4089
0
    return "loose";
4090
0
  case strict:
4091
0
    return "strict";
4092
0
  case stop_before_partial:
4093
0
    return "stop_before_partial";
4094
0
  case only_full_chunks:
4095
0
    return "only_full_chunks";
4096
0
  }
4097
0
  return "<unknown>";
4098
0
}
4099
  #endif
4100
4101
/**
4102
 * Provide the maximal binary length in bytes given the base64 input.
4103
 * As long as the input does not contain ignorable characters (e.g., ASCII
4104
 * spaces or linefeed characters), the result is exact. In particular, the
4105
 * function checks for padding characters.
4106
 *
4107
 * The function is fast (constant time). It checks up to two characters at
4108
 * the end of the string. The input is not otherwise validated or read.
4109
 *
4110
 * @param input         the base64 input to process
4111
 * @param length        the length of the base64 input in bytes
4112
 * @return maximum number of binary bytes
4113
 */
4114
simdutf_warn_unused size_t
4115
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
4116
  #if SIMDUTF_SPAN
4117
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4118
maximal_binary_length_from_base64(
4119
    const detail::input_span_of_byte_like auto &input) noexcept {
4120
    #if SIMDUTF_CPLUSPLUS23
4121
  if consteval {
4122
    return scalar::base64::maximal_binary_length_from_base64(
4123
        detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
4124
  } else
4125
    #endif
4126
  {
4127
    return maximal_binary_length_from_base64(
4128
        reinterpret_cast<const char *>(input.data()), input.size());
4129
  }
4130
}
4131
  #endif // SIMDUTF_SPAN
4132
4133
/**
4134
 * Provide the maximal binary length in bytes given the base64 input.
4135
 * As long as the input does not contain ignorable characters (e.g., ASCII
4136
 * spaces or linefeed characters), the result is exact. In particular, the
4137
 * function checks for padding characters.
4138
 *
4139
 * The function is fast (constant time). It checks up to two characters at
4140
 * the end of the string. The input is not otherwise validated or read.
4141
 *
4142
 * @param input         the base64 input to process, in ASCII stored as 16-bit
4143
 * units
4144
 * @param length        the length of the base64 input in 16-bit units
4145
 * @return maximal number of binary bytes
4146
 */
4147
simdutf_warn_unused size_t maximal_binary_length_from_base64(
4148
    const char16_t *input, size_t length) noexcept;
4149
  #if SIMDUTF_SPAN
4150
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4151
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
4152
0
    #if SIMDUTF_CPLUSPLUS23
4153
0
  if consteval {
4154
0
    return scalar::base64::maximal_binary_length_from_base64(input.data(),
4155
0
                                                             input.size());
4156
0
  } else
4157
0
    #endif
4158
0
  {
4159
0
    return maximal_binary_length_from_base64(input.data(), input.size());
4160
0
  }
4161
0
}
4162
  #endif // SIMDUTF_SPAN
4163
4164
/**
4165
 * Convert a base64 input to a binary output.
4166
 *
4167
 * This function follows the WHATWG forgiving-base64 format, which means that it
4168
 * will ignore any ASCII spaces in the input. You may provide a padded input
4169
 * (with one or two equal signs at the end) or an unpadded input (without any
4170
 * equal signs at the end).
4171
 *
4172
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4173
 *
4174
 * This function will fail in case of invalid input. When last_chunk_options =
4175
 * loose, there are two possible reasons for failure: the input contains a
4176
 * number of base64 characters that when divided by 4, leaves a single remainder
4177
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4178
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4179
 *
4180
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4181
 * input where the invalid character was found. When the error is
4182
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4183
 *
4184
 * The default option (simdutf::base64_default) expects the characters `+` and
4185
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4186
 * characters `-` and `_` as part of its alphabet.
4187
 *
4188
 * The padding (`=`) is validated if present. There may be at most two padding
4189
 * characters at the end of the input. If there are any padding characters, the
4190
 * total number of characters (excluding spaces but including padding
4191
 * characters) must be divisible by four.
4192
 *
4193
 * You should call this function with a buffer that is at least
4194
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
4195
 * provide that much space, the function may cause a buffer overflow.
4196
 *
4197
 * Advanced users may want to tailor how the last chunk is handled. By default,
4198
 * we use a loose (forgiving) approach but we also support a strict approach
4199
 * as well as a stop_before_partial approach, as per the following proposal:
4200
 *
4201
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4202
 *
4203
 * @param input         the base64 string to process
4204
 * @param length        the length of the string in bytes
4205
 * @param output        the pointer to a buffer that can hold the conversion
4206
 * result (should be at least maximal_binary_length_from_base64(input, length)
4207
 * bytes long).
4208
 * @param options       the base64 options to use, usually base64_default or
4209
 * base64_url, and base64_default by default.
4210
 * @param last_chunk_options the last chunk handling options,
4211
 * last_chunk_handling_options::loose by default
4212
 * but can also be last_chunk_handling_options::strict or
4213
 * last_chunk_handling_options::stop_before_partial.
4214
 * @return a result pair struct (of type simdutf::result containing the two
4215
 * fields error and count) with an error code and either position of the error
4216
 * (in the input in bytes) if any, or the number of bytes written if successful.
4217
 */
4218
simdutf_warn_unused result base64_to_binary(
4219
    const char *input, size_t length, char *output,
4220
    base64_options options = base64_default,
4221
    last_chunk_handling_options last_chunk_options = loose) noexcept;
4222
  #if SIMDUTF_SPAN
4223
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
4224
base64_to_binary(
4225
    const detail::input_span_of_byte_like auto &input,
4226
    detail::output_span_of_byte_like auto &&binary_output,
4227
    base64_options options = base64_default,
4228
    last_chunk_handling_options last_chunk_options = loose) noexcept {
4229
    #if SIMDUTF_CPLUSPLUS23
4230
  if consteval {
4231
    return scalar::base64::base64_to_binary_details_impl(
4232
        input.data(), input.size(), binary_output.data(), options,
4233
        last_chunk_options);
4234
  } else
4235
    #endif
4236
  {
4237
    return base64_to_binary(reinterpret_cast<const char *>(input.data()),
4238
                            input.size(),
4239
                            reinterpret_cast<char *>(binary_output.data()),
4240
                            options, last_chunk_options);
4241
  }
4242
}
4243
  #endif // SIMDUTF_SPAN
4244
4245
/**
4246
 * Provide the base64 length in bytes given the length of a binary input.
4247
 *
4248
 * @param length        the length of the input in bytes
4249
 * @return number of base64 bytes
4250
 */
4251
inline simdutf_warn_unused simdutf_constexpr23 size_t base64_length_from_binary(
4252
    size_t length, base64_options options = base64_default) noexcept {
4253
  return scalar::base64::base64_length_from_binary(length, options);
4254
}
4255
4256
/**
4257
 * Provide the base64 length in bytes given the length of a binary input,
4258
 * taking into account line breaks.
4259
 *
4260
 * @param length        the length of the input in bytes
4261
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
4262
 * interpreted as 4),
4263
 * @return number of base64 bytes
4264
 */
4265
inline simdutf_warn_unused simdutf_constexpr23 size_t
4266
base64_length_from_binary_with_lines(
4267
    size_t length, base64_options options = base64_default,
4268
    size_t line_length = default_line_length) noexcept {
4269
  return scalar::base64::base64_length_from_binary_with_lines(length, options,
4270
                                                              line_length);
4271
}
4272
4273
/**
4274
 * Convert a binary input to a base64 output.
4275
 *
4276
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4277
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4278
 * output to ensure that the output length is a multiple of four.
4279
 *
4280
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4281
 * of its alphabet. No padding is added at the end of the output.
4282
 *
4283
 * This function always succeeds.
4284
 *
4285
 * @param input         the binary to process
4286
 * @param length        the length of the input in bytes
4287
 * @param output        the pointer to a buffer that can hold the conversion
4288
 * result (should be at least base64_length_from_binary(length) bytes long)
4289
 * @param options       the base64 options to use, can be base64_default or
4290
 * base64_url, is base64_default by default.
4291
 * @return number of written bytes, will be equal to
4292
 * base64_length_from_binary(length, options)
4293
 */
4294
size_t binary_to_base64(const char *input, size_t length, char *output,
4295
                        base64_options options = base64_default) noexcept;
4296
  #if SIMDUTF_SPAN
4297
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4298
binary_to_base64(const detail::input_span_of_byte_like auto &input,
4299
                 detail::output_span_of_byte_like auto &&binary_output,
4300
                 base64_options options = base64_default) noexcept {
4301
    #if SIMDUTF_CPLUSPLUS23
4302
  if consteval {
4303
    return scalar::base64::tail_encode_base64(
4304
        binary_output.data(), input.data(), input.size(), options);
4305
  } else
4306
    #endif
4307
  {
4308
    return binary_to_base64(
4309
        reinterpret_cast<const char *>(input.data()), input.size(),
4310
        reinterpret_cast<char *>(binary_output.data()), options);
4311
  }
4312
}
4313
  #endif // SIMDUTF_SPAN
4314
4315
/**
4316
 * Convert a binary input to a base64 output with line breaks.
4317
 *
4318
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4319
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4320
 * output to ensure that the output length is a multiple of four.
4321
 *
4322
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4323
 * of its alphabet. No padding is added at the end of the output.
4324
 *
4325
 * This function always succeeds.
4326
 *
4327
 * @param input         the binary to process
4328
 * @param length        the length of the input in bytes
4329
 * @param output        the pointer to a buffer that can hold the conversion
4330
 * result (should be at least base64_length_from_binary_with_lines(length,
4331
 * options, line_length) bytes long)
4332
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
4333
 * interpreted as 4),
4334
 * @param options       the base64 options to use, can be base64_default or
4335
 * base64_url, is base64_default by default.
4336
 * @return number of written bytes, will be equal to
4337
 * base64_length_from_binary_with_lines(length, options)
4338
 */
4339
size_t
4340
binary_to_base64_with_lines(const char *input, size_t length, char *output,
4341
                            size_t line_length = simdutf::default_line_length,
4342
                            base64_options options = base64_default) noexcept;
4343
  #if SIMDUTF_SPAN
4344
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4345
binary_to_base64_with_lines(
4346
    const detail::input_span_of_byte_like auto &input,
4347
    detail::output_span_of_byte_like auto &&binary_output,
4348
    size_t line_length = simdutf::default_line_length,
4349
    base64_options options = base64_default) noexcept {
4350
    #if SIMDUTF_CPLUSPLUS23
4351
  if consteval {
4352
    return scalar::base64::tail_encode_base64_impl<true>(
4353
        binary_output.data(), input.data(), input.size(), options, line_length);
4354
  } else
4355
    #endif
4356
  {
4357
    return binary_to_base64_with_lines(
4358
        reinterpret_cast<const char *>(input.data()), input.size(),
4359
        reinterpret_cast<char *>(binary_output.data()), line_length, options);
4360
  }
4361
}
4362
  #endif // SIMDUTF_SPAN
4363
4364
  #if SIMDUTF_ATOMIC_REF
4365
/**
4366
 * Convert a binary input to a base64 output, using atomic accesses.
4367
 * This function comes with a potentially significant performance
4368
 * penalty, but it may be useful in some cases where the input
4369
 * buffers are shared between threads, to avoid undefined
4370
 * behavior in case of data races.
4371
 *
4372
 * The function is for advanced users. Its main use case is when
4373
 * to silence sanitizer warnings. We have no documented use case
4374
 * where this function is actually necessary in terms of practical correctness.
4375
 *
4376
 * This function is only available when simdutf is compiled with
4377
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4378
 * the availability of this function by checking the macro
4379
 * SIMDUTF_ATOMIC_REF.
4380
 *
4381
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4382
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4383
 * output to ensure that the output length is a multiple of four.
4384
 *
4385
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4386
 * of its alphabet. No padding is added at the end of the output.
4387
 *
4388
 * This function always succeeds.
4389
 *
4390
 * This function is considered experimental. It is not tested by default
4391
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
4392
 * It is not documented in the public API documentation (README). It is
4393
 * offered on a best effort basis. We rely on the community for further
4394
 * testing and feedback.
4395
 *
4396
 * @brief atomic_binary_to_base64
4397
 * @param input         the binary to process
4398
 * @param length        the length of the input in bytes
4399
 * @param output        the pointer to a buffer that can hold the conversion
4400
 * result (should be at least base64_length_from_binary(length) bytes long)
4401
 * @param options       the base64 options to use, can be base64_default or
4402
 * base64_url, is base64_default by default.
4403
 * @return number of written bytes, will be equal to
4404
 * base64_length_from_binary(length, options)
4405
 */
4406
size_t
4407
atomic_binary_to_base64(const char *input, size_t length, char *output,
4408
                        base64_options options = base64_default) noexcept;
4409
    #if SIMDUTF_SPAN
4410
simdutf_really_inline simdutf_warn_unused size_t
4411
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
4412
                        detail::output_span_of_byte_like auto &&binary_output,
4413
                        base64_options options = base64_default) noexcept {
4414
  return atomic_binary_to_base64(
4415
      reinterpret_cast<const char *>(input.data()), input.size(),
4416
      reinterpret_cast<char *>(binary_output.data()), options);
4417
}
4418
    #endif // SIMDUTF_SPAN
4419
  #endif   // SIMDUTF_ATOMIC_REF
4420
4421
/**
4422
 * Convert a base64 input to a binary output.
4423
 *
4424
 * This function follows the WHATWG forgiving-base64 format, which means that it
4425
 * will ignore any ASCII spaces in the input. You may provide a padded input
4426
 * (with one or two equal signs at the end) or an unpadded input (without any
4427
 * equal signs at the end).
4428
 *
4429
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4430
 *
4431
 * This function will fail in case of invalid input. When last_chunk_options =
4432
 * loose, there are two possible reasons for failure: the input contains a
4433
 * number of base64 characters that when divided by 4, leaves a single remainder
4434
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4435
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4436
 *
4437
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4438
 * input where the invalid character was found. When the error is
4439
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4440
 *
4441
 * The default option (simdutf::base64_default) expects the characters `+` and
4442
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4443
 * characters `-` and `_` as part of its alphabet.
4444
 *
4445
 * The padding (`=`) is validated if present. There may be at most two padding
4446
 * characters at the end of the input. If there are any padding characters, the
4447
 * total number of characters (excluding spaces but including padding
4448
 * characters) must be divisible by four.
4449
 *
4450
 * You should call this function with a buffer that is at least
4451
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
4452
 * to provide that much space, the function may cause a buffer overflow.
4453
 *
4454
 * Advanced users may want to tailor how the last chunk is handled. By default,
4455
 * we use a loose (forgiving) approach but we also support a strict approach
4456
 * as well as a stop_before_partial approach, as per the following proposal:
4457
 *
4458
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4459
 *
4460
 * @param input         the base64 string to process, in ASCII stored as 16-bit
4461
 * units
4462
 * @param length        the length of the string in 16-bit units
4463
 * @param output        the pointer to a buffer that can hold the conversion
4464
 * result (should be at least maximal_binary_length_from_base64(input, length)
4465
 * bytes long).
4466
 * @param options       the base64 options to use, can be base64_default or
4467
 * base64_url, is base64_default by default.
4468
 * @param last_chunk_options the last chunk handling options,
4469
 * last_chunk_handling_options::loose by default
4470
 * but can also be last_chunk_handling_options::strict or
4471
 * last_chunk_handling_options::stop_before_partial.
4472
 * @return a result pair struct (of type simdutf::result containing the two
4473
 * fields error and count) with an error code and position of the
4474
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4475
 * of bytes written if successful.
4476
 */
4477
simdutf_warn_unused result
4478
base64_to_binary(const char16_t *input, size_t length, char *output,
4479
                 base64_options options = base64_default,
4480
                 last_chunk_handling_options last_chunk_options =
4481
                     last_chunk_handling_options::loose) noexcept;
4482
  #if SIMDUTF_SPAN
4483
simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
4484
base64_to_binary(
4485
    std::span<const char16_t> input,
4486
    detail::output_span_of_byte_like auto &&binary_output,
4487
    base64_options options = base64_default,
4488
    last_chunk_handling_options last_chunk_options = loose) noexcept {
4489
    #if SIMDUTF_CPLUSPLUS23
4490
  if consteval {
4491
    return scalar::base64::base64_to_binary_details_impl(
4492
        input.data(), input.size(), binary_output.data(), options,
4493
        last_chunk_options);
4494
  } else
4495
    #endif
4496
  {
4497
    return base64_to_binary(input.data(), input.size(),
4498
                            reinterpret_cast<char *>(binary_output.data()),
4499
                            options, last_chunk_options);
4500
  }
4501
}
4502
  #endif // SIMDUTF_SPAN
4503
4504
/**
4505
 * Check if a character is an ignorable base64 character.
4506
 * Checking a large input, character by character, is not computationally
4507
 * efficient.
4508
 *
4509
 * @param input         the character to check
4510
 * @param options       the base64 options to use, is base64_default by default.
4511
 * @return true if the character is an ignorable base64 character, false
4512
 * otherwise.
4513
 */
4514
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4515
base64_ignorable(char input, base64_options options = base64_default) noexcept {
4516
  return scalar::base64::is_ignorable(input, options);
4517
}
4518
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4519
base64_ignorable(char16_t input,
4520
                 base64_options options = base64_default) noexcept {
4521
  return scalar::base64::is_ignorable(input, options);
4522
}
4523
4524
/**
4525
 * Check if a character is a valid base64 character.
4526
 * Checking a large input, character by character, is not computationally
4527
 * efficient.
4528
 * Note that padding characters are not considered valid base64 characters in
4529
 * this context, nor are spaces.
4530
 *
4531
 * @param input         the character to check
4532
 * @param options       the base64 options to use, is base64_default by default.
4533
 * @return true if the character is a base64 character, false otherwise.
4534
 */
4535
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4536
0
base64_valid(char input, base64_options options = base64_default) noexcept {
4537
0
  return scalar::base64::is_base64(input, options);
4538
0
}
4539
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4540
0
base64_valid(char16_t input, base64_options options = base64_default) noexcept {
4541
0
  return scalar::base64::is_base64(input, options);
4542
0
}
4543
4544
/**
4545
 * Check if a character is a valid base64 character or the padding character
4546
 * ('='). Checking a large input, character by character, is not computationally
4547
 * efficient.
4548
 *
4549
 * @param input         the character to check
4550
 * @param options       the base64 options to use, is base64_default by default.
4551
 * @return true if the character is a base64 character, false otherwise.
4552
 */
4553
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4554
base64_valid_or_padding(char input,
4555
0
                        base64_options options = base64_default) noexcept {
4556
0
  return scalar::base64::is_base64_or_padding(input, options);
4557
0
}
4558
simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4559
base64_valid_or_padding(char16_t input,
4560
0
                        base64_options options = base64_default) noexcept {
4561
0
  return scalar::base64::is_base64_or_padding(input, options);
4562
0
}
4563
4564
/**
4565
 * Convert a base64 input to a binary output.
4566
 *
4567
 * This function follows the WHATWG forgiving-base64 format, which means that it
4568
 * will ignore any ASCII spaces in the input. You may provide a padded input
4569
 * (with one or two equal signs at the end) or an unpadded input (without any
4570
 * equal signs at the end).
4571
 *
4572
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4573
 *
4574
 * This function will fail in case of invalid input. When last_chunk_options =
4575
 * loose, there are three possible reasons for failure: the input contains a
4576
 * number of base64 characters that when divided by 4, leaves a single remainder
4577
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
4578
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
4579
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
4580
 *
4581
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
4582
 * and the number of units processed, see description of the parameters and
4583
 * returned value.
4584
 *
4585
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4586
 * input where the invalid character was found. When the error is
4587
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4588
 *
4589
 * The default option (simdutf::base64_default) expects the characters `+` and
4590
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4591
 * characters `-` and `_` as part of its alphabet.
4592
 *
4593
 * The padding (`=`) is validated if present. There may be at most two padding
4594
 * characters at the end of the input. If there are any padding characters, the
4595
 * total number of characters (excluding spaces but including padding
4596
 * characters) must be divisible by four.
4597
 *
4598
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
4599
 * to discard the output unless the parameter decode_up_to_bad_char is set to
4600
 * true. In that case, the function will decode up to the first invalid
4601
 * character. Extra padding characters ('=') are considered invalid characters.
4602
 *
4603
 * Advanced users may want to tailor how the last chunk is handled. By default,
4604
 * we use a loose (forgiving) approach but we also support a strict approach
4605
 * as well as a stop_before_partial approach, as per the following proposal:
4606
 *
4607
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4608
 *
4609
 * @param input         the base64 string to process, in ASCII stored as 8-bit
4610
 * or 16-bit units
4611
 * @param length        the length of the string in 8-bit or 16-bit units.
4612
 * @param output        the pointer to a buffer that can hold the conversion
4613
 * result.
4614
 * @param outlen        the number of bytes that can be written in the output
4615
 * buffer. Upon return, it is modified to reflect how many bytes were written.
4616
 * @param options       the base64 options to use, can be base64_default or
4617
 * base64_url, is base64_default by default.
4618
 * @param last_chunk_options the last chunk handling options,
4619
 * last_chunk_handling_options::loose by default
4620
 * but can also be last_chunk_handling_options::strict or
4621
 * last_chunk_handling_options::stop_before_partial.
4622
 * @param decode_up_to_bad_char if true, the function will decode up to the
4623
 * first invalid character. By default (false), it is assumed that the output
4624
 * buffer is to be discarded. When there are multiple errors in the input,
4625
 * using decode_up_to_bad_char might trigger a different error.
4626
 * @return a result pair struct (of type simdutf::result containing the two
4627
 * fields error and count) with an error code and position of the
4628
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4629
 * of units processed if successful.
4630
 */
4631
simdutf_warn_unused result
4632
base64_to_binary_safe(const char *input, size_t length, char *output,
4633
                      size_t &outlen, base64_options options = base64_default,
4634
                      last_chunk_handling_options last_chunk_options =
4635
                          last_chunk_handling_options::loose,
4636
                      bool decode_up_to_bad_char = false) noexcept;
4637
// the span overload has moved to the bottom of the file
4638
4639
simdutf_warn_unused result
4640
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
4641
                      size_t &outlen, base64_options options = base64_default,
4642
                      last_chunk_handling_options last_chunk_options =
4643
                          last_chunk_handling_options::loose,
4644
                      bool decode_up_to_bad_char = false) noexcept;
4645
  // span overload moved to bottom of file
4646
4647
  #if SIMDUTF_ATOMIC_REF
4648
/**
4649
 * Convert a base64 input to a binary output with a size limit and using atomic
4650
 * operations.
4651
 *
4652
 * Like `base64_to_binary_safe` but using atomic operations, this function is
4653
 * thread-safe for concurrent memory access, allowing the output
4654
 * buffers to be shared between threads without undefined behavior in case of
4655
 * data races.
4656
 *
4657
 * This function comes with a potentially significant performance penalty, but
4658
 * is useful when thread safety is needed during base64 decoding.
4659
 *
4660
 * This function is only available when simdutf is compiled with
4661
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4662
 * the availability of this function by checking the macro
4663
 * SIMDUTF_ATOMIC_REF.
4664
 *
4665
 * This function is considered experimental. It is not tested by default
4666
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
4667
 * It is not documented in the public API documentation (README). It is
4668
 * offered on a best effort basis. We rely on the community for further
4669
 * testing and feedback.
4670
 *
4671
 * @param input         the base64 input to decode
4672
 * @param length        the length of the input in bytes
4673
 * @param output        the pointer to buffer that can hold the conversion
4674
 * result
4675
 * @param outlen        the number of bytes that can be written in the output
4676
 * buffer. Upon return, it is modified to reflect how many bytes were written.
4677
 * @param options       the base64 options to use (default, url, etc.)
4678
 * @param last_chunk_options the last chunk handling options (loose, strict,
4679
 * stop_before_partial)
4680
 * @param decode_up_to_bad_char if true, the function will decode up to the
4681
 * first invalid character. By default (false), it is assumed that the output
4682
 * buffer is to be discarded. When there are multiple errors in the input,
4683
 * using decode_up_to_bad_char might trigger a different error.
4684
 * @return a result struct with an error code and count indicating error
4685
 * position or success
4686
 */
4687
simdutf_warn_unused result atomic_base64_to_binary_safe(
4688
    const char *input, size_t length, char *output, size_t &outlen,
4689
    base64_options options = base64_default,
4690
    last_chunk_handling_options last_chunk_options =
4691
        last_chunk_handling_options::loose,
4692
    bool decode_up_to_bad_char = false) noexcept;
4693
simdutf_warn_unused result atomic_base64_to_binary_safe(
4694
    const char16_t *input, size_t length, char *output, size_t &outlen,
4695
    base64_options options = base64_default,
4696
    last_chunk_handling_options last_chunk_options = loose,
4697
    bool decode_up_to_bad_char = false) noexcept;
4698
    #if SIMDUTF_SPAN
4699
/**
4700
 * @brief span overload
4701
 * @return a tuple of result and outlen
4702
 */
4703
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
4704
atomic_base64_to_binary_safe(
4705
    const detail::input_span_of_byte_like auto &binary_input,
4706
    detail::output_span_of_byte_like auto &&output,
4707
    base64_options options = base64_default,
4708
    last_chunk_handling_options last_chunk_options =
4709
        last_chunk_handling_options::loose,
4710
    bool decode_up_to_bad_char = false) noexcept {
4711
  size_t outlen = output.size();
4712
  auto ret = atomic_base64_to_binary_safe(
4713
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
4714
      reinterpret_cast<char *>(output.data()), outlen, options,
4715
      last_chunk_options, decode_up_to_bad_char);
4716
  return {ret, outlen};
4717
}
4718
/**
4719
 * @brief span overload
4720
 * @return a tuple of result and outlen
4721
 */
4722
simdutf_warn_unused std::tuple<result, std::size_t>
4723
atomic_base64_to_binary_safe(
4724
    std::span<const char16_t> base64_input,
4725
    detail::output_span_of_byte_like auto &&binary_output,
4726
    base64_options options = base64_default,
4727
    last_chunk_handling_options last_chunk_options = loose,
4728
    bool decode_up_to_bad_char = false) noexcept {
4729
  size_t outlen = binary_output.size();
4730
  auto ret = atomic_base64_to_binary_safe(
4731
      base64_input.data(), base64_input.size(),
4732
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
4733
      last_chunk_options, decode_up_to_bad_char);
4734
  return {ret, outlen};
4735
}
4736
    #endif // SIMDUTF_SPAN
4737
  #endif   // SIMDUTF_ATOMIC_REF
4738
4739
#endif // SIMDUTF_FEATURE_BASE64
4740
4741
/**
4742
 * An implementation of simdutf for a particular CPU architecture.
4743
 *
4744
 * Also used to maintain the currently active implementation. The active
4745
 * implementation is automatically initialized on first use to the most advanced
4746
 * implementation supported by the host.
4747
 */
4748
class implementation {
4749
public:
4750
  /**
4751
   * The name of this implementation.
4752
   *
4753
   *     const implementation *impl = simdutf::active_implementation;
4754
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
4755
   * impl->description() << ")" << endl;
4756
   *
4757
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
4758
   */
4759
  virtual std::string name() const { return std::string(_name); }
4760
4761
  /**
4762
   * The description of this implementation.
4763
   *
4764
   *     const implementation *impl = simdutf::active_implementation;
4765
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
4766
   * impl->description() << ")" << endl;
4767
   *
4768
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
4769
   */
4770
  virtual std::string description() const { return std::string(_description); }
4771
4772
  /**
4773
   * The instruction sets this implementation is compiled against
4774
   * and the current CPU match. This function may poll the current CPU/system
4775
   * and should therefore not be called too often if performance is a concern.
4776
   *
4777
   *
4778
   * @return true if the implementation can be safely used on the current system
4779
   * (determined at runtime)
4780
   */
4781
  bool supported_by_runtime_system() const;
4782
4783
#if SIMDUTF_FEATURE_DETECT_ENCODING
4784
  /**
4785
   * This function will try to detect the encoding
4786
   * @param input the string to identify
4787
   * @param length the length of the string in bytes.
4788
   * @return the encoding type detected
4789
   */
4790
  virtual encoding_type autodetect_encoding(const char *input,
4791
                                            size_t length) const noexcept;
4792
4793
  /**
4794
   * This function will try to detect the possible encodings in one pass
4795
   * @param input the string to identify
4796
   * @param length the length of the string in bytes.
4797
   * @return the encoding type detected
4798
   */
4799
  virtual int detect_encodings(const char *input,
4800
                               size_t length) const noexcept = 0;
4801
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
4802
4803
  /**
4804
   * @private For internal implementation use
4805
   *
4806
   * The instruction sets this implementation is compiled against.
4807
   *
4808
   * @return a mask of all required `internal::instruction_set::` values
4809
   */
4810
  virtual uint32_t required_instruction_sets() const {
4811
    return _required_instruction_sets;
4812
  }
4813
4814
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
4815
  /**
4816
   * Validate the UTF-8 string.
4817
   *
4818
   * Overridden by each implementation.
4819
   *
4820
   * @param buf the UTF-8 string to validate.
4821
   * @param len the length of the string in bytes.
4822
   * @return true if and only if the string is valid UTF-8.
4823
   */
4824
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
4825
                                                 size_t len) const noexcept = 0;
4826
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
4827
4828
#if SIMDUTF_FEATURE_UTF8
4829
  /**
4830
   * Validate the UTF-8 string and stop on errors.
4831
   *
4832
   * Overridden by each implementation.
4833
   *
4834
   * @param buf the UTF-8 string to validate.
4835
   * @param len the length of the string in bytes.
4836
   * @return a result pair struct (of type simdutf::result containing the two
4837
   * fields error and count) with an error code and either position of the error
4838
   * (in the input in code units) if any, or the number of code units validated
4839
   * if successful.
4840
   */
4841
  simdutf_warn_unused virtual result
4842
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
4843
#endif // SIMDUTF_FEATURE_UTF8
4844
4845
#if SIMDUTF_FEATURE_ASCII
4846
  /**
4847
   * Validate the ASCII string.
4848
   *
4849
   * Overridden by each implementation.
4850
   *
4851
   * @param buf the ASCII string to validate.
4852
   * @param len the length of the string in bytes.
4853
   * @return true if and only if the string is valid ASCII.
4854
   */
4855
  simdutf_warn_unused virtual bool
4856
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
4857
4858
  /**
4859
   * Validate the ASCII string and stop on error.
4860
   *
4861
   * Overridden by each implementation.
4862
   *
4863
   * @param buf the ASCII string to validate.
4864
   * @param len the length of the string in bytes.
4865
   * @return a result pair struct (of type simdutf::result containing the two
4866
   * fields error and count) with an error code and either position of the error
4867
   * (in the input in code units) if any, or the number of code units validated
4868
   * if successful.
4869
   */
4870
  simdutf_warn_unused virtual result
4871
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
4872
4873
#endif // SIMDUTF_FEATURE_ASCII
4874
4875
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
4876
  /**
4877
   * Validate the ASCII string as a UTF-16BE sequence.
4878
   * An UTF-16 sequence is considered an ASCII sequence
4879
   * if it could be converted to an ASCII string losslessly.
4880
   *
4881
   * Overridden by each implementation.
4882
   *
4883
   * @param buf the UTF-16BE string to validate.
4884
   * @param len the length of the string in bytes.
4885
   * @return true if and only if the string is valid ASCII.
4886
   */
4887
  simdutf_warn_unused virtual bool
4888
  validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
4889
4890
  /**
4891
   * Validate the ASCII string as a UTF-16LE sequence.
4892
   * An UTF-16 sequence is considered an ASCII sequence
4893
   * if it could be converted to an ASCII string losslessly.
4894
   *
4895
   * Overridden by each implementation.
4896
   *
4897
   * @param buf the UTF-16LE string to validate.
4898
   * @param len the length of the string in bytes.
4899
   * @return true if and only if the string is valid ASCII.
4900
   */
4901
  simdutf_warn_unused virtual bool
4902
  validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
4903
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
4904
4905
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
4906
  /**
4907
   * Validate the UTF-16LE string.This function may be best when you expect
4908
   * the input to be almost always valid. Otherwise, consider using
4909
   * validate_utf16le_with_errors.
4910
   *
4911
   * Overridden by each implementation.
4912
   *
4913
   * This function is not BOM-aware.
4914
   *
4915
   * @param buf the UTF-16LE string to validate.
4916
   * @param len the length of the string in number of 2-byte code units
4917
   * (char16_t).
4918
   * @return true if and only if the string is valid UTF-16LE.
4919
   */
4920
  simdutf_warn_unused virtual bool
4921
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
4922
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
4923
4924
#if SIMDUTF_FEATURE_UTF16
4925
  /**
4926
   * Validate the UTF-16BE string. This function may be best when you expect
4927
   * the input to be almost always valid. Otherwise, consider using
4928
   * validate_utf16be_with_errors.
4929
   *
4930
   * Overridden by each implementation.
4931
   *
4932
   * This function is not BOM-aware.
4933
   *
4934
   * @param buf the UTF-16BE string to validate.
4935
   * @param len the length of the string in number of 2-byte code units
4936
   * (char16_t).
4937
   * @return true if and only if the string is valid UTF-16BE.
4938
   */
4939
  simdutf_warn_unused virtual bool
4940
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
4941
4942
  /**
4943
   * Validate the UTF-16LE string and stop on error.  It might be faster than
4944
   * validate_utf16le when an error is expected to occur early.
4945
   *
4946
   * Overridden by each implementation.
4947
   *
4948
   * This function is not BOM-aware.
4949
   *
4950
   * @param buf the UTF-16LE string to validate.
4951
   * @param len the length of the string in number of 2-byte code units
4952
   * (char16_t).
4953
   * @return a result pair struct (of type simdutf::result containing the two
4954
   * fields error and count) with an error code and either position of the error
4955
   * (in the input in code units) if any, or the number of code units validated
4956
   * if successful.
4957
   */
4958
  simdutf_warn_unused virtual result
4959
  validate_utf16le_with_errors(const char16_t *buf,
4960
                               size_t len) const noexcept = 0;
4961
4962
  /**
4963
   * Validate the UTF-16BE string and stop on error. It might be faster than
4964
   * validate_utf16be when an error is expected to occur early.
4965
   *
4966
   * Overridden by each implementation.
4967
   *
4968
   * This function is not BOM-aware.
4969
   *
4970
   * @param buf the UTF-16BE string to validate.
4971
   * @param len the length of the string in number of 2-byte code units
4972
   * (char16_t).
4973
   * @return a result pair struct (of type simdutf::result containing the two
4974
   * fields error and count) with an error code and either position of the error
4975
   * (in the input in code units) if any, or the number of code units validated
4976
   * if successful.
4977
   */
4978
  simdutf_warn_unused virtual result
4979
  validate_utf16be_with_errors(const char16_t *buf,
4980
                               size_t len) const noexcept = 0;
4981
  /**
4982
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
4983
   * Unicode replacement character U+FFFD. We allow the input and output to be
4984
   * the same buffer so that the correction is done in-place.
4985
   *
4986
   * Overridden by each implementation.
4987
   *
4988
   * @param input the UTF-16LE string to correct.
4989
   * @param len the length of the string in number of 2-byte code units
4990
   * (char16_t).
4991
   * @param output the output buffer.
4992
   */
4993
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
4994
                                      char16_t *output) const noexcept = 0;
4995
  /**
4996
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
4997
   * Unicode replacement character U+FFFD. We allow the input and output to be
4998
   * the same buffer so that the correction is done in-place.
4999
   *
5000
   * Overridden by each implementation.
5001
   *
5002
   * @param input the UTF-16BE string to correct.
5003
   * @param len the length of the string in number of 2-byte code units
5004
   * (char16_t).
5005
   * @param output the output buffer.
5006
   */
5007
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
5008
                                      char16_t *output) const noexcept = 0;
5009
#endif // SIMDUTF_FEATURE_UTF16
5010
5011
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
5012
  /**
5013
   * Validate the UTF-32 string.
5014
   *
5015
   * Overridden by each implementation.
5016
   *
5017
   * This function is not BOM-aware.
5018
   *
5019
   * @param buf the UTF-32 string to validate.
5020
   * @param len the length of the string in number of 4-byte code units
5021
   * (char32_t).
5022
   * @return true if and only if the string is valid UTF-32.
5023
   */
5024
  simdutf_warn_unused virtual bool
5025
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
5026
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
5027
5028
#if SIMDUTF_FEATURE_UTF32
5029
  /**
5030
   * Validate the UTF-32 string and stop on error.
5031
   *
5032
   * Overridden by each implementation.
5033
   *
5034
   * This function is not BOM-aware.
5035
   *
5036
   * @param buf the UTF-32 string to validate.
5037
   * @param len the length of the string in number of 4-byte code units
5038
   * (char32_t).
5039
   * @return a result pair struct (of type simdutf::result containing the two
5040
   * fields error and count) with an error code and either position of the error
5041
   * (in the input in code units) if any, or the number of code units validated
5042
   * if successful.
5043
   */
5044
  simdutf_warn_unused virtual result
5045
  validate_utf32_with_errors(const char32_t *buf,
5046
                             size_t len) const noexcept = 0;
5047
#endif // SIMDUTF_FEATURE_UTF32
5048
5049
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5050
  /**
5051
   * Convert Latin1 string into UTF-8 string.
5052
   *
5053
   * This function is suitable to work with inputs from untrusted sources.
5054
   *
5055
   * @param input         the Latin1 string to convert
5056
   * @param length        the length of the string in bytes
5057
   * @param utf8_output  the pointer to buffer that can hold conversion result
5058
   * @return the number of written char; 0 if conversion is not possible
5059
   */
5060
  simdutf_warn_unused virtual size_t
5061
  convert_latin1_to_utf8(const char *input, size_t length,
5062
                         char *utf8_output) const noexcept = 0;
5063
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5064
5065
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5066
  /**
5067
   * Convert possibly Latin1 string into UTF-16LE string.
5068
   *
5069
   * This function is suitable to work with inputs from untrusted sources.
5070
   *
5071
   * @param input         the Latin1  string to convert
5072
   * @param length        the length of the string in bytes
5073
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5074
   * @return the number of written char16_t; 0 if conversion is not possible
5075
   */
5076
  simdutf_warn_unused virtual size_t
5077
  convert_latin1_to_utf16le(const char *input, size_t length,
5078
                            char16_t *utf16_output) const noexcept = 0;
5079
5080
  /**
5081
   * Convert Latin1 string into UTF-16BE string.
5082
   *
5083
   * This function is suitable to work with inputs from untrusted sources.
5084
   *
5085
   * @param input         the Latin1 string to convert
5086
   * @param length        the length of the string in bytes
5087
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5088
   * @return the number of written char16_t; 0 if conversion is not possible
5089
   */
5090
  simdutf_warn_unused virtual size_t
5091
  convert_latin1_to_utf16be(const char *input, size_t length,
5092
                            char16_t *utf16_output) const noexcept = 0;
5093
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5094
5095
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5096
  /**
5097
   * Convert Latin1 string into UTF-32 string.
5098
   *
5099
   * This function is suitable to work with inputs from untrusted sources.
5100
   *
5101
   * @param input         the Latin1 string to convert
5102
   * @param length        the length of the string in bytes
5103
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5104
   * @return the number of written char32_t; 0 if conversion is not possible
5105
   */
5106
  simdutf_warn_unused virtual size_t
5107
  convert_latin1_to_utf32(const char *input, size_t length,
5108
                          char32_t *utf32_buffer) const noexcept = 0;
5109
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5110
5111
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5112
  /**
5113
   * Convert possibly broken UTF-8 string into latin1 string.
5114
   *
5115
   * During the conversion also validation of the input string is done.
5116
   * This function is suitable to work with inputs from untrusted sources.
5117
   *
5118
   * @param input         the UTF-8 string to convert
5119
   * @param length        the length of the string in bytes
5120
   * @param latin1_output  the pointer to buffer that can hold conversion result
5121
   * @return the number of written char; 0 if the input was not valid UTF-8
5122
   * string or if it cannot be represented as Latin1
5123
   */
5124
  simdutf_warn_unused virtual size_t
5125
  convert_utf8_to_latin1(const char *input, size_t length,
5126
                         char *latin1_output) const noexcept = 0;
5127
5128
  /**
5129
   * Convert possibly broken UTF-8 string into latin1 string with errors.
5130
   * If the string cannot be represented as Latin1, an error
5131
   * code is returned.
5132
   *
5133
   * During the conversion also validation of the input string is done.
5134
   * This function is suitable to work with inputs from untrusted sources.
5135
   *
5136
   * @param input         the UTF-8 string to convert
5137
   * @param length        the length of the string in bytes
5138
   * @param latin1_output  the pointer to buffer that can hold conversion result
5139
   * @return a result pair struct (of type simdutf::result containing the two
5140
   * fields error and count) with an error code and either position of the error
5141
   * (in the input in code units) if any, or the number of code units validated
5142
   * if successful.
5143
   */
5144
  simdutf_warn_unused virtual result
5145
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
5146
                                     char *latin1_output) const noexcept = 0;
5147
5148
  /**
5149
   * Convert valid UTF-8 string into latin1 string.
5150
   *
5151
   * This function assumes that the input string is valid UTF-8 and that it can
5152
   * be represented as Latin1. If you violate this assumption, the result is
5153
   * implementation defined and may include system-dependent behavior such as
5154
   * crashes.
5155
   *
5156
   * This function is for expert users only and not part of our public API. Use
5157
   * convert_utf8_to_latin1 instead.
5158
   *
5159
   * This function is not BOM-aware.
5160
   *
5161
   * @param input         the UTF-8 string to convert
5162
   * @param length        the length of the string in bytes
5163
   * @param latin1_output  the pointer to buffer that can hold conversion result
5164
   * @return the number of written char; 0 if the input was not valid UTF-8
5165
   * string
5166
   */
5167
  simdutf_warn_unused virtual size_t
5168
  convert_valid_utf8_to_latin1(const char *input, size_t length,
5169
                               char *latin1_output) const noexcept = 0;
5170
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5171
5172
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5173
  /**
5174
   * Convert possibly broken UTF-8 string into UTF-16LE string.
5175
   *
5176
   * During the conversion also validation of the input string is done.
5177
   * This function is suitable to work with inputs from untrusted sources.
5178
   *
5179
   * @param input         the UTF-8 string to convert
5180
   * @param length        the length of the string in bytes
5181
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5182
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
5183
   * string
5184
   */
5185
  simdutf_warn_unused virtual size_t
5186
  convert_utf8_to_utf16le(const char *input, size_t length,
5187
                          char16_t *utf16_output) const noexcept = 0;
5188
5189
  /**
5190
   * Convert possibly broken UTF-8 string into UTF-16BE string.
5191
   *
5192
   * During the conversion also validation of the input string is done.
5193
   * This function is suitable to work with inputs from untrusted sources.
5194
   *
5195
   * @param input         the UTF-8 string to convert
5196
   * @param length        the length of the string in bytes
5197
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5198
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
5199
   * string
5200
   */
5201
  simdutf_warn_unused virtual size_t
5202
  convert_utf8_to_utf16be(const char *input, size_t length,
5203
                          char16_t *utf16_output) const noexcept = 0;
5204
5205
  /**
5206
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
5207
   * error.
5208
   *
5209
   * During the conversion also validation of the input string is done.
5210
   * This function is suitable to work with inputs from untrusted sources.
5211
   *
5212
   * @param input         the UTF-8 string to convert
5213
   * @param length        the length of the string in bytes
5214
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5215
   * @return a result pair struct (of type simdutf::result containing the two
5216
   * fields error and count) with an error code and either position of the error
5217
   * (in the input in code units) if any, or the number of code units validated
5218
   * if successful.
5219
   */
5220
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
5221
      const char *input, size_t length,
5222
      char16_t *utf16_output) const noexcept = 0;
5223
5224
  /**
5225
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
5226
   * error.
5227
   *
5228
   * During the conversion also validation of the input string is done.
5229
   * This function is suitable to work with inputs from untrusted sources.
5230
   *
5231
   * @param input         the UTF-8 string to convert
5232
   * @param length        the length of the string in bytes
5233
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5234
   * @return a result pair struct (of type simdutf::result containing the two
5235
   * fields error and count) with an error code and either position of the error
5236
   * (in the input in code units) if any, or the number of code units validated
5237
   * if successful.
5238
   */
5239
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
5240
      const char *input, size_t length,
5241
      char16_t *utf16_output) const noexcept = 0;
5242
  /**
5243
   * Compute the number of bytes that this UTF-16LE string would require in
5244
   * UTF-8 format even when the UTF-16LE content contains mismatched
5245
   * surrogates that have to be replaced by the replacement character (0xFFFD).
5246
   *
5247
   * @param input         the UTF-16LE string to convert
5248
   * @param length        the length of the string in 2-byte code units
5249
   * (char16_t)
5250
   * @return a result pair struct (of type simdutf::result containing the two
5251
   * fields error and count) where the count is the number of bytes required to
5252
   * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS
5253
   * or SURROGATE. The count is correct regardless of the error field.
5254
   * When SURROGATE is returned, it does not indicate an error in the case of
5255
   * this function: it indicates that at least one surrogate has been
5256
   * encountered: the surrogates may be matched or not (thus this function does
5257
   * not validate). If the returned error code is SUCCESS, then the input
5258
   * contains no surrogate, is in the Basic Multilingual Plane, and is
5259
   * necessarily valid.
5260
   */
5261
  virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
5262
      const char16_t *input, size_t length) const noexcept = 0;
5263
5264
  /**
5265
   * Compute the number of bytes that this UTF-16BE string would require in
5266
   * UTF-8 format even when the UTF-16BE content contains mismatched
5267
   * surrogates that have to be replaced by the replacement character (0xFFFD).
5268
   *
5269
   * @param input         the UTF-16BE string to convert
5270
   * @param length        the length of the string in 2-byte code units
5271
   * (char16_t)
5272
   * @return a result pair struct (of type simdutf::result containing the two
5273
   * fields error and count) where the count is the number of bytes required to
5274
   * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS
5275
   * or SURROGATE. The count is correct regardless of the error field.
5276
   * When SURROGATE is returned, it does not indicate an error in the case of
5277
   * this function: it indicates that at least one surrogate has been
5278
   * encountered: the surrogates may be matched or not (thus this function does
5279
   * not validate). If the returned error code is SUCCESS, then the input
5280
   * contains no surrogate, is in the Basic Multilingual Plane, and is
5281
   * necessarily valid.
5282
   */
5283
  virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
5284
      const char16_t *input, size_t length) const noexcept = 0;
5285
5286
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5287
5288
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5289
  /**
5290
   * Convert possibly broken UTF-8 string into UTF-32 string.
5291
   *
5292
   * During the conversion also validation of the input string is done.
5293
   * This function is suitable to work with inputs from untrusted sources.
5294
   *
5295
   * @param input         the UTF-8 string to convert
5296
   * @param length        the length of the string in bytes
5297
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5298
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
5299
   * string
5300
   */
5301
  simdutf_warn_unused virtual size_t
5302
  convert_utf8_to_utf32(const char *input, size_t length,
5303
                        char32_t *utf32_output) const noexcept = 0;
5304
5305
  /**
5306
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
5307
   *
5308
   * During the conversion also validation of the input string is done.
5309
   * This function is suitable to work with inputs from untrusted sources.
5310
   *
5311
   * @param input         the UTF-8 string to convert
5312
   * @param length        the length of the string in bytes
5313
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5314
   * @return a result pair struct (of type simdutf::result containing the two
5315
   * fields error and count) with an error code and either position of the error
5316
   * (in the input in code units) if any, or the number of char32_t written if
5317
   * successful.
5318
   */
5319
  simdutf_warn_unused virtual result
5320
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
5321
                                    char32_t *utf32_output) const noexcept = 0;
5322
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5323
5324
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5325
  /**
5326
   * Convert valid UTF-8 string into UTF-16LE string.
5327
   *
5328
   * This function assumes that the input string is valid UTF-8.
5329
   *
5330
   * @param input         the UTF-8 string to convert
5331
   * @param length        the length of the string in bytes
5332
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5333
   * @return the number of written char16_t
5334
   */
5335
  simdutf_warn_unused virtual size_t
5336
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
5337
                                char16_t *utf16_buffer) const noexcept = 0;
5338
5339
  /**
5340
   * Convert valid UTF-8 string into UTF-16BE string.
5341
   *
5342
   * This function assumes that the input string is valid UTF-8.
5343
   *
5344
   * @param input         the UTF-8 string to convert
5345
   * @param length        the length of the string in bytes
5346
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5347
   * @return the number of written char16_t
5348
   */
5349
  simdutf_warn_unused virtual size_t
5350
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
5351
                                char16_t *utf16_buffer) const noexcept = 0;
5352
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5353
5354
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5355
  /**
5356
   * Convert valid UTF-8 string into UTF-32 string.
5357
   *
5358
   * This function assumes that the input string is valid UTF-8.
5359
   *
5360
   * @param input         the UTF-8 string to convert
5361
   * @param length        the length of the string in bytes
5362
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5363
   * @return the number of written char32_t
5364
   */
5365
  simdutf_warn_unused virtual size_t
5366
  convert_valid_utf8_to_utf32(const char *input, size_t length,
5367
                              char32_t *utf32_buffer) const noexcept = 0;
5368
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5369
5370
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5371
  /**
5372
   * Compute the number of 2-byte code units that this UTF-8 string would
5373
   * require in UTF-16LE format.
5374
   *
5375
   * This function does not validate the input. It is acceptable to pass invalid
5376
   * UTF-8 strings but in such cases the result is implementation defined.
5377
   *
5378
   * @param input         the UTF-8 string to process
5379
   * @param length        the length of the string in bytes
5380
   * @return the number of char16_t code units required to encode the UTF-8
5381
   * string as UTF-16LE
5382
   */
5383
  simdutf_warn_unused virtual size_t
5384
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5385
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5386
5387
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5388
  /**
5389
   * Compute the number of 4-byte code units that this UTF-8 string would
5390
   * require in UTF-32 format.
5391
   *
5392
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
5393
   * UTF-8 strings but in such cases the result is implementation defined.
5394
   *
5395
   * This function does not validate the input.
5396
   *
5397
   * @param input         the UTF-8 string to process
5398
   * @param length        the length of the string in bytes
5399
   * @return the number of char32_t code units required to encode the UTF-8
5400
   * string as UTF-32
5401
   */
5402
  simdutf_warn_unused virtual size_t
5403
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5404
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5405
5406
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5407
  /**
5408
   * Convert possibly broken UTF-16LE string into Latin1 string.
5409
   *
5410
   * During the conversion also validation of the input string is done.
5411
   * This function is suitable to work with inputs from untrusted sources.
5412
   *
5413
   * This function is not BOM-aware.
5414
   *
5415
   * @param input         the UTF-16LE string to convert
5416
   * @param length        the length of the string in 2-byte code units
5417
   * (char16_t)
5418
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5419
   * result
5420
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5421
   * string or if it cannot be represented as Latin1
5422
   */
5423
  simdutf_warn_unused virtual size_t
5424
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
5425
                            char *latin1_buffer) const noexcept = 0;
5426
5427
  /**
5428
   * Convert possibly broken UTF-16BE string into Latin1 string.
5429
   *
5430
   * During the conversion also validation of the input string is done.
5431
   * This function is suitable to work with inputs from untrusted sources.
5432
   *
5433
   * This function is not BOM-aware.
5434
   *
5435
   * @param input         the UTF-16BE string to convert
5436
   * @param length        the length of the string in 2-byte code units
5437
   * (char16_t)
5438
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5439
   * result
5440
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5441
   * string or if it cannot be represented as Latin1
5442
   */
5443
  simdutf_warn_unused virtual size_t
5444
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
5445
                            char *latin1_buffer) const noexcept = 0;
5446
5447
  /**
5448
   * Convert possibly broken UTF-16LE string into Latin1 string.
5449
   * If the string cannot be represented as Latin1, an error
5450
   * is returned.
5451
   *
5452
   * During the conversion also validation of the input string is done.
5453
   * This function is suitable to work with inputs from untrusted sources.
5454
   * This function is not BOM-aware.
5455
   *
5456
   * @param input         the UTF-16LE string to convert
5457
   * @param length        the length of the string in 2-byte code units
5458
   * (char16_t)
5459
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5460
   * result
5461
   * @return a result pair struct (of type simdutf::result containing the two
5462
   * fields error and count) with an error code and either position of the error
5463
   * (in the input in code units) if any, or the number of char written if
5464
   * successful.
5465
   */
5466
  simdutf_warn_unused virtual result
5467
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
5468
                                        char *latin1_buffer) const noexcept = 0;
5469
5470
  /**
5471
   * Convert possibly broken UTF-16BE string into Latin1 string.
5472
   * If the string cannot be represented as Latin1, an error
5473
   * is returned.
5474
   *
5475
   * During the conversion also validation of the input string is done.
5476
   * This function is suitable to work with inputs from untrusted sources.
5477
   * This function is not BOM-aware.
5478
   *
5479
   * @param input         the UTF-16BE string to convert
5480
   * @param length        the length of the string in 2-byte code units
5481
   * (char16_t)
5482
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5483
   * result
5484
   * @return a result pair struct (of type simdutf::result containing the two
5485
   * fields error and count) with an error code and either position of the error
5486
   * (in the input in code units) if any, or the number of char written if
5487
   * successful.
5488
   */
5489
  simdutf_warn_unused virtual result
5490
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
5491
                                        char *latin1_buffer) const noexcept = 0;
5492
5493
  /**
5494
   * Convert valid UTF-16LE string into Latin1 string.
5495
   *
5496
   * This function assumes that the input string is valid UTF-L16LE and that it
5497
   * can be represented as Latin1. If you violate this assumption, the result is
5498
   * implementation defined and may include system-dependent behavior such as
5499
   * crashes.
5500
   *
5501
   * This function is for expert users only and not part of our public API. Use
5502
   * convert_utf16le_to_latin1 instead.
5503
   *
5504
   * This function is not BOM-aware.
5505
   *
5506
   * @param input         the UTF-16LE string to convert
5507
   * @param length        the length of the string in 2-byte code units
5508
   * (char16_t)
5509
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5510
   * result
5511
   * @return number of written code units; 0 if conversion is not possible
5512
   */
5513
  simdutf_warn_unused virtual size_t
5514
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
5515
                                  char *latin1_buffer) const noexcept = 0;
5516
5517
  /**
5518
   * Convert valid UTF-16BE string into Latin1 string.
5519
   *
5520
   * This function assumes that the input string is valid UTF16-BE and that it
5521
   * can be represented as Latin1. If you violate this assumption, the result is
5522
   * implementation defined and may include system-dependent behavior such as
5523
   * crashes.
5524
   *
5525
   * This function is for expert users only and not part of our public API. Use
5526
   * convert_utf16be_to_latin1 instead.
5527
   *
5528
   * This function is not BOM-aware.
5529
   *
5530
   * @param input         the UTF-16BE string to convert
5531
   * @param length        the length of the string in 2-byte code units
5532
   * (char16_t)
5533
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5534
   * result
5535
   * @return number of written code units; 0 if conversion is not possible
5536
   */
5537
  simdutf_warn_unused virtual size_t
5538
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
5539
                                  char *latin1_buffer) const noexcept = 0;
5540
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5541
5542
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5543
  /**
5544
   * Convert possibly broken UTF-16LE string into UTF-8 string.
5545
   *
5546
   * During the conversion also validation of the input string is done.
5547
   * This function is suitable to work with inputs from untrusted sources.
5548
   *
5549
   * This function is not BOM-aware.
5550
   *
5551
   * @param input         the UTF-16LE string to convert
5552
   * @param length        the length of the string in 2-byte code units
5553
   * (char16_t)
5554
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5555
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5556
   * string
5557
   */
5558
  simdutf_warn_unused virtual size_t
5559
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
5560
                          char *utf8_buffer) const noexcept = 0;
5561
5562
  /**
5563
   * Convert possibly broken UTF-16BE string into UTF-8 string.
5564
   *
5565
   * During the conversion also validation of the input string is done.
5566
   * This function is suitable to work with inputs from untrusted sources.
5567
   *
5568
   * This function is not BOM-aware.
5569
   *
5570
   * @param input         the UTF-16BE string to convert
5571
   * @param length        the length of the string in 2-byte code units
5572
   * (char16_t)
5573
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5574
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5575
   * string
5576
   */
5577
  simdutf_warn_unused virtual size_t
5578
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
5579
                          char *utf8_buffer) const noexcept = 0;
5580
5581
  /**
5582
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
5583
   * error.
5584
   *
5585
   * During the conversion also validation of the input string is done.
5586
   * This function is suitable to work with inputs from untrusted sources.
5587
   *
5588
   * This function is not BOM-aware.
5589
   *
5590
   * @param input         the UTF-16LE string to convert
5591
   * @param length        the length of the string in 2-byte code units
5592
   * (char16_t)
5593
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5594
   * @return a result pair struct (of type simdutf::result containing the two
5595
   * fields error and count) with an error code and either position of the error
5596
   * (in the input in code units) if any, or the number of char written if
5597
   * successful.
5598
   */
5599
  simdutf_warn_unused virtual result
5600
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
5601
                                      char *utf8_buffer) const noexcept = 0;
5602
5603
  /**
5604
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
5605
   * error.
5606
   *
5607
   * During the conversion also validation of the input string is done.
5608
   * This function is suitable to work with inputs from untrusted sources.
5609
   *
5610
   * This function is not BOM-aware.
5611
   *
5612
   * @param input         the UTF-16BE string to convert
5613
   * @param length        the length of the string in 2-byte code units
5614
   * (char16_t)
5615
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5616
   * @return a result pair struct (of type simdutf::result containing the two
5617
   * fields error and count) with an error code and either position of the error
5618
   * (in the input in code units) if any, or the number of char written if
5619
   * successful.
5620
   */
5621
  simdutf_warn_unused virtual result
5622
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
5623
                                      char *utf8_buffer) const noexcept = 0;
5624
5625
  /**
5626
   * Convert valid UTF-16LE string into UTF-8 string.
5627
   *
5628
   * This function assumes that the input string is valid UTF-16LE.
5629
   *
5630
   * This function is not BOM-aware.
5631
   *
5632
   * @param input         the UTF-16LE string to convert
5633
   * @param length        the length of the string in 2-byte code units
5634
   * (char16_t)
5635
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5636
   * result
5637
   * @return number of written code units; 0 if conversion is not possible
5638
   */
5639
  simdutf_warn_unused virtual size_t
5640
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
5641
                                char *utf8_buffer) const noexcept = 0;
5642
5643
  /**
5644
   * Convert valid UTF-16BE string into UTF-8 string.
5645
   *
5646
   * This function assumes that the input string is valid UTF-16BE.
5647
   *
5648
   * This function is not BOM-aware.
5649
   *
5650
   * @param input         the UTF-16BE string to convert
5651
   * @param length        the length of the string in 2-byte code units
5652
   * (char16_t)
5653
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5654
   * result
5655
   * @return number of written code units; 0 if conversion is not possible
5656
   */
5657
  simdutf_warn_unused virtual size_t
5658
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
5659
                                char *utf8_buffer) const noexcept = 0;
5660
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5661
5662
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5663
  /**
5664
   * Convert possibly broken UTF-16LE string into UTF-32 string.
5665
   *
5666
   * During the conversion also validation of the input string is done.
5667
   * This function is suitable to work with inputs from untrusted sources.
5668
   *
5669
   * This function is not BOM-aware.
5670
   *
5671
   * @param input         the UTF-16LE string to convert
5672
   * @param length        the length of the string in 2-byte code units
5673
   * (char16_t)
5674
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5675
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5676
   * string
5677
   */
5678
  simdutf_warn_unused virtual size_t
5679
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
5680
                           char32_t *utf32_buffer) const noexcept = 0;
5681
5682
  /**
5683
   * Convert possibly broken UTF-16BE string into UTF-32 string.
5684
   *
5685
   * During the conversion also validation of the input string is done.
5686
   * This function is suitable to work with inputs from untrusted sources.
5687
   *
5688
   * This function is not BOM-aware.
5689
   *
5690
   * @param input         the UTF-16BE string to convert
5691
   * @param length        the length of the string in 2-byte code units
5692
   * (char16_t)
5693
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5694
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5695
   * string
5696
   */
5697
  simdutf_warn_unused virtual size_t
5698
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
5699
                           char32_t *utf32_buffer) const noexcept = 0;
5700
5701
  /**
5702
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
5703
   * error.
5704
   *
5705
   * During the conversion also validation of the input string is done.
5706
   * This function is suitable to work with inputs from untrusted sources.
5707
   *
5708
   * This function is not BOM-aware.
5709
   *
5710
   * @param input         the UTF-16LE string to convert
5711
   * @param length        the length of the string in 2-byte code units
5712
   * (char16_t)
5713
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5714
   * @return a result pair struct (of type simdutf::result containing the two
5715
   * fields error and count) with an error code and either position of the error
5716
   * (in the input in code units) if any, or the number of char32_t written if
5717
   * successful.
5718
   */
5719
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
5720
      const char16_t *input, size_t length,
5721
      char32_t *utf32_buffer) const noexcept = 0;
5722
5723
  /**
5724
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
5725
   * error.
5726
   *
5727
   * During the conversion also validation of the input string is done.
5728
   * This function is suitable to work with inputs from untrusted sources.
5729
   *
5730
   * This function is not BOM-aware.
5731
   *
5732
   * @param input         the UTF-16BE string to convert
5733
   * @param length        the length of the string in 2-byte code units
5734
   * (char16_t)
5735
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5736
   * @return a result pair struct (of type simdutf::result containing the two
5737
   * fields error and count) with an error code and either position of the error
5738
   * (in the input in code units) if any, or the number of char32_t written if
5739
   * successful.
5740
   */
5741
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
5742
      const char16_t *input, size_t length,
5743
      char32_t *utf32_buffer) const noexcept = 0;
5744
5745
  /**
5746
   * Convert valid UTF-16LE string into UTF-32 string.
5747
   *
5748
   * This function assumes that the input string is valid UTF-16LE.
5749
   *
5750
   * This function is not BOM-aware.
5751
   *
5752
   * @param input         the UTF-16LE string to convert
5753
   * @param length        the length of the string in 2-byte code units
5754
   * (char16_t)
5755
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
5756
   * result
5757
   * @return number of written code units; 0 if conversion is not possible
5758
   */
5759
  simdutf_warn_unused virtual size_t
5760
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
5761
                                 char32_t *utf32_buffer) const noexcept = 0;
5762
5763
  /**
5764
   * Convert valid UTF-16LE string into UTF-32BE string.
5765
   *
5766
   * This function assumes that the input string is valid UTF-16BE.
5767
   *
5768
   * This function is not BOM-aware.
5769
   *
5770
   * @param input         the UTF-16BE string to convert
5771
   * @param length        the length of the string in 2-byte code units
5772
   * (char16_t)
5773
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
5774
   * result
5775
   * @return number of written code units; 0 if conversion is not possible
5776
   */
5777
  simdutf_warn_unused virtual size_t
5778
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
5779
                                 char32_t *utf32_buffer) const noexcept = 0;
5780
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5781
5782
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5783
  /**
5784
   * Compute the number of bytes that this UTF-16LE string would require in
5785
   * UTF-8 format.
5786
   *
5787
   * This function does not validate the input. It is acceptable to pass invalid
5788
   * UTF-16 strings but in such cases the result is implementation defined.
5789
   *
5790
   * This function is not BOM-aware.
5791
   *
5792
   * @param input         the UTF-16LE string to convert
5793
   * @param length        the length of the string in 2-byte code units
5794
   * (char16_t)
5795
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
5796
   */
5797
  simdutf_warn_unused virtual size_t
5798
  utf8_length_from_utf16le(const char16_t *input,
5799
                           size_t length) const noexcept = 0;
5800
5801
  /**
5802
   * Compute the number of bytes that this UTF-16BE string would require in
5803
   * UTF-8 format.
5804
   *
5805
   * This function does not validate the input. It is acceptable to pass invalid
5806
   * UTF-16 strings but in such cases the result is implementation defined.
5807
   *
5808
   * This function is not BOM-aware.
5809
   *
5810
   * @param input         the UTF-16BE string to convert
5811
   * @param length        the length of the string in 2-byte code units
5812
   * (char16_t)
5813
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
5814
   */
5815
  simdutf_warn_unused virtual size_t
5816
  utf8_length_from_utf16be(const char16_t *input,
5817
                           size_t length) const noexcept = 0;
5818
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5819
5820
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5821
  /**
5822
   * Convert possibly broken UTF-32 string into Latin1 string.
5823
   *
5824
   * During the conversion also validation of the input string is done.
5825
   * This function is suitable to work with inputs from untrusted sources.
5826
   *
5827
   * This function is not BOM-aware.
5828
   *
5829
   * @param input         the UTF-32 string to convert
5830
   * @param length        the length of the string in 4-byte code units
5831
   * (char32_t)
5832
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5833
   * result
5834
   * @return number of written code units; 0 if input is not a valid UTF-32
5835
   * string
5836
   */
5837
  simdutf_warn_unused virtual size_t
5838
  convert_utf32_to_latin1(const char32_t *input, size_t length,
5839
                          char *latin1_buffer) const noexcept = 0;
5840
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5841
5842
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5843
  /**
5844
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
5845
   * If the string cannot be represented as Latin1, an error is returned.
5846
   *
5847
   * During the conversion also validation of the input string is done.
5848
   * This function is suitable to work with inputs from untrusted sources.
5849
   *
5850
   * This function is not BOM-aware.
5851
   *
5852
   * @param input         the UTF-32 string to convert
5853
   * @param length        the length of the string in 4-byte code units
5854
   * (char32_t)
5855
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5856
   * result
5857
   * @return a result pair struct (of type simdutf::result containing the two
5858
   * fields error and count) with an error code and either position of the error
5859
   * (in the input in code units) if any, or the number of char written if
5860
   * successful.
5861
   */
5862
  simdutf_warn_unused virtual result
5863
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
5864
                                      char *latin1_buffer) const noexcept = 0;
5865
5866
  /**
5867
   * Convert valid UTF-32 string into Latin1 string.
5868
   *
5869
   * This function assumes that the input string is valid UTF-32 and can be
5870
   * represented as Latin1. If you violate this assumption, the result is
5871
   * implementation defined and may include system-dependent behavior such as
5872
   * crashes.
5873
   *
5874
   * This function is for expert users only and not part of our public API. Use
5875
   * convert_utf32_to_latin1 instead.
5876
   *
5877
   * This function is not BOM-aware.
5878
   *
5879
   * @param input         the UTF-32 string to convert
5880
   * @param length        the length of the string in 4-byte code units
5881
   * (char32_t)
5882
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
5883
   * result
5884
   * @return number of written code units; 0 if conversion is not possible
5885
   */
5886
  simdutf_warn_unused virtual size_t
5887
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
5888
                                char *latin1_buffer) const noexcept = 0;
5889
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5890
5891
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5892
  /**
5893
   * Convert possibly broken UTF-32 string into UTF-8 string.
5894
   *
5895
   * During the conversion also validation of the input string is done.
5896
   * This function is suitable to work with inputs from untrusted sources.
5897
   *
5898
   * This function is not BOM-aware.
5899
   *
5900
   * @param input         the UTF-32 string to convert
5901
   * @param length        the length of the string in 4-byte code units
5902
   * (char32_t)
5903
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5904
   * @return number of written code units; 0 if input is not a valid UTF-32
5905
   * string
5906
   */
5907
  simdutf_warn_unused virtual size_t
5908
  convert_utf32_to_utf8(const char32_t *input, size_t length,
5909
                        char *utf8_buffer) const noexcept = 0;
5910
5911
  /**
5912
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
5913
   *
5914
   * During the conversion also validation of the input string is done.
5915
   * This function is suitable to work with inputs from untrusted sources.
5916
   *
5917
   * This function is not BOM-aware.
5918
   *
5919
   * @param input         the UTF-32 string to convert
5920
   * @param length        the length of the string in 4-byte code units
5921
   * (char32_t)
5922
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5923
   * @return a result pair struct (of type simdutf::result containing the two
5924
   * fields error and count) with an error code and either position of the error
5925
   * (in the input in code units) if any, or the number of char written if
5926
   * successful.
5927
   */
5928
  simdutf_warn_unused virtual result
5929
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
5930
                                    char *utf8_buffer) const noexcept = 0;
5931
5932
  /**
5933
   * Convert valid UTF-32 string into UTF-8 string.
5934
   *
5935
   * This function assumes that the input string is valid UTF-32.
5936
   *
5937
   * This function is not BOM-aware.
5938
   *
5939
   * @param input         the UTF-32 string to convert
5940
   * @param length        the length of the string in 4-byte code units
5941
   * (char32_t)
5942
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5943
   * result
5944
   * @return number of written code units; 0 if conversion is not possible
5945
   */
5946
  simdutf_warn_unused virtual size_t
5947
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
5948
                              char *utf8_buffer) const noexcept = 0;
5949
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5950
5951
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5952
  /**
5953
   * Return the number of bytes that this UTF-16 string would require in Latin1
5954
   * format.
5955
   *
5956
   *
5957
   * @param input         the UTF-16 string to convert
5958
   * @param length        the length of the string in 2-byte code units
5959
   * (char16_t)
5960
   * @return the number of bytes required to encode the UTF-16 string as Latin1
5961
   */
5962
  simdutf_warn_unused virtual size_t
5963
  utf16_length_from_latin1(size_t length) const noexcept {
5964
    return length;
5965
  }
5966
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5967
5968
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5969
  /**
5970
   * Convert possibly broken UTF-32 string into UTF-16LE string.
5971
   *
5972
   * During the conversion also validation of the input string is done.
5973
   * This function is suitable to work with inputs from untrusted sources.
5974
   *
5975
   * This function is not BOM-aware.
5976
   *
5977
   * @param input         the UTF-32 string to convert
5978
   * @param length        the length of the string in 4-byte code units
5979
   * (char32_t)
5980
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
5981
   * @return number of written code units; 0 if input is not a valid UTF-32
5982
   * string
5983
   */
5984
  simdutf_warn_unused virtual size_t
5985
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
5986
                           char16_t *utf16_buffer) const noexcept = 0;
5987
5988
  /**
5989
   * Convert possibly broken UTF-32 string into UTF-16BE string.
5990
   *
5991
   * During the conversion also validation of the input string is done.
5992
   * This function is suitable to work with inputs from untrusted sources.
5993
   *
5994
   * This function is not BOM-aware.
5995
   *
5996
   * @param input         the UTF-32 string to convert
5997
   * @param length        the length of the string in 4-byte code units
5998
   * (char32_t)
5999
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
6000
   * @return number of written code units; 0 if input is not a valid UTF-32
6001
   * string
6002
   */
6003
  simdutf_warn_unused virtual size_t
6004
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
6005
                           char16_t *utf16_buffer) const noexcept = 0;
6006
6007
  /**
6008
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
6009
   * error.
6010
   *
6011
   * During the conversion also validation of the input string is done.
6012
   * This function is suitable to work with inputs from untrusted sources.
6013
   *
6014
   * This function is not BOM-aware.
6015
   *
6016
   * @param input         the UTF-32 string to convert
6017
   * @param length        the length of the string in 4-byte code units
6018
   * (char32_t)
6019
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
6020
   * @return a result pair struct (of type simdutf::result containing the two
6021
   * fields error and count) with an error code and either position of the error
6022
   * (in the input in code units) if any, or the number of char16_t written if
6023
   * successful.
6024
   */
6025
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
6026
      const char32_t *input, size_t length,
6027
      char16_t *utf16_buffer) const noexcept = 0;
6028
6029
  /**
6030
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
6031
   * error.
6032
   *
6033
   * During the conversion also validation of the input string is done.
6034
   * This function is suitable to work with inputs from untrusted sources.
6035
   *
6036
   * This function is not BOM-aware.
6037
   *
6038
   * @param input         the UTF-32 string to convert
6039
   * @param length        the length of the string in 4-byte code units
6040
   * (char32_t)
6041
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
6042
   * @return a result pair struct (of type simdutf::result containing the two
6043
   * fields error and count) with an error code and either position of the error
6044
   * (in the input in code units) if any, or the number of char16_t written if
6045
   * successful.
6046
   */
6047
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
6048
      const char32_t *input, size_t length,
6049
      char16_t *utf16_buffer) const noexcept = 0;
6050
6051
  /**
6052
   * Convert valid UTF-32 string into UTF-16LE string.
6053
   *
6054
   * This function assumes that the input string is valid UTF-32.
6055
   *
6056
   * This function is not BOM-aware.
6057
   *
6058
   * @param input         the UTF-32 string to convert
6059
   * @param length        the length of the string in 4-byte code units
6060
   * (char32_t)
6061
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
6062
   * result
6063
   * @return number of written code units; 0 if conversion is not possible
6064
   */
6065
  simdutf_warn_unused virtual size_t
6066
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
6067
                                 char16_t *utf16_buffer) const noexcept = 0;
6068
6069
  /**
6070
   * Convert valid UTF-32 string into UTF-16BE string.
6071
   *
6072
   * This function assumes that the input string is valid UTF-32.
6073
   *
6074
   * This function is not BOM-aware.
6075
   *
6076
   * @param input         the UTF-32 string to convert
6077
   * @param length        the length of the string in 4-byte code units
6078
   * (char32_t)
6079
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
6080
   * result
6081
   * @return number of written code units; 0 if conversion is not possible
6082
   */
6083
  simdutf_warn_unused virtual size_t
6084
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
6085
                                 char16_t *utf16_buffer) const noexcept = 0;
6086
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6087
6088
#if SIMDUTF_FEATURE_UTF16
6089
  /**
6090
   * Change the endianness of the input. Can be used to go from UTF-16LE to
6091
   * UTF-16BE or from UTF-16BE to UTF-16LE.
6092
   *
6093
   * This function does not validate the input.
6094
   *
6095
   * This function is not BOM-aware.
6096
   *
6097
   * @param input         the UTF-16 string to process
6098
   * @param length        the length of the string in 2-byte code units
6099
   * (char16_t)
6100
   * @param output        the pointer to a buffer that can hold the conversion
6101
   * result
6102
   */
6103
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
6104
                                       char16_t *output) const noexcept = 0;
6105
#endif // SIMDUTF_FEATURE_UTF16
6106
6107
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6108
  /**
6109
   * Return the number of bytes that this Latin1 string would require in UTF-8
6110
   * format.
6111
   *
6112
   * @param input         the Latin1 string to convert
6113
   * @param length        the length of the string bytes
6114
   * @return the number of bytes required to encode the Latin1 string as UTF-8
6115
   */
6116
  simdutf_warn_unused virtual size_t
6117
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
6118
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6119
6120
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6121
  /**
6122
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
6123
   * format.
6124
   *
6125
   * This function does not validate the input. It is acceptable to pass invalid
6126
   * UTF-32 strings but in such cases the result is implementation defined.
6127
   *
6128
   * @param input         the UTF-32 string to convert
6129
   * @param length        the length of the string in 4-byte code units
6130
   * (char32_t)
6131
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
6132
   */
6133
  simdutf_warn_unused virtual size_t
6134
  utf8_length_from_utf32(const char32_t *input,
6135
                         size_t length) const noexcept = 0;
6136
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6137
6138
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6139
  /**
6140
   * Compute the number of bytes that this UTF-32 string would require in Latin1
6141
   * format.
6142
   *
6143
   * This function does not validate the input. It is acceptable to pass invalid
6144
   * UTF-32 strings but in such cases the result is implementation defined.
6145
   *
6146
   * @param length        the length of the string in 4-byte code units
6147
   * (char32_t)
6148
   * @return the number of bytes required to encode the UTF-32 string as Latin1
6149
   */
6150
  simdutf_warn_unused virtual size_t
6151
  latin1_length_from_utf32(size_t length) const noexcept {
6152
    return length;
6153
  }
6154
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6155
6156
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6157
  /**
6158
   * Compute the number of bytes that this UTF-8 string would require in Latin1
6159
   * format.
6160
   *
6161
   * This function does not validate the input. It is acceptable to pass invalid
6162
   * UTF-8 strings but in such cases the result is implementation defined.
6163
   *
6164
   * @param input         the UTF-8 string to convert
6165
   * @param length        the length of the string in byte
6166
   * @return the number of bytes required to encode the UTF-8 string as Latin1
6167
   */
6168
  simdutf_warn_unused virtual size_t
6169
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
6170
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6171
6172
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6173
  /**
6174
   * Compute the number of bytes that this UTF-16LE/BE string would require in
6175
   * Latin1 format.
6176
   *
6177
   * This function does not validate the input. It is acceptable to pass invalid
6178
   * UTF-16 strings but in such cases the result is implementation defined.
6179
   *
6180
   * This function is not BOM-aware.
6181
   *
6182
   * @param input         the UTF-16LE string to convert
6183
   * @param length        the length of the string in 2-byte code units
6184
   * (char16_t)
6185
   * @return the number of bytes required to encode the UTF-16LE string as
6186
   * Latin1
6187
   */
6188
  simdutf_warn_unused virtual size_t
6189
  latin1_length_from_utf16(size_t length) const noexcept {
6190
    return length;
6191
  }
6192
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6193
6194
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6195
  /**
6196
   * Compute the number of two-byte code units that this UTF-32 string would
6197
   * require in UTF-16 format.
6198
   *
6199
   * This function does not validate the input. It is acceptable to pass invalid
6200
   * UTF-32 strings but in such cases the result is implementation defined.
6201
   *
6202
   * @param input         the UTF-32 string to convert
6203
   * @param length        the length of the string in 4-byte code units
6204
   * (char32_t)
6205
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
6206
   */
6207
  simdutf_warn_unused virtual size_t
6208
  utf16_length_from_utf32(const char32_t *input,
6209
                          size_t length) const noexcept = 0;
6210
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6211
6212
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6213
  /**
6214
   * Return the number of bytes that this UTF-32 string would require in Latin1
6215
   * format.
6216
   *
6217
   * @param length        the length of the string in 4-byte code units
6218
   * (char32_t)
6219
   * @return the number of bytes required to encode the UTF-32 string as Latin1
6220
   */
6221
  simdutf_warn_unused virtual size_t
6222
  utf32_length_from_latin1(size_t length) const noexcept {
6223
    return length;
6224
  }
6225
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6226
6227
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6228
  /**
6229
   * Compute the number of bytes that this UTF-16LE string would require in
6230
   * UTF-32 format.
6231
   *
6232
   * This function is equivalent to count_utf16le.
6233
   *
6234
   * This function does not validate the input. It is acceptable to pass invalid
6235
   * UTF-16 strings but in such cases the result is implementation defined.
6236
   *
6237
   * This function is not BOM-aware.
6238
   *
6239
   * @param input         the UTF-16LE string to convert
6240
   * @param length        the length of the string in 2-byte code units
6241
   * (char16_t)
6242
   * @return the number of bytes required to encode the UTF-16LE string as
6243
   * UTF-32
6244
   */
6245
  simdutf_warn_unused virtual size_t
6246
  utf32_length_from_utf16le(const char16_t *input,
6247
                            size_t length) const noexcept = 0;
6248
6249
  /**
6250
   * Compute the number of bytes that this UTF-16BE string would require in
6251
   * UTF-32 format.
6252
   *
6253
   * This function is equivalent to count_utf16be.
6254
   *
6255
   * This function does not validate the input. It is acceptable to pass invalid
6256
   * UTF-16 strings but in such cases the result is implementation defined.
6257
   *
6258
   * This function is not BOM-aware.
6259
   *
6260
   * @param input         the UTF-16BE string to convert
6261
   * @param length        the length of the string in 2-byte code units
6262
   * (char16_t)
6263
   * @return the number of bytes required to encode the UTF-16BE string as
6264
   * UTF-32
6265
   */
6266
  simdutf_warn_unused virtual size_t
6267
  utf32_length_from_utf16be(const char16_t *input,
6268
                            size_t length) const noexcept = 0;
6269
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6270
6271
#if SIMDUTF_FEATURE_UTF16
6272
  /**
6273
   * Count the number of code points (characters) in the string assuming that
6274
   * it is valid.
6275
   *
6276
   * This function assumes that the input string is valid UTF-16LE.
6277
   * It is acceptable to pass invalid UTF-16 strings but in such cases
6278
   * the result is implementation defined.
6279
   *
6280
   * This function is not BOM-aware.
6281
   *
6282
   * @param input         the UTF-16LE string to process
6283
   * @param length        the length of the string in 2-byte code units
6284
   * (char16_t)
6285
   * @return number of code points
6286
   */
6287
  simdutf_warn_unused virtual size_t
6288
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
6289
6290
  /**
6291
   * Count the number of code points (characters) in the string assuming that
6292
   * it is valid.
6293
   *
6294
   * This function assumes that the input string is valid UTF-16BE.
6295
   * It is acceptable to pass invalid UTF-16 strings but in such cases
6296
   * the result is implementation defined.
6297
   *
6298
   * This function is not BOM-aware.
6299
   *
6300
   * @param input         the UTF-16BE string to process
6301
   * @param length        the length of the string in 2-byte code units
6302
   * (char16_t)
6303
   * @return number of code points
6304
   */
6305
  simdutf_warn_unused virtual size_t
6306
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
6307
#endif // SIMDUTF_FEATURE_UTF16
6308
6309
#if SIMDUTF_FEATURE_UTF8
6310
  /**
6311
   * Count the number of code points (characters) in the string assuming that
6312
   * it is valid.
6313
   *
6314
   * This function assumes that the input string is valid UTF-8.
6315
   * It is acceptable to pass invalid UTF-8 strings but in such cases
6316
   * the result is implementation defined.
6317
   *
6318
   * @param input         the UTF-8 string to process
6319
   * @param length        the length of the string in bytes
6320
   * @return number of code points
6321
   */
6322
  simdutf_warn_unused virtual size_t
6323
  count_utf8(const char *input, size_t length) const noexcept = 0;
6324
#endif // SIMDUTF_FEATURE_UTF8
6325
6326
#if SIMDUTF_FEATURE_BASE64
6327
  /**
6328
   * Provide the maximal binary length in bytes given the base64 input.
6329
   * As long as the input does not contain ignorable characters (e.g., ASCII
6330
   * spaces or linefeed characters), the result is exact. In particular, the
6331
   * function checks for padding characters.
6332
   *
6333
   * The function is fast (constant time). It checks up to two characters at
6334
   * the end of the string. The input is not otherwise validated or read..
6335
   *
6336
   * @param input         the base64 input to process
6337
   * @param length        the length of the base64 input in bytes
6338
   * @return maximal number of binary bytes
6339
   */
6340
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
6341
      const char *input, size_t length) const noexcept;
6342
6343
  /**
6344
   * Provide the maximal binary length in bytes given the base64 input.
6345
   * As long as the input does not contain ignorable characters (e.g., ASCII
6346
   * spaces or linefeed characters), the result is exact. In particular, the
6347
   * function checks for padding characters.
6348
   *
6349
   * The function is fast (constant time). It checks up to two characters at
6350
   * the end of the string. The input is not otherwise validated or read.
6351
   *
6352
   * @param input         the base64 input to process, in ASCII stored as 16-bit
6353
   * units
6354
   * @param length        the length of the base64 input in 16-bit units
6355
   * @return maximal number of binary bytes
6356
   */
6357
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
6358
      const char16_t *input, size_t length) const noexcept;
6359
6360
  /**
6361
   * Convert a base64 input to a binary output.
6362
   *
6363
   * This function follows the WHATWG forgiving-base64 format, which means that
6364
   * it will ignore any ASCII spaces in the input. You may provide a padded
6365
   * input (with one or two equal signs at the end) or an unpadded input
6366
   * (without any equal signs at the end).
6367
   *
6368
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6369
   *
6370
   * This function will fail in case of invalid input. When last_chunk_options =
6371
   * loose, there are two possible reasons for failure: the input contains a
6372
   * number of base64 characters that when divided by 4, leaves a single
6373
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6374
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6375
   *
6376
   * You should call this function with a buffer that is at least
6377
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6378
   * provide that much space, the function may cause a buffer overflow.
6379
   *
6380
   * @param input         the base64 string to process
6381
   * @param length        the length of the string in bytes
6382
   * @param output        the pointer to a buffer that can hold the conversion
6383
   * result (should be at least maximal_binary_length_from_base64(input, length)
6384
   * bytes long).
6385
   * @param options       the base64 options to use, can be base64_default or
6386
   * base64_url, is base64_default by default.
6387
   * @return a result pair struct (of type simdutf::result containing the two
6388
   * fields error and count) with an error code and either position of the error
6389
   * (in the input in bytes) if any, or the number of bytes written if
6390
   * successful.
6391
   */
6392
  simdutf_warn_unused virtual result
6393
  base64_to_binary(const char *input, size_t length, char *output,
6394
                   base64_options options = base64_default,
6395
                   last_chunk_handling_options last_chunk_options =
6396
                       last_chunk_handling_options::loose) const noexcept = 0;
6397
6398
  /**
6399
   * Convert a base64 input to a binary output while returning more details
6400
   * than base64_to_binary.
6401
   *
6402
   * This function follows the WHATWG forgiving-base64 format, which means that
6403
   * it will ignore any ASCII spaces in the input. You may provide a padded
6404
   * input (with one or two equal signs at the end) or an unpadded input
6405
   * (without any equal signs at the end).
6406
   *
6407
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6408
   *
6409
   * This function will fail in case of invalid input. When last_chunk_options =
6410
   * loose, there are two possible reasons for failure: the input contains a
6411
   * number of base64 characters that when divided by 4, leaves a single
6412
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6413
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6414
   *
6415
   * You should call this function with a buffer that is at least
6416
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6417
   * provide that much space, the function may cause a buffer overflow.
6418
   *
6419
   * @param input         the base64 string to process
6420
   * @param length        the length of the string in bytes
6421
   * @param output        the pointer to a buffer that can hold the conversion
6422
   * result (should be at least maximal_binary_length_from_base64(input, length)
6423
   * bytes long).
6424
   * @param options       the base64 options to use, can be base64_default or
6425
   * base64_url, is base64_default by default.
6426
   * @return a full_result pair struct (of type simdutf::result containing the
6427
   * three fields error, input_count and output_count).
6428
   */
6429
  simdutf_warn_unused virtual full_result base64_to_binary_details(
6430
      const char *input, size_t length, char *output,
6431
      base64_options options = base64_default,
6432
      last_chunk_handling_options last_chunk_options =
6433
          last_chunk_handling_options::loose) const noexcept = 0;
6434
6435
  /**
6436
   * Convert a base64 input to a binary output.
6437
   *
6438
   * This function follows the WHATWG forgiving-base64 format, which means that
6439
   * it will ignore any ASCII spaces in the input. You may provide a padded
6440
   * input (with one or two equal signs at the end) or an unpadded input
6441
   * (without any equal signs at the end).
6442
   *
6443
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6444
   *
6445
   * This function will fail in case of invalid input. When last_chunk_options =
6446
   * loose, there are two possible reasons for failure: the input contains a
6447
   * number of base64 characters that when divided by 4, leaves a single
6448
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6449
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6450
   *
6451
   * You should call this function with a buffer that is at least
6452
   * maximal_binary_length_from_base64(input, length) bytes long. If you
6453
   * fail to provide that much space, the function may cause a buffer overflow.
6454
   *
6455
   * @param input         the base64 string to process, in ASCII stored as
6456
   * 16-bit units
6457
   * @param length        the length of the string in 16-bit units
6458
   * @param output        the pointer to a buffer that can hold the conversion
6459
   * result (should be at least maximal_binary_length_from_base64(input, length)
6460
   * bytes long).
6461
   * @param options       the base64 options to use, can be base64_default or
6462
   * base64_url, is base64_default by default.
6463
   * @return a result pair struct (of type simdutf::result containing the two
6464
   * fields error and count) with an error code and position of the
6465
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
6466
   * number of bytes written if successful.
6467
   */
6468
  simdutf_warn_unused virtual result
6469
  base64_to_binary(const char16_t *input, size_t length, char *output,
6470
                   base64_options options = base64_default,
6471
                   last_chunk_handling_options last_chunk_options =
6472
                       last_chunk_handling_options::loose) const noexcept = 0;
6473
6474
  /**
6475
   * Convert a base64 input to a binary output while returning more details
6476
   * than base64_to_binary.
6477
   *
6478
   * This function follows the WHATWG forgiving-base64 format, which means that
6479
   * it will ignore any ASCII spaces in the input. You may provide a padded
6480
   * input (with one or two equal signs at the end) or an unpadded input
6481
   * (without any equal signs at the end).
6482
   *
6483
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6484
   *
6485
   * This function will fail in case of invalid input. When last_chunk_options =
6486
   * loose, there are two possible reasons for failure: the input contains a
6487
   * number of base64 characters that when divided by 4, leaves a single
6488
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6489
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6490
   *
6491
   * You should call this function with a buffer that is at least
6492
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6493
   * provide that much space, the function may cause a buffer overflow.
6494
   *
6495
   * @param input         the base64 string to process
6496
   * @param length        the length of the string in bytes
6497
   * @param output        the pointer to a buffer that can hold the conversion
6498
   * result (should be at least maximal_binary_length_from_base64(input, length)
6499
   * bytes long).
6500
   * @param options       the base64 options to use, can be base64_default or
6501
   * base64_url, is base64_default by default.
6502
   * @return a full_result pair struct (of type simdutf::result containing the
6503
   * three fields error, input_count and output_count).
6504
   */
6505
  simdutf_warn_unused virtual full_result base64_to_binary_details(
6506
      const char16_t *input, size_t length, char *output,
6507
      base64_options options = base64_default,
6508
      last_chunk_handling_options last_chunk_options =
6509
          last_chunk_handling_options::loose) const noexcept = 0;
6510
6511
  /**
6512
   * Provide the base64 length in bytes given the length of a binary input.
6513
   *
6514
   * @param length        the length of the input in bytes
6515
   * @param options       the base64 options to use, can be base64_default or
6516
   * base64_url, is base64_default by default.
6517
   * @return number of base64 bytes
6518
   */
6519
  simdutf_warn_unused size_t base64_length_from_binary(
6520
      size_t length, base64_options options = base64_default) const noexcept;
6521
6522
  /**
6523
   * Convert a binary input to a base64 output.
6524
   *
6525
   * The default option (simdutf::base64_default) uses the characters `+` and
6526
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
6527
   * the output to ensure that the output length is a multiple of four.
6528
   *
6529
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
6530
   * part of its alphabet. No padding is added at the end of the output.
6531
   *
6532
   * This function always succeeds.
6533
   *
6534
   * @param input         the binary to process
6535
   * @param length        the length of the input in bytes
6536
   * @param output        the pointer to a buffer that can hold the conversion
6537
   * result (should be at least base64_length_from_binary(length) bytes long)
6538
   * @param options       the base64 options to use, can be base64_default or
6539
   * base64_url, is base64_default by default.
6540
   * @return number of written bytes, will be equal to
6541
   * base64_length_from_binary(length, options)
6542
   */
6543
  virtual size_t
6544
  binary_to_base64(const char *input, size_t length, char *output,
6545
                   base64_options options = base64_default) const noexcept = 0;
6546
6547
  /**
6548
   * Convert a binary input to a base64 output with lines of given length.
6549
   * Lines are separated by a single linefeed character.
6550
   *
6551
   * The default option (simdutf::base64_default) uses the characters `+` and
6552
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
6553
   * the output to ensure that the output length is a multiple of four.
6554
   *
6555
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
6556
   * part of its alphabet. No padding is added at the end of the output.
6557
   *
6558
   * This function always succeeds.
6559
   *
6560
   * @param input         the binary to process
6561
   * @param length        the length of the input in bytes
6562
   * @param output        the pointer to a buffer that can hold the conversion
6563
   * result (should be at least base64_length_from_binary_with_lines(length,
6564
   * options, line_length) bytes long)
6565
   * @param line_length   the length of each line, values smaller than 4 are
6566
   * interpreted as 4
6567
   * @param options       the base64 options to use, can be base64_default or
6568
   * base64_url, is base64_default by default.
6569
   * @return number of written bytes, will be equal to
6570
   * base64_length_from_binary_with_lines(length, options, line_length)
6571
   */
6572
  virtual size_t binary_to_base64_with_lines(
6573
      const char *input, size_t length, char *output,
6574
      size_t line_length = simdutf::default_line_length,
6575
      base64_options options = base64_default) const noexcept = 0;
6576
6577
  /**
6578
   * Find the first occurrence of a character in a string. If the character is
6579
   * not found, return a pointer to the end of the string.
6580
   * @param start        the start of the string
6581
   * @param end          the end of the string
6582
   * @param character    the character to find
6583
   * @return a pointer to the first occurrence of the character in the string,
6584
   * or a pointer to the end of the string if the character is not found.
6585
   *
6586
   */
6587
  virtual const char *find(const char *start, const char *end,
6588
                           char character) const noexcept = 0;
6589
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
6590
                               char16_t character) const noexcept = 0;
6591
#endif // SIMDUTF_FEATURE_BASE64
6592
6593
#ifdef SIMDUTF_INTERNAL_TESTS
6594
  // This method is exported only in developer mode, its purpose
6595
  // is to expose some internal test procedures from the given
6596
  // implementation and then use them through our standard test
6597
  // framework.
6598
  //
6599
  // Regular users should not use it, the tests of the public
6600
  // API are enough.
6601
6602
  struct TestProcedure {
6603
    // display name
6604
    std::string name;
6605
6606
    // procedure should return whether given test pass or not
6607
    void (*procedure)(const implementation &);
6608
  };
6609
6610
  virtual std::vector<TestProcedure> internal_tests() const;
6611
#endif
6612
6613
protected:
6614
  /** @private Construct an implementation with the given name and description.
6615
   * For subclasses. */
6616
  simdutf_really_inline implementation(const char *name,
6617
                                       const char *description,
6618
                                       uint32_t required_instruction_sets)
6619
      : _name(name), _description(description),
6620
        _required_instruction_sets(required_instruction_sets) {}
6621
6622
protected:
6623
  ~implementation() = default;
6624
6625
private:
6626
  /**
6627
   * The name of this implementation.
6628
   */
6629
  const char *_name;
6630
6631
  /**
6632
   * The description of this implementation.
6633
   */
6634
  const char *_description;
6635
6636
  /**
6637
   * Instruction sets required for this implementation.
6638
   */
6639
  const uint32_t _required_instruction_sets;
6640
};
6641
6642
/** @private */
6643
namespace internal {
6644
6645
/**
6646
 * The list of available implementations compiled into simdutf.
6647
 */
6648
class available_implementation_list {
6649
public:
6650
  /** Get the list of available implementations compiled into simdutf */
6651
  simdutf_really_inline available_implementation_list() {}
6652
  /** Number of implementations */
6653
  size_t size() const noexcept;
6654
  /** STL const begin() iterator */
6655
  const implementation *const *begin() const noexcept;
6656
  /** STL const end() iterator */
6657
  const implementation *const *end() const noexcept;
6658
6659
  /**
6660
   * Get the implementation with the given name.
6661
   *
6662
   * Case sensitive.
6663
   *
6664
   *     const implementation *impl =
6665
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
6666
   * (!imp->supported_by_runtime_system()) { exit(1); }
6667
   *     simdutf::active_implementation = impl;
6668
   *
6669
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
6670
   * @return the implementation, or nullptr if the parse failed.
6671
   */
6672
  const implementation *operator[](const std::string &name) const noexcept {
6673
    for (const implementation *impl : *this) {
6674
      if (impl->name() == name) {
6675
        return impl;
6676
      }
6677
    }
6678
    return nullptr;
6679
  }
6680
6681
  /**
6682
   * Detect the most advanced implementation supported by the current host.
6683
   *
6684
   * This is used to initialize the implementation on startup.
6685
   *
6686
   *     const implementation *impl =
6687
   * simdutf::available_implementation::detect_best_supported();
6688
   *     simdutf::active_implementation = impl;
6689
   *
6690
   * @return the most advanced supported implementation for the current host, or
6691
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
6692
   * supported implementation. Will never return nullptr.
6693
   */
6694
  const implementation *detect_best_supported() const noexcept;
6695
};
6696
6697
template <typename T> class atomic_ptr {
6698
public:
6699
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
6700
6701
#if defined(SIMDUTF_NO_THREADS)
6702
  operator const T *() const { return ptr; }
6703
  const T &operator*() const { return *ptr; }
6704
  const T *operator->() const { return ptr; }
6705
6706
  operator T *() { return ptr; }
6707
  T &operator*() { return *ptr; }
6708
  T *operator->() { return ptr; }
6709
  atomic_ptr &operator=(T *_ptr) {
6710
    ptr = _ptr;
6711
    return *this;
6712
  }
6713
6714
#else
6715
  operator const T *() const { return ptr.load(); }
6716
  const T &operator*() const { return *ptr; }
6717
  const T *operator->() const { return ptr.load(); }
6718
6719
  operator T *() { return ptr.load(); }
6720
  T &operator*() { return *ptr; }
6721
  T *operator->() { return ptr.load(); }
6722
  atomic_ptr &operator=(T *_ptr) {
6723
    ptr = _ptr;
6724
    return *this;
6725
  }
6726
6727
#endif
6728
6729
private:
6730
#if defined(SIMDUTF_NO_THREADS)
6731
  T *ptr;
6732
#else
6733
  std::atomic<T *> ptr;
6734
#endif
6735
};
6736
6737
class detect_best_supported_implementation_on_first_use;
6738
6739
} // namespace internal
6740
6741
/**
6742
 * The list of available implementations compiled into simdutf.
6743
 */
6744
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
6745
get_available_implementations();
6746
6747
/**
6748
 * The active implementation.
6749
 *
6750
 * Automatically initialized on first use to the most advanced implementation
6751
 * supported by this hardware.
6752
 */
6753
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
6754
get_active_implementation();
6755
6756
} // namespace simdutf
6757
6758
#if SIMDUTF_FEATURE_BASE64
6759
  // this header is not part of the public api
6760
  #include <simdutf/base64_implementation.h>
6761
6762
namespace simdutf {
6763
  #if SIMDUTF_SPAN
6764
/**
6765
 * @brief span overload
6766
 * @return a tuple of result and outlen
6767
 */
6768
simdutf_really_inline
6769
    simdutf_constexpr23 simdutf_warn_unused std::tuple<result, std::size_t>
6770
    base64_to_binary_safe(
6771
        const detail::input_span_of_byte_like auto &input,
6772
        detail::output_span_of_byte_like auto &&binary_output,
6773
        base64_options options = base64_default,
6774
        last_chunk_handling_options last_chunk_options = loose,
6775
        bool decode_up_to_bad_char = false) noexcept {
6776
  size_t outlen = binary_output.size();
6777
    #if SIMDUTF_CPLUSPLUS23
6778
  if consteval {
6779
    using CInput = std::decay_t<decltype(*input.data())>;
6780
    static_assert(std::is_same_v<CInput, char>,
6781
                  "sorry, the constexpr implementation is for now limited to "
6782
                  "input of type char");
6783
    using COutput = std::decay_t<decltype(*binary_output.data())>;
6784
    static_assert(std::is_same_v<COutput, char>,
6785
                  "sorry, the constexpr implementation is for now limited to "
6786
                  "output of type char");
6787
    auto r = base64_to_binary_safe_impl(
6788
        input.data(), input.size(), binary_output.data(), outlen, options,
6789
        last_chunk_options, decode_up_to_bad_char);
6790
    return {r, outlen};
6791
  } else
6792
    #endif
6793
  {
6794
    auto r = base64_to_binary_safe_impl<char>(
6795
        reinterpret_cast<const char *>(input.data()), input.size(),
6796
        reinterpret_cast<char *>(binary_output.data()), outlen, options,
6797
        last_chunk_options, decode_up_to_bad_char);
6798
    return {r, outlen};
6799
  }
6800
}
6801
6802
    #if SIMDUTF_SPAN
6803
/**
6804
 * @brief span overload
6805
 * @return a tuple of result and outlen
6806
 */
6807
simdutf_really_inline
6808
    simdutf_warn_unused simdutf_constexpr23 std::tuple<result, std::size_t>
6809
    base64_to_binary_safe(
6810
        std::span<const char16_t> input,
6811
        detail::output_span_of_byte_like auto &&binary_output,
6812
        base64_options options = base64_default,
6813
        last_chunk_handling_options last_chunk_options = loose,
6814
        bool decode_up_to_bad_char = false) noexcept {
6815
  size_t outlen = binary_output.size();
6816
      #if SIMDUTF_CPLUSPLUS23
6817
  if consteval {
6818
    auto r = base64_to_binary_safe_impl(
6819
        input.data(), input.size(), binary_output.data(), outlen, options,
6820
        last_chunk_options, decode_up_to_bad_char);
6821
    return {r, outlen};
6822
  } else
6823
      #endif
6824
  {
6825
    auto r = base64_to_binary_safe(
6826
        input.data(), input.size(),
6827
        reinterpret_cast<char *>(binary_output.data()), outlen, options,
6828
        last_chunk_options, decode_up_to_bad_char);
6829
    return {r, outlen};
6830
  }
6831
}
6832
    #endif // SIMDUTF_SPAN
6833
6834
  #endif // SIMDUTF_SPAN
6835
} // namespace simdutf
6836
6837
#endif // SIMDUTF_FEATURE_BASE64
6838
6839
#endif // SIMDUTF_IMPLEMENTATION_H