Coverage Report

Created: 2025-11-16 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/include/simdutf/implementation.h
Line
Count
Source
1
#ifndef SIMDUTF_IMPLEMENTATION_H
2
#define SIMDUTF_IMPLEMENTATION_H
3
#if !defined(SIMDUTF_NO_THREADS)
4
  #include <atomic>
5
#endif
6
#include <string>
7
#ifdef SIMDUTF_INTERNAL_TESTS
8
  #include <vector>
9
#endif
10
#include "simdutf/common_defs.h"
11
#include "simdutf/compiler_check.h"
12
#include "simdutf/encoding_types.h"
13
#include "simdutf/error.h"
14
#include "simdutf/internal/isadetection.h"
15
16
#if SIMDUTF_SPAN
17
  #include <concepts>
18
  #include <type_traits>
19
  #include <span>
20
  #include <tuple>
21
#endif
22
#if SIMDUTF_CPLUSPLUS17
23
  #include <string_view>
24
#endif
25
// The following defines are conditionally enabled/disabled during amalgamation.
26
// By default all features are enabled, regular code shouldn't check them. Only
27
// when user code really relies of a selected subset, it's good to verify these
28
// flags, like:
29
//
30
//      #if !SIMDUTF_FEATURE_UTF16
31
//      #   error("Please amalgamate simdutf with UTF-16 support")
32
//      #endif
33
//
34
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
35
#define SIMDUTF_FEATURE_ASCII 1
36
#define SIMDUTF_FEATURE_LATIN1 1
37
#define SIMDUTF_FEATURE_UTF8 1
38
#define SIMDUTF_FEATURE_UTF16 1
39
#define SIMDUTF_FEATURE_UTF32 1
40
#define SIMDUTF_FEATURE_BASE64 1
41
42
namespace simdutf {
43
44
constexpr size_t default_line_length =
45
    76; ///< default line length for base64 encoding with lines
46
47
#if SIMDUTF_SPAN
48
/// helpers placed in namespace detail are not a part of the public API
49
namespace detail {
50
/**
51
 * matches a byte, in the many ways C++ allows. note that these
52
 * are all distinct types.
53
 */
54
template <typename T>
55
concept byte_like = std::is_same_v<T, std::byte> ||   //
56
                    std::is_same_v<T, char> ||        //
57
                    std::is_same_v<T, signed char> || //
58
                    std::is_same_v<T, unsigned char>;
59
60
template <typename T>
61
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
62
63
template <typename T>
64
concept is_pointer = std::is_pointer_v<T>;
65
66
/**
67
 * matches anything that behaves like std::span and points to character-like
68
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
69
 * std::uint8_t
70
 */
71
template <typename T>
72
concept input_span_of_byte_like = requires(const T &t) {
73
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
74
  { t.data() } noexcept -> is_pointer;
75
  { *t.data() } noexcept -> is_byte_like;
76
};
77
78
template <typename T>
79
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
80
81
/**
82
 * like span_of_byte_like, but for an output span (intended to be written to)
83
 */
84
template <typename T>
85
concept output_span_of_byte_like = requires(T &t) {
86
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
87
  { t.data() } noexcept -> is_pointer;
88
  { *t.data() } noexcept -> is_byte_like;
89
  { *t.data() } noexcept -> is_mutable;
90
};
91
} // namespace detail
92
#endif
93
94
#if SIMDUTF_FEATURE_DETECT_ENCODING
95
/**
96
 * Autodetect the encoding of the input, a single encoding is recommended.
97
 * E.g., the function might return simdutf::encoding_type::UTF8,
98
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
99
 * simdutf::encoding_type::UTF32_LE.
100
 *
101
 * @param input the string to analyze.
102
 * @param length the length of the string in bytes.
103
 * @return the detected encoding type
104
 */
105
simdutf_warn_unused simdutf::encoding_type
106
autodetect_encoding(const char *input, size_t length) noexcept;
107
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
108
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
109
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
110
0
}
111
  #if SIMDUTF_SPAN
112
/**
113
 * Autodetect the encoding of the input, a single encoding is recommended.
114
 * E.g., the function might return simdutf::encoding_type::UTF8,
115
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
116
 * simdutf::encoding_type::UTF32_LE.
117
 *
118
 * @param input the string to analyze. can be a anything span-like that has a
119
 * data() and size() that points to character data: std::string,
120
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
121
 * @return the detected encoding type
122
 */
123
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
124
autodetect_encoding(
125
    const detail::input_span_of_byte_like auto &input) noexcept {
126
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
127
                             input.size());
128
}
129
  #endif // SIMDUTF_SPAN
130
131
/**
132
 * Autodetect the possible encodings of the input in one pass.
133
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
134
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
135
 *
136
 * Overridden by each implementation.
137
 *
138
 * @param input the string to analyze.
139
 * @param length the length of the string in bytes.
140
 * @return the detected encoding type
141
 */
142
simdutf_warn_unused int detect_encodings(const char *input,
143
                                         size_t length) noexcept;
144
simdutf_really_inline simdutf_warn_unused int
145
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
146
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
147
0
}
148
  #if SIMDUTF_SPAN
149
simdutf_really_inline simdutf_warn_unused int
150
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
151
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
152
                          input.size());
153
}
154
  #endif // SIMDUTF_SPAN
155
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
156
157
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
158
/**
159
 * Validate the UTF-8 string. This function may be best when you expect
160
 * the input to be almost always valid. Otherwise, consider using
161
 * validate_utf8_with_errors.
162
 *
163
 * Overridden by each implementation.
164
 *
165
 * @param buf the UTF-8 string to validate.
166
 * @param len the length of the string in bytes.
167
 * @return true if and only if the string is valid UTF-8.
168
 */
169
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
170
  #if SIMDUTF_SPAN
171
simdutf_really_inline simdutf_warn_unused bool
172
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
173
  return validate_utf8(reinterpret_cast<const char *>(input.data()),
174
                       input.size());
175
}
176
  #endif // SIMDUTF_SPAN
177
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
178
179
#if SIMDUTF_FEATURE_UTF8
180
/**
181
 * Validate the UTF-8 string and stop on error.
182
 *
183
 * Overridden by each implementation.
184
 *
185
 * @param buf the UTF-8 string to validate.
186
 * @param len the length of the string in bytes.
187
 * @return a result pair struct (of type simdutf::result containing the two
188
 * fields error and count) with an error code and either position of the error
189
 * (in the input in code units) if any, or the number of code units validated if
190
 * successful.
191
 */
192
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
193
                                                     size_t len) noexcept;
194
  #if SIMDUTF_SPAN
195
simdutf_really_inline simdutf_warn_unused result validate_utf8_with_errors(
196
    const detail::input_span_of_byte_like auto &input) noexcept {
197
  return validate_utf8_with_errors(reinterpret_cast<const char *>(input.data()),
198
                                   input.size());
199
}
200
  #endif // SIMDUTF_SPAN
201
#endif   // SIMDUTF_FEATURE_UTF8
202
203
#if SIMDUTF_FEATURE_ASCII
204
/**
205
 * Validate the ASCII string.
206
 *
207
 * Overridden by each implementation.
208
 *
209
 * @param buf the ASCII string to validate.
210
 * @param len the length of the string in bytes.
211
 * @return true if and only if the string is valid ASCII.
212
 */
213
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
214
  #if SIMDUTF_SPAN
215
simdutf_really_inline simdutf_warn_unused bool
216
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
217
  return validate_ascii(reinterpret_cast<const char *>(input.data()),
218
                        input.size());
219
}
220
  #endif // SIMDUTF_SPAN
221
222
/**
223
 * Validate the ASCII string and stop on error. It might be faster than
224
 * validate_utf8 when an error is expected to occur early.
225
 *
226
 * Overridden by each implementation.
227
 *
228
 * @param buf the ASCII string to validate.
229
 * @param len the length of the string in bytes.
230
 * @return a result pair struct (of type simdutf::result containing the two
231
 * fields error and count) with an error code and either position of the error
232
 * (in the input in code units) if any, or the number of code units validated if
233
 * successful.
234
 */
235
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
236
                                                      size_t len) noexcept;
237
  #if SIMDUTF_SPAN
238
simdutf_really_inline simdutf_warn_unused result validate_ascii_with_errors(
239
    const detail::input_span_of_byte_like auto &input) noexcept {
240
  return validate_ascii_with_errors(
241
      reinterpret_cast<const char *>(input.data()), input.size());
242
}
243
  #endif // SIMDUTF_SPAN
244
#endif   // SIMDUTF_FEATURE_ASCII
245
246
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
247
/**
248
 * Validate the ASCII string as a UTF-16 sequence.
249
 * An UTF-16 sequence is considered an ASCII sequence
250
 * if it could be converted to an ASCII string losslessly.
251
 *
252
 * Overridden by each implementation.
253
 *
254
 * @param buf the UTF-16 string to validate.
255
 * @param len the length of the string in bytes.
256
 * @return true if and only if the string is valid ASCII.
257
 */
258
simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf,
259
                                                 size_t len) noexcept;
260
  #if SIMDUTF_SPAN
261
simdutf_really_inline simdutf_warn_unused bool
262
0
validate_utf16_as_ascii(std::span<const char16_t> input) noexcept {
263
0
  return validate_utf16_as_ascii(input.data(), input.size());
264
0
}
265
  #endif // SIMDUTF_SPAN
266
267
/**
268
 * Validate the ASCII string as a UTF-16BE sequence.
269
 * An UTF-16 sequence is considered an ASCII sequence
270
 * if it could be converted to an ASCII string losslessly.
271
 *
272
 * Overridden by each implementation.
273
 *
274
 * @param buf the UTF-16BE string to validate.
275
 * @param len the length of the string in bytes.
276
 * @return true if and only if the string is valid ASCII.
277
 */
278
simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf,
279
                                                   size_t len) noexcept;
280
  #if SIMDUTF_SPAN
281
simdutf_really_inline simdutf_warn_unused bool
282
0
validate_utf16be_as_ascii(std::span<const char16_t> input) noexcept {
283
0
  return validate_utf16be_as_ascii(input.data(), input.size());
284
0
}
285
  #endif // SIMDUTF_SPAN
286
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
287
288
/**
289
 * Validate the ASCII string as a UTF-16LE sequence.
290
 * An UTF-16 sequence is considered an ASCII sequence
291
 * if it could be converted to an ASCII string losslessly.
292
 *
293
 * Overridden by each implementation.
294
 *
295
 * @param buf the UTF-16LE string to validate.
296
 * @param len the length of the string in bytes.
297
 * @return true if and only if the string is valid ASCII.
298
 */
299
simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf,
300
                                                   size_t len) noexcept;
301
#if SIMDUTF_SPAN
302
simdutf_really_inline simdutf_warn_unused bool
303
0
validate_utf16le_as_ascii(std::span<const char16_t> input) noexcept {
304
0
  return validate_utf16le_as_ascii(input.data(), input.size());
305
0
}
306
#endif // SIMDUTF_SPAN
307
308
#if SIMDUTF_FEATURE_UTF16
309
/**
310
 * Using native endianness; Validate the UTF-16 string.
311
 * This function may be best when you expect the input to be almost always
312
 * valid. Otherwise, consider using validate_utf16_with_errors.
313
 *
314
 * Overridden by each implementation.
315
 *
316
 * This function is not BOM-aware.
317
 *
318
 * @param buf the UTF-16 string to validate.
319
 * @param len the length of the string in number of 2-byte code units
320
 * (char16_t).
321
 * @return true if and only if the string is valid UTF-16.
322
 */
323
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
324
                                        size_t len) noexcept;
325
  #if SIMDUTF_SPAN
326
simdutf_really_inline simdutf_warn_unused bool
327
0
validate_utf16(std::span<const char16_t> input) noexcept {
328
0
  return validate_utf16(input.data(), input.size());
329
0
}
330
  #endif // SIMDUTF_SPAN
331
#endif   // SIMDUTF_FEATURE_UTF16
332
333
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
334
/**
335
 * Validate the UTF-16LE string. This function may be best when you expect
336
 * the input to be almost always valid. Otherwise, consider using
337
 * validate_utf16le_with_errors.
338
 *
339
 * Overridden by each implementation.
340
 *
341
 * This function is not BOM-aware.
342
 *
343
 * @param buf the UTF-16LE string to validate.
344
 * @param len the length of the string in number of 2-byte code units
345
 * (char16_t).
346
 * @return true if and only if the string is valid UTF-16LE.
347
 */
348
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
349
                                          size_t len) noexcept;
350
  #if SIMDUTF_SPAN
351
simdutf_really_inline simdutf_warn_unused bool
352
0
validate_utf16le(std::span<const char16_t> input) noexcept {
353
0
  return validate_utf16le(input.data(), input.size());
354
0
}
355
  #endif // SIMDUTF_SPAN
356
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
357
358
#if SIMDUTF_FEATURE_UTF16
359
/**
360
 * Validate the UTF-16BE string. This function may be best when you expect
361
 * the input to be almost always valid. Otherwise, consider using
362
 * validate_utf16be_with_errors.
363
 *
364
 * Overridden by each implementation.
365
 *
366
 * This function is not BOM-aware.
367
 *
368
 * @param buf the UTF-16BE string to validate.
369
 * @param len the length of the string in number of 2-byte code units
370
 * (char16_t).
371
 * @return true if and only if the string is valid UTF-16BE.
372
 */
373
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
374
                                          size_t len) noexcept;
375
  #if SIMDUTF_SPAN
376
simdutf_really_inline simdutf_warn_unused bool
377
0
validate_utf16be(std::span<const char16_t> input) noexcept {
378
0
  return validate_utf16be(input.data(), input.size());
379
0
}
380
  #endif // SIMDUTF_SPAN
381
382
/**
383
 * Using native endianness; Validate the UTF-16 string and stop on error.
384
 * It might be faster than validate_utf16 when an error is expected to occur
385
 * early.
386
 *
387
 * Overridden by each implementation.
388
 *
389
 * This function is not BOM-aware.
390
 *
391
 * @param buf the UTF-16 string to validate.
392
 * @param len the length of the string in number of 2-byte code units
393
 * (char16_t).
394
 * @return a result pair struct (of type simdutf::result containing the two
395
 * fields error and count) with an error code and either position of the error
396
 * (in the input in code units) if any, or the number of code units validated if
397
 * successful.
398
 */
399
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
400
                                                      size_t len) noexcept;
401
  #if SIMDUTF_SPAN
402
simdutf_really_inline simdutf_warn_unused result
403
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
404
0
  return validate_utf16_with_errors(input.data(), input.size());
405
0
}
406
  #endif // SIMDUTF_SPAN
407
408
/**
409
 * Validate the UTF-16LE string and stop on error. It might be faster than
410
 * validate_utf16le when an error is expected to occur early.
411
 *
412
 * Overridden by each implementation.
413
 *
414
 * This function is not BOM-aware.
415
 *
416
 * @param buf the UTF-16LE string to validate.
417
 * @param len the length of the string in number of 2-byte code units
418
 * (char16_t).
419
 * @return a result pair struct (of type simdutf::result containing the two
420
 * fields error and count) with an error code and either position of the error
421
 * (in the input in code units) if any, or the number of code units validated if
422
 * successful.
423
 */
424
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
425
                                                        size_t len) noexcept;
426
  #if SIMDUTF_SPAN
427
simdutf_really_inline simdutf_warn_unused result
428
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
429
0
  return validate_utf16le_with_errors(input.data(), input.size());
430
0
}
431
  #endif // SIMDUTF_SPAN
432
433
/**
434
 * Validate the UTF-16BE string and stop on error. It might be faster than
435
 * validate_utf16be when an error is expected to occur early.
436
 *
437
 * Overridden by each implementation.
438
 *
439
 * This function is not BOM-aware.
440
 *
441
 * @param buf the UTF-16BE string to validate.
442
 * @param len the length of the string in number of 2-byte code units
443
 * (char16_t).
444
 * @return a result pair struct (of type simdutf::result containing the two
445
 * fields error and count) with an error code and either position of the error
446
 * (in the input in code units) if any, or the number of code units validated if
447
 * successful.
448
 */
449
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
450
                                                        size_t len) noexcept;
451
  #if SIMDUTF_SPAN
452
simdutf_really_inline simdutf_warn_unused result
453
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
454
0
  return validate_utf16be_with_errors(input.data(), input.size());
455
0
}
456
  #endif // SIMDUTF_SPAN
457
458
/**
459
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
460
 * the Unicode replacement character U+FFFD. If input and output points to
461
 * different memory areas, the procedure copies string, and it's expected that
462
 * output memory is at least as big as the input. It's also possible to set
463
 * input equal output, that makes replacements an in-place operation.
464
 *
465
 * @param input the UTF-16LE string to correct.
466
 * @param len the length of the string in number of 2-byte code units
467
 * (char16_t).
468
 * @param output the output buffer.
469
 */
470
void to_well_formed_utf16le(const char16_t *input, size_t len,
471
                            char16_t *output) noexcept;
472
  #if SIMDUTF_SPAN
473
simdutf_really_inline void
474
to_well_formed_utf16le(std::span<const char16_t> input,
475
0
                       std::span<char16_t> output) noexcept {
476
0
  to_well_formed_utf16le(input.data(), input.size(), output.data());
477
0
}
478
  #endif // SIMDUTF_SPAN
479
480
/**
481
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
482
 * the Unicode replacement character U+FFFD. If input and output points to
483
 * different memory areas, the procedure copies string, and it's expected that
484
 * output memory is at least as big as the input. It's also possible to set
485
 * input equal output, that makes replacements an in-place operation.
486
 *
487
 * @param input the UTF-16BE string to correct.
488
 * @param len the length of the string in number of 2-byte code units
489
 * (char16_t).
490
 * @param output the output buffer.
491
 */
492
void to_well_formed_utf16be(const char16_t *input, size_t len,
493
                            char16_t *output) noexcept;
494
  #if SIMDUTF_SPAN
495
simdutf_really_inline void
496
to_well_formed_utf16be(std::span<const char16_t> input,
497
0
                       std::span<char16_t> output) noexcept {
498
0
  to_well_formed_utf16be(input.data(), input.size(), output.data());
499
0
}
500
  #endif // SIMDUTF_SPAN
501
502
/**
503
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
504
 * Unicode replacement character U+FFFD. If input and output points to different
505
 * memory areas, the procedure copies string, and it's expected that output
506
 * memory is at least as big as the input. It's also possible to set input equal
507
 * output, that makes replacements an in-place operation.
508
 *
509
 * @param input the UTF-16 string to correct.
510
 * @param len the length of the string in number of 2-byte code units
511
 * (char16_t).
512
 * @param output the output buffer.
513
 */
514
void to_well_formed_utf16(const char16_t *input, size_t len,
515
                          char16_t *output) noexcept;
516
  #if SIMDUTF_SPAN
517
simdutf_really_inline void
518
to_well_formed_utf16(std::span<const char16_t> input,
519
0
                     std::span<char16_t> output) noexcept {
520
0
  to_well_formed_utf16(input.data(), input.size(), output.data());
521
0
}
522
  #endif // SIMDUTF_SPAN
523
524
#endif // SIMDUTF_FEATURE_UTF16
525
526
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
527
/**
528
 * Validate the UTF-32 string. This function may be best when you expect
529
 * the input to be almost always valid. Otherwise, consider using
530
 * validate_utf32_with_errors.
531
 *
532
 * Overridden by each implementation.
533
 *
534
 * This function is not BOM-aware.
535
 *
536
 * @param buf the UTF-32 string to validate.
537
 * @param len the length of the string in number of 4-byte code units
538
 * (char32_t).
539
 * @return true if and only if the string is valid UTF-32.
540
 */
541
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
542
                                        size_t len) noexcept;
543
  #if SIMDUTF_SPAN
544
simdutf_really_inline simdutf_warn_unused bool
545
0
validate_utf32(std::span<const char32_t> input) noexcept {
546
0
  return validate_utf32(input.data(), input.size());
547
0
}
548
  #endif // SIMDUTF_SPAN
549
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
550
551
#if SIMDUTF_FEATURE_UTF32
552
/**
553
 * Validate the UTF-32 string and stop on error. It might be faster than
554
 * validate_utf32 when an error is expected to occur early.
555
 *
556
 * Overridden by each implementation.
557
 *
558
 * This function is not BOM-aware.
559
 *
560
 * @param buf the UTF-32 string to validate.
561
 * @param len the length of the string in number of 4-byte code units
562
 * (char32_t).
563
 * @return a result pair struct (of type simdutf::result containing the two
564
 * fields error and count) with an error code and either position of the error
565
 * (in the input in code units) if any, or the number of code units validated if
566
 * successful.
567
 */
568
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
569
                                                      size_t len) noexcept;
570
  #if SIMDUTF_SPAN
571
simdutf_really_inline simdutf_warn_unused result
572
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
573
0
  return validate_utf32_with_errors(input.data(), input.size());
574
0
}
575
  #endif // SIMDUTF_SPAN
576
#endif   // SIMDUTF_FEATURE_UTF32
577
578
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
579
/**
580
 * Convert Latin1 string into UTF-8 string.
581
 *
582
 * This function is suitable to work with inputs from untrusted sources.
583
 *
584
 * @param input         the Latin1 string to convert
585
 * @param length        the length of the string in bytes
586
 * @param utf8_output   the pointer to buffer that can hold conversion result
587
 * @return the number of written char; 0 if conversion is not possible
588
 */
589
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
590
                                                  size_t length,
591
                                                  char *utf8_output) noexcept;
592
  #if SIMDUTF_SPAN
593
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8(
594
    const detail::input_span_of_byte_like auto &latin1_input,
595
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
596
  return convert_latin1_to_utf8(
597
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
598
      utf8_output.data());
599
}
600
  #endif // SIMDUTF_SPAN
601
602
/**
603
 * Convert Latin1 string into UTF-8 string with output limit.
604
 *
605
 * This function is suitable to work with inputs from untrusted sources.
606
 *
607
 * We write as many characters as possible.
608
 *
609
 * @param input         the Latin1 string to convert
610
 * @param length        the length of the string in bytes
611
 * @param utf8_output   the pointer to buffer that can hold conversion result
612
 * @param utf8_len      the maximum output length
613
 * @return the number of written char; 0 if conversion is not possible
614
 */
615
simdutf_warn_unused size_t
616
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
617
                            size_t utf8_len) noexcept;
618
  #if SIMDUTF_SPAN
619
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
620
    const detail::input_span_of_byte_like auto &input,
621
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
622
  // implementation note: outputspan is a forwarding ref to avoid copying and
623
  // allow both lvalues and rvalues. std::span can be copied without problems,
624
  // but std::vector should not, and this function should accept both. it will
625
  // allow using an owning rvalue ref (example: passing a temporary std::string)
626
  // as output, but the user will quickly find out that he has no way of getting
627
  // the data out of the object in that case.
628
  return convert_latin1_to_utf8_safe(
629
      input.data(), input.size(), reinterpret_cast<char *>(utf8_output.data()),
630
      utf8_output.size());
631
}
632
  #endif // SIMDUTF_SPAN
633
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
634
635
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
636
/**
637
 * Convert possibly Latin1 string into UTF-16LE string.
638
 *
639
 * This function is suitable to work with inputs from untrusted sources.
640
 *
641
 * @param input         the Latin1 string to convert
642
 * @param length        the length of the string in bytes
643
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
644
 * @return the number of written char16_t; 0 if conversion is not possible
645
 */
646
simdutf_warn_unused size_t convert_latin1_to_utf16le(
647
    const char *input, size_t length, char16_t *utf16_output) noexcept;
648
  #if SIMDUTF_SPAN
649
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf16le(
650
    const detail::input_span_of_byte_like auto &latin1_input,
651
    std::span<char16_t> utf16_output) noexcept {
652
  return convert_latin1_to_utf16le(
653
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
654
      utf16_output.data());
655
}
656
  #endif // SIMDUTF_SPAN
657
658
/**
659
 * Convert Latin1 string into UTF-16BE string.
660
 *
661
 * This function is suitable to work with inputs from untrusted sources.
662
 *
663
 * @param input         the Latin1 string to convert
664
 * @param length        the length of the string in bytes
665
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
666
 * @return the number of written char16_t; 0 if conversion is not possible
667
 */
668
simdutf_warn_unused size_t convert_latin1_to_utf16be(
669
    const char *input, size_t length, char16_t *utf16_output) noexcept;
670
  #if SIMDUTF_SPAN
671
simdutf_really_inline simdutf_warn_unused size_t
672
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
673
                          std::span<char16_t> output) noexcept {
674
  return convert_latin1_to_utf16be(reinterpret_cast<const char *>(input.data()),
675
                                   input.size(), output.data());
676
}
677
  #endif // SIMDUTF_SPAN
678
/**
679
 * Compute the number of bytes that this UTF-16 string would require in Latin1
680
 * format.
681
 *
682
 * @param length        the length of the string in Latin1 code units (char)
683
 * @return the length of the string in Latin1 code units (char) required to
684
 * encode the UTF-16 string as Latin1
685
 */
686
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
687
688
/**
689
 * Compute the number of code units that this Latin1 string would require in
690
 * UTF-16 format.
691
 *
692
 * @param length        the length of the string in Latin1 code units (char)
693
 * @return the length of the string in 2-byte code units (char16_t) required to
694
 * encode the Latin1 string as UTF-16
695
 */
696
simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept;
697
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
698
699
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
700
/**
701
 * Convert Latin1 string into UTF-32 string.
702
 *
703
 * This function is suitable to work with inputs from untrusted sources.
704
 *
705
 * @param input         the Latin1 string to convert
706
 * @param length        the length of the string in bytes
707
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
708
 * @return the number of written char32_t; 0 if conversion is not possible
709
 */
710
simdutf_warn_unused size_t convert_latin1_to_utf32(
711
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
712
  #if SIMDUTF_SPAN
713
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf32(
714
    const detail::input_span_of_byte_like auto &latin1_input,
715
    std::span<char32_t> utf32_output) noexcept {
716
  return convert_latin1_to_utf32(
717
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
718
      utf32_output.data());
719
}
720
  #endif // SIMDUTF_SPAN
721
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
722
723
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
724
/**
725
 * Convert possibly broken UTF-8 string into latin1 string.
726
 *
727
 * During the conversion also validation of the input string is done.
728
 * This function is suitable to work with inputs from untrusted sources.
729
 *
730
 * @param input         the UTF-8 string to convert
731
 * @param length        the length of the string in bytes
732
 * @param latin1_output  the pointer to buffer that can hold conversion result
733
 * @return the number of written char; 0 if the input was not valid UTF-8 string
734
 * or if it cannot be represented as Latin1
735
 */
736
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
737
                                                  size_t length,
738
                                                  char *latin1_output) noexcept;
739
  #if SIMDUTF_SPAN
740
simdutf_really_inline simdutf_warn_unused size_t convert_utf8_to_latin1(
741
    const detail::input_span_of_byte_like auto &input,
742
    detail::output_span_of_byte_like auto &&output) noexcept {
743
  return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
744
                                input.size(),
745
                                reinterpret_cast<char *>(output.data()));
746
}
747
  #endif // SIMDUTF_SPAN
748
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
749
750
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
751
/**
752
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
753
 * string.
754
 *
755
 * During the conversion also validation of the input string is done.
756
 * This function is suitable to work with inputs from untrusted sources.
757
 *
758
 * @param input         the UTF-8 string to convert
759
 * @param length        the length of the string in bytes
760
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
761
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
762
 * string
763
 */
764
simdutf_warn_unused size_t convert_utf8_to_utf16(
765
    const char *input, size_t length, char16_t *utf16_output) noexcept;
766
  #if SIMDUTF_SPAN
767
simdutf_really_inline simdutf_warn_unused size_t
768
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
769
                      std::span<char16_t> output) noexcept {
770
  return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
771
                               input.size(), output.data());
772
}
773
  #endif // SIMDUTF_SPAN
774
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
775
776
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
777
/**
778
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
779
 *
780
 * @param input         the Latin1 string to convert
781
 * @param length        the length of the string in bytes
782
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
783
 * @return the number of written char16_t.
784
 */
785
simdutf_warn_unused size_t convert_latin1_to_utf16(
786
    const char *input, size_t length, char16_t *utf16_output) noexcept;
787
  #if SIMDUTF_SPAN
788
simdutf_really_inline simdutf_warn_unused size_t
789
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
790
                        std::span<char16_t> output) noexcept {
791
  return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
792
                                 input.size(), output.data());
793
}
794
  #endif // SIMDUTF_SPAN
795
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
796
797
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
798
/**
799
 * Convert possibly broken UTF-8 string into UTF-16LE string.
800
 *
801
 * During the conversion also validation of the input string is done.
802
 * This function is suitable to work with inputs from untrusted sources.
803
 *
804
 * @param input         the UTF-8 string to convert
805
 * @param length        the length of the string in bytes
806
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
807
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
808
 * string
809
 */
810
simdutf_warn_unused size_t convert_utf8_to_utf16le(
811
    const char *input, size_t length, char16_t *utf16_output) noexcept;
812
  #if SIMDUTF_SPAN
813
simdutf_really_inline simdutf_warn_unused size_t
814
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
815
                        std::span<char16_t> utf16_output) noexcept {
816
  return convert_utf8_to_utf16le(
817
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
818
      utf16_output.data());
819
}
820
  #endif // SIMDUTF_SPAN
821
822
/**
823
 * Convert possibly broken UTF-8 string into UTF-16BE string.
824
 *
825
 * During the conversion also validation of the input string is done.
826
 * This function is suitable to work with inputs from untrusted sources.
827
 *
828
 * @param input         the UTF-8 string to convert
829
 * @param length        the length of the string in bytes
830
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
831
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
832
 * string
833
 */
834
simdutf_warn_unused size_t convert_utf8_to_utf16be(
835
    const char *input, size_t length, char16_t *utf16_output) noexcept;
836
  #if SIMDUTF_SPAN
837
simdutf_really_inline simdutf_warn_unused size_t
838
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
839
                        std::span<char16_t> utf16_output) noexcept {
840
  return convert_utf8_to_utf16be(
841
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
842
      utf16_output.data());
843
}
844
  #endif // SIMDUTF_SPAN
845
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
846
847
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
848
/**
849
 * Convert possibly broken UTF-8 string into latin1 string with errors.
850
 * If the string cannot be represented as Latin1, an error
851
 * code is returned.
852
 *
853
 * During the conversion also validation of the input string is done.
854
 * This function is suitable to work with inputs from untrusted sources.
855
 *
856
 * @param input         the UTF-8 string to convert
857
 * @param length        the length of the string in bytes
858
 * @param latin1_output  the pointer to buffer that can hold conversion result
859
 * @return a result pair struct (of type simdutf::result containing the two
860
 * fields error and count) with an error code and either position of the error
861
 * (in the input in code units) if any, or the number of code units validated if
862
 * successful.
863
 */
864
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
865
    const char *input, size_t length, char *latin1_output) noexcept;
866
  #if SIMDUTF_SPAN
867
simdutf_really_inline simdutf_warn_unused result
868
convert_utf8_to_latin1_with_errors(
869
    const detail::input_span_of_byte_like auto &utf8_input,
870
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
871
  return convert_utf8_to_latin1_with_errors(
872
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
873
      reinterpret_cast<char *>(latin1_output.data()));
874
}
875
  #endif // SIMDUTF_SPAN
876
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
877
878
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
879
/**
880
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
881
 * string and stop on error.
882
 *
883
 * During the conversion also validation of the input string is done.
884
 * This function is suitable to work with inputs from untrusted sources.
885
 *
886
 * @param input         the UTF-8 string to convert
887
 * @param length        the length of the string in bytes
888
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
889
 * @return a result pair struct (of type simdutf::result containing the two
890
 * fields error and count) with an error code and either position of the error
891
 * (in the input in code units) if any, or the number of char16_t written if
892
 * successful.
893
 */
894
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
895
    const char *input, size_t length, char16_t *utf16_output) noexcept;
896
  #if SIMDUTF_SPAN
897
simdutf_really_inline simdutf_warn_unused result
898
convert_utf8_to_utf16_with_errors(
899
    const detail::input_span_of_byte_like auto &utf8_input,
900
    std::span<char16_t> utf16_output) noexcept {
901
  return convert_utf8_to_utf16_with_errors(
902
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
903
      utf16_output.data());
904
}
905
  #endif // SIMDUTF_SPAN
906
907
/**
908
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
909
 *
910
 * During the conversion also validation of the input string is done.
911
 * This function is suitable to work with inputs from untrusted sources.
912
 *
913
 * @param input         the UTF-8 string to convert
914
 * @param length        the length of the string in bytes
915
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
916
 * @return a result pair struct (of type simdutf::result containing the two
917
 * fields error and count) with an error code and either position of the error
918
 * (in the input in code units) if any, or the number of char16_t written if
919
 * successful.
920
 */
921
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
922
    const char *input, size_t length, char16_t *utf16_output) noexcept;
923
  #if SIMDUTF_SPAN
924
simdutf_really_inline simdutf_warn_unused result
925
convert_utf8_to_utf16le_with_errors(
926
    const detail::input_span_of_byte_like auto &utf8_input,
927
    std::span<char16_t> utf16_output) noexcept {
928
  return convert_utf8_to_utf16le_with_errors(
929
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
930
      utf16_output.data());
931
}
932
  #endif // SIMDUTF_SPAN
933
934
/**
935
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
936
 *
937
 * During the conversion also validation of the input string is done.
938
 * This function is suitable to work with inputs from untrusted sources.
939
 *
940
 * @param input         the UTF-8 string to convert
941
 * @param length        the length of the string in bytes
942
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
943
 * @return a result pair struct (of type simdutf::result containing the two
944
 * fields error and count) with an error code and either position of the error
945
 * (in the input in code units) if any, or the number of char16_t written if
946
 * successful.
947
 */
948
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
949
    const char *input, size_t length, char16_t *utf16_output) noexcept;
950
  #if SIMDUTF_SPAN
951
simdutf_really_inline simdutf_warn_unused result
952
convert_utf8_to_utf16be_with_errors(
953
    const detail::input_span_of_byte_like auto &utf8_input,
954
    std::span<char16_t> utf16_output) noexcept {
955
  return convert_utf8_to_utf16be_with_errors(
956
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
957
      utf16_output.data());
958
}
959
  #endif // SIMDUTF_SPAN
960
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
961
962
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
963
/**
964
 * Convert possibly broken UTF-8 string into UTF-32 string.
965
 *
966
 * During the conversion also validation of the input string is done.
967
 * This function is suitable to work with inputs from untrusted sources.
968
 *
969
 * @param input         the UTF-8 string to convert
970
 * @param length        the length of the string in bytes
971
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
972
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
973
 * string
974
 */
975
simdutf_warn_unused size_t convert_utf8_to_utf32(
976
    const char *input, size_t length, char32_t *utf32_output) noexcept;
977
  #if SIMDUTF_SPAN
978
simdutf_really_inline simdutf_warn_unused size_t
979
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
980
                      std::span<char32_t> utf32_output) noexcept {
981
  return convert_utf8_to_utf32(
982
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
983
      utf32_output.data());
984
}
985
  #endif // SIMDUTF_SPAN
986
987
/**
988
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
989
 *
990
 * During the conversion also validation of the input string is done.
991
 * This function is suitable to work with inputs from untrusted sources.
992
 *
993
 * @param input         the UTF-8 string to convert
994
 * @param length        the length of the string in bytes
995
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
996
 * @return a result pair struct (of type simdutf::result containing the two
997
 * fields error and count) with an error code and either position of the error
998
 * (in the input in code units) if any, or the number of char32_t written if
999
 * successful.
1000
 */
1001
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
1002
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1003
  #if SIMDUTF_SPAN
1004
simdutf_really_inline simdutf_warn_unused result
1005
convert_utf8_to_utf32_with_errors(
1006
    const detail::input_span_of_byte_like auto &utf8_input,
1007
    std::span<char32_t> utf32_output) noexcept {
1008
  return convert_utf8_to_utf32_with_errors(
1009
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1010
      utf32_output.data());
1011
}
1012
  #endif // SIMDUTF_SPAN
1013
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1014
1015
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1016
/**
1017
 * Convert valid UTF-8 string into latin1 string.
1018
 *
1019
 * This function assumes that the input string is valid UTF-8 and that it can be
1020
 * represented as Latin1. If you violate this assumption, the result is
1021
 * implementation defined and may include system-dependent behavior such as
1022
 * crashes.
1023
 *
1024
 * This function is for expert users only and not part of our public API. Use
1025
 * convert_utf8_to_latin1 instead. The function may be removed from the library
1026
 * in the future.
1027
 *
1028
 * This function is not BOM-aware.
1029
 *
1030
 * @param input         the UTF-8 string to convert
1031
 * @param length        the length of the string in bytes
1032
 * @param latin1_output  the pointer to buffer that can hold conversion result
1033
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1034
 */
1035
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1036
    const char *input, size_t length, char *latin1_output) noexcept;
1037
  #if SIMDUTF_SPAN
1038
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1039
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1040
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1041
  return convert_valid_utf8_to_latin1(
1042
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1043
      valid_utf8_input.size(), latin1_output.data());
1044
}
1045
  #endif // SIMDUTF_SPAN
1046
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1047
1048
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1049
/**
1050
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1051
 *
1052
 * This function assumes that the input string is valid UTF-8.
1053
 *
1054
 * @param input         the UTF-8 string to convert
1055
 * @param length        the length of the string in bytes
1056
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1057
 * @return the number of written char16_t
1058
 */
1059
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1060
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1061
  #if SIMDUTF_SPAN
1062
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1063
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1064
    std::span<char16_t> utf16_output) noexcept {
1065
  return convert_valid_utf8_to_utf16(
1066
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1067
      valid_utf8_input.size(), utf16_output.data());
1068
}
1069
  #endif // SIMDUTF_SPAN
1070
1071
/**
1072
 * Convert valid UTF-8 string into UTF-16LE string.
1073
 *
1074
 * This function assumes that the input string is valid UTF-8.
1075
 *
1076
 * @param input         the UTF-8 string to convert
1077
 * @param length        the length of the string in bytes
1078
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1079
 * @return the number of written char16_t
1080
 */
1081
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1082
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1083
  #if SIMDUTF_SPAN
1084
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1085
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1086
    std::span<char16_t> utf16_output) noexcept {
1087
  return convert_valid_utf8_to_utf16le(
1088
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1089
      valid_utf8_input.size(), utf16_output.data());
1090
}
1091
  #endif // SIMDUTF_SPAN
1092
1093
/**
1094
 * Convert valid UTF-8 string into UTF-16BE string.
1095
 *
1096
 * This function assumes that the input string is valid UTF-8.
1097
 *
1098
 * @param input         the UTF-8 string to convert
1099
 * @param length        the length of the string in bytes
1100
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1101
 * @return the number of written char16_t
1102
 */
1103
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1104
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1105
  #if SIMDUTF_SPAN
1106
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1107
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1108
    std::span<char16_t> utf16_output) noexcept {
1109
  return convert_valid_utf8_to_utf16be(
1110
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1111
      valid_utf8_input.size(), utf16_output.data());
1112
}
1113
  #endif // SIMDUTF_SPAN
1114
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1115
1116
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1117
/**
1118
 * Convert valid UTF-8 string into UTF-32 string.
1119
 *
1120
 * This function assumes that the input string is valid UTF-8.
1121
 *
1122
 * @param input         the UTF-8 string to convert
1123
 * @param length        the length of the string in bytes
1124
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1125
 * @return the number of written char32_t
1126
 */
1127
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1128
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1129
  #if SIMDUTF_SPAN
1130
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1131
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1132
    std::span<char32_t> utf32_output) noexcept {
1133
  return convert_valid_utf8_to_utf32(
1134
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1135
      valid_utf8_input.size(), utf32_output.data());
1136
}
1137
  #endif // SIMDUTF_SPAN
1138
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1139
1140
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1141
/**
1142
 * Return the number of bytes that this Latin1 string would require in UTF-8
1143
 * format.
1144
 *
1145
 * @param input         the Latin1 string to convert
1146
 * @param length        the length of the string bytes
1147
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1148
 */
1149
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1150
                                                   size_t length) noexcept;
1151
  #if SIMDUTF_SPAN
1152
simdutf_really_inline simdutf_warn_unused size_t utf8_length_from_latin1(
1153
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1154
  return utf8_length_from_latin1(
1155
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size());
1156
}
1157
  #endif // SIMDUTF_SPAN
1158
1159
/**
1160
 * Compute the number of bytes that this UTF-8 string would require in Latin1
1161
 * format.
1162
 *
1163
 * This function does not validate the input. It is acceptable to pass invalid
1164
 * UTF-8 strings but in such cases the result is implementation defined.
1165
 *
1166
 * This function is not BOM-aware.
1167
 *
1168
 * @param input         the UTF-8 string to convert
1169
 * @param length        the length of the string in byte
1170
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1171
 */
1172
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1173
                                                   size_t length) noexcept;
1174
  #if SIMDUTF_SPAN
1175
simdutf_really_inline simdutf_warn_unused size_t latin1_length_from_utf8(
1176
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1177
  return latin1_length_from_utf8(
1178
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1179
      valid_utf8_input.size());
1180
}
1181
  #endif // SIMDUTF_SPAN
1182
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1183
1184
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1185
/**
1186
 * Compute the number of 2-byte code units that this UTF-8 string would require
1187
 * in UTF-16LE format.
1188
 *
1189
 * This function does not validate the input. It is acceptable to pass invalid
1190
 * UTF-8 strings but in such cases the result is implementation defined.
1191
 *
1192
 * This function is not BOM-aware.
1193
 *
1194
 * @param input         the UTF-8 string to process
1195
 * @param length        the length of the string in bytes
1196
 * @return the number of char16_t code units required to encode the UTF-8 string
1197
 * as UTF-16LE
1198
 */
1199
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1200
                                                  size_t length) noexcept;
1201
  #if SIMDUTF_SPAN
1202
simdutf_really_inline simdutf_warn_unused size_t utf16_length_from_utf8(
1203
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1204
  return utf16_length_from_utf8(
1205
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1206
      valid_utf8_input.size());
1207
}
1208
  #endif // SIMDUTF_SPAN
1209
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1210
1211
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1212
/**
1213
 * Compute the number of 4-byte code units that this UTF-8 string would require
1214
 * in UTF-32 format.
1215
 *
1216
 * This function is equivalent to count_utf8
1217
 *
1218
 * This function does not validate the input. It is acceptable to pass invalid
1219
 * UTF-8 strings but in such cases the result is implementation defined.
1220
 *
1221
 * This function is not BOM-aware.
1222
 *
1223
 * @param input         the UTF-8 string to process
1224
 * @param length        the length of the string in bytes
1225
 * @return the number of char32_t code units required to encode the UTF-8 string
1226
 * as UTF-32
1227
 */
1228
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1229
                                                  size_t length) noexcept;
1230
  #if SIMDUTF_SPAN
1231
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf8(
1232
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1233
  return utf32_length_from_utf8(
1234
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1235
      valid_utf8_input.size());
1236
}
1237
  #endif // SIMDUTF_SPAN
1238
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1239
1240
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1241
/**
1242
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1243
 * string.
1244
 *
1245
 * During the conversion also validation of the input string is done.
1246
 * This function is suitable to work with inputs from untrusted sources.
1247
 *
1248
 * This function is not BOM-aware.
1249
 *
1250
 * @param input         the UTF-16 string to convert
1251
 * @param length        the length of the string in 2-byte code units (char16_t)
1252
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1253
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1254
 * string
1255
 */
1256
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1257
                                                 size_t length,
1258
                                                 char *utf8_buffer) noexcept;
1259
  #if SIMDUTF_SPAN
1260
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8(
1261
    std::span<const char16_t> utf16_input,
1262
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1263
  return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1264
                               reinterpret_cast<char *>(utf8_output.data()));
1265
}
1266
  #endif // SIMDUTF_SPAN
1267
1268
/**
1269
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1270
 * string with output limit.
1271
 *
1272
 * We write as many characters as possible into the output buffer,
1273
 *
1274
 * During the conversion also validation of the input string is done.
1275
 * This function is suitable to work with inputs from untrusted sources.
1276
 *
1277
 * This function is not BOM-aware.
1278
 *
1279
 *
1280
 * @param input         the UTF-16 string to convert
1281
 * @param length        the length of the string in 16-bit code units (char16_t)
1282
 * @param utf8_output   the pointer to buffer that can hold conversion result
1283
 * @param utf8_len      the maximum output length
1284
 * @return the number of written char; 0 if conversion is not possible
1285
 */
1286
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1287
                                                      size_t length,
1288
                                                      char *utf8_output,
1289
                                                      size_t utf8_len) noexcept;
1290
  #if SIMDUTF_SPAN
1291
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8_safe(
1292
    std::span<const char16_t> utf16_input,
1293
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1294
  // implementation note: outputspan is a forwarding ref to avoid copying and
1295
  // allow both lvalues and rvalues. std::span can be copied without problems,
1296
  // but std::vector should not, and this function should accept both. it will
1297
  // allow using an owning rvalue ref (example: passing a temporary std::string)
1298
  // as output, but the user will quickly find out that he has no way of getting
1299
  // the data out of the object in that case.
1300
  return convert_utf16_to_utf8_safe(
1301
      utf16_input.data(), utf16_input.size(),
1302
      reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1303
}
1304
  #endif // SIMDUTF_SPAN
1305
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1306
1307
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1308
/**
1309
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1310
 * string.
1311
 *
1312
 * During the conversion also validation of the input string is done.
1313
 * This function is suitable to work with inputs from untrusted sources.
1314
 *
1315
 * This function is not BOM-aware.
1316
 *
1317
 * @param input         the UTF-16 string to convert
1318
 * @param length        the length of the string in 2-byte code units (char16_t)
1319
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1320
 * @return number of written code units; 0 if input is not a valid UTF-16 string
1321
 * or if it cannot be represented as Latin1
1322
 */
1323
simdutf_warn_unused size_t convert_utf16_to_latin1(
1324
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1325
  #if SIMDUTF_SPAN
1326
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_latin1(
1327
    std::span<const char16_t> utf16_input,
1328
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1329
  return convert_utf16_to_latin1(
1330
      utf16_input.data(), utf16_input.size(),
1331
      reinterpret_cast<char *>(latin1_output.data()));
1332
}
1333
  #endif // SIMDUTF_SPAN
1334
1335
/**
1336
 * Convert possibly broken UTF-16LE string into Latin1 string.
1337
 * If the string cannot be represented as Latin1, an error
1338
 * is returned.
1339
 *
1340
 * During the conversion also validation of the input string is done.
1341
 * This function is suitable to work with inputs from untrusted sources.
1342
 *
1343
 * This function is not BOM-aware.
1344
 *
1345
 * @param input         the UTF-16LE string to convert
1346
 * @param length        the length of the string in 2-byte code units (char16_t)
1347
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1348
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1349
 * string or if it cannot be represented as Latin1
1350
 */
1351
simdutf_warn_unused size_t convert_utf16le_to_latin1(
1352
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1353
  #if SIMDUTF_SPAN
1354
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_latin1(
1355
    std::span<const char16_t> utf16_input,
1356
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1357
  return convert_utf16le_to_latin1(
1358
      utf16_input.data(), utf16_input.size(),
1359
      reinterpret_cast<char *>(latin1_output.data()));
1360
}
1361
  #endif // SIMDUTF_SPAN
1362
1363
/**
1364
 * Convert possibly broken UTF-16BE string into Latin1 string.
1365
 *
1366
 * During the conversion also validation of the input string is done.
1367
 * This function is suitable to work with inputs from untrusted sources.
1368
 *
1369
 * This function is not BOM-aware.
1370
 *
1371
 * @param input         the UTF-16BE string to convert
1372
 * @param length        the length of the string in 2-byte code units (char16_t)
1373
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1374
 * @return number of written code units; 0 if input is not a valid UTF-16BE
1375
 * string or if it cannot be represented as Latin1
1376
 */
1377
simdutf_warn_unused size_t convert_utf16be_to_latin1(
1378
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1379
  #if SIMDUTF_SPAN
1380
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_latin1(
1381
    std::span<const char16_t> utf16_input,
1382
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1383
  return convert_utf16be_to_latin1(
1384
      utf16_input.data(), utf16_input.size(),
1385
      reinterpret_cast<char *>(latin1_output.data()));
1386
}
1387
  #endif // SIMDUTF_SPAN
1388
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1389
1390
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1391
/**
1392
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1393
 *
1394
 * During the conversion also validation of the input string is done.
1395
 * This function is suitable to work with inputs from untrusted sources.
1396
 *
1397
 * This function is not BOM-aware.
1398
 *
1399
 * @param input         the UTF-16LE string to convert
1400
 * @param length        the length of the string in 2-byte code units (char16_t)
1401
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1402
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1403
 * string
1404
 */
1405
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1406
                                                   size_t length,
1407
                                                   char *utf8_buffer) noexcept;
1408
  #if SIMDUTF_SPAN
1409
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_utf8(
1410
    std::span<const char16_t> utf16_input,
1411
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1412
  return convert_utf16le_to_utf8(utf16_input.data(), utf16_input.size(),
1413
                                 reinterpret_cast<char *>(utf8_output.data()));
1414
}
1415
  #endif // SIMDUTF_SPAN
1416
1417
/**
1418
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1419
 *
1420
 * During the conversion also validation of the input string is done.
1421
 * This function is suitable to work with inputs from untrusted sources.
1422
 *
1423
 * This function is not BOM-aware.
1424
 *
1425
 * @param input         the UTF-16BE string to convert
1426
 * @param length        the length of the string in 2-byte code units (char16_t)
1427
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1428
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1429
 * string
1430
 */
1431
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
1432
                                                   size_t length,
1433
                                                   char *utf8_buffer) noexcept;
1434
  #if SIMDUTF_SPAN
1435
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_utf8(
1436
    std::span<const char16_t> utf16_input,
1437
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1438
  return convert_utf16be_to_utf8(utf16_input.data(), utf16_input.size(),
1439
                                 reinterpret_cast<char *>(utf8_output.data()));
1440
}
1441
  #endif // SIMDUTF_SPAN
1442
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1443
1444
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1445
/**
1446
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1447
 * string.
1448
 *
1449
 * During the conversion also validation of the input string is done.
1450
 * This function is suitable to work with inputs from untrusted sources.
1451
 * This function is not BOM-aware.
1452
 *
1453
 * @param input         the UTF-16 string to convert
1454
 * @param length        the length of the string in 2-byte code units (char16_t)
1455
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1456
 * @return a result pair struct (of type simdutf::result containing the two
1457
 * fields error and count) with an error code and either position of the error
1458
 * (in the input in code units) if any, or the number of char written if
1459
 * successful.
1460
 */
1461
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
1462
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1463
  #if SIMDUTF_SPAN
1464
simdutf_really_inline simdutf_warn_unused result
1465
convert_utf16_to_latin1_with_errors(
1466
    std::span<const char16_t> utf16_input,
1467
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1468
  return convert_utf16_to_latin1_with_errors(
1469
      utf16_input.data(), utf16_input.size(),
1470
      reinterpret_cast<char *>(latin1_output.data()));
1471
}
1472
  #endif // SIMDUTF_SPAN
1473
1474
/**
1475
 * Convert possibly broken UTF-16LE string into Latin1 string.
1476
 *
1477
 * During the conversion also validation of the input string is done.
1478
 * This function is suitable to work with inputs from untrusted sources.
1479
 * This function is not BOM-aware.
1480
 *
1481
 * @param input         the UTF-16LE string to convert
1482
 * @param length        the length of the string in 2-byte code units (char16_t)
1483
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1484
 * @return a result pair struct (of type simdutf::result containing the two
1485
 * fields error and count) with an error code and either position of the error
1486
 * (in the input in code units) if any, or the number of char written if
1487
 * successful.
1488
 */
1489
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
1490
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1491
  #if SIMDUTF_SPAN
1492
simdutf_really_inline simdutf_warn_unused result
1493
convert_utf16le_to_latin1_with_errors(
1494
    std::span<const char16_t> utf16_input,
1495
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1496
  return convert_utf16le_to_latin1_with_errors(
1497
      utf16_input.data(), utf16_input.size(),
1498
      reinterpret_cast<char *>(latin1_output.data()));
1499
}
1500
  #endif // SIMDUTF_SPAN
1501
1502
/**
1503
 * Convert possibly broken UTF-16BE string into Latin1 string.
1504
 * If the string cannot be represented as Latin1, an error
1505
 * is returned.
1506
 *
1507
 * During the conversion also validation of the input string is done.
1508
 * This function is suitable to work with inputs from untrusted sources.
1509
 * This function is not BOM-aware.
1510
 *
1511
 * @param input         the UTF-16BE string to convert
1512
 * @param length        the length of the string in 2-byte code units (char16_t)
1513
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1514
 * @return a result pair struct (of type simdutf::result containing the two
1515
 * fields error and count) with an error code and either position of the error
1516
 * (in the input in code units) if any, or the number of char written if
1517
 * successful.
1518
 */
1519
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
1520
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1521
  #if SIMDUTF_SPAN
1522
simdutf_really_inline simdutf_warn_unused result
1523
convert_utf16be_to_latin1_with_errors(
1524
    std::span<const char16_t> utf16_input,
1525
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1526
  return convert_utf16be_to_latin1_with_errors(
1527
      utf16_input.data(), utf16_input.size(),
1528
      reinterpret_cast<char *>(latin1_output.data()));
1529
}
1530
  #endif // SIMDUTF_SPAN
1531
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1532
1533
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1534
/**
1535
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1536
 * string and stop on error.
1537
 *
1538
 * During the conversion also validation of the input string is done.
1539
 * This function is suitable to work with inputs from untrusted sources.
1540
 *
1541
 * This function is not BOM-aware.
1542
 *
1543
 * @param input         the UTF-16 string to convert
1544
 * @param length        the length of the string in 2-byte code units (char16_t)
1545
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1546
 * @return a result pair struct (of type simdutf::result containing the two
1547
 * fields error and count) with an error code and either position of the error
1548
 * (in the input in code units) if any, or the number of char written if
1549
 * successful.
1550
 */
1551
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
1552
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1553
  #if SIMDUTF_SPAN
1554
simdutf_really_inline simdutf_warn_unused result
1555
convert_utf16_to_utf8_with_errors(
1556
    std::span<const char16_t> utf16_input,
1557
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1558
  return convert_utf16_to_utf8_with_errors(
1559
      utf16_input.data(), utf16_input.size(),
1560
      reinterpret_cast<char *>(utf8_output.data()));
1561
}
1562
  #endif // SIMDUTF_SPAN
1563
1564
/**
1565
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1566
 *
1567
 * During the conversion also validation of the input string is done.
1568
 * This function is suitable to work with inputs from untrusted sources.
1569
 *
1570
 * This function is not BOM-aware.
1571
 *
1572
 * @param input         the UTF-16LE string to convert
1573
 * @param length        the length of the string in 2-byte code units (char16_t)
1574
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1575
 * @return a result pair struct (of type simdutf::result containing the two
1576
 * fields error and count) with an error code and either position of the error
1577
 * (in the input in code units) if any, or the number of char written if
1578
 * successful.
1579
 */
1580
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
1581
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1582
  #if SIMDUTF_SPAN
1583
simdutf_really_inline simdutf_warn_unused result
1584
convert_utf16le_to_utf8_with_errors(
1585
    std::span<const char16_t> utf16_input,
1586
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1587
  return convert_utf16le_to_utf8_with_errors(
1588
      utf16_input.data(), utf16_input.size(),
1589
      reinterpret_cast<char *>(utf8_output.data()));
1590
}
1591
  #endif // SIMDUTF_SPAN
1592
1593
/**
1594
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1595
 *
1596
 * During the conversion also validation of the input string is done.
1597
 * This function is suitable to work with inputs from untrusted sources.
1598
 *
1599
 * This function is not BOM-aware.
1600
 *
1601
 * @param input         the UTF-16BE string to convert
1602
 * @param length        the length of the string in 2-byte code units (char16_t)
1603
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1604
 * @return a result pair struct (of type simdutf::result containing the two
1605
 * fields error and count) with an error code and either position of the error
1606
 * (in the input in code units) if any, or the number of char written if
1607
 * successful.
1608
 */
1609
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
1610
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1611
  #if SIMDUTF_SPAN
1612
simdutf_really_inline simdutf_warn_unused result
1613
convert_utf16be_to_utf8_with_errors(
1614
    std::span<const char16_t> utf16_input,
1615
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1616
  return convert_utf16be_to_utf8_with_errors(
1617
      utf16_input.data(), utf16_input.size(),
1618
      reinterpret_cast<char *>(utf8_output.data()));
1619
}
1620
  #endif // SIMDUTF_SPAN
1621
1622
/**
1623
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
1624
 *
1625
 * This function assumes that the input string is valid UTF-16LE.
1626
 *
1627
 * This function is not BOM-aware.
1628
 *
1629
 * @param input         the UTF-16 string to convert
1630
 * @param length        the length of the string in 2-byte code units (char16_t)
1631
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1632
 * result
1633
 * @return number of written code units; 0 if conversion is not possible
1634
 */
1635
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1636
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1637
  #if SIMDUTF_SPAN
1638
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1639
    std::span<const char16_t> valid_utf16_input,
1640
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1641
  return convert_valid_utf16_to_utf8(
1642
      valid_utf16_input.data(), valid_utf16_input.size(),
1643
      reinterpret_cast<char *>(utf8_output.data()));
1644
}
1645
  #endif // SIMDUTF_SPAN
1646
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1647
1648
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1649
/**
1650
 * Using native endianness, convert UTF-16 string into Latin1 string.
1651
 *
1652
 * This function assumes that the input string is valid UTF-16 and that it can
1653
 * be represented as Latin1. If you violate this assumption, the result is
1654
 * implementation defined and may include system-dependent behavior such as
1655
 * crashes.
1656
 *
1657
 * This function is for expert users only and not part of our public API. Use
1658
 * convert_utf16_to_latin1 instead. The function may be removed from the library
1659
 * in the future.
1660
 *
1661
 * This function is not BOM-aware.
1662
 *
1663
 * @param input         the UTF-16 string to convert
1664
 * @param length        the length of the string in 2-byte code units (char16_t)
1665
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1666
 * @return number of written code units; 0 if conversion is not possible
1667
 */
1668
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1669
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1670
  #if SIMDUTF_SPAN
1671
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1672
    std::span<const char16_t> valid_utf16_input,
1673
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1674
  return convert_valid_utf16_to_latin1(
1675
      valid_utf16_input.data(), valid_utf16_input.size(),
1676
      reinterpret_cast<char *>(latin1_output.data()));
1677
}
1678
  #endif // SIMDUTF_SPAN
1679
1680
/**
1681
 * Convert valid UTF-16LE string into Latin1 string.
1682
 *
1683
 * This function assumes that the input string is valid UTF-16LE and that it can
1684
 * be represented as Latin1. If you violate this assumption, the result is
1685
 * implementation defined and may include system-dependent behavior such as
1686
 * crashes.
1687
 *
1688
 * This function is for expert users only and not part of our public API. Use
1689
 * convert_utf16le_to_latin1 instead. The function may be removed from the
1690
 * library in the future.
1691
 *
1692
 * This function is not BOM-aware.
1693
 *
1694
 * @param input         the UTF-16LE string to convert
1695
 * @param length        the length of the string in 2-byte code units (char16_t)
1696
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1697
 * @return number of written code units; 0 if conversion is not possible
1698
 */
1699
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
1700
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1701
  #if SIMDUTF_SPAN
1702
simdutf_really_inline simdutf_warn_unused size_t
1703
convert_valid_utf16le_to_latin1(
1704
    std::span<const char16_t> valid_utf16_input,
1705
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1706
  return convert_valid_utf16le_to_latin1(
1707
      valid_utf16_input.data(), valid_utf16_input.size(),
1708
      reinterpret_cast<char *>(latin1_output.data()));
1709
}
1710
  #endif // SIMDUTF_SPAN
1711
1712
/**
1713
 * Convert valid UTF-16BE string into Latin1 string.
1714
 *
1715
 * This function assumes that the input string is valid UTF-16BE and that it can
1716
 * be represented as Latin1. If you violate this assumption, the result is
1717
 * implementation defined and may include system-dependent behavior such as
1718
 * crashes.
1719
 *
1720
 * This function is for expert users only and not part of our public API. Use
1721
 * convert_utf16be_to_latin1 instead. The function may be removed from the
1722
 * library in the future.
1723
 *
1724
 * This function is not BOM-aware.
1725
 *
1726
 * @param input         the UTF-16BE string to convert
1727
 * @param length        the length of the string in 2-byte code units (char16_t)
1728
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1729
 * @return number of written code units; 0 if conversion is not possible
1730
 */
1731
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
1732
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1733
  #if SIMDUTF_SPAN
1734
simdutf_really_inline simdutf_warn_unused size_t
1735
convert_valid_utf16be_to_latin1(
1736
    std::span<const char16_t> valid_utf16_input,
1737
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1738
  return convert_valid_utf16be_to_latin1(
1739
      valid_utf16_input.data(), valid_utf16_input.size(),
1740
      reinterpret_cast<char *>(latin1_output.data()));
1741
}
1742
  #endif // SIMDUTF_SPAN
1743
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1744
1745
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1746
/**
1747
 * Convert valid UTF-16LE string into UTF-8 string.
1748
 *
1749
 * This function assumes that the input string is valid UTF-16LE and that it can
1750
 * be represented as Latin1.
1751
 *
1752
 * This function is not BOM-aware.
1753
 *
1754
 * @param input         the UTF-16LE string to convert
1755
 * @param length        the length of the string in 2-byte code units (char16_t)
1756
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1757
 * result
1758
 * @return number of written code units; 0 if conversion is not possible
1759
 */
1760
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1761
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1762
  #if SIMDUTF_SPAN
1763
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1764
    std::span<const char16_t> valid_utf16_input,
1765
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1766
  return convert_valid_utf16le_to_utf8(
1767
      valid_utf16_input.data(), valid_utf16_input.size(),
1768
      reinterpret_cast<char *>(utf8_output.data()));
1769
}
1770
  #endif // SIMDUTF_SPAN
1771
1772
/**
1773
 * Convert valid UTF-16BE string into UTF-8 string.
1774
 *
1775
 * This function assumes that the input string is valid UTF-16BE.
1776
 *
1777
 * This function is not BOM-aware.
1778
 *
1779
 * @param input         the UTF-16BE string to convert
1780
 * @param length        the length of the string in 2-byte code units (char16_t)
1781
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1782
 * result
1783
 * @return number of written code units; 0 if conversion is not possible
1784
 */
1785
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1786
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1787
  #if SIMDUTF_SPAN
1788
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1789
    std::span<const char16_t> valid_utf16_input,
1790
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1791
  return convert_valid_utf16be_to_utf8(
1792
      valid_utf16_input.data(), valid_utf16_input.size(),
1793
      reinterpret_cast<char *>(utf8_output.data()));
1794
}
1795
  #endif // SIMDUTF_SPAN
1796
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1797
1798
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
1799
/**
1800
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
1801
 * string.
1802
 *
1803
 * During the conversion also validation of the input string is done.
1804
 * This function is suitable to work with inputs from untrusted sources.
1805
 *
1806
 * This function is not BOM-aware.
1807
 *
1808
 * @param input         the UTF-16 string to convert
1809
 * @param length        the length of the string in 2-byte code units (char16_t)
1810
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1811
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1812
 * string
1813
 */
1814
simdutf_warn_unused size_t convert_utf16_to_utf32(
1815
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1816
  #if SIMDUTF_SPAN
1817
simdutf_really_inline simdutf_warn_unused size_t
1818
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
1819
0
                       std::span<char32_t> utf32_output) noexcept {
1820
0
  return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
1821
0
                                utf32_output.data());
1822
0
}
1823
  #endif // SIMDUTF_SPAN
1824
1825
/**
1826
 * Convert possibly broken UTF-16LE string into UTF-32 string.
1827
 *
1828
 * During the conversion also validation of the input string is done.
1829
 * This function is suitable to work with inputs from untrusted sources.
1830
 *
1831
 * This function is not BOM-aware.
1832
 *
1833
 * @param input         the UTF-16LE string to convert
1834
 * @param length        the length of the string in 2-byte code units (char16_t)
1835
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1836
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1837
 * string
1838
 */
1839
simdutf_warn_unused size_t convert_utf16le_to_utf32(
1840
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1841
  #if SIMDUTF_SPAN
1842
simdutf_really_inline simdutf_warn_unused size_t
1843
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
1844
0
                         std::span<char32_t> utf32_output) noexcept {
1845
0
  return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
1846
0
                                  utf32_output.data());
1847
0
}
1848
  #endif // SIMDUTF_SPAN
1849
1850
/**
1851
 * Convert possibly broken UTF-16BE string into UTF-32 string.
1852
 *
1853
 * During the conversion also validation of the input string is done.
1854
 * This function is suitable to work with inputs from untrusted sources.
1855
 *
1856
 * This function is not BOM-aware.
1857
 *
1858
 * @param input         the UTF-16BE string to convert
1859
 * @param length        the length of the string in 2-byte code units (char16_t)
1860
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1861
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1862
 * string
1863
 */
1864
simdutf_warn_unused size_t convert_utf16be_to_utf32(
1865
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1866
  #if SIMDUTF_SPAN
1867
simdutf_really_inline simdutf_warn_unused size_t
1868
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
1869
0
                         std::span<char32_t> utf32_output) noexcept {
1870
0
  return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
1871
0
                                  utf32_output.data());
1872
0
}
1873
  #endif // SIMDUTF_SPAN
1874
1875
/**
1876
 * Using native endianness, convert possibly broken UTF-16 string into
1877
 * UTF-32 string and stop on error.
1878
 *
1879
 * During the conversion also validation of the input string is done.
1880
 * This function is suitable to work with inputs from untrusted sources.
1881
 *
1882
 * This function is not BOM-aware.
1883
 *
1884
 * @param input         the UTF-16 string to convert
1885
 * @param length        the length of the string in 2-byte code units (char16_t)
1886
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1887
 * @return a result pair struct (of type simdutf::result containing the two
1888
 * fields error and count) with an error code and either position of the error
1889
 * (in the input in code units) if any, or the number of char32_t written if
1890
 * successful.
1891
 */
1892
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
1893
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1894
  #if SIMDUTF_SPAN
1895
simdutf_really_inline simdutf_warn_unused result
1896
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
1897
0
                                   std::span<char32_t> utf32_output) noexcept {
1898
0
  return convert_utf16_to_utf32_with_errors(
1899
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1900
0
}
1901
  #endif // SIMDUTF_SPAN
1902
1903
/**
1904
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1905
 *
1906
 * During the conversion also validation of the input string is done.
1907
 * This function is suitable to work with inputs from untrusted sources.
1908
 *
1909
 * This function is not BOM-aware.
1910
 *
1911
 * @param input         the UTF-16LE string to convert
1912
 * @param length        the length of the string in 2-byte code units (char16_t)
1913
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1914
 * @return a result pair struct (of type simdutf::result containing the two
1915
 * fields error and count) with an error code and either position of the error
1916
 * (in the input in code units) if any, or the number of char32_t written if
1917
 * successful.
1918
 */
1919
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
1920
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1921
  #if SIMDUTF_SPAN
1922
simdutf_really_inline simdutf_warn_unused result
1923
convert_utf16le_to_utf32_with_errors(
1924
    std::span<const char16_t> utf16_input,
1925
0
    std::span<char32_t> utf32_output) noexcept {
1926
0
  return convert_utf16le_to_utf32_with_errors(
1927
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1928
0
}
1929
  #endif // SIMDUTF_SPAN
1930
1931
/**
1932
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1933
 *
1934
 * During the conversion also validation of the input string is done.
1935
 * This function is suitable to work with inputs from untrusted sources.
1936
 *
1937
 * This function is not BOM-aware.
1938
 *
1939
 * @param input         the UTF-16BE string to convert
1940
 * @param length        the length of the string in 2-byte code units (char16_t)
1941
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1942
 * @return a result pair struct (of type simdutf::result containing the two
1943
 * fields error and count) with an error code and either position of the error
1944
 * (in the input in code units) if any, or the number of char32_t written if
1945
 * successful.
1946
 */
1947
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
1948
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1949
  #if SIMDUTF_SPAN
1950
simdutf_really_inline simdutf_warn_unused result
1951
convert_utf16be_to_utf32_with_errors(
1952
    std::span<const char16_t> utf16_input,
1953
0
    std::span<char32_t> utf32_output) noexcept {
1954
0
  return convert_utf16be_to_utf32_with_errors(
1955
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1956
0
}
1957
  #endif // SIMDUTF_SPAN
1958
1959
/**
1960
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
1961
 *
1962
 * This function assumes that the input string is valid UTF-16 (native
1963
 * endianness).
1964
 *
1965
 * This function is not BOM-aware.
1966
 *
1967
 * @param input         the UTF-16 string to convert
1968
 * @param length        the length of the string in 2-byte code units (char16_t)
1969
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
1970
 * result
1971
 * @return number of written code units; 0 if conversion is not possible
1972
 */
1973
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
1974
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1975
  #if SIMDUTF_SPAN
1976
simdutf_really_inline simdutf_warn_unused size_t
1977
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
1978
0
                             std::span<char32_t> utf32_output) noexcept {
1979
0
  return convert_valid_utf16_to_utf32(
1980
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
1981
0
}
1982
  #endif // SIMDUTF_SPAN
1983
1984
/**
1985
 * Convert valid UTF-16LE string into UTF-32 string.
1986
 *
1987
 * This function assumes that the input string is valid UTF-16LE.
1988
 *
1989
 * This function is not BOM-aware.
1990
 *
1991
 * @param input         the UTF-16LE string to convert
1992
 * @param length        the length of the string in 2-byte code units (char16_t)
1993
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
1994
 * result
1995
 * @return number of written code units; 0 if conversion is not possible
1996
 */
1997
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
1998
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1999
  #if SIMDUTF_SPAN
2000
simdutf_really_inline simdutf_warn_unused size_t
2001
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
2002
0
                               std::span<char32_t> utf32_output) noexcept {
2003
0
  return convert_valid_utf16le_to_utf32(
2004
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2005
0
}
2006
  #endif // SIMDUTF_SPAN
2007
2008
/**
2009
 * Convert valid UTF-16BE string into UTF-32 string.
2010
 *
2011
 * This function assumes that the input string is valid UTF-16LE.
2012
 *
2013
 * This function is not BOM-aware.
2014
 *
2015
 * @param input         the UTF-16BE string to convert
2016
 * @param length        the length of the string in 2-byte code units (char16_t)
2017
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2018
 * result
2019
 * @return number of written code units; 0 if conversion is not possible
2020
 */
2021
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
2022
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2023
  #if SIMDUTF_SPAN
2024
simdutf_really_inline simdutf_warn_unused size_t
2025
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
2026
0
                               std::span<char32_t> utf32_output) noexcept {
2027
0
  return convert_valid_utf16be_to_utf32(
2028
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2029
0
}
2030
  #endif // SIMDUTF_SPAN
2031
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2032
2033
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2034
/**
2035
 * Compute the number of bytes that this UTF-16LE/BE string would require in
2036
 * Latin1 format.
2037
 *
2038
 * This function does not validate the input. It is acceptable to pass invalid
2039
 * UTF-16 strings but in such cases the result is implementation defined.
2040
 *
2041
 * This function is not BOM-aware.
2042
 *
2043
 * @param length        the length of the string in 2-byte code units (char16_t)
2044
 * @return the number of bytes required to encode the UTF-16LE string as Latin1
2045
 */
2046
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
2047
2048
/**
2049
 * Using native endianness; Compute the number of bytes that this UTF-16
2050
 * string would require in UTF-8 format.
2051
 *
2052
 * This function does not validate the input. It is acceptable to pass invalid
2053
 * UTF-16 strings but in such cases the result is implementation defined.
2054
 *
2055
 * @param input         the UTF-16 string to convert
2056
 * @param length        the length of the string in 2-byte code units (char16_t)
2057
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2058
 */
2059
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
2060
                                                  size_t length) noexcept;
2061
  #if SIMDUTF_SPAN
2062
simdutf_really_inline simdutf_warn_unused size_t
2063
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2064
0
  return utf8_length_from_utf16(valid_utf16_input.data(),
2065
0
                                valid_utf16_input.size());
2066
0
}
2067
  #endif // SIMDUTF_SPAN
2068
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2069
2070
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2071
/**
2072
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2073
 * format.
2074
 *
2075
 * This function does not validate the input. It is acceptable to pass invalid
2076
 * UTF-16 strings but in such cases the result is implementation defined.
2077
 *
2078
 * @param input         the UTF-16LE string to convert
2079
 * @param length        the length of the string in 2-byte code units (char16_t)
2080
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2081
 */
2082
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
2083
                                                    size_t length) noexcept;
2084
  #if SIMDUTF_SPAN
2085
simdutf_really_inline simdutf_warn_unused size_t
2086
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2087
0
  return utf8_length_from_utf16le(valid_utf16_input.data(),
2088
0
                                  valid_utf16_input.size());
2089
0
}
2090
  #endif // SIMDUTF_SPAN
2091
2092
/**
2093
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
2094
 * format.
2095
 *
2096
 * This function does not validate the input. It is acceptable to pass invalid
2097
 * UTF-16 strings but in such cases the result is implementation defined.
2098
 *
2099
 * @param input         the UTF-16BE string to convert
2100
 * @param length        the length of the string in 2-byte code units (char16_t)
2101
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2102
 */
2103
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
2104
                                                    size_t length) noexcept;
2105
  #if SIMDUTF_SPAN
2106
simdutf_really_inline simdutf_warn_unused size_t
2107
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2108
0
  return utf8_length_from_utf16be(valid_utf16_input.data(),
2109
0
                                  valid_utf16_input.size());
2110
0
}
2111
  #endif // SIMDUTF_SPAN
2112
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2113
2114
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2115
/**
2116
 * Convert possibly broken UTF-32 string into UTF-8 string.
2117
 *
2118
 * During the conversion also validation of the input string is done.
2119
 * This function is suitable to work with inputs from untrusted sources.
2120
 *
2121
 * This function is not BOM-aware.
2122
 *
2123
 * @param input         the UTF-32 string to convert
2124
 * @param length        the length of the string in 4-byte code units (char32_t)
2125
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2126
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2127
 */
2128
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
2129
                                                 size_t length,
2130
                                                 char *utf8_buffer) noexcept;
2131
  #if SIMDUTF_SPAN
2132
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_utf8(
2133
    std::span<const char32_t> utf32_input,
2134
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2135
  return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
2136
                               reinterpret_cast<char *>(utf8_output.data()));
2137
}
2138
  #endif // SIMDUTF_SPAN
2139
2140
/**
2141
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2142
 *
2143
 * During the conversion also validation of the input string is done.
2144
 * This function is suitable to work with inputs from untrusted sources.
2145
 *
2146
 * This function is not BOM-aware.
2147
 *
2148
 * @param input         the UTF-32 string to convert
2149
 * @param length        the length of the string in 4-byte code units (char32_t)
2150
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2151
 * @return a result pair struct (of type simdutf::result containing the two
2152
 * fields error and count) with an error code and either position of the error
2153
 * (in the input in code units) if any, or the number of char written if
2154
 * successful.
2155
 */
2156
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
2157
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2158
  #if SIMDUTF_SPAN
2159
simdutf_really_inline simdutf_warn_unused result
2160
convert_utf32_to_utf8_with_errors(
2161
    std::span<const char32_t> utf32_input,
2162
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2163
  return convert_utf32_to_utf8_with_errors(
2164
      utf32_input.data(), utf32_input.size(),
2165
      reinterpret_cast<char *>(utf8_output.data()));
2166
}
2167
  #endif // SIMDUTF_SPAN
2168
2169
/**
2170
 * Convert valid UTF-32 string into UTF-8 string.
2171
 *
2172
 * This function assumes that the input string is valid UTF-32.
2173
 *
2174
 * This function is not BOM-aware.
2175
 *
2176
 * @param input         the UTF-32 string to convert
2177
 * @param length        the length of the string in 4-byte code units (char32_t)
2178
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2179
 * result
2180
 * @return number of written code units; 0 if conversion is not possible
2181
 */
2182
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2183
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2184
  #if SIMDUTF_SPAN
2185
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2186
    std::span<const char32_t> valid_utf32_input,
2187
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2188
  return convert_valid_utf32_to_utf8(
2189
      valid_utf32_input.data(), valid_utf32_input.size(),
2190
      reinterpret_cast<char *>(utf8_output.data()));
2191
}
2192
  #endif // SIMDUTF_SPAN
2193
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2194
2195
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2196
/**
2197
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
2198
 * string.
2199
 *
2200
 * During the conversion also validation of the input string is done.
2201
 * This function is suitable to work with inputs from untrusted sources.
2202
 *
2203
 * This function is not BOM-aware.
2204
 *
2205
 * @param input         the UTF-32 string to convert
2206
 * @param length        the length of the string in 4-byte code units (char32_t)
2207
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2208
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2209
 */
2210
simdutf_warn_unused size_t convert_utf32_to_utf16(
2211
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2212
  #if SIMDUTF_SPAN
2213
simdutf_really_inline simdutf_warn_unused size_t
2214
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
2215
0
                       std::span<char16_t> utf16_output) noexcept {
2216
0
  return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
2217
0
                                utf16_output.data());
2218
0
}
2219
  #endif // SIMDUTF_SPAN
2220
2221
/**
2222
 * Convert possibly broken UTF-32 string into UTF-16LE string.
2223
 *
2224
 * During the conversion also validation of the input string is done.
2225
 * This function is suitable to work with inputs from untrusted sources.
2226
 *
2227
 * This function is not BOM-aware.
2228
 *
2229
 * @param input         the UTF-32 string to convert
2230
 * @param length        the length of the string in 4-byte code units (char32_t)
2231
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2232
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2233
 */
2234
simdutf_warn_unused size_t convert_utf32_to_utf16le(
2235
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2236
  #if SIMDUTF_SPAN
2237
simdutf_really_inline simdutf_warn_unused size_t
2238
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
2239
0
                         std::span<char16_t> utf16_output) noexcept {
2240
0
  return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
2241
0
                                  utf16_output.data());
2242
0
}
2243
  #endif // SIMDUTF_SPAN
2244
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2245
2246
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2247
/**
2248
 * Convert possibly broken UTF-32 string into Latin1 string.
2249
 *
2250
 * During the conversion also validation of the input string is done.
2251
 * This function is suitable to work with inputs from untrusted sources.
2252
 *
2253
 * This function is not BOM-aware.
2254
 *
2255
 * @param input         the UTF-32 string to convert
2256
 * @param length        the length of the string in 4-byte code units (char32_t)
2257
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2258
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2259
 * or if it cannot be represented as Latin1
2260
 */
2261
simdutf_warn_unused size_t convert_utf32_to_latin1(
2262
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2263
  #if SIMDUTF_SPAN
2264
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_latin1(
2265
    std::span<const char32_t> utf32_input,
2266
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2267
  return convert_utf32_to_latin1(
2268
      utf32_input.data(), utf32_input.size(),
2269
      reinterpret_cast<char *>(latin1_output.data()));
2270
}
2271
  #endif // SIMDUTF_SPAN
2272
2273
/**
2274
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
2275
 * If the string cannot be represented as Latin1, an error is returned.
2276
 *
2277
 * During the conversion also validation of the input string is done.
2278
 * This function is suitable to work with inputs from untrusted sources.
2279
 *
2280
 * This function is not BOM-aware.
2281
 *
2282
 * @param input         the UTF-32 string to convert
2283
 * @param length        the length of the string in 4-byte code units (char32_t)
2284
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2285
 * @return a result pair struct (of type simdutf::result containing the two
2286
 * fields error and count) with an error code and either position of the error
2287
 * (in the input in code units) if any, or the number of char written if
2288
 * successful.
2289
 */
2290
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
2291
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2292
  #if SIMDUTF_SPAN
2293
simdutf_really_inline simdutf_warn_unused result
2294
convert_utf32_to_latin1_with_errors(
2295
    std::span<const char32_t> utf32_input,
2296
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2297
  return convert_utf32_to_latin1_with_errors(
2298
      utf32_input.data(), utf32_input.size(),
2299
      reinterpret_cast<char *>(latin1_output.data()));
2300
}
2301
  #endif // SIMDUTF_SPAN
2302
2303
/**
2304
 * Convert valid UTF-32 string into Latin1 string.
2305
 *
2306
 * This function assumes that the input string is valid UTF-32 and that it can
2307
 * be represented as Latin1. If you violate this assumption, the result is
2308
 * implementation defined and may include system-dependent behavior such as
2309
 * crashes.
2310
 *
2311
 * This function is for expert users only and not part of our public API. Use
2312
 * convert_utf32_to_latin1 instead. The function may be removed from the library
2313
 * in the future.
2314
 *
2315
 * This function is not BOM-aware.
2316
 *
2317
 * @param input         the UTF-32 string to convert
2318
 * @param length        the length of the string in 4-byte code units (char32_t)
2319
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
2320
 * result
2321
 * @return number of written code units; 0 if conversion is not possible
2322
 */
2323
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2324
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2325
  #if SIMDUTF_SPAN
2326
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2327
    std::span<const char32_t> valid_utf32_input,
2328
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2329
  return convert_valid_utf32_to_latin1(
2330
      valid_utf32_input.data(), valid_utf32_input.size(),
2331
      reinterpret_cast<char *>(latin1_output.data()));
2332
}
2333
  #endif // SIMDUTF_SPAN
2334
2335
/**
2336
 * Compute the number of bytes that this UTF-32 string would require in Latin1
2337
 * format.
2338
 *
2339
 * This function does not validate the input. It is acceptable to pass invalid
2340
 * UTF-32 strings but in such cases the result is implementation defined.
2341
 *
2342
 * This function is not BOM-aware.
2343
 *
2344
 * @param length        the length of the string in 4-byte code units (char32_t)
2345
 * @return the number of bytes required to encode the UTF-32 string as Latin1
2346
 */
2347
simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept;
2348
2349
/**
2350
 * Compute the number of bytes that this Latin1 string would require in UTF-32
2351
 * format.
2352
 *
2353
 * @param length        the length of the string in Latin1 code units (char)
2354
 * @return the length of the string in 4-byte code units (char32_t) required to
2355
 * encode the Latin1 string as UTF-32
2356
 */
2357
simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept;
2358
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2359
2360
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2361
/**
2362
 * Convert possibly broken UTF-32 string into UTF-16BE string.
2363
 *
2364
 * During the conversion also validation of the input string is done.
2365
 * This function is suitable to work with inputs from untrusted sources.
2366
 *
2367
 * This function is not BOM-aware.
2368
 *
2369
 * @param input         the UTF-32 string to convert
2370
 * @param length        the length of the string in 4-byte code units (char32_t)
2371
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2372
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2373
 */
2374
simdutf_warn_unused size_t convert_utf32_to_utf16be(
2375
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2376
  #if SIMDUTF_SPAN
2377
simdutf_really_inline simdutf_warn_unused size_t
2378
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
2379
0
                         std::span<char16_t> utf16_output) noexcept {
2380
0
  return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
2381
0
                                  utf16_output.data());
2382
0
}
2383
  #endif // SIMDUTF_SPAN
2384
2385
/**
2386
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
2387
 * string and stop on error.
2388
 *
2389
 * During the conversion also validation of the input string is done.
2390
 * This function is suitable to work with inputs from untrusted sources.
2391
 *
2392
 * This function is not BOM-aware.
2393
 *
2394
 * @param input         the UTF-32 string to convert
2395
 * @param length        the length of the string in 4-byte code units (char32_t)
2396
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2397
 * @return a result pair struct (of type simdutf::result containing the two
2398
 * fields error and count) with an error code and either position of the error
2399
 * (in the input in code units) if any, or the number of char16_t written if
2400
 * successful.
2401
 */
2402
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
2403
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2404
  #if SIMDUTF_SPAN
2405
simdutf_really_inline simdutf_warn_unused result
2406
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
2407
0
                                   std::span<char16_t> utf16_output) noexcept {
2408
0
  return convert_utf32_to_utf16_with_errors(
2409
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2410
0
}
2411
  #endif // SIMDUTF_SPAN
2412
2413
/**
2414
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2415
 *
2416
 * During the conversion also validation of the input string is done.
2417
 * This function is suitable to work with inputs from untrusted sources.
2418
 *
2419
 * This function is not BOM-aware.
2420
 *
2421
 * @param input         the UTF-32 string to convert
2422
 * @param length        the length of the string in 4-byte code units (char32_t)
2423
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2424
 * @return a result pair struct (of type simdutf::result containing the two
2425
 * fields error and count) with an error code and either position of the error
2426
 * (in the input in code units) if any, or the number of char16_t written if
2427
 * successful.
2428
 */
2429
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
2430
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2431
  #if SIMDUTF_SPAN
2432
simdutf_really_inline simdutf_warn_unused result
2433
convert_utf32_to_utf16le_with_errors(
2434
    std::span<const char32_t> utf32_input,
2435
0
    std::span<char16_t> utf16_output) noexcept {
2436
0
  return convert_utf32_to_utf16le_with_errors(
2437
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2438
0
}
2439
  #endif // SIMDUTF_SPAN
2440
2441
/**
2442
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2443
 *
2444
 * During the conversion also validation of the input string is done.
2445
 * This function is suitable to work with inputs from untrusted sources.
2446
 *
2447
 * This function is not BOM-aware.
2448
 *
2449
 * @param input         the UTF-32 string to convert
2450
 * @param length        the length of the string in 4-byte code units (char32_t)
2451
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2452
 * @return a result pair struct (of type simdutf::result containing the two
2453
 * fields error and count) with an error code and either position of the error
2454
 * (in the input in code units) if any, or the number of char16_t written if
2455
 * successful.
2456
 */
2457
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
2458
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2459
  #if SIMDUTF_SPAN
2460
simdutf_really_inline simdutf_warn_unused result
2461
convert_utf32_to_utf16be_with_errors(
2462
    std::span<const char32_t> utf32_input,
2463
0
    std::span<char16_t> utf16_output) noexcept {
2464
0
  return convert_utf32_to_utf16be_with_errors(
2465
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2466
0
}
2467
  #endif // SIMDUTF_SPAN
2468
2469
/**
2470
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
2471
 *
2472
 * This function assumes that the input string is valid UTF-32.
2473
 *
2474
 * This function is not BOM-aware.
2475
 *
2476
 * @param input         the UTF-32 string to convert
2477
 * @param length        the length of the string in 4-byte code units (char32_t)
2478
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2479
 * result
2480
 * @return number of written code units; 0 if conversion is not possible
2481
 */
2482
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
2483
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2484
  #if SIMDUTF_SPAN
2485
simdutf_really_inline simdutf_warn_unused size_t
2486
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
2487
0
                             std::span<char16_t> utf16_output) noexcept {
2488
0
  return convert_valid_utf32_to_utf16(
2489
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2490
0
}
2491
  #endif // SIMDUTF_SPAN
2492
2493
/**
2494
 * Convert valid UTF-32 string into UTF-16LE string.
2495
 *
2496
 * This function assumes that the input string is valid UTF-32.
2497
 *
2498
 * This function is not BOM-aware.
2499
 *
2500
 * @param input         the UTF-32 string to convert
2501
 * @param length        the length of the string in 4-byte code units (char32_t)
2502
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2503
 * result
2504
 * @return number of written code units; 0 if conversion is not possible
2505
 */
2506
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
2507
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2508
  #if SIMDUTF_SPAN
2509
simdutf_really_inline simdutf_warn_unused size_t
2510
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
2511
0
                               std::span<char16_t> utf16_output) noexcept {
2512
0
  return convert_valid_utf32_to_utf16le(
2513
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2514
0
}
2515
  #endif // SIMDUTF_SPAN
2516
2517
/**
2518
 * Convert valid UTF-32 string into UTF-16BE string.
2519
 *
2520
 * This function assumes that the input string is valid UTF-32.
2521
 *
2522
 * This function is not BOM-aware.
2523
 *
2524
 * @param input         the UTF-32 string to convert
2525
 * @param length        the length of the string in 4-byte code units (char32_t)
2526
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2527
 * result
2528
 * @return number of written code units; 0 if conversion is not possible
2529
 */
2530
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
2531
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2532
  #if SIMDUTF_SPAN
2533
simdutf_really_inline simdutf_warn_unused size_t
2534
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
2535
0
                               std::span<char16_t> utf16_output) noexcept {
2536
0
  return convert_valid_utf32_to_utf16be(
2537
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2538
0
}
2539
  #endif // SIMDUTF_SPAN
2540
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2541
2542
#if SIMDUTF_FEATURE_UTF16
2543
/**
2544
 * Change the endianness of the input. Can be used to go from UTF-16LE to
2545
 * UTF-16BE or from UTF-16BE to UTF-16LE.
2546
 *
2547
 * This function does not validate the input.
2548
 *
2549
 * This function is not BOM-aware.
2550
 *
2551
 * @param input         the UTF-16 string to process
2552
 * @param length        the length of the string in 2-byte code units (char16_t)
2553
 * @param output        the pointer to a buffer that can hold the conversion
2554
 * result
2555
 */
2556
void change_endianness_utf16(const char16_t *input, size_t length,
2557
                             char16_t *output) noexcept;
2558
  #if SIMDUTF_SPAN
2559
simdutf_really_inline void
2560
change_endianness_utf16(std::span<const char16_t> utf16_input,
2561
0
                        std::span<char16_t> utf16_output) noexcept {
2562
0
  return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
2563
0
                                 utf16_output.data());
2564
0
}
2565
  #endif // SIMDUTF_SPAN
2566
#endif   // SIMDUTF_FEATURE_UTF16
2567
2568
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2569
/**
2570
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
2571
 * format.
2572
 *
2573
 * This function does not validate the input. It is acceptable to pass invalid
2574
 * UTF-32 strings but in such cases the result is implementation defined.
2575
 *
2576
 * @param input         the UTF-32 string to convert
2577
 * @param length        the length of the string in 4-byte code units (char32_t)
2578
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2579
 */
2580
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
2581
                                                  size_t length) noexcept;
2582
  #if SIMDUTF_SPAN
2583
simdutf_really_inline simdutf_warn_unused size_t
2584
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2585
0
  return utf8_length_from_utf32(valid_utf32_input.data(),
2586
0
                                valid_utf32_input.size());
2587
0
}
2588
  #endif // SIMDUTF_SPAN
2589
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2590
2591
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2592
/**
2593
 * Compute the number of two-byte code units that this UTF-32 string would
2594
 * require in UTF-16 format.
2595
 *
2596
 * This function does not validate the input. It is acceptable to pass invalid
2597
 * UTF-32 strings but in such cases the result is implementation defined.
2598
 *
2599
 * @param input         the UTF-32 string to convert
2600
 * @param length        the length of the string in 4-byte code units (char32_t)
2601
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2602
 */
2603
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
2604
                                                   size_t length) noexcept;
2605
  #if SIMDUTF_SPAN
2606
simdutf_really_inline simdutf_warn_unused size_t
2607
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2608
0
  return utf16_length_from_utf32(valid_utf32_input.data(),
2609
0
                                 valid_utf32_input.size());
2610
0
}
2611
  #endif // SIMDUTF_SPAN
2612
2613
/**
2614
 * Using native endianness; Compute the number of bytes that this UTF-16
2615
 * string would require in UTF-32 format.
2616
 *
2617
 * This function is equivalent to count_utf16.
2618
 *
2619
 * This function does not validate the input. It is acceptable to pass invalid
2620
 * UTF-16 strings but in such cases the result is implementation defined.
2621
 *
2622
 * This function is not BOM-aware.
2623
 *
2624
 * @param input         the UTF-16 string to convert
2625
 * @param length        the length of the string in 2-byte code units (char16_t)
2626
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2627
 */
2628
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
2629
                                                   size_t length) noexcept;
2630
  #if SIMDUTF_SPAN
2631
simdutf_really_inline simdutf_warn_unused size_t
2632
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2633
0
  return utf32_length_from_utf16(valid_utf16_input.data(),
2634
0
                                 valid_utf16_input.size());
2635
0
}
2636
  #endif // SIMDUTF_SPAN
2637
2638
/**
2639
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
2640
 * format.
2641
 *
2642
 * This function is equivalent to count_utf16le.
2643
 *
2644
 * This function does not validate the input. It is acceptable to pass invalid
2645
 * UTF-16 strings but in such cases the result is implementation defined.
2646
 *
2647
 * This function is not BOM-aware.
2648
 *
2649
 * @param input         the UTF-16LE string to convert
2650
 * @param length        the length of the string in 2-byte code units (char16_t)
2651
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2652
 */
2653
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
2654
                                                     size_t length) noexcept;
2655
  #if SIMDUTF_SPAN
2656
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16le(
2657
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2658
0
  return utf32_length_from_utf16le(valid_utf16_input.data(),
2659
0
                                   valid_utf16_input.size());
2660
0
}
2661
  #endif // SIMDUTF_SPAN
2662
2663
/**
2664
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
2665
 * format.
2666
 *
2667
 * This function is equivalent to count_utf16be.
2668
 *
2669
 * This function does not validate the input. It is acceptable to pass invalid
2670
 * UTF-16 strings but in such cases the result is implementation defined.
2671
 *
2672
 * This function is not BOM-aware.
2673
 *
2674
 * @param input         the UTF-16BE string to convert
2675
 * @param length        the length of the string in 2-byte code units (char16_t)
2676
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2677
 */
2678
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
2679
                                                     size_t length) noexcept;
2680
  #if SIMDUTF_SPAN
2681
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16be(
2682
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2683
0
  return utf32_length_from_utf16be(valid_utf16_input.data(),
2684
0
                                   valid_utf16_input.size());
2685
0
}
2686
  #endif // SIMDUTF_SPAN
2687
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2688
2689
#if SIMDUTF_FEATURE_UTF16
2690
/**
2691
 * Count the number of code points (characters) in the string assuming that
2692
 * it is valid.
2693
 *
2694
 * This function assumes that the input string is valid UTF-16 (native
2695
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
2696
 * cases the result is implementation defined.
2697
 *
2698
 * This function is not BOM-aware.
2699
 *
2700
 * @param input         the UTF-16 string to process
2701
 * @param length        the length of the string in 2-byte code units (char16_t)
2702
 * @return number of code points
2703
 */
2704
simdutf_warn_unused size_t count_utf16(const char16_t *input,
2705
                                       size_t length) noexcept;
2706
  #if SIMDUTF_SPAN
2707
simdutf_really_inline simdutf_warn_unused size_t
2708
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2709
0
  return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2710
0
}
2711
  #endif // SIMDUTF_SPAN
2712
2713
/**
2714
 * Count the number of code points (characters) in the string assuming that
2715
 * it is valid.
2716
 *
2717
 * This function assumes that the input string is valid UTF-16LE.
2718
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2719
 * the result is implementation defined.
2720
 *
2721
 * This function is not BOM-aware.
2722
 *
2723
 * @param input         the UTF-16LE string to process
2724
 * @param length        the length of the string in 2-byte code units (char16_t)
2725
 * @return number of code points
2726
 */
2727
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
2728
                                         size_t length) noexcept;
2729
  #if SIMDUTF_SPAN
2730
simdutf_really_inline simdutf_warn_unused size_t
2731
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2732
0
  return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
2733
0
}
2734
  #endif // SIMDUTF_SPAN
2735
2736
/**
2737
 * Count the number of code points (characters) in the string assuming that
2738
 * it is valid.
2739
 *
2740
 * This function assumes that the input string is valid UTF-16BE.
2741
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2742
 * the result is implementation defined.
2743
 *
2744
 * This function is not BOM-aware.
2745
 *
2746
 * @param input         the UTF-16BE string to process
2747
 * @param length        the length of the string in 2-byte code units (char16_t)
2748
 * @return number of code points
2749
 */
2750
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
2751
                                         size_t length) noexcept;
2752
  #if SIMDUTF_SPAN
2753
simdutf_really_inline simdutf_warn_unused size_t
2754
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2755
0
  return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
2756
0
}
2757
  #endif // SIMDUTF_SPAN
2758
#endif   // SIMDUTF_FEATURE_UTF16
2759
2760
#if SIMDUTF_FEATURE_UTF8
2761
/**
2762
 * Count the number of code points (characters) in the string assuming that
2763
 * it is valid.
2764
 *
2765
 * This function assumes that the input string is valid UTF-8.
2766
 * It is acceptable to pass invalid UTF-8 strings but in such cases
2767
 * the result is implementation defined.
2768
 *
2769
 * @param input         the UTF-8 string to process
2770
 * @param length        the length of the string in bytes
2771
 * @return number of code points
2772
 */
2773
simdutf_warn_unused size_t count_utf8(const char *input,
2774
                                      size_t length) noexcept;
2775
  #if SIMDUTF_SPAN
2776
simdutf_really_inline simdutf_warn_unused size_t count_utf8(
2777
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2778
  return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
2779
                    valid_utf8_input.size());
2780
}
2781
  #endif // SIMDUTF_SPAN
2782
2783
/**
2784
 * Given a valid UTF-8 string having a possibly truncated last character,
2785
 * this function checks the end of string. If the last character is truncated
2786
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
2787
 * that the short UTF-8 strings only contain complete characters. If there is no
2788
 * truncated character, the original length is returned.
2789
 *
2790
 * This function assumes that the input string is valid UTF-8, but possibly
2791
 * truncated.
2792
 *
2793
 * @param input         the UTF-8 string to process
2794
 * @param length        the length of the string in bytes
2795
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
2796
 */
2797
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
2798
  #if SIMDUTF_SPAN
2799
simdutf_really_inline simdutf_warn_unused size_t trim_partial_utf8(
2800
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2801
  return trim_partial_utf8(
2802
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2803
      valid_utf8_input.size());
2804
}
2805
  #endif // SIMDUTF_SPAN
2806
#endif   // SIMDUTF_FEATURE_UTF8
2807
2808
#if SIMDUTF_FEATURE_UTF16
2809
/**
2810
 * Given a valid UTF-16BE string having a possibly truncated last character,
2811
 * this function checks the end of string. If the last character is truncated
2812
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2813
 * the short UTF-16BE strings only contain complete characters. If there is no
2814
 * truncated character, the original length is returned.
2815
 *
2816
 * This function assumes that the input string is valid UTF-16BE, but possibly
2817
 * truncated.
2818
 *
2819
 * @param input         the UTF-16BE string to process
2820
 * @param length        the length of the string in bytes
2821
 * @return the length of the string in bytes, possibly shorter by 1 unit
2822
 */
2823
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
2824
                                                size_t length);
2825
  #if SIMDUTF_SPAN
2826
simdutf_really_inline simdutf_warn_unused size_t
2827
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2828
0
  return trim_partial_utf16be(valid_utf16_input.data(),
2829
0
                              valid_utf16_input.size());
2830
0
}
2831
  #endif // SIMDUTF_SPAN
2832
2833
/**
2834
 * Given a valid UTF-16LE string having a possibly truncated last character,
2835
 * this function checks the end of string. If the last character is truncated
2836
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2837
 * the short UTF-16LE strings only contain complete characters. If there is no
2838
 * truncated character, the original length is returned.
2839
 *
2840
 * This function assumes that the input string is valid UTF-16LE, but possibly
2841
 * truncated.
2842
 *
2843
 * @param input         the UTF-16LE string to process
2844
 * @param length        the length of the string in bytes
2845
 * @return the length of the string in unit, possibly shorter by 1 unit
2846
 */
2847
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
2848
                                                size_t length);
2849
  #if SIMDUTF_SPAN
2850
simdutf_really_inline simdutf_warn_unused size_t
2851
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2852
0
  return trim_partial_utf16le(valid_utf16_input.data(),
2853
0
                              valid_utf16_input.size());
2854
0
}
2855
  #endif // SIMDUTF_SPAN
2856
2857
/**
2858
 * Given a valid UTF-16 string having a possibly truncated last character,
2859
 * this function checks the end of string. If the last character is truncated
2860
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2861
 * the short UTF-16 strings only contain complete characters. If there is no
2862
 * truncated character, the original length is returned.
2863
 *
2864
 * This function assumes that the input string is valid UTF-16, but possibly
2865
 * truncated. We use the native endianness.
2866
 *
2867
 * @param input         the UTF-16 string to process
2868
 * @param length        the length of the string in bytes
2869
 * @return the length of the string in unit, possibly shorter by 1 unit
2870
 */
2871
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
2872
                                              size_t length);
2873
  #if SIMDUTF_SPAN
2874
simdutf_really_inline simdutf_warn_unused size_t
2875
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2876
0
  return trim_partial_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2877
0
}
2878
  #endif // SIMDUTF_SPAN
2879
#endif   // SIMDUTF_FEATURE_UTF16
2880
2881
#if SIMDUTF_FEATURE_BASE64
2882
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
2883
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
2884
  #endif
2885
// base64_options are used to specify the base64 encoding options.
2886
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
2887
// garbage characters are characters that are not part of the base64 alphabet
2888
// nor ASCII spaces.
2889
constexpr uint64_t base64_reverse_padding =
2890
    2; /* modifier for base64_default and base64_url */
2891
enum base64_options : uint64_t {
2892
  base64_default = 0, /* standard base64 format (with padding) */
2893
  base64_url = 1,     /* base64url format (no padding) */
2894
  base64_default_no_padding =
2895
      base64_default |
2896
      base64_reverse_padding, /* standard base64 format without padding */
2897
  base64_url_with_padding =
2898
      base64_url | base64_reverse_padding, /* base64url with padding */
2899
  base64_default_accept_garbage =
2900
      4, /* standard base64 format accepting garbage characters, the input stops
2901
            with the first '=' if any */
2902
  base64_url_accept_garbage =
2903
      5, /* base64url format accepting garbage characters, the input stops with
2904
            the first '=' if any */
2905
  base64_default_or_url =
2906
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
2907
  base64_default_or_url_accept_garbage =
2908
      12, /* standard/base64url hybrid format accepting garbage characters
2909
             (only meaningful for decoding!), the input stops with the first '='
2910
             if any */
2911
};
2912
2913
  #if SIMDUTF_CPLUSPLUS17
2914
0
inline std::string_view to_string(base64_options options) {
2915
0
  switch (options) {
2916
0
  case base64_default:
2917
0
    return "base64_default";
2918
0
  case base64_url:
2919
0
    return "base64_url";
2920
0
  case base64_reverse_padding:
2921
0
    return "base64_reverse_padding";
2922
0
  case base64_url_with_padding:
2923
0
    return "base64_url_with_padding";
2924
0
  case base64_default_accept_garbage:
2925
0
    return "base64_default_accept_garbage";
2926
0
  case base64_url_accept_garbage:
2927
0
    return "base64_url_accept_garbage";
2928
0
  case base64_default_or_url:
2929
0
    return "base64_default_or_url";
2930
0
  case base64_default_or_url_accept_garbage:
2931
0
    return "base64_default_or_url_accept_garbage";
2932
0
  }
2933
0
  return "<unknown>";
2934
0
}
2935
  #endif // SIMDUTF_CPLUSPLUS17
2936
2937
// last_chunk_handling_options are used to specify the handling of the last
2938
// chunk in base64 decoding.
2939
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
2940
enum last_chunk_handling_options : uint64_t {
2941
  loose = 0,  /* standard base64 format, decode partial final chunk */
2942
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
2943
                 unpadded, or non-zero bit padding */
2944
  stop_before_partial =
2945
      2, /* if the last chunk is partial, ignore it (no error) */
2946
  only_full_chunks =
2947
      3 /* only decode full blocks (4 base64 characters, no padding) */
2948
};
2949
2950
inline bool is_partial(last_chunk_handling_options options) {
2951
  return (options == stop_before_partial) || (options == only_full_chunks);
2952
}
2953
2954
  #if SIMDUTF_CPLUSPLUS17
2955
0
inline std::string_view to_string(last_chunk_handling_options options) {
2956
0
  switch (options) {
2957
0
  case loose:
2958
0
    return "loose";
2959
0
  case strict:
2960
0
    return "strict";
2961
0
  case stop_before_partial:
2962
0
    return "stop_before_partial";
2963
0
  case only_full_chunks:
2964
0
    return "only_full_chunks";
2965
0
  }
2966
0
  return "<unknown>";
2967
0
}
2968
  #endif
2969
2970
/**
2971
 * Provide the maximal binary length in bytes given the base64 input.
2972
 * In general, if the input contains ASCII spaces, the result will be less than
2973
 * the maximum length.
2974
 *
2975
 * @param input         the base64 input to process
2976
 * @param length        the length of the base64 input in bytes
2977
 * @return maximum number of binary bytes
2978
 */
2979
simdutf_warn_unused size_t
2980
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
2981
  #if SIMDUTF_SPAN
2982
simdutf_really_inline simdutf_warn_unused size_t
2983
maximal_binary_length_from_base64(
2984
    const detail::input_span_of_byte_like auto &input) noexcept {
2985
  return maximal_binary_length_from_base64(
2986
      reinterpret_cast<const char *>(input.data()), input.size());
2987
}
2988
  #endif // SIMDUTF_SPAN
2989
2990
/**
2991
 * Provide the maximal binary length in bytes given the base64 input.
2992
 * In general, if the input contains ASCII spaces, the result will be less than
2993
 * the maximum length.
2994
 *
2995
 * @param input         the base64 input to process, in ASCII stored as 16-bit
2996
 * units
2997
 * @param length        the length of the base64 input in 16-bit units
2998
 * @return maximal number of binary bytes
2999
 */
3000
simdutf_warn_unused size_t maximal_binary_length_from_base64(
3001
    const char16_t *input, size_t length) noexcept;
3002
  #if SIMDUTF_SPAN
3003
simdutf_really_inline simdutf_warn_unused size_t
3004
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
3005
0
  return maximal_binary_length_from_base64(input.data(), input.size());
3006
0
}
3007
  #endif // SIMDUTF_SPAN
3008
3009
/**
3010
 * Convert a base64 input to a binary output.
3011
 *
3012
 * This function follows the WHATWG forgiving-base64 format, which means that it
3013
 * will ignore any ASCII spaces in the input. You may provide a padded input
3014
 * (with one or two equal signs at the end) or an unpadded input (without any
3015
 * equal signs at the end).
3016
 *
3017
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3018
 *
3019
 * This function will fail in case of invalid input. When last_chunk_options =
3020
 * loose, there are two possible reasons for failure: the input contains a
3021
 * number of base64 characters that when divided by 4, leaves a single remainder
3022
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3023
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3024
 *
3025
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3026
 * input where the invalid character was found. When the error is
3027
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3028
 *
3029
 * The default option (simdutf::base64_default) expects the characters `+` and
3030
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3031
 * characters `-` and `_` as part of its alphabet.
3032
 *
3033
 * The padding (`=`) is validated if present. There may be at most two padding
3034
 * characters at the end of the input. If there are any padding characters, the
3035
 * total number of characters (excluding spaces but including padding
3036
 * characters) must be divisible by four.
3037
 *
3038
 * You should call this function with a buffer that is at least
3039
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
3040
 * provide that much space, the function may cause a buffer overflow.
3041
 *
3042
 * Advanced users may want to tailor how the last chunk is handled. By default,
3043
 * we use a loose (forgiving) approach but we also support a strict approach
3044
 * as well as a stop_before_partial approach, as per the following proposal:
3045
 *
3046
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3047
 *
3048
 * @param input         the base64 string to process
3049
 * @param length        the length of the string in bytes
3050
 * @param output        the pointer to a buffer that can hold the conversion
3051
 * result (should be at least maximal_binary_length_from_base64(input, length)
3052
 * bytes long).
3053
 * @param options       the base64 options to use, usually base64_default or
3054
 * base64_url, and base64_default by default.
3055
 * @param last_chunk_options the last chunk handling options,
3056
 * last_chunk_handling_options::loose by default
3057
 * but can also be last_chunk_handling_options::strict or
3058
 * last_chunk_handling_options::stop_before_partial.
3059
 * @return a result pair struct (of type simdutf::result containing the two
3060
 * fields error and count) with an error code and either position of the error
3061
 * (in the input in bytes) if any, or the number of bytes written if successful.
3062
 */
3063
simdutf_warn_unused result base64_to_binary(
3064
    const char *input, size_t length, char *output,
3065
    base64_options options = base64_default,
3066
    last_chunk_handling_options last_chunk_options = loose) noexcept;
3067
  #if SIMDUTF_SPAN
3068
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3069
    const detail::input_span_of_byte_like auto &input,
3070
    detail::output_span_of_byte_like auto &&binary_output,
3071
    base64_options options = base64_default,
3072
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3073
  return base64_to_binary(reinterpret_cast<const char *>(input.data()),
3074
                          input.size(),
3075
                          reinterpret_cast<char *>(binary_output.data()),
3076
                          options, last_chunk_options);
3077
}
3078
  #endif // SIMDUTF_SPAN
3079
3080
/**
3081
 * Provide the base64 length in bytes given the length of a binary input.
3082
 *
3083
 * @param length        the length of the input in bytes
3084
 * @return number of base64 bytes
3085
 */
3086
simdutf_warn_unused size_t base64_length_from_binary(
3087
    size_t length, base64_options options = base64_default) noexcept;
3088
3089
/**
3090
 * Provide the base64 length in bytes given the length of a binary input,
3091
 * taking into account line breaks.
3092
 *
3093
 * @param length        the length of the input in bytes
3094
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
3095
 * interpreted as 4),
3096
 * @return number of base64 bytes
3097
 */
3098
simdutf_warn_unused size_t base64_length_from_binary_with_lines(
3099
    size_t length, base64_options options = base64_default,
3100
    size_t line_length = default_line_length) noexcept;
3101
3102
/**
3103
 * Convert a binary input to a base64 output.
3104
 *
3105
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3106
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3107
 * output to ensure that the output length is a multiple of four.
3108
 *
3109
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3110
 * of its alphabet. No padding is added at the end of the output.
3111
 *
3112
 * This function always succeeds.
3113
 *
3114
 * @param input         the binary to process
3115
 * @param length        the length of the input in bytes
3116
 * @param output        the pointer to a buffer that can hold the conversion
3117
 * result (should be at least base64_length_from_binary(length) bytes long)
3118
 * @param options       the base64 options to use, can be base64_default or
3119
 * base64_url, is base64_default by default.
3120
 * @return number of written bytes, will be equal to
3121
 * base64_length_from_binary(length, options)
3122
 */
3123
size_t binary_to_base64(const char *input, size_t length, char *output,
3124
                        base64_options options = base64_default) noexcept;
3125
  #if SIMDUTF_SPAN
3126
simdutf_really_inline simdutf_warn_unused size_t
3127
binary_to_base64(const detail::input_span_of_byte_like auto &input,
3128
                 detail::output_span_of_byte_like auto &&binary_output,
3129
                 base64_options options = base64_default) noexcept {
3130
  return binary_to_base64(
3131
      reinterpret_cast<const char *>(input.data()), input.size(),
3132
      reinterpret_cast<char *>(binary_output.data()), options);
3133
}
3134
  #endif // SIMDUTF_SPAN
3135
3136
/**
3137
 * Convert a binary input to a base64 output with line breaks.
3138
 *
3139
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3140
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3141
 * output to ensure that the output length is a multiple of four.
3142
 *
3143
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3144
 * of its alphabet. No padding is added at the end of the output.
3145
 *
3146
 * This function always succeeds.
3147
 *
3148
 * @param input         the binary to process
3149
 * @param length        the length of the input in bytes
3150
 * @param output        the pointer to a buffer that can hold the conversion
3151
 * result (should be at least base64_length_from_binary_with_lines(length,
3152
 * options, line_length) bytes long)
3153
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
3154
 * interpreted as 4),
3155
 * @param options       the base64 options to use, can be base64_default or
3156
 * base64_url, is base64_default by default.
3157
 * @return number of written bytes, will be equal to
3158
 * base64_length_from_binary_with_lines(length, options)
3159
 */
3160
size_t
3161
binary_to_base64_with_lines(const char *input, size_t length, char *output,
3162
                            size_t line_length = simdutf::default_line_length,
3163
                            base64_options options = base64_default) noexcept;
3164
  #if SIMDUTF_SPAN
3165
simdutf_really_inline simdutf_warn_unused size_t binary_to_base64_with_lines(
3166
    const detail::input_span_of_byte_like auto &input,
3167
    detail::output_span_of_byte_like auto &&binary_output,
3168
    size_t line_length = simdutf::default_line_length,
3169
    base64_options options = base64_default) noexcept {
3170
  return binary_to_base64_with_lines(
3171
      reinterpret_cast<const char *>(input.data()), input.size(),
3172
      reinterpret_cast<char *>(binary_output.data()), line_length, options);
3173
}
3174
  #endif // SIMDUTF_SPAN
3175
3176
  #if SIMDUTF_ATOMIC_REF
3177
/**
3178
 * Convert a binary input to a base64 output, using atomic accesses.
3179
 * This function comes with a potentially significant performance
3180
 * penalty, but it may be useful in some cases where the input
3181
 * buffers are shared between threads, to avoid undefined
3182
 * behavior in case of data races.
3183
 *
3184
 * The function is for advanced users. Its main use case is when
3185
 * to silence sanitizer warnings. We have no documented use case
3186
 * where this function is actually necessary in terms of practical correctness.
3187
 *
3188
 * This function is only available when simdutf is compiled with
3189
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3190
 * the availability of this function by checking the macro
3191
 * SIMDUTF_ATOMIC_REF.
3192
 *
3193
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3194
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3195
 * output to ensure that the output length is a multiple of four.
3196
 *
3197
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3198
 * of its alphabet. No padding is added at the end of the output.
3199
 *
3200
 * This function always succeeds.
3201
 *
3202
 * This function is considered experimental. It is not tested by default
3203
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3204
 * It is not documented in the public API documentation (README). It is
3205
 * offered on a best effort basis. We rely on the community for further
3206
 * testing and feedback.
3207
 *
3208
 * @brief atomic_binary_to_base64
3209
 * @param input         the binary to process
3210
 * @param length        the length of the input in bytes
3211
 * @param output        the pointer to a buffer that can hold the conversion
3212
 * result (should be at least base64_length_from_binary(length) bytes long)
3213
 * @param options       the base64 options to use, can be base64_default or
3214
 * base64_url, is base64_default by default.
3215
 * @return number of written bytes, will be equal to
3216
 * base64_length_from_binary(length, options)
3217
 */
3218
size_t
3219
atomic_binary_to_base64(const char *input, size_t length, char *output,
3220
                        base64_options options = base64_default) noexcept;
3221
    #if SIMDUTF_SPAN
3222
simdutf_really_inline simdutf_warn_unused size_t
3223
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
3224
                        detail::output_span_of_byte_like auto &&binary_output,
3225
                        base64_options options = base64_default) noexcept {
3226
  return atomic_binary_to_base64(
3227
      reinterpret_cast<const char *>(input.data()), input.size(),
3228
      reinterpret_cast<char *>(binary_output.data()), options);
3229
}
3230
    #endif // SIMDUTF_SPAN
3231
  #endif   // SIMDUTF_ATOMIC_REF
3232
3233
/**
3234
 * Convert a base64 input to a binary output.
3235
 *
3236
 * This function follows the WHATWG forgiving-base64 format, which means that it
3237
 * will ignore any ASCII spaces in the input. You may provide a padded input
3238
 * (with one or two equal signs at the end) or an unpadded input (without any
3239
 * equal signs at the end).
3240
 *
3241
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3242
 *
3243
 * This function will fail in case of invalid input. When last_chunk_options =
3244
 * loose, there are two possible reasons for failure: the input contains a
3245
 * number of base64 characters that when divided by 4, leaves a single remainder
3246
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3247
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3248
 *
3249
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3250
 * input where the invalid character was found. When the error is
3251
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3252
 *
3253
 * The default option (simdutf::base64_default) expects the characters `+` and
3254
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3255
 * characters `-` and `_` as part of its alphabet.
3256
 *
3257
 * The padding (`=`) is validated if present. There may be at most two padding
3258
 * characters at the end of the input. If there are any padding characters, the
3259
 * total number of characters (excluding spaces but including padding
3260
 * characters) must be divisible by four.
3261
 *
3262
 * You should call this function with a buffer that is at least
3263
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
3264
 * to provide that much space, the function may cause a buffer overflow.
3265
 *
3266
 * Advanced users may want to tailor how the last chunk is handled. By default,
3267
 * we use a loose (forgiving) approach but we also support a strict approach
3268
 * as well as a stop_before_partial approach, as per the following proposal:
3269
 *
3270
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3271
 *
3272
 * @param input         the base64 string to process, in ASCII stored as 16-bit
3273
 * units
3274
 * @param length        the length of the string in 16-bit units
3275
 * @param output        the pointer to a buffer that can hold the conversion
3276
 * result (should be at least maximal_binary_length_from_base64(input, length)
3277
 * bytes long).
3278
 * @param options       the base64 options to use, can be base64_default or
3279
 * base64_url, is base64_default by default.
3280
 * @param last_chunk_options the last chunk handling options,
3281
 * last_chunk_handling_options::loose by default
3282
 * but can also be last_chunk_handling_options::strict or
3283
 * last_chunk_handling_options::stop_before_partial.
3284
 * @return a result pair struct (of type simdutf::result containing the two
3285
 * fields error and count) with an error code and position of the
3286
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3287
 * of bytes written if successful.
3288
 */
3289
simdutf_warn_unused result
3290
base64_to_binary(const char16_t *input, size_t length, char *output,
3291
                 base64_options options = base64_default,
3292
                 last_chunk_handling_options last_chunk_options =
3293
                     last_chunk_handling_options::loose) noexcept;
3294
  #if SIMDUTF_SPAN
3295
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3296
    std::span<const char16_t> input,
3297
    detail::output_span_of_byte_like auto &&binary_output,
3298
    base64_options options = base64_default,
3299
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3300
  return base64_to_binary(input.data(), input.size(),
3301
                          reinterpret_cast<char *>(binary_output.data()),
3302
                          options, last_chunk_options);
3303
}
3304
  #endif // SIMDUTF_SPAN
3305
3306
/**
3307
 * Check if a character is an ignorabl base64 character.
3308
 * Checking a large input, character by character, is not computationally
3309
 * efficient.
3310
 *
3311
 * @param input         the character to check
3312
 * @param options       the base64 options to use, is base64_default by default.
3313
 * @return true if the character is an ignorablee base64 character, false
3314
 * otherwise.
3315
 */
3316
simdutf_warn_unused bool
3317
base64_ignorable(char input, base64_options options = base64_default) noexcept;
3318
simdutf_warn_unused bool
3319
base64_ignorable(char16_t input,
3320
                 base64_options options = base64_default) noexcept;
3321
3322
/**
3323
 * Check if a character is a valid base64 character.
3324
 * Checking a large input, character by character, is not computationally
3325
 * efficient.
3326
 * Note that padding characters are not considered valid base64 characters in
3327
 * this context, nor are spaces.
3328
 *
3329
 * @param input         the character to check
3330
 * @param options       the base64 options to use, is base64_default by default.
3331
 * @return true if the character is a base64 character, false otherwise.
3332
 */
3333
simdutf_warn_unused bool
3334
base64_valid(char input, base64_options options = base64_default) noexcept;
3335
simdutf_warn_unused bool
3336
base64_valid(char16_t input, base64_options options = base64_default) noexcept;
3337
3338
/**
3339
 * Check if a character is a valid base64 character or the padding character
3340
 * ('='). Checking a large input, character by character, is not computationally
3341
 * efficient.
3342
 *
3343
 * @param input         the character to check
3344
 * @param options       the base64 options to use, is base64_default by default.
3345
 * @return true if the character is a base64 character, false otherwise.
3346
 */
3347
simdutf_warn_unused bool
3348
base64_valid_or_padding(char input,
3349
                        base64_options options = base64_default) noexcept;
3350
simdutf_warn_unused bool
3351
base64_valid_or_padding(char16_t input,
3352
                        base64_options options = base64_default) noexcept;
3353
3354
/**
3355
 * Convert a base64 input to a binary output.
3356
 *
3357
 * This function follows the WHATWG forgiving-base64 format, which means that it
3358
 * will ignore any ASCII spaces in the input. You may provide a padded input
3359
 * (with one or two equal signs at the end) or an unpadded input (without any
3360
 * equal signs at the end).
3361
 *
3362
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3363
 *
3364
 * This function will fail in case of invalid input. When last_chunk_options =
3365
 * loose, there are three possible reasons for failure: the input contains a
3366
 * number of base64 characters that when divided by 4, leaves a single remainder
3367
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
3368
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
3369
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
3370
 *
3371
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
3372
 * and the number of units processed, see description of the parameters and
3373
 * returned value.
3374
 *
3375
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3376
 * input where the invalid character was found. When the error is
3377
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3378
 *
3379
 * The default option (simdutf::base64_default) expects the characters `+` and
3380
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3381
 * characters `-` and `_` as part of its alphabet.
3382
 *
3383
 * The padding (`=`) is validated if present. There may be at most two padding
3384
 * characters at the end of the input. If there are any padding characters, the
3385
 * total number of characters (excluding spaces but including padding
3386
 * characters) must be divisible by four.
3387
 *
3388
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
3389
 * to discard the output unless the parameter decode_up_to_bad_char is set to
3390
 * true. In that case, the function will decode up to the first invalid
3391
 * character. Extra padding characters ('=') are considered invalid characters.
3392
 *
3393
 * Advanced users may want to tailor how the last chunk is handled. By default,
3394
 * we use a loose (forgiving) approach but we also support a strict approach
3395
 * as well as a stop_before_partial approach, as per the following proposal:
3396
 *
3397
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3398
 *
3399
 * @param input         the base64 string to process, in ASCII stored as 8-bit
3400
 * or 16-bit units
3401
 * @param length        the length of the string in 8-bit or 16-bit units.
3402
 * @param output        the pointer to a buffer that can hold the conversion
3403
 * result.
3404
 * @param outlen        the number of bytes that can be written in the output
3405
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3406
 * @param options       the base64 options to use, can be base64_default or
3407
 * base64_url, is base64_default by default.
3408
 * @param last_chunk_options the last chunk handling options,
3409
 * last_chunk_handling_options::loose by default
3410
 * but can also be last_chunk_handling_options::strict or
3411
 * last_chunk_handling_options::stop_before_partial.
3412
 * @param decode_up_to_bad_char if true, the function will decode up to the
3413
 * first invalid character. By default (false), it is assumed that the output
3414
 * buffer is to be discarded. When there are multiple errors in the input,
3415
 * using decode_up_to_bad_char might trigger a different error.
3416
 * @return a result pair struct (of type simdutf::result containing the two
3417
 * fields error and count) with an error code and position of the
3418
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3419
 * of units processed if successful.
3420
 */
3421
simdutf_warn_unused result
3422
base64_to_binary_safe(const char *input, size_t length, char *output,
3423
                      size_t &outlen, base64_options options = base64_default,
3424
                      last_chunk_handling_options last_chunk_options =
3425
                          last_chunk_handling_options::loose,
3426
                      bool decode_up_to_bad_char = false) noexcept;
3427
  #if SIMDUTF_SPAN
3428
/**
3429
 * @brief span overload
3430
 * @return a tuple of result and outlen
3431
 */
3432
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3433
base64_to_binary_safe(const detail::input_span_of_byte_like auto &input,
3434
                      detail::output_span_of_byte_like auto &&binary_output,
3435
                      base64_options options = base64_default,
3436
                      last_chunk_handling_options last_chunk_options = loose,
3437
                      bool decode_up_to_bad_char = false) noexcept {
3438
  size_t outlen = binary_output.size();
3439
  auto r = base64_to_binary_safe(
3440
      reinterpret_cast<const char *>(input.data()), input.size(),
3441
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3442
      last_chunk_options, decode_up_to_bad_char);
3443
  return {r, outlen};
3444
}
3445
  #endif // SIMDUTF_SPAN
3446
3447
simdutf_warn_unused result
3448
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
3449
                      size_t &outlen, base64_options options = base64_default,
3450
                      last_chunk_handling_options last_chunk_options =
3451
                          last_chunk_handling_options::loose,
3452
                      bool decode_up_to_bad_char = false) noexcept;
3453
  #if SIMDUTF_SPAN
3454
/**
3455
 * @brief span overload
3456
 * @return a tuple of result and outlen
3457
 */
3458
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3459
base64_to_binary_safe(std::span<const char16_t> input,
3460
                      detail::output_span_of_byte_like auto &&binary_output,
3461
                      base64_options options = base64_default,
3462
                      last_chunk_handling_options last_chunk_options = loose,
3463
                      bool decode_up_to_bad_char = false) noexcept {
3464
  size_t outlen = binary_output.size();
3465
  auto r = base64_to_binary_safe(input.data(), input.size(),
3466
                                 reinterpret_cast<char *>(binary_output.data()),
3467
                                 outlen, options, last_chunk_options,
3468
                                 decode_up_to_bad_char);
3469
  return {r, outlen};
3470
}
3471
  #endif // SIMDUTF_SPAN
3472
3473
  #if SIMDUTF_ATOMIC_REF
3474
/**
3475
 * Convert a base64 input to a binary output with a size limit and using atomic
3476
 * operations.
3477
 *
3478
 * Like `base64_to_binary_safe` but using atomic operations, this function is
3479
 * thread-safe for concurrent memory access, allowing the output
3480
 * buffers to be shared between threads without undefined behavior in case of
3481
 * data races.
3482
 *
3483
 * This function comes with a potentially significant performance penalty, but
3484
 * is useful when thread safety is needed during base64 decoding.
3485
 *
3486
 * This function is only available when simdutf is compiled with
3487
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3488
 * the availability of this function by checking the macro
3489
 * SIMDUTF_ATOMIC_REF.
3490
 *
3491
 * This function is considered experimental. It is not tested by default
3492
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3493
 * It is not documented in the public API documentation (README). It is
3494
 * offered on a best effort basis. We rely on the community for further
3495
 * testing and feedback.
3496
 *
3497
 * @param input         the base64 input to decode
3498
 * @param length        the length of the input in bytes
3499
 * @param output        the pointer to buffer that can hold the conversion
3500
 * result
3501
 * @param outlen        the number of bytes that can be written in the output
3502
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3503
 * @param options       the base64 options to use (default, url, etc.)
3504
 * @param last_chunk_options the last chunk handling options (loose, strict,
3505
 * stop_before_partial)
3506
 * @param decode_up_to_bad_char if true, the function will decode up to the
3507
 * first invalid character. By default (false), it is assumed that the output
3508
 * buffer is to be discarded. When there are multiple errors in the input,
3509
 * using decode_up_to_bad_char might trigger a different error.
3510
 * @return a result struct with an error code and count indicating error
3511
 * position or success
3512
 */
3513
simdutf_warn_unused result atomic_base64_to_binary_safe(
3514
    const char *input, size_t length, char *output, size_t &outlen,
3515
    base64_options options = base64_default,
3516
    last_chunk_handling_options last_chunk_options =
3517
        last_chunk_handling_options::loose,
3518
    bool decode_up_to_bad_char = false) noexcept;
3519
simdutf_warn_unused result atomic_base64_to_binary_safe(
3520
    const char16_t *input, size_t length, char *output, size_t &outlen,
3521
    base64_options options = base64_default,
3522
    last_chunk_handling_options last_chunk_options = loose,
3523
    bool decode_up_to_bad_char = false) noexcept;
3524
    #if SIMDUTF_SPAN
3525
/**
3526
 * @brief span overload
3527
 * @return a tuple of result and outlen
3528
 */
3529
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3530
atomic_base64_to_binary_safe(
3531
    const detail::input_span_of_byte_like auto &binary_input,
3532
    detail::output_span_of_byte_like auto &&output,
3533
    base64_options options = base64_default,
3534
    last_chunk_handling_options last_chunk_options =
3535
        last_chunk_handling_options::loose,
3536
    bool decode_up_to_bad_char = false) noexcept {
3537
  size_t outlen = output.size();
3538
  auto ret = atomic_base64_to_binary_safe(
3539
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
3540
      reinterpret_cast<char *>(output.data()), outlen, options,
3541
      last_chunk_options, decode_up_to_bad_char);
3542
  return {ret, outlen};
3543
}
3544
/**
3545
 * @brief span overload
3546
 * @return a tuple of result and outlen
3547
 */
3548
simdutf_warn_unused std::tuple<result, std::size_t>
3549
atomic_base64_to_binary_safe(
3550
    std::span<const char16_t> base64_input,
3551
    detail::output_span_of_byte_like auto &&binary_output,
3552
    base64_options options = base64_default,
3553
    last_chunk_handling_options last_chunk_options = loose,
3554
    bool decode_up_to_bad_char = false) noexcept {
3555
  size_t outlen = binary_output.size();
3556
  auto ret = atomic_base64_to_binary_safe(
3557
      base64_input.data(), base64_input.size(),
3558
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3559
      last_chunk_options, decode_up_to_bad_char);
3560
  return {ret, outlen};
3561
}
3562
    #endif // SIMDUTF_SPAN
3563
  #endif   // SIMDUTF_ATOMIC_REF
3564
3565
/**
3566
 * Find the first occurrence of a character in a string. If the character is
3567
 * not found, return a pointer to the end of the string.
3568
 * @param start        the start of the string
3569
 * @param end          the end of the string
3570
 * @param character    the character to find
3571
 * @return a pointer to the first occurrence of the character in the string,
3572
 * or a pointer to the end of the string if the character is not found.
3573
 *
3574
 */
3575
simdutf_warn_unused const char *find(const char *start, const char *end,
3576
                                     char character) noexcept;
3577
simdutf_warn_unused const char16_t *
3578
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
3579
#endif // SIMDUTF_FEATURE_BASE64
3580
3581
/**
3582
 * An implementation of simdutf for a particular CPU architecture.
3583
 *
3584
 * Also used to maintain the currently active implementation. The active
3585
 * implementation is automatically initialized on first use to the most advanced
3586
 * implementation supported by the host.
3587
 */
3588
class implementation {
3589
public:
3590
  /**
3591
   * The name of this implementation.
3592
   *
3593
   *     const implementation *impl = simdutf::active_implementation;
3594
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3595
   * impl->description() << ")" << endl;
3596
   *
3597
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3598
   */
3599
  virtual std::string name() const { return std::string(_name); }
3600
3601
  /**
3602
   * The description of this implementation.
3603
   *
3604
   *     const implementation *impl = simdutf::active_implementation;
3605
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3606
   * impl->description() << ")" << endl;
3607
   *
3608
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3609
   */
3610
  virtual std::string description() const { return std::string(_description); }
3611
3612
  /**
3613
   * The instruction sets this implementation is compiled against
3614
   * and the current CPU match. This function may poll the current CPU/system
3615
   * and should therefore not be called too often if performance is a concern.
3616
   *
3617
   *
3618
   * @return true if the implementation can be safely used on the current system
3619
   * (determined at runtime)
3620
   */
3621
  bool supported_by_runtime_system() const;
3622
3623
#if SIMDUTF_FEATURE_DETECT_ENCODING
3624
  /**
3625
   * This function will try to detect the encoding
3626
   * @param input the string to identify
3627
   * @param length the length of the string in bytes.
3628
   * @return the encoding type detected
3629
   */
3630
  virtual encoding_type autodetect_encoding(const char *input,
3631
                                            size_t length) const noexcept;
3632
3633
  /**
3634
   * This function will try to detect the possible encodings in one pass
3635
   * @param input the string to identify
3636
   * @param length the length of the string in bytes.
3637
   * @return the encoding type detected
3638
   */
3639
  virtual int detect_encodings(const char *input,
3640
                               size_t length) const noexcept = 0;
3641
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
3642
3643
  /**
3644
   * @private For internal implementation use
3645
   *
3646
   * The instruction sets this implementation is compiled against.
3647
   *
3648
   * @return a mask of all required `internal::instruction_set::` values
3649
   */
3650
  virtual uint32_t required_instruction_sets() const {
3651
    return _required_instruction_sets;
3652
  }
3653
3654
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3655
  /**
3656
   * Validate the UTF-8 string.
3657
   *
3658
   * Overridden by each implementation.
3659
   *
3660
   * @param buf the UTF-8 string to validate.
3661
   * @param len the length of the string in bytes.
3662
   * @return true if and only if the string is valid UTF-8.
3663
   */
3664
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
3665
                                                 size_t len) const noexcept = 0;
3666
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3667
3668
#if SIMDUTF_FEATURE_UTF8
3669
  /**
3670
   * Validate the UTF-8 string and stop on errors.
3671
   *
3672
   * Overridden by each implementation.
3673
   *
3674
   * @param buf the UTF-8 string to validate.
3675
   * @param len the length of the string in bytes.
3676
   * @return a result pair struct (of type simdutf::result containing the two
3677
   * fields error and count) with an error code and either position of the error
3678
   * (in the input in code units) if any, or the number of code units validated
3679
   * if successful.
3680
   */
3681
  simdutf_warn_unused virtual result
3682
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
3683
#endif // SIMDUTF_FEATURE_UTF8
3684
3685
#if SIMDUTF_FEATURE_ASCII
3686
  /**
3687
   * Validate the ASCII string.
3688
   *
3689
   * Overridden by each implementation.
3690
   *
3691
   * @param buf the ASCII string to validate.
3692
   * @param len the length of the string in bytes.
3693
   * @return true if and only if the string is valid ASCII.
3694
   */
3695
  simdutf_warn_unused virtual bool
3696
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
3697
3698
  /**
3699
   * Validate the ASCII string and stop on error.
3700
   *
3701
   * Overridden by each implementation.
3702
   *
3703
   * @param buf the ASCII string to validate.
3704
   * @param len the length of the string in bytes.
3705
   * @return a result pair struct (of type simdutf::result containing the two
3706
   * fields error and count) with an error code and either position of the error
3707
   * (in the input in code units) if any, or the number of code units validated
3708
   * if successful.
3709
   */
3710
  simdutf_warn_unused virtual result
3711
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
3712
3713
#endif // SIMDUTF_FEATURE_ASCII
3714
3715
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3716
  /**
3717
   * Validate the ASCII string as a UTF-16BE sequence.
3718
   * An UTF-16 sequence is considered an ASCII sequence
3719
   * if it could be converted to an ASCII string losslessly.
3720
   *
3721
   * Overridden by each implementation.
3722
   *
3723
   * @param buf the UTF-16BE string to validate.
3724
   * @param len the length of the string in bytes.
3725
   * @return true if and only if the string is valid ASCII.
3726
   */
3727
  simdutf_warn_unused virtual bool
3728
  validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3729
3730
  /**
3731
   * Validate the ASCII string as a UTF-16LE sequence.
3732
   * An UTF-16 sequence is considered an ASCII sequence
3733
   * if it could be converted to an ASCII string losslessly.
3734
   *
3735
   * Overridden by each implementation.
3736
   *
3737
   * @param buf the UTF-16LE string to validate.
3738
   * @param len the length of the string in bytes.
3739
   * @return true if and only if the string is valid ASCII.
3740
   */
3741
  simdutf_warn_unused virtual bool
3742
  validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3743
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3744
3745
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3746
  /**
3747
   * Validate the UTF-16LE string.This function may be best when you expect
3748
   * the input to be almost always valid. Otherwise, consider using
3749
   * validate_utf16le_with_errors.
3750
   *
3751
   * Overridden by each implementation.
3752
   *
3753
   * This function is not BOM-aware.
3754
   *
3755
   * @param buf the UTF-16LE string to validate.
3756
   * @param len the length of the string in number of 2-byte code units
3757
   * (char16_t).
3758
   * @return true if and only if the string is valid UTF-16LE.
3759
   */
3760
  simdutf_warn_unused virtual bool
3761
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
3762
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3763
3764
#if SIMDUTF_FEATURE_UTF16
3765
  /**
3766
   * Validate the UTF-16BE string. This function may be best when you expect
3767
   * the input to be almost always valid. Otherwise, consider using
3768
   * validate_utf16be_with_errors.
3769
   *
3770
   * Overridden by each implementation.
3771
   *
3772
   * This function is not BOM-aware.
3773
   *
3774
   * @param buf the UTF-16BE string to validate.
3775
   * @param len the length of the string in number of 2-byte code units
3776
   * (char16_t).
3777
   * @return true if and only if the string is valid UTF-16BE.
3778
   */
3779
  simdutf_warn_unused virtual bool
3780
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
3781
3782
  /**
3783
   * Validate the UTF-16LE string and stop on error.  It might be faster than
3784
   * validate_utf16le when an error is expected to occur early.
3785
   *
3786
   * Overridden by each implementation.
3787
   *
3788
   * This function is not BOM-aware.
3789
   *
3790
   * @param buf the UTF-16LE string to validate.
3791
   * @param len the length of the string in number of 2-byte code units
3792
   * (char16_t).
3793
   * @return a result pair struct (of type simdutf::result containing the two
3794
   * fields error and count) with an error code and either position of the error
3795
   * (in the input in code units) if any, or the number of code units validated
3796
   * if successful.
3797
   */
3798
  simdutf_warn_unused virtual result
3799
  validate_utf16le_with_errors(const char16_t *buf,
3800
                               size_t len) const noexcept = 0;
3801
3802
  /**
3803
   * Validate the UTF-16BE string and stop on error. It might be faster than
3804
   * validate_utf16be when an error is expected to occur early.
3805
   *
3806
   * Overridden by each implementation.
3807
   *
3808
   * This function is not BOM-aware.
3809
   *
3810
   * @param buf the UTF-16BE string to validate.
3811
   * @param len the length of the string in number of 2-byte code units
3812
   * (char16_t).
3813
   * @return a result pair struct (of type simdutf::result containing the two
3814
   * fields error and count) with an error code and either position of the error
3815
   * (in the input in code units) if any, or the number of code units validated
3816
   * if successful.
3817
   */
3818
  simdutf_warn_unused virtual result
3819
  validate_utf16be_with_errors(const char16_t *buf,
3820
                               size_t len) const noexcept = 0;
3821
  /**
3822
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
3823
   * Unicode replacement character U+FFFD. We allow the input and output to be
3824
   * the same buffer so that the correction is done in-place.
3825
   *
3826
   * Overridden by each implementation.
3827
   *
3828
   * @param input the UTF-16LE string to correct.
3829
   * @param len the length of the string in number of 2-byte code units
3830
   * (char16_t).
3831
   * @param output the output buffer.
3832
   */
3833
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
3834
                                      char16_t *output) const noexcept = 0;
3835
  /**
3836
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
3837
   * Unicode replacement character U+FFFD. We allow the input and output to be
3838
   * the same buffer so that the correction is done in-place.
3839
   *
3840
   * Overridden by each implementation.
3841
   *
3842
   * @param input the UTF-16BE string to correct.
3843
   * @param len the length of the string in number of 2-byte code units
3844
   * (char16_t).
3845
   * @param output the output buffer.
3846
   */
3847
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
3848
                                      char16_t *output) const noexcept = 0;
3849
#endif // SIMDUTF_FEATURE_UTF16
3850
3851
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3852
  /**
3853
   * Validate the UTF-32 string.
3854
   *
3855
   * Overridden by each implementation.
3856
   *
3857
   * This function is not BOM-aware.
3858
   *
3859
   * @param buf the UTF-32 string to validate.
3860
   * @param len the length of the string in number of 4-byte code units
3861
   * (char32_t).
3862
   * @return true if and only if the string is valid UTF-32.
3863
   */
3864
  simdutf_warn_unused virtual bool
3865
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
3866
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3867
3868
#if SIMDUTF_FEATURE_UTF32
3869
  /**
3870
   * Validate the UTF-32 string and stop on error.
3871
   *
3872
   * Overridden by each implementation.
3873
   *
3874
   * This function is not BOM-aware.
3875
   *
3876
   * @param buf the UTF-32 string to validate.
3877
   * @param len the length of the string in number of 4-byte code units
3878
   * (char32_t).
3879
   * @return a result pair struct (of type simdutf::result containing the two
3880
   * fields error and count) with an error code and either position of the error
3881
   * (in the input in code units) if any, or the number of code units validated
3882
   * if successful.
3883
   */
3884
  simdutf_warn_unused virtual result
3885
  validate_utf32_with_errors(const char32_t *buf,
3886
                             size_t len) const noexcept = 0;
3887
#endif // SIMDUTF_FEATURE_UTF32
3888
3889
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3890
  /**
3891
   * Convert Latin1 string into UTF-8 string.
3892
   *
3893
   * This function is suitable to work with inputs from untrusted sources.
3894
   *
3895
   * @param input         the Latin1 string to convert
3896
   * @param length        the length of the string in bytes
3897
   * @param utf8_output  the pointer to buffer that can hold conversion result
3898
   * @return the number of written char; 0 if conversion is not possible
3899
   */
3900
  simdutf_warn_unused virtual size_t
3901
  convert_latin1_to_utf8(const char *input, size_t length,
3902
                         char *utf8_output) const noexcept = 0;
3903
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3904
3905
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3906
  /**
3907
   * Convert possibly Latin1 string into UTF-16LE string.
3908
   *
3909
   * This function is suitable to work with inputs from untrusted sources.
3910
   *
3911
   * @param input         the Latin1  string to convert
3912
   * @param length        the length of the string in bytes
3913
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3914
   * @return the number of written char16_t; 0 if conversion is not possible
3915
   */
3916
  simdutf_warn_unused virtual size_t
3917
  convert_latin1_to_utf16le(const char *input, size_t length,
3918
                            char16_t *utf16_output) const noexcept = 0;
3919
3920
  /**
3921
   * Convert Latin1 string into UTF-16BE string.
3922
   *
3923
   * This function is suitable to work with inputs from untrusted sources.
3924
   *
3925
   * @param input         the Latin1 string to convert
3926
   * @param length        the length of the string in bytes
3927
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3928
   * @return the number of written char16_t; 0 if conversion is not possible
3929
   */
3930
  simdutf_warn_unused virtual size_t
3931
  convert_latin1_to_utf16be(const char *input, size_t length,
3932
                            char16_t *utf16_output) const noexcept = 0;
3933
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3934
3935
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3936
  /**
3937
   * Convert Latin1 string into UTF-32 string.
3938
   *
3939
   * This function is suitable to work with inputs from untrusted sources.
3940
   *
3941
   * @param input         the Latin1 string to convert
3942
   * @param length        the length of the string in bytes
3943
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
3944
   * @return the number of written char32_t; 0 if conversion is not possible
3945
   */
3946
  simdutf_warn_unused virtual size_t
3947
  convert_latin1_to_utf32(const char *input, size_t length,
3948
                          char32_t *utf32_buffer) const noexcept = 0;
3949
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3950
3951
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3952
  /**
3953
   * Convert possibly broken UTF-8 string into latin1 string.
3954
   *
3955
   * During the conversion also validation of the input string is done.
3956
   * This function is suitable to work with inputs from untrusted sources.
3957
   *
3958
   * @param input         the UTF-8 string to convert
3959
   * @param length        the length of the string in bytes
3960
   * @param latin1_output  the pointer to buffer that can hold conversion result
3961
   * @return the number of written char; 0 if the input was not valid UTF-8
3962
   * string or if it cannot be represented as Latin1
3963
   */
3964
  simdutf_warn_unused virtual size_t
3965
  convert_utf8_to_latin1(const char *input, size_t length,
3966
                         char *latin1_output) const noexcept = 0;
3967
3968
  /**
3969
   * Convert possibly broken UTF-8 string into latin1 string with errors.
3970
   * If the string cannot be represented as Latin1, an error
3971
   * code is returned.
3972
   *
3973
   * During the conversion also validation of the input string is done.
3974
   * This function is suitable to work with inputs from untrusted sources.
3975
   *
3976
   * @param input         the UTF-8 string to convert
3977
   * @param length        the length of the string in bytes
3978
   * @param latin1_output  the pointer to buffer that can hold conversion result
3979
   * @return a result pair struct (of type simdutf::result containing the two
3980
   * fields error and count) with an error code and either position of the error
3981
   * (in the input in code units) if any, or the number of code units validated
3982
   * if successful.
3983
   */
3984
  simdutf_warn_unused virtual result
3985
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
3986
                                     char *latin1_output) const noexcept = 0;
3987
3988
  /**
3989
   * Convert valid UTF-8 string into latin1 string.
3990
   *
3991
   * This function assumes that the input string is valid UTF-8 and that it can
3992
   * be represented as Latin1. If you violate this assumption, the result is
3993
   * implementation defined and may include system-dependent behavior such as
3994
   * crashes.
3995
   *
3996
   * This function is for expert users only and not part of our public API. Use
3997
   * convert_utf8_to_latin1 instead.
3998
   *
3999
   * This function is not BOM-aware.
4000
   *
4001
   * @param input         the UTF-8 string to convert
4002
   * @param length        the length of the string in bytes
4003
   * @param latin1_output  the pointer to buffer that can hold conversion result
4004
   * @return the number of written char; 0 if the input was not valid UTF-8
4005
   * string
4006
   */
4007
  simdutf_warn_unused virtual size_t
4008
  convert_valid_utf8_to_latin1(const char *input, size_t length,
4009
                               char *latin1_output) const noexcept = 0;
4010
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4011
4012
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4013
  /**
4014
   * Convert possibly broken UTF-8 string into UTF-16LE string.
4015
   *
4016
   * During the conversion also validation of the input string is done.
4017
   * This function is suitable to work with inputs from untrusted sources.
4018
   *
4019
   * @param input         the UTF-8 string to convert
4020
   * @param length        the length of the string in bytes
4021
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4022
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4023
   * string
4024
   */
4025
  simdutf_warn_unused virtual size_t
4026
  convert_utf8_to_utf16le(const char *input, size_t length,
4027
                          char16_t *utf16_output) const noexcept = 0;
4028
4029
  /**
4030
   * Convert possibly broken UTF-8 string into UTF-16BE string.
4031
   *
4032
   * During the conversion also validation of the input string is done.
4033
   * This function is suitable to work with inputs from untrusted sources.
4034
   *
4035
   * @param input         the UTF-8 string to convert
4036
   * @param length        the length of the string in bytes
4037
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4038
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4039
   * string
4040
   */
4041
  simdutf_warn_unused virtual size_t
4042
  convert_utf8_to_utf16be(const char *input, size_t length,
4043
                          char16_t *utf16_output) const noexcept = 0;
4044
4045
  /**
4046
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
4047
   * error.
4048
   *
4049
   * During the conversion also validation of the input string is done.
4050
   * This function is suitable to work with inputs from untrusted sources.
4051
   *
4052
   * @param input         the UTF-8 string to convert
4053
   * @param length        the length of the string in bytes
4054
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4055
   * @return a result pair struct (of type simdutf::result containing the two
4056
   * fields error and count) with an error code and either position of the error
4057
   * (in the input in code units) if any, or the number of code units validated
4058
   * if successful.
4059
   */
4060
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
4061
      const char *input, size_t length,
4062
      char16_t *utf16_output) const noexcept = 0;
4063
4064
  /**
4065
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
4066
   * error.
4067
   *
4068
   * During the conversion also validation of the input string is done.
4069
   * This function is suitable to work with inputs from untrusted sources.
4070
   *
4071
   * @param input         the UTF-8 string to convert
4072
   * @param length        the length of the string in bytes
4073
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4074
   * @return a result pair struct (of type simdutf::result containing the two
4075
   * fields error and count) with an error code and either position of the error
4076
   * (in the input in code units) if any, or the number of code units validated
4077
   * if successful.
4078
   */
4079
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
4080
      const char *input, size_t length,
4081
      char16_t *utf16_output) const noexcept = 0;
4082
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4083
4084
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4085
  /**
4086
   * Convert possibly broken UTF-8 string into UTF-32 string.
4087
   *
4088
   * During the conversion also validation of the input string is done.
4089
   * This function is suitable to work with inputs from untrusted sources.
4090
   *
4091
   * @param input         the UTF-8 string to convert
4092
   * @param length        the length of the string in bytes
4093
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4094
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4095
   * string
4096
   */
4097
  simdutf_warn_unused virtual size_t
4098
  convert_utf8_to_utf32(const char *input, size_t length,
4099
                        char32_t *utf32_output) const noexcept = 0;
4100
4101
  /**
4102
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
4103
   *
4104
   * During the conversion also validation of the input string is done.
4105
   * This function is suitable to work with inputs from untrusted sources.
4106
   *
4107
   * @param input         the UTF-8 string to convert
4108
   * @param length        the length of the string in bytes
4109
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4110
   * @return a result pair struct (of type simdutf::result containing the two
4111
   * fields error and count) with an error code and either position of the error
4112
   * (in the input in code units) if any, or the number of char32_t written if
4113
   * successful.
4114
   */
4115
  simdutf_warn_unused virtual result
4116
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
4117
                                    char32_t *utf32_output) const noexcept = 0;
4118
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4119
4120
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4121
  /**
4122
   * Convert valid UTF-8 string into UTF-16LE string.
4123
   *
4124
   * This function assumes that the input string is valid UTF-8.
4125
   *
4126
   * @param input         the UTF-8 string to convert
4127
   * @param length        the length of the string in bytes
4128
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4129
   * @return the number of written char16_t
4130
   */
4131
  simdutf_warn_unused virtual size_t
4132
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
4133
                                char16_t *utf16_buffer) const noexcept = 0;
4134
4135
  /**
4136
   * Convert valid UTF-8 string into UTF-16BE string.
4137
   *
4138
   * This function assumes that the input string is valid UTF-8.
4139
   *
4140
   * @param input         the UTF-8 string to convert
4141
   * @param length        the length of the string in bytes
4142
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4143
   * @return the number of written char16_t
4144
   */
4145
  simdutf_warn_unused virtual size_t
4146
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
4147
                                char16_t *utf16_buffer) const noexcept = 0;
4148
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4149
4150
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4151
  /**
4152
   * Convert valid UTF-8 string into UTF-32 string.
4153
   *
4154
   * This function assumes that the input string is valid UTF-8.
4155
   *
4156
   * @param input         the UTF-8 string to convert
4157
   * @param length        the length of the string in bytes
4158
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4159
   * @return the number of written char32_t
4160
   */
4161
  simdutf_warn_unused virtual size_t
4162
  convert_valid_utf8_to_utf32(const char *input, size_t length,
4163
                              char32_t *utf32_buffer) const noexcept = 0;
4164
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4165
4166
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4167
  /**
4168
   * Compute the number of 2-byte code units that this UTF-8 string would
4169
   * require in UTF-16LE format.
4170
   *
4171
   * This function does not validate the input. It is acceptable to pass invalid
4172
   * UTF-8 strings but in such cases the result is implementation defined.
4173
   *
4174
   * @param input         the UTF-8 string to process
4175
   * @param length        the length of the string in bytes
4176
   * @return the number of char16_t code units required to encode the UTF-8
4177
   * string as UTF-16LE
4178
   */
4179
  simdutf_warn_unused virtual size_t
4180
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4181
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4182
4183
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4184
  /**
4185
   * Compute the number of 4-byte code units that this UTF-8 string would
4186
   * require in UTF-32 format.
4187
   *
4188
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
4189
   * UTF-8 strings but in such cases the result is implementation defined.
4190
   *
4191
   * This function does not validate the input.
4192
   *
4193
   * @param input         the UTF-8 string to process
4194
   * @param length        the length of the string in bytes
4195
   * @return the number of char32_t code units required to encode the UTF-8
4196
   * string as UTF-32
4197
   */
4198
  simdutf_warn_unused virtual size_t
4199
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4200
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4201
4202
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4203
  /**
4204
   * Convert possibly broken UTF-16LE string into Latin1 string.
4205
   *
4206
   * During the conversion also validation of the input string is done.
4207
   * This function is suitable to work with inputs from untrusted sources.
4208
   *
4209
   * This function is not BOM-aware.
4210
   *
4211
   * @param input         the UTF-16LE string to convert
4212
   * @param length        the length of the string in 2-byte code units
4213
   * (char16_t)
4214
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4215
   * result
4216
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4217
   * string or if it cannot be represented as Latin1
4218
   */
4219
  simdutf_warn_unused virtual size_t
4220
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
4221
                            char *latin1_buffer) const noexcept = 0;
4222
4223
  /**
4224
   * Convert possibly broken UTF-16BE string into Latin1 string.
4225
   *
4226
   * During the conversion also validation of the input string is done.
4227
   * This function is suitable to work with inputs from untrusted sources.
4228
   *
4229
   * This function is not BOM-aware.
4230
   *
4231
   * @param input         the UTF-16BE string to convert
4232
   * @param length        the length of the string in 2-byte code units
4233
   * (char16_t)
4234
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4235
   * result
4236
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4237
   * string or if it cannot be represented as Latin1
4238
   */
4239
  simdutf_warn_unused virtual size_t
4240
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
4241
                            char *latin1_buffer) const noexcept = 0;
4242
4243
  /**
4244
   * Convert possibly broken UTF-16LE string into Latin1 string.
4245
   * If the string cannot be represented as Latin1, an error
4246
   * is returned.
4247
   *
4248
   * During the conversion also validation of the input string is done.
4249
   * This function is suitable to work with inputs from untrusted sources.
4250
   * This function is not BOM-aware.
4251
   *
4252
   * @param input         the UTF-16LE string to convert
4253
   * @param length        the length of the string in 2-byte code units
4254
   * (char16_t)
4255
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4256
   * result
4257
   * @return a result pair struct (of type simdutf::result containing the two
4258
   * fields error and count) with an error code and either position of the error
4259
   * (in the input in code units) if any, or the number of char written if
4260
   * successful.
4261
   */
4262
  simdutf_warn_unused virtual result
4263
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
4264
                                        char *latin1_buffer) const noexcept = 0;
4265
4266
  /**
4267
   * Convert possibly broken UTF-16BE string into Latin1 string.
4268
   * If the string cannot be represented as Latin1, an error
4269
   * is returned.
4270
   *
4271
   * During the conversion also validation of the input string is done.
4272
   * This function is suitable to work with inputs from untrusted sources.
4273
   * This function is not BOM-aware.
4274
   *
4275
   * @param input         the UTF-16BE string to convert
4276
   * @param length        the length of the string in 2-byte code units
4277
   * (char16_t)
4278
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4279
   * result
4280
   * @return a result pair struct (of type simdutf::result containing the two
4281
   * fields error and count) with an error code and either position of the error
4282
   * (in the input in code units) if any, or the number of char written if
4283
   * successful.
4284
   */
4285
  simdutf_warn_unused virtual result
4286
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
4287
                                        char *latin1_buffer) const noexcept = 0;
4288
4289
  /**
4290
   * Convert valid UTF-16LE string into Latin1 string.
4291
   *
4292
   * This function assumes that the input string is valid UTF-L16LE and that it
4293
   * can be represented as Latin1. If you violate this assumption, the result is
4294
   * implementation defined and may include system-dependent behavior such as
4295
   * crashes.
4296
   *
4297
   * This function is for expert users only and not part of our public API. Use
4298
   * convert_utf16le_to_latin1 instead.
4299
   *
4300
   * This function is not BOM-aware.
4301
   *
4302
   * @param input         the UTF-16LE string to convert
4303
   * @param length        the length of the string in 2-byte code units
4304
   * (char16_t)
4305
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4306
   * result
4307
   * @return number of written code units; 0 if conversion is not possible
4308
   */
4309
  simdutf_warn_unused virtual size_t
4310
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
4311
                                  char *latin1_buffer) const noexcept = 0;
4312
4313
  /**
4314
   * Convert valid UTF-16BE string into Latin1 string.
4315
   *
4316
   * This function assumes that the input string is valid UTF16-BE and that it
4317
   * can be represented as Latin1. If you violate this assumption, the result is
4318
   * implementation defined and may include system-dependent behavior such as
4319
   * crashes.
4320
   *
4321
   * This function is for expert users only and not part of our public API. Use
4322
   * convert_utf16be_to_latin1 instead.
4323
   *
4324
   * This function is not BOM-aware.
4325
   *
4326
   * @param input         the UTF-16BE string to convert
4327
   * @param length        the length of the string in 2-byte code units
4328
   * (char16_t)
4329
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4330
   * result
4331
   * @return number of written code units; 0 if conversion is not possible
4332
   */
4333
  simdutf_warn_unused virtual size_t
4334
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
4335
                                  char *latin1_buffer) const noexcept = 0;
4336
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4337
4338
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4339
  /**
4340
   * Convert possibly broken UTF-16LE string into UTF-8 string.
4341
   *
4342
   * During the conversion also validation of the input string is done.
4343
   * This function is suitable to work with inputs from untrusted sources.
4344
   *
4345
   * This function is not BOM-aware.
4346
   *
4347
   * @param input         the UTF-16LE string to convert
4348
   * @param length        the length of the string in 2-byte code units
4349
   * (char16_t)
4350
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4351
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4352
   * string
4353
   */
4354
  simdutf_warn_unused virtual size_t
4355
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
4356
                          char *utf8_buffer) const noexcept = 0;
4357
4358
  /**
4359
   * Convert possibly broken UTF-16BE string into UTF-8 string.
4360
   *
4361
   * During the conversion also validation of the input string is done.
4362
   * This function is suitable to work with inputs from untrusted sources.
4363
   *
4364
   * This function is not BOM-aware.
4365
   *
4366
   * @param input         the UTF-16BE string to convert
4367
   * @param length        the length of the string in 2-byte code units
4368
   * (char16_t)
4369
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4370
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4371
   * string
4372
   */
4373
  simdutf_warn_unused virtual size_t
4374
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
4375
                          char *utf8_buffer) const noexcept = 0;
4376
4377
  /**
4378
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
4379
   * error.
4380
   *
4381
   * During the conversion also validation of the input string is done.
4382
   * This function is suitable to work with inputs from untrusted sources.
4383
   *
4384
   * This function is not BOM-aware.
4385
   *
4386
   * @param input         the UTF-16LE string to convert
4387
   * @param length        the length of the string in 2-byte code units
4388
   * (char16_t)
4389
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4390
   * @return a result pair struct (of type simdutf::result containing the two
4391
   * fields error and count) with an error code and either position of the error
4392
   * (in the input in code units) if any, or the number of char written if
4393
   * successful.
4394
   */
4395
  simdutf_warn_unused virtual result
4396
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
4397
                                      char *utf8_buffer) const noexcept = 0;
4398
4399
  /**
4400
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
4401
   * error.
4402
   *
4403
   * During the conversion also validation of the input string is done.
4404
   * This function is suitable to work with inputs from untrusted sources.
4405
   *
4406
   * This function is not BOM-aware.
4407
   *
4408
   * @param input         the UTF-16BE string to convert
4409
   * @param length        the length of the string in 2-byte code units
4410
   * (char16_t)
4411
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4412
   * @return a result pair struct (of type simdutf::result containing the two
4413
   * fields error and count) with an error code and either position of the error
4414
   * (in the input in code units) if any, or the number of char written if
4415
   * successful.
4416
   */
4417
  simdutf_warn_unused virtual result
4418
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
4419
                                      char *utf8_buffer) const noexcept = 0;
4420
4421
  /**
4422
   * Convert valid UTF-16LE string into UTF-8 string.
4423
   *
4424
   * This function assumes that the input string is valid UTF-16LE.
4425
   *
4426
   * This function is not BOM-aware.
4427
   *
4428
   * @param input         the UTF-16LE string to convert
4429
   * @param length        the length of the string in 2-byte code units
4430
   * (char16_t)
4431
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4432
   * result
4433
   * @return number of written code units; 0 if conversion is not possible
4434
   */
4435
  simdutf_warn_unused virtual size_t
4436
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
4437
                                char *utf8_buffer) const noexcept = 0;
4438
4439
  /**
4440
   * Convert valid UTF-16BE string into UTF-8 string.
4441
   *
4442
   * This function assumes that the input string is valid UTF-16BE.
4443
   *
4444
   * This function is not BOM-aware.
4445
   *
4446
   * @param input         the UTF-16BE string to convert
4447
   * @param length        the length of the string in 2-byte code units
4448
   * (char16_t)
4449
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4450
   * result
4451
   * @return number of written code units; 0 if conversion is not possible
4452
   */
4453
  simdutf_warn_unused virtual size_t
4454
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
4455
                                char *utf8_buffer) const noexcept = 0;
4456
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4457
4458
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4459
  /**
4460
   * Convert possibly broken UTF-16LE string into UTF-32 string.
4461
   *
4462
   * During the conversion also validation of the input string is done.
4463
   * This function is suitable to work with inputs from untrusted sources.
4464
   *
4465
   * This function is not BOM-aware.
4466
   *
4467
   * @param input         the UTF-16LE string to convert
4468
   * @param length        the length of the string in 2-byte code units
4469
   * (char16_t)
4470
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4471
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4472
   * string
4473
   */
4474
  simdutf_warn_unused virtual size_t
4475
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
4476
                           char32_t *utf32_buffer) const noexcept = 0;
4477
4478
  /**
4479
   * Convert possibly broken UTF-16BE string into UTF-32 string.
4480
   *
4481
   * During the conversion also validation of the input string is done.
4482
   * This function is suitable to work with inputs from untrusted sources.
4483
   *
4484
   * This function is not BOM-aware.
4485
   *
4486
   * @param input         the UTF-16BE string to convert
4487
   * @param length        the length of the string in 2-byte code units
4488
   * (char16_t)
4489
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4490
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4491
   * string
4492
   */
4493
  simdutf_warn_unused virtual size_t
4494
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
4495
                           char32_t *utf32_buffer) const noexcept = 0;
4496
4497
  /**
4498
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
4499
   * error.
4500
   *
4501
   * During the conversion also validation of the input string is done.
4502
   * This function is suitable to work with inputs from untrusted sources.
4503
   *
4504
   * This function is not BOM-aware.
4505
   *
4506
   * @param input         the UTF-16LE string to convert
4507
   * @param length        the length of the string in 2-byte code units
4508
   * (char16_t)
4509
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4510
   * @return a result pair struct (of type simdutf::result containing the two
4511
   * fields error and count) with an error code and either position of the error
4512
   * (in the input in code units) if any, or the number of char32_t written if
4513
   * successful.
4514
   */
4515
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
4516
      const char16_t *input, size_t length,
4517
      char32_t *utf32_buffer) const noexcept = 0;
4518
4519
  /**
4520
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
4521
   * error.
4522
   *
4523
   * During the conversion also validation of the input string is done.
4524
   * This function is suitable to work with inputs from untrusted sources.
4525
   *
4526
   * This function is not BOM-aware.
4527
   *
4528
   * @param input         the UTF-16BE string to convert
4529
   * @param length        the length of the string in 2-byte code units
4530
   * (char16_t)
4531
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4532
   * @return a result pair struct (of type simdutf::result containing the two
4533
   * fields error and count) with an error code and either position of the error
4534
   * (in the input in code units) if any, or the number of char32_t written if
4535
   * successful.
4536
   */
4537
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
4538
      const char16_t *input, size_t length,
4539
      char32_t *utf32_buffer) const noexcept = 0;
4540
4541
  /**
4542
   * Convert valid UTF-16LE string into UTF-32 string.
4543
   *
4544
   * This function assumes that the input string is valid UTF-16LE.
4545
   *
4546
   * This function is not BOM-aware.
4547
   *
4548
   * @param input         the UTF-16LE string to convert
4549
   * @param length        the length of the string in 2-byte code units
4550
   * (char16_t)
4551
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4552
   * result
4553
   * @return number of written code units; 0 if conversion is not possible
4554
   */
4555
  simdutf_warn_unused virtual size_t
4556
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
4557
                                 char32_t *utf32_buffer) const noexcept = 0;
4558
4559
  /**
4560
   * Convert valid UTF-16LE string into UTF-32BE string.
4561
   *
4562
   * This function assumes that the input string is valid UTF-16BE.
4563
   *
4564
   * This function is not BOM-aware.
4565
   *
4566
   * @param input         the UTF-16BE string to convert
4567
   * @param length        the length of the string in 2-byte code units
4568
   * (char16_t)
4569
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4570
   * result
4571
   * @return number of written code units; 0 if conversion is not possible
4572
   */
4573
  simdutf_warn_unused virtual size_t
4574
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
4575
                                 char32_t *utf32_buffer) const noexcept = 0;
4576
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4577
4578
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4579
  /**
4580
   * Compute the number of bytes that this UTF-16LE string would require in
4581
   * UTF-8 format.
4582
   *
4583
   * This function does not validate the input. It is acceptable to pass invalid
4584
   * UTF-16 strings but in such cases the result is implementation defined.
4585
   *
4586
   * This function is not BOM-aware.
4587
   *
4588
   * @param input         the UTF-16LE string to convert
4589
   * @param length        the length of the string in 2-byte code units
4590
   * (char16_t)
4591
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
4592
   */
4593
  simdutf_warn_unused virtual size_t
4594
  utf8_length_from_utf16le(const char16_t *input,
4595
                           size_t length) const noexcept = 0;
4596
4597
  /**
4598
   * Compute the number of bytes that this UTF-16BE string would require in
4599
   * UTF-8 format.
4600
   *
4601
   * This function does not validate the input. It is acceptable to pass invalid
4602
   * UTF-16 strings but in such cases the result is implementation defined.
4603
   *
4604
   * This function is not BOM-aware.
4605
   *
4606
   * @param input         the UTF-16BE string to convert
4607
   * @param length        the length of the string in 2-byte code units
4608
   * (char16_t)
4609
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
4610
   */
4611
  simdutf_warn_unused virtual size_t
4612
  utf8_length_from_utf16be(const char16_t *input,
4613
                           size_t length) const noexcept = 0;
4614
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4615
4616
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4617
  /**
4618
   * Convert possibly broken UTF-32 string into Latin1 string.
4619
   *
4620
   * During the conversion also validation of the input string is done.
4621
   * This function is suitable to work with inputs from untrusted sources.
4622
   *
4623
   * This function is not BOM-aware.
4624
   *
4625
   * @param input         the UTF-32 string to convert
4626
   * @param length        the length of the string in 4-byte code units
4627
   * (char32_t)
4628
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4629
   * result
4630
   * @return number of written code units; 0 if input is not a valid UTF-32
4631
   * string
4632
   */
4633
  simdutf_warn_unused virtual size_t
4634
  convert_utf32_to_latin1(const char32_t *input, size_t length,
4635
                          char *latin1_buffer) const noexcept = 0;
4636
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4637
4638
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4639
  /**
4640
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
4641
   * If the string cannot be represented as Latin1, an error is returned.
4642
   *
4643
   * During the conversion also validation of the input string is done.
4644
   * This function is suitable to work with inputs from untrusted sources.
4645
   *
4646
   * This function is not BOM-aware.
4647
   *
4648
   * @param input         the UTF-32 string to convert
4649
   * @param length        the length of the string in 4-byte code units
4650
   * (char32_t)
4651
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4652
   * result
4653
   * @return a result pair struct (of type simdutf::result containing the two
4654
   * fields error and count) with an error code and either position of the error
4655
   * (in the input in code units) if any, or the number of char written if
4656
   * successful.
4657
   */
4658
  simdutf_warn_unused virtual result
4659
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
4660
                                      char *latin1_buffer) const noexcept = 0;
4661
4662
  /**
4663
   * Convert valid UTF-32 string into Latin1 string.
4664
   *
4665
   * This function assumes that the input string is valid UTF-32 and can be
4666
   * represented as Latin1. If you violate this assumption, the result is
4667
   * implementation defined and may include system-dependent behavior such as
4668
   * crashes.
4669
   *
4670
   * This function is for expert users only and not part of our public API. Use
4671
   * convert_utf32_to_latin1 instead.
4672
   *
4673
   * This function is not BOM-aware.
4674
   *
4675
   * @param input         the UTF-32 string to convert
4676
   * @param length        the length of the string in 4-byte code units
4677
   * (char32_t)
4678
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
4679
   * result
4680
   * @return number of written code units; 0 if conversion is not possible
4681
   */
4682
  simdutf_warn_unused virtual size_t
4683
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
4684
                                char *latin1_buffer) const noexcept = 0;
4685
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4686
4687
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4688
  /**
4689
   * Convert possibly broken UTF-32 string into UTF-8 string.
4690
   *
4691
   * During the conversion also validation of the input string is done.
4692
   * This function is suitable to work with inputs from untrusted sources.
4693
   *
4694
   * This function is not BOM-aware.
4695
   *
4696
   * @param input         the UTF-32 string to convert
4697
   * @param length        the length of the string in 4-byte code units
4698
   * (char32_t)
4699
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4700
   * @return number of written code units; 0 if input is not a valid UTF-32
4701
   * string
4702
   */
4703
  simdutf_warn_unused virtual size_t
4704
  convert_utf32_to_utf8(const char32_t *input, size_t length,
4705
                        char *utf8_buffer) const noexcept = 0;
4706
4707
  /**
4708
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
4709
   *
4710
   * During the conversion also validation of the input string is done.
4711
   * This function is suitable to work with inputs from untrusted sources.
4712
   *
4713
   * This function is not BOM-aware.
4714
   *
4715
   * @param input         the UTF-32 string to convert
4716
   * @param length        the length of the string in 4-byte code units
4717
   * (char32_t)
4718
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4719
   * @return a result pair struct (of type simdutf::result containing the two
4720
   * fields error and count) with an error code and either position of the error
4721
   * (in the input in code units) if any, or the number of char written if
4722
   * successful.
4723
   */
4724
  simdutf_warn_unused virtual result
4725
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
4726
                                    char *utf8_buffer) const noexcept = 0;
4727
4728
  /**
4729
   * Convert valid UTF-32 string into UTF-8 string.
4730
   *
4731
   * This function assumes that the input string is valid UTF-32.
4732
   *
4733
   * This function is not BOM-aware.
4734
   *
4735
   * @param input         the UTF-32 string to convert
4736
   * @param length        the length of the string in 4-byte code units
4737
   * (char32_t)
4738
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4739
   * result
4740
   * @return number of written code units; 0 if conversion is not possible
4741
   */
4742
  simdutf_warn_unused virtual size_t
4743
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
4744
                              char *utf8_buffer) const noexcept = 0;
4745
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4746
4747
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4748
  /**
4749
   * Return the number of bytes that this UTF-16 string would require in Latin1
4750
   * format.
4751
   *
4752
   *
4753
   * @param input         the UTF-16 string to convert
4754
   * @param length        the length of the string in 2-byte code units
4755
   * (char16_t)
4756
   * @return the number of bytes required to encode the UTF-16 string as Latin1
4757
   */
4758
  simdutf_warn_unused virtual size_t
4759
  utf16_length_from_latin1(size_t length) const noexcept {
4760
    return length;
4761
  }
4762
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4763
4764
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4765
  /**
4766
   * Convert possibly broken UTF-32 string into UTF-16LE string.
4767
   *
4768
   * During the conversion also validation of the input string is done.
4769
   * This function is suitable to work with inputs from untrusted sources.
4770
   *
4771
   * This function is not BOM-aware.
4772
   *
4773
   * @param input         the UTF-32 string to convert
4774
   * @param length        the length of the string in 4-byte code units
4775
   * (char32_t)
4776
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4777
   * @return number of written code units; 0 if input is not a valid UTF-32
4778
   * string
4779
   */
4780
  simdutf_warn_unused virtual size_t
4781
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
4782
                           char16_t *utf16_buffer) const noexcept = 0;
4783
4784
  /**
4785
   * Convert possibly broken UTF-32 string into UTF-16BE string.
4786
   *
4787
   * During the conversion also validation of the input string is done.
4788
   * This function is suitable to work with inputs from untrusted sources.
4789
   *
4790
   * This function is not BOM-aware.
4791
   *
4792
   * @param input         the UTF-32 string to convert
4793
   * @param length        the length of the string in 4-byte code units
4794
   * (char32_t)
4795
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4796
   * @return number of written code units; 0 if input is not a valid UTF-32
4797
   * string
4798
   */
4799
  simdutf_warn_unused virtual size_t
4800
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
4801
                           char16_t *utf16_buffer) const noexcept = 0;
4802
4803
  /**
4804
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
4805
   * error.
4806
   *
4807
   * During the conversion also validation of the input string is done.
4808
   * This function is suitable to work with inputs from untrusted sources.
4809
   *
4810
   * This function is not BOM-aware.
4811
   *
4812
   * @param input         the UTF-32 string to convert
4813
   * @param length        the length of the string in 4-byte code units
4814
   * (char32_t)
4815
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4816
   * @return a result pair struct (of type simdutf::result containing the two
4817
   * fields error and count) with an error code and either position of the error
4818
   * (in the input in code units) if any, or the number of char16_t written if
4819
   * successful.
4820
   */
4821
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
4822
      const char32_t *input, size_t length,
4823
      char16_t *utf16_buffer) const noexcept = 0;
4824
4825
  /**
4826
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
4827
   * error.
4828
   *
4829
   * During the conversion also validation of the input string is done.
4830
   * This function is suitable to work with inputs from untrusted sources.
4831
   *
4832
   * This function is not BOM-aware.
4833
   *
4834
   * @param input         the UTF-32 string to convert
4835
   * @param length        the length of the string in 4-byte code units
4836
   * (char32_t)
4837
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4838
   * @return a result pair struct (of type simdutf::result containing the two
4839
   * fields error and count) with an error code and either position of the error
4840
   * (in the input in code units) if any, or the number of char16_t written if
4841
   * successful.
4842
   */
4843
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
4844
      const char32_t *input, size_t length,
4845
      char16_t *utf16_buffer) const noexcept = 0;
4846
4847
  /**
4848
   * Convert valid UTF-32 string into UTF-16LE string.
4849
   *
4850
   * This function assumes that the input string is valid UTF-32.
4851
   *
4852
   * This function is not BOM-aware.
4853
   *
4854
   * @param input         the UTF-32 string to convert
4855
   * @param length        the length of the string in 4-byte code units
4856
   * (char32_t)
4857
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
4858
   * result
4859
   * @return number of written code units; 0 if conversion is not possible
4860
   */
4861
  simdutf_warn_unused virtual size_t
4862
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
4863
                                 char16_t *utf16_buffer) const noexcept = 0;
4864
4865
  /**
4866
   * Convert valid UTF-32 string into UTF-16BE string.
4867
   *
4868
   * This function assumes that the input string is valid UTF-32.
4869
   *
4870
   * This function is not BOM-aware.
4871
   *
4872
   * @param input         the UTF-32 string to convert
4873
   * @param length        the length of the string in 4-byte code units
4874
   * (char32_t)
4875
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
4876
   * result
4877
   * @return number of written code units; 0 if conversion is not possible
4878
   */
4879
  simdutf_warn_unused virtual size_t
4880
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
4881
                                 char16_t *utf16_buffer) const noexcept = 0;
4882
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4883
4884
#if SIMDUTF_FEATURE_UTF16
4885
  /**
4886
   * Change the endianness of the input. Can be used to go from UTF-16LE to
4887
   * UTF-16BE or from UTF-16BE to UTF-16LE.
4888
   *
4889
   * This function does not validate the input.
4890
   *
4891
   * This function is not BOM-aware.
4892
   *
4893
   * @param input         the UTF-16 string to process
4894
   * @param length        the length of the string in 2-byte code units
4895
   * (char16_t)
4896
   * @param output        the pointer to a buffer that can hold the conversion
4897
   * result
4898
   */
4899
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
4900
                                       char16_t *output) const noexcept = 0;
4901
#endif // SIMDUTF_FEATURE_UTF16
4902
4903
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4904
  /**
4905
   * Return the number of bytes that this Latin1 string would require in UTF-8
4906
   * format.
4907
   *
4908
   * @param input         the Latin1 string to convert
4909
   * @param length        the length of the string bytes
4910
   * @return the number of bytes required to encode the Latin1 string as UTF-8
4911
   */
4912
  simdutf_warn_unused virtual size_t
4913
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
4914
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4915
4916
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4917
  /**
4918
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
4919
   * format.
4920
   *
4921
   * This function does not validate the input. It is acceptable to pass invalid
4922
   * UTF-32 strings but in such cases the result is implementation defined.
4923
   *
4924
   * @param input         the UTF-32 string to convert
4925
   * @param length        the length of the string in 4-byte code units
4926
   * (char32_t)
4927
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
4928
   */
4929
  simdutf_warn_unused virtual size_t
4930
  utf8_length_from_utf32(const char32_t *input,
4931
                         size_t length) const noexcept = 0;
4932
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4933
4934
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4935
  /**
4936
   * Compute the number of bytes that this UTF-32 string would require in Latin1
4937
   * format.
4938
   *
4939
   * This function does not validate the input. It is acceptable to pass invalid
4940
   * UTF-32 strings but in such cases the result is implementation defined.
4941
   *
4942
   * @param length        the length of the string in 4-byte code units
4943
   * (char32_t)
4944
   * @return the number of bytes required to encode the UTF-32 string as Latin1
4945
   */
4946
  simdutf_warn_unused virtual size_t
4947
  latin1_length_from_utf32(size_t length) const noexcept {
4948
    return length;
4949
  }
4950
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4951
4952
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4953
  /**
4954
   * Compute the number of bytes that this UTF-8 string would require in Latin1
4955
   * format.
4956
   *
4957
   * This function does not validate the input. It is acceptable to pass invalid
4958
   * UTF-8 strings but in such cases the result is implementation defined.
4959
   *
4960
   * @param input         the UTF-8 string to convert
4961
   * @param length        the length of the string in byte
4962
   * @return the number of bytes required to encode the UTF-8 string as Latin1
4963
   */
4964
  simdutf_warn_unused virtual size_t
4965
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4966
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4967
4968
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4969
  /**
4970
   * Compute the number of bytes that this UTF-16LE/BE string would require in
4971
   * Latin1 format.
4972
   *
4973
   * This function does not validate the input. It is acceptable to pass invalid
4974
   * UTF-16 strings but in such cases the result is implementation defined.
4975
   *
4976
   * This function is not BOM-aware.
4977
   *
4978
   * @param input         the UTF-16LE string to convert
4979
   * @param length        the length of the string in 2-byte code units
4980
   * (char16_t)
4981
   * @return the number of bytes required to encode the UTF-16LE string as
4982
   * Latin1
4983
   */
4984
  simdutf_warn_unused virtual size_t
4985
  latin1_length_from_utf16(size_t length) const noexcept {
4986
    return length;
4987
  }
4988
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4989
4990
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4991
  /**
4992
   * Compute the number of two-byte code units that this UTF-32 string would
4993
   * require in UTF-16 format.
4994
   *
4995
   * This function does not validate the input. It is acceptable to pass invalid
4996
   * UTF-32 strings but in such cases the result is implementation defined.
4997
   *
4998
   * @param input         the UTF-32 string to convert
4999
   * @param length        the length of the string in 4-byte code units
5000
   * (char32_t)
5001
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
5002
   */
5003
  simdutf_warn_unused virtual size_t
5004
  utf16_length_from_utf32(const char32_t *input,
5005
                          size_t length) const noexcept = 0;
5006
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5007
5008
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5009
  /**
5010
   * Return the number of bytes that this UTF-32 string would require in Latin1
5011
   * format.
5012
   *
5013
   * @param length        the length of the string in 4-byte code units
5014
   * (char32_t)
5015
   * @return the number of bytes required to encode the UTF-32 string as Latin1
5016
   */
5017
  simdutf_warn_unused virtual size_t
5018
  utf32_length_from_latin1(size_t length) const noexcept {
5019
    return length;
5020
  }
5021
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5022
5023
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5024
  /**
5025
   * Compute the number of bytes that this UTF-16LE string would require in
5026
   * UTF-32 format.
5027
   *
5028
   * This function is equivalent to count_utf16le.
5029
   *
5030
   * This function does not validate the input. It is acceptable to pass invalid
5031
   * UTF-16 strings but in such cases the result is implementation defined.
5032
   *
5033
   * This function is not BOM-aware.
5034
   *
5035
   * @param input         the UTF-16LE string to convert
5036
   * @param length        the length of the string in 2-byte code units
5037
   * (char16_t)
5038
   * @return the number of bytes required to encode the UTF-16LE string as
5039
   * UTF-32
5040
   */
5041
  simdutf_warn_unused virtual size_t
5042
  utf32_length_from_utf16le(const char16_t *input,
5043
                            size_t length) const noexcept = 0;
5044
5045
  /**
5046
   * Compute the number of bytes that this UTF-16BE string would require in
5047
   * UTF-32 format.
5048
   *
5049
   * This function is equivalent to count_utf16be.
5050
   *
5051
   * This function does not validate the input. It is acceptable to pass invalid
5052
   * UTF-16 strings but in such cases the result is implementation defined.
5053
   *
5054
   * This function is not BOM-aware.
5055
   *
5056
   * @param input         the UTF-16BE string to convert
5057
   * @param length        the length of the string in 2-byte code units
5058
   * (char16_t)
5059
   * @return the number of bytes required to encode the UTF-16BE string as
5060
   * UTF-32
5061
   */
5062
  simdutf_warn_unused virtual size_t
5063
  utf32_length_from_utf16be(const char16_t *input,
5064
                            size_t length) const noexcept = 0;
5065
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5066
5067
#if SIMDUTF_FEATURE_UTF16
5068
  /**
5069
   * Count the number of code points (characters) in the string assuming that
5070
   * it is valid.
5071
   *
5072
   * This function assumes that the input string is valid UTF-16LE.
5073
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5074
   * the result is implementation defined.
5075
   *
5076
   * This function is not BOM-aware.
5077
   *
5078
   * @param input         the UTF-16LE string to process
5079
   * @param length        the length of the string in 2-byte code units
5080
   * (char16_t)
5081
   * @return number of code points
5082
   */
5083
  simdutf_warn_unused virtual size_t
5084
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
5085
5086
  /**
5087
   * Count the number of code points (characters) in the string assuming that
5088
   * it is valid.
5089
   *
5090
   * This function assumes that the input string is valid UTF-16BE.
5091
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5092
   * the result is implementation defined.
5093
   *
5094
   * This function is not BOM-aware.
5095
   *
5096
   * @param input         the UTF-16BE string to process
5097
   * @param length        the length of the string in 2-byte code units
5098
   * (char16_t)
5099
   * @return number of code points
5100
   */
5101
  simdutf_warn_unused virtual size_t
5102
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
5103
#endif // SIMDUTF_FEATURE_UTF16
5104
5105
#if SIMDUTF_FEATURE_UTF8
5106
  /**
5107
   * Count the number of code points (characters) in the string assuming that
5108
   * it is valid.
5109
   *
5110
   * This function assumes that the input string is valid UTF-8.
5111
   * It is acceptable to pass invalid UTF-8 strings but in such cases
5112
   * the result is implementation defined.
5113
   *
5114
   * @param input         the UTF-8 string to process
5115
   * @param length        the length of the string in bytes
5116
   * @return number of code points
5117
   */
5118
  simdutf_warn_unused virtual size_t
5119
  count_utf8(const char *input, size_t length) const noexcept = 0;
5120
#endif // SIMDUTF_FEATURE_UTF8
5121
5122
#if SIMDUTF_FEATURE_BASE64
5123
  /**
5124
   * Provide the maximal binary length in bytes given the base64 input.
5125
   * In general, if the input contains ASCII spaces, the result will be less
5126
   * than the maximum length. It is acceptable to pass invalid base64 strings
5127
   * but in such cases the result is implementation defined.
5128
   *
5129
   * @param input         the base64 input to process
5130
   * @param length        the length of the base64 input in bytes
5131
   * @return maximal number of binary bytes
5132
   */
5133
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5134
      const char *input, size_t length) const noexcept;
5135
5136
  /**
5137
   * Provide the maximal binary length in bytes given the base64 input.
5138
   * In general, if the input contains ASCII spaces, the result will be less
5139
   * than the maximum length. It is acceptable to pass invalid base64 strings
5140
   * but in such cases the result is implementation defined.
5141
   *
5142
   * @param input         the base64 input to process, in ASCII stored as 16-bit
5143
   * units
5144
   * @param length        the length of the base64 input in 16-bit units
5145
   * @return maximal number of binary bytes
5146
   */
5147
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5148
      const char16_t *input, size_t length) const noexcept;
5149
5150
  /**
5151
   * Convert a base64 input to a binary output.
5152
   *
5153
   * This function follows the WHATWG forgiving-base64 format, which means that
5154
   * it will ignore any ASCII spaces in the input. You may provide a padded
5155
   * input (with one or two equal signs at the end) or an unpadded input
5156
   * (without any equal signs at the end).
5157
   *
5158
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5159
   *
5160
   * This function will fail in case of invalid input. When last_chunk_options =
5161
   * loose, there are two possible reasons for failure: the input contains a
5162
   * number of base64 characters that when divided by 4, leaves a single
5163
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5164
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5165
   *
5166
   * You should call this function with a buffer that is at least
5167
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5168
   * provide that much space, the function may cause a buffer overflow.
5169
   *
5170
   * @param input         the base64 string to process
5171
   * @param length        the length of the string in bytes
5172
   * @param output        the pointer to a buffer that can hold the conversion
5173
   * result (should be at least maximal_binary_length_from_base64(input, length)
5174
   * bytes long).
5175
   * @param options       the base64 options to use, can be base64_default or
5176
   * base64_url, is base64_default by default.
5177
   * @return a result pair struct (of type simdutf::result containing the two
5178
   * fields error and count) with an error code and either position of the error
5179
   * (in the input in bytes) if any, or the number of bytes written if
5180
   * successful.
5181
   */
5182
  simdutf_warn_unused virtual result
5183
  base64_to_binary(const char *input, size_t length, char *output,
5184
                   base64_options options = base64_default,
5185
                   last_chunk_handling_options last_chunk_options =
5186
                       last_chunk_handling_options::loose) const noexcept = 0;
5187
5188
  /**
5189
   * Convert a base64 input to a binary output while returning more details
5190
   * than base64_to_binary.
5191
   *
5192
   * This function follows the WHATWG forgiving-base64 format, which means that
5193
   * it will ignore any ASCII spaces in the input. You may provide a padded
5194
   * input (with one or two equal signs at the end) or an unpadded input
5195
   * (without any equal signs at the end).
5196
   *
5197
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5198
   *
5199
   * This function will fail in case of invalid input. When last_chunk_options =
5200
   * loose, there are two possible reasons for failure: the input contains a
5201
   * number of base64 characters that when divided by 4, leaves a single
5202
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5203
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5204
   *
5205
   * You should call this function with a buffer that is at least
5206
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5207
   * provide that much space, the function may cause a buffer overflow.
5208
   *
5209
   * @param input         the base64 string to process
5210
   * @param length        the length of the string in bytes
5211
   * @param output        the pointer to a buffer that can hold the conversion
5212
   * result (should be at least maximal_binary_length_from_base64(input, length)
5213
   * bytes long).
5214
   * @param options       the base64 options to use, can be base64_default or
5215
   * base64_url, is base64_default by default.
5216
   * @return a full_result pair struct (of type simdutf::result containing the
5217
   * three fields error, input_count and output_count).
5218
   */
5219
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5220
      const char *input, size_t length, char *output,
5221
      base64_options options = base64_default,
5222
      last_chunk_handling_options last_chunk_options =
5223
          last_chunk_handling_options::loose) const noexcept = 0;
5224
  /**
5225
   * Convert a base64 input to a binary output.
5226
   *
5227
   * This function follows the WHATWG forgiving-base64 format, which means that
5228
   * it will ignore any ASCII spaces in the input. You may provide a padded
5229
   * input (with one or two equal signs at the end) or an unpadded input
5230
   * (without any equal signs at the end).
5231
   *
5232
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5233
   *
5234
   * This function will fail in case of invalid input. When last_chunk_options =
5235
   * loose, there are two possible reasons for failure: the input contains a
5236
   * number of base64 characters that when divided by 4, leaves a single
5237
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5238
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5239
   *
5240
   * You should call this function with a buffer that is at least
5241
   * maximal_binary_length_from_base64(input, length) bytes long. If you
5242
   * fail to provide that much space, the function may cause a buffer overflow.
5243
   *
5244
   * @param input         the base64 string to process, in ASCII stored as
5245
   * 16-bit units
5246
   * @param length        the length of the string in 16-bit units
5247
   * @param output        the pointer to a buffer that can hold the conversion
5248
   * result (should be at least maximal_binary_length_from_base64(input, length)
5249
   * bytes long).
5250
   * @param options       the base64 options to use, can be base64_default or
5251
   * base64_url, is base64_default by default.
5252
   * @return a result pair struct (of type simdutf::result containing the two
5253
   * fields error and count) with an error code and position of the
5254
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
5255
   * number of bytes written if successful.
5256
   */
5257
  simdutf_warn_unused virtual result
5258
  base64_to_binary(const char16_t *input, size_t length, char *output,
5259
                   base64_options options = base64_default,
5260
                   last_chunk_handling_options last_chunk_options =
5261
                       last_chunk_handling_options::loose) const noexcept = 0;
5262
5263
  /**
5264
   * Convert a base64 input to a binary output while returning more details
5265
   * than base64_to_binary.
5266
   *
5267
   * This function follows the WHATWG forgiving-base64 format, which means that
5268
   * it will ignore any ASCII spaces in the input. You may provide a padded
5269
   * input (with one or two equal signs at the end) or an unpadded input
5270
   * (without any equal signs at the end).
5271
   *
5272
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5273
   *
5274
   * This function will fail in case of invalid input. When last_chunk_options =
5275
   * loose, there are two possible reasons for failure: the input contains a
5276
   * number of base64 characters that when divided by 4, leaves a single
5277
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5278
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5279
   *
5280
   * You should call this function with a buffer that is at least
5281
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5282
   * provide that much space, the function may cause a buffer overflow.
5283
   *
5284
   * @param input         the base64 string to process
5285
   * @param length        the length of the string in bytes
5286
   * @param output        the pointer to a buffer that can hold the conversion
5287
   * result (should be at least maximal_binary_length_from_base64(input, length)
5288
   * bytes long).
5289
   * @param options       the base64 options to use, can be base64_default or
5290
   * base64_url, is base64_default by default.
5291
   * @return a full_result pair struct (of type simdutf::result containing the
5292
   * three fields error, input_count and output_count).
5293
   */
5294
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5295
      const char16_t *input, size_t length, char *output,
5296
      base64_options options = base64_default,
5297
      last_chunk_handling_options last_chunk_options =
5298
          last_chunk_handling_options::loose) const noexcept = 0;
5299
5300
  /**
5301
   * Provide the base64 length in bytes given the length of a binary input.
5302
   *
5303
   * @param length        the length of the input in bytes
5304
   * @param options       the base64 options to use, can be base64_default or
5305
   * base64_url, is base64_default by default.
5306
   * @return number of base64 bytes
5307
   */
5308
  simdutf_warn_unused size_t base64_length_from_binary(
5309
      size_t length, base64_options options = base64_default) const noexcept;
5310
5311
  /**
5312
   * Convert a binary input to a base64 output.
5313
   *
5314
   * The default option (simdutf::base64_default) uses the characters `+` and
5315
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5316
   * the output to ensure that the output length is a multiple of four.
5317
   *
5318
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5319
   * part of its alphabet. No padding is added at the end of the output.
5320
   *
5321
   * This function always succeeds.
5322
   *
5323
   * @param input         the binary to process
5324
   * @param length        the length of the input in bytes
5325
   * @param output        the pointer to a buffer that can hold the conversion
5326
   * result (should be at least base64_length_from_binary(length) bytes long)
5327
   * @param options       the base64 options to use, can be base64_default or
5328
   * base64_url, is base64_default by default.
5329
   * @return number of written bytes, will be equal to
5330
   * base64_length_from_binary(length, options)
5331
   */
5332
  virtual size_t
5333
  binary_to_base64(const char *input, size_t length, char *output,
5334
                   base64_options options = base64_default) const noexcept = 0;
5335
5336
  /**
5337
   * Convert a binary input to a base64 output with lines of given length.
5338
   * Lines are separated by a single linefeed character.
5339
   *
5340
   * The default option (simdutf::base64_default) uses the characters `+` and
5341
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5342
   * the output to ensure that the output length is a multiple of four.
5343
   *
5344
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5345
   * part of its alphabet. No padding is added at the end of the output.
5346
   *
5347
   * This function always succeeds.
5348
   *
5349
   * @param input         the binary to process
5350
   * @param length        the length of the input in bytes
5351
   * @param output        the pointer to a buffer that can hold the conversion
5352
   * result (should be at least base64_length_from_binary_with_lines(length,
5353
   * options, line_length) bytes long)
5354
   * @param line_length   the length of each line, values smaller than 4 are
5355
   * interpreted as 4
5356
   * @param options       the base64 options to use, can be base64_default or
5357
   * base64_url, is base64_default by default.
5358
   * @return number of written bytes, will be equal to
5359
   * base64_length_from_binary_with_lines(length, options, line_length)
5360
   */
5361
  virtual size_t binary_to_base64_with_lines(
5362
      const char *input, size_t length, char *output,
5363
      size_t line_length = simdutf::default_line_length,
5364
      base64_options options = base64_default) const noexcept = 0;
5365
  /**
5366
   * Find the first occurrence of a character in a string. If the character is
5367
   * not found, return a pointer to the end of the string.
5368
   * @param start        the start of the string
5369
   * @param end          the end of the string
5370
   * @param character    the character to find
5371
   * @return a pointer to the first occurrence of the character in the string,
5372
   * or a pointer to the end of the string if the character is not found.
5373
   *
5374
   */
5375
  virtual const char *find(const char *start, const char *end,
5376
                           char character) const noexcept = 0;
5377
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
5378
                               char16_t character) const noexcept = 0;
5379
#endif // SIMDUTF_FEATURE_BASE64
5380
5381
#ifdef SIMDUTF_INTERNAL_TESTS
5382
  // This method is exported only in developer mode, its purpose
5383
  // is to expose some internal test procedures from the given
5384
  // implementation and then use them through our standard test
5385
  // framework.
5386
  //
5387
  // Regular users should not use it, the tests of the public
5388
  // API are enough.
5389
5390
  struct TestProcedure {
5391
    // display name
5392
    std::string name;
5393
5394
    // procedure should return whether given test pass or not
5395
    void (*procedure)(const implementation &);
5396
  };
5397
5398
  virtual std::vector<TestProcedure> internal_tests() const;
5399
#endif
5400
5401
protected:
5402
  /** @private Construct an implementation with the given name and description.
5403
   * For subclasses. */
5404
  simdutf_really_inline implementation(const char *name,
5405
                                       const char *description,
5406
                                       uint32_t required_instruction_sets)
5407
      : _name(name), _description(description),
5408
        _required_instruction_sets(required_instruction_sets) {}
5409
5410
protected:
5411
  ~implementation() = default;
5412
5413
private:
5414
  /**
5415
   * The name of this implementation.
5416
   */
5417
  const char *_name;
5418
5419
  /**
5420
   * The description of this implementation.
5421
   */
5422
  const char *_description;
5423
5424
  /**
5425
   * Instruction sets required for this implementation.
5426
   */
5427
  const uint32_t _required_instruction_sets;
5428
};
5429
5430
/** @private */
5431
namespace internal {
5432
5433
/**
5434
 * The list of available implementations compiled into simdutf.
5435
 */
5436
class available_implementation_list {
5437
public:
5438
  /** Get the list of available implementations compiled into simdutf */
5439
  simdutf_really_inline available_implementation_list() {}
5440
  /** Number of implementations */
5441
  size_t size() const noexcept;
5442
  /** STL const begin() iterator */
5443
  const implementation *const *begin() const noexcept;
5444
  /** STL const end() iterator */
5445
  const implementation *const *end() const noexcept;
5446
5447
  /**
5448
   * Get the implementation with the given name.
5449
   *
5450
   * Case sensitive.
5451
   *
5452
   *     const implementation *impl =
5453
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
5454
   * (!imp->supported_by_runtime_system()) { exit(1); }
5455
   *     simdutf::active_implementation = impl;
5456
   *
5457
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
5458
   * @return the implementation, or nullptr if the parse failed.
5459
   */
5460
  const implementation *operator[](const std::string &name) const noexcept {
5461
    for (const implementation *impl : *this) {
5462
      if (impl->name() == name) {
5463
        return impl;
5464
      }
5465
    }
5466
    return nullptr;
5467
  }
5468
5469
  /**
5470
   * Detect the most advanced implementation supported by the current host.
5471
   *
5472
   * This is used to initialize the implementation on startup.
5473
   *
5474
   *     const implementation *impl =
5475
   * simdutf::available_implementation::detect_best_supported();
5476
   *     simdutf::active_implementation = impl;
5477
   *
5478
   * @return the most advanced supported implementation for the current host, or
5479
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
5480
   * supported implementation. Will never return nullptr.
5481
   */
5482
  const implementation *detect_best_supported() const noexcept;
5483
};
5484
5485
template <typename T> class atomic_ptr {
5486
public:
5487
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
5488
5489
#if defined(SIMDUTF_NO_THREADS)
5490
  operator const T *() const { return ptr; }
5491
  const T &operator*() const { return *ptr; }
5492
  const T *operator->() const { return ptr; }
5493
5494
  operator T *() { return ptr; }
5495
  T &operator*() { return *ptr; }
5496
  T *operator->() { return ptr; }
5497
  atomic_ptr &operator=(T *_ptr) {
5498
    ptr = _ptr;
5499
    return *this;
5500
  }
5501
5502
#else
5503
  operator const T *() const { return ptr.load(); }
5504
  const T &operator*() const { return *ptr; }
5505
  const T *operator->() const { return ptr.load(); }
5506
5507
  operator T *() { return ptr.load(); }
5508
  T &operator*() { return *ptr; }
5509
  T *operator->() { return ptr.load(); }
5510
  atomic_ptr &operator=(T *_ptr) {
5511
    ptr = _ptr;
5512
    return *this;
5513
  }
5514
5515
#endif
5516
5517
private:
5518
#if defined(SIMDUTF_NO_THREADS)
5519
  T *ptr;
5520
#else
5521
  std::atomic<T *> ptr;
5522
#endif
5523
};
5524
5525
class detect_best_supported_implementation_on_first_use;
5526
5527
} // namespace internal
5528
5529
/**
5530
 * The list of available implementations compiled into simdutf.
5531
 */
5532
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
5533
get_available_implementations();
5534
5535
/**
5536
 * The active implementation.
5537
 *
5538
 * Automatically initialized on first use to the most advanced implementation
5539
 * supported by this hardware.
5540
 */
5541
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
5542
get_active_implementation();
5543
5544
} // namespace simdutf
5545
5546
#endif // SIMDUTF_IMPLEMENTATION_H