Coverage Report

Created: 2025-08-26 06:46

/work/include/simdutf/implementation.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef SIMDUTF_IMPLEMENTATION_H
2
#define SIMDUTF_IMPLEMENTATION_H
3
#if !defined(SIMDUTF_NO_THREADS)
4
  #include <atomic>
5
#endif
6
#include <string>
7
#ifdef SIMDUTF_INTERNAL_TESTS
8
  #include <vector>
9
#endif
10
#include "simdutf/common_defs.h"
11
#include "simdutf/compiler_check.h"
12
#include "simdutf/encoding_types.h"
13
#include "simdutf/error.h"
14
#include "simdutf/internal/isadetection.h"
15
16
#if SIMDUTF_SPAN
17
  #include <concepts>
18
  #include <type_traits>
19
  #include <span>
20
  #include <tuple>
21
#endif
22
#if SIMDUTF_CPLUSPLUS17
23
  #include <string_view>
24
#endif
25
// The following defines are conditionally enabled/disabled during amalgamation.
26
// By default all features are enabled, regular code shouldn't check them. Only
27
// when user code really relies of a selected subset, it's good to verify these
28
// flags, like:
29
//
30
//      #if !SIMDUTF_FEATURE_UTF16
31
//      #   error("Please amalgamate simdutf with UTF-16 support")
32
//      #endif
33
//
34
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
35
#define SIMDUTF_FEATURE_ASCII 1
36
#define SIMDUTF_FEATURE_LATIN1 1
37
#define SIMDUTF_FEATURE_UTF8 1
38
#define SIMDUTF_FEATURE_UTF16 1
39
#define SIMDUTF_FEATURE_UTF32 1
40
#define SIMDUTF_FEATURE_BASE64 1
41
42
namespace simdutf {
43
44
#if SIMDUTF_SPAN
45
/// helpers placed in namespace detail are not a part of the public API
46
namespace detail {
47
/**
48
 * matches a byte, in the many ways C++ allows. note that these
49
 * are all distinct types.
50
 */
51
template <typename T>
52
concept byte_like = std::is_same_v<T, std::byte> ||   //
53
                    std::is_same_v<T, char> ||        //
54
                    std::is_same_v<T, signed char> || //
55
                    std::is_same_v<T, unsigned char>;
56
57
template <typename T>
58
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
59
60
template <typename T>
61
concept is_pointer = std::is_pointer_v<T>;
62
63
/**
64
 * matches anything that behaves like std::span and points to character-like
65
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
66
 * std::uint8_t
67
 */
68
template <typename T>
69
concept input_span_of_byte_like = requires(const T &t) {
70
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
71
  { t.data() } noexcept -> is_pointer;
72
  { *t.data() } noexcept -> is_byte_like;
73
};
74
75
template <typename T>
76
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
77
78
/**
79
 * like span_of_byte_like, but for an output span (intended to be written to)
80
 */
81
template <typename T>
82
concept output_span_of_byte_like = requires(T &t) {
83
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
84
  { t.data() } noexcept -> is_pointer;
85
  { *t.data() } noexcept -> is_byte_like;
86
  { *t.data() } noexcept -> is_mutable;
87
};
88
} // namespace detail
89
#endif
90
91
#if SIMDUTF_FEATURE_DETECT_ENCODING
92
/**
93
 * Autodetect the encoding of the input, a single encoding is recommended.
94
 * E.g., the function might return simdutf::encoding_type::UTF8,
95
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
96
 * simdutf::encoding_type::UTF32_LE.
97
 *
98
 * @param input the string to analyze.
99
 * @param length the length of the string in bytes.
100
 * @return the detected encoding type
101
 */
102
simdutf_warn_unused simdutf::encoding_type
103
autodetect_encoding(const char *input, size_t length) noexcept;
104
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
105
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
106
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
107
0
}
108
  #if SIMDUTF_SPAN
109
/**
110
 * Autodetect the encoding of the input, a single encoding is recommended.
111
 * E.g., the function might return simdutf::encoding_type::UTF8,
112
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
113
 * simdutf::encoding_type::UTF32_LE.
114
 *
115
 * @param input the string to analyze. can be a anything span-like that has a
116
 * data() and size() that points to character data: std::string,
117
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
118
 * @return the detected encoding type
119
 */
120
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
121
autodetect_encoding(
122
    const detail::input_span_of_byte_like auto &input) noexcept {
123
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
124
                             input.size());
125
}
126
  #endif // SIMDUTF_SPAN
127
128
/**
129
 * Autodetect the possible encodings of the input in one pass.
130
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
131
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
132
 *
133
 * Overridden by each implementation.
134
 *
135
 * @param input the string to analyze.
136
 * @param length the length of the string in bytes.
137
 * @return the detected encoding type
138
 */
139
simdutf_warn_unused int detect_encodings(const char *input,
140
                                         size_t length) noexcept;
141
simdutf_really_inline simdutf_warn_unused int
142
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
143
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
144
0
}
145
  #if SIMDUTF_SPAN
146
simdutf_really_inline simdutf_warn_unused int
147
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
148
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
149
                          input.size());
150
}
151
  #endif // SIMDUTF_SPAN
152
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
153
154
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
155
/**
156
 * Validate the UTF-8 string. This function may be best when you expect
157
 * the input to be almost always valid. Otherwise, consider using
158
 * validate_utf8_with_errors.
159
 *
160
 * Overridden by each implementation.
161
 *
162
 * @param buf the UTF-8 string to validate.
163
 * @param len the length of the string in bytes.
164
 * @return true if and only if the string is valid UTF-8.
165
 */
166
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
167
  #if SIMDUTF_SPAN
168
simdutf_really_inline simdutf_warn_unused bool
169
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
170
  return validate_utf8(reinterpret_cast<const char *>(input.data()),
171
                       input.size());
172
}
173
  #endif // SIMDUTF_SPAN
174
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
175
176
#if SIMDUTF_FEATURE_UTF8
177
/**
178
 * Validate the UTF-8 string and stop on error.
179
 *
180
 * Overridden by each implementation.
181
 *
182
 * @param buf the UTF-8 string to validate.
183
 * @param len the length of the string in bytes.
184
 * @return a result pair struct (of type simdutf::result containing the two
185
 * fields error and count) with an error code and either position of the error
186
 * (in the input in code units) if any, or the number of code units validated if
187
 * successful.
188
 */
189
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
190
                                                     size_t len) noexcept;
191
  #if SIMDUTF_SPAN
192
simdutf_really_inline simdutf_warn_unused result validate_utf8_with_errors(
193
    const detail::input_span_of_byte_like auto &input) noexcept {
194
  return validate_utf8_with_errors(reinterpret_cast<const char *>(input.data()),
195
                                   input.size());
196
}
197
  #endif // SIMDUTF_SPAN
198
#endif   // SIMDUTF_FEATURE_UTF8
199
200
#if SIMDUTF_FEATURE_ASCII
201
/**
202
 * Validate the ASCII string.
203
 *
204
 * Overridden by each implementation.
205
 *
206
 * @param buf the ASCII string to validate.
207
 * @param len the length of the string in bytes.
208
 * @return true if and only if the string is valid ASCII.
209
 */
210
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
211
  #if SIMDUTF_SPAN
212
simdutf_really_inline simdutf_warn_unused bool
213
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
214
  return validate_ascii(reinterpret_cast<const char *>(input.data()),
215
                        input.size());
216
}
217
  #endif // SIMDUTF_SPAN
218
219
/**
220
 * Validate the ASCII string and stop on error. It might be faster than
221
 * validate_utf8 when an error is expected to occur early.
222
 *
223
 * Overridden by each implementation.
224
 *
225
 * @param buf the ASCII string to validate.
226
 * @param len the length of the string in bytes.
227
 * @return a result pair struct (of type simdutf::result containing the two
228
 * fields error and count) with an error code and either position of the error
229
 * (in the input in code units) if any, or the number of code units validated if
230
 * successful.
231
 */
232
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
233
                                                      size_t len) noexcept;
234
  #if SIMDUTF_SPAN
235
simdutf_really_inline simdutf_warn_unused result validate_ascii_with_errors(
236
    const detail::input_span_of_byte_like auto &input) noexcept {
237
  return validate_ascii_with_errors(
238
      reinterpret_cast<const char *>(input.data()), input.size());
239
}
240
  #endif // SIMDUTF_SPAN
241
#endif   // SIMDUTF_FEATURE_ASCII
242
243
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
244
/**
245
 * Validate the ASCII string as a UTF-16 sequence.
246
 * An UTF-16 sequence is considered an ASCII sequence
247
 * if it could be converted to an ASCII string losslessly.
248
 *
249
 * Overridden by each implementation.
250
 *
251
 * @param buf the UTF-16 string to validate.
252
 * @param len the length of the string in bytes.
253
 * @return true if and only if the string is valid ASCII.
254
 */
255
simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf,
256
                                                 size_t len) noexcept;
257
  #if SIMDUTF_SPAN
258
simdutf_really_inline simdutf_warn_unused bool
259
0
validate_utf16_as_ascii(std::span<const char16_t> input) noexcept {
260
0
  return validate_utf16_as_ascii(input.data(), input.size());
261
0
}
262
  #endif // SIMDUTF_SPAN
263
264
/**
265
 * Validate the ASCII string as a UTF-16BE sequence.
266
 * An UTF-16 sequence is considered an ASCII sequence
267
 * if it could be converted to an ASCII string losslessly.
268
 *
269
 * Overridden by each implementation.
270
 *
271
 * @param buf the UTF-16BE string to validate.
272
 * @param len the length of the string in bytes.
273
 * @return true if and only if the string is valid ASCII.
274
 */
275
simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf,
276
                                                   size_t len) noexcept;
277
  #if SIMDUTF_SPAN
278
simdutf_really_inline simdutf_warn_unused bool
279
0
validate_utf16be_as_ascii(std::span<const char16_t> input) noexcept {
280
0
  return validate_utf16be_as_ascii(input.data(), input.size());
281
0
}
282
  #endif // SIMDUTF_SPAN
283
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
284
285
/**
286
 * Validate the ASCII string as a UTF-16LE sequence.
287
 * An UTF-16 sequence is considered an ASCII sequence
288
 * if it could be converted to an ASCII string losslessly.
289
 *
290
 * Overridden by each implementation.
291
 *
292
 * @param buf the UTF-16LE string to validate.
293
 * @param len the length of the string in bytes.
294
 * @return true if and only if the string is valid ASCII.
295
 */
296
simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf,
297
                                                   size_t len) noexcept;
298
#if SIMDUTF_SPAN
299
simdutf_really_inline simdutf_warn_unused bool
300
0
validate_utf16le_as_ascii(std::span<const char16_t> input) noexcept {
301
0
  return validate_utf16le_as_ascii(input.data(), input.size());
302
0
}
303
#endif // SIMDUTF_SPAN
304
305
#if SIMDUTF_FEATURE_UTF16
306
/**
307
 * Using native endianness; Validate the UTF-16 string.
308
 * This function may be best when you expect the input to be almost always
309
 * valid. Otherwise, consider using validate_utf16_with_errors.
310
 *
311
 * Overridden by each implementation.
312
 *
313
 * This function is not BOM-aware.
314
 *
315
 * @param buf the UTF-16 string to validate.
316
 * @param len the length of the string in number of 2-byte code units
317
 * (char16_t).
318
 * @return true if and only if the string is valid UTF-16.
319
 */
320
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
321
                                        size_t len) noexcept;
322
  #if SIMDUTF_SPAN
323
simdutf_really_inline simdutf_warn_unused bool
324
0
validate_utf16(std::span<const char16_t> input) noexcept {
325
0
  return validate_utf16(input.data(), input.size());
326
0
}
327
  #endif // SIMDUTF_SPAN
328
#endif   // SIMDUTF_FEATURE_UTF16
329
330
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
331
/**
332
 * Validate the UTF-16LE string. This function may be best when you expect
333
 * the input to be almost always valid. Otherwise, consider using
334
 * validate_utf16le_with_errors.
335
 *
336
 * Overridden by each implementation.
337
 *
338
 * This function is not BOM-aware.
339
 *
340
 * @param buf the UTF-16LE string to validate.
341
 * @param len the length of the string in number of 2-byte code units
342
 * (char16_t).
343
 * @return true if and only if the string is valid UTF-16LE.
344
 */
345
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
346
                                          size_t len) noexcept;
347
  #if SIMDUTF_SPAN
348
simdutf_really_inline simdutf_warn_unused bool
349
0
validate_utf16le(std::span<const char16_t> input) noexcept {
350
0
  return validate_utf16le(input.data(), input.size());
351
0
}
352
  #endif // SIMDUTF_SPAN
353
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
354
355
#if SIMDUTF_FEATURE_UTF16
356
/**
357
 * Validate the UTF-16BE string. This function may be best when you expect
358
 * the input to be almost always valid. Otherwise, consider using
359
 * validate_utf16be_with_errors.
360
 *
361
 * Overridden by each implementation.
362
 *
363
 * This function is not BOM-aware.
364
 *
365
 * @param buf the UTF-16BE string to validate.
366
 * @param len the length of the string in number of 2-byte code units
367
 * (char16_t).
368
 * @return true if and only if the string is valid UTF-16BE.
369
 */
370
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
371
                                          size_t len) noexcept;
372
  #if SIMDUTF_SPAN
373
simdutf_really_inline simdutf_warn_unused bool
374
0
validate_utf16be(std::span<const char16_t> input) noexcept {
375
0
  return validate_utf16be(input.data(), input.size());
376
0
}
377
  #endif // SIMDUTF_SPAN
378
379
/**
380
 * Using native endianness; Validate the UTF-16 string and stop on error.
381
 * It might be faster than validate_utf16 when an error is expected to occur
382
 * early.
383
 *
384
 * Overridden by each implementation.
385
 *
386
 * This function is not BOM-aware.
387
 *
388
 * @param buf the UTF-16 string to validate.
389
 * @param len the length of the string in number of 2-byte code units
390
 * (char16_t).
391
 * @return a result pair struct (of type simdutf::result containing the two
392
 * fields error and count) with an error code and either position of the error
393
 * (in the input in code units) if any, or the number of code units validated if
394
 * successful.
395
 */
396
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
397
                                                      size_t len) noexcept;
398
  #if SIMDUTF_SPAN
399
simdutf_really_inline simdutf_warn_unused result
400
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
401
0
  return validate_utf16_with_errors(input.data(), input.size());
402
0
}
403
  #endif // SIMDUTF_SPAN
404
405
/**
406
 * Validate the UTF-16LE string and stop on error. It might be faster than
407
 * validate_utf16le when an error is expected to occur early.
408
 *
409
 * Overridden by each implementation.
410
 *
411
 * This function is not BOM-aware.
412
 *
413
 * @param buf the UTF-16LE string to validate.
414
 * @param len the length of the string in number of 2-byte code units
415
 * (char16_t).
416
 * @return a result pair struct (of type simdutf::result containing the two
417
 * fields error and count) with an error code and either position of the error
418
 * (in the input in code units) if any, or the number of code units validated if
419
 * successful.
420
 */
421
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
422
                                                        size_t len) noexcept;
423
  #if SIMDUTF_SPAN
424
simdutf_really_inline simdutf_warn_unused result
425
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
426
0
  return validate_utf16le_with_errors(input.data(), input.size());
427
0
}
428
  #endif // SIMDUTF_SPAN
429
430
/**
431
 * Validate the UTF-16BE string and stop on error. It might be faster than
432
 * validate_utf16be when an error is expected to occur early.
433
 *
434
 * Overridden by each implementation.
435
 *
436
 * This function is not BOM-aware.
437
 *
438
 * @param buf the UTF-16BE string to validate.
439
 * @param len the length of the string in number of 2-byte code units
440
 * (char16_t).
441
 * @return a result pair struct (of type simdutf::result containing the two
442
 * fields error and count) with an error code and either position of the error
443
 * (in the input in code units) if any, or the number of code units validated if
444
 * successful.
445
 */
446
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
447
                                                        size_t len) noexcept;
448
  #if SIMDUTF_SPAN
449
simdutf_really_inline simdutf_warn_unused result
450
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
451
0
  return validate_utf16be_with_errors(input.data(), input.size());
452
0
}
453
  #endif // SIMDUTF_SPAN
454
455
/**
456
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
457
 * the Unicode replacement character U+FFFD. If input and output points to
458
 * different memory areas, the procedure copies string, and it's expected that
459
 * output memory is at least as big as the input. It's also possible to set
460
 * input equal output, that makes replacements an in-place operation.
461
 *
462
 * @param input the UTF-16LE string to correct.
463
 * @param len the length of the string in number of 2-byte code units
464
 * (char16_t).
465
 * @param output the output buffer.
466
 */
467
void to_well_formed_utf16le(const char16_t *input, size_t len,
468
                            char16_t *output) noexcept;
469
  #if SIMDUTF_SPAN
470
simdutf_really_inline void
471
to_well_formed_utf16le(std::span<const char16_t> input,
472
0
                       std::span<char16_t> output) noexcept {
473
0
  to_well_formed_utf16le(input.data(), input.size(), output.data());
474
0
}
475
  #endif // SIMDUTF_SPAN
476
477
/**
478
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
479
 * the Unicode replacement character U+FFFD. If input and output points to
480
 * different memory areas, the procedure copies string, and it's expected that
481
 * output memory is at least as big as the input. It's also possible to set
482
 * input equal output, that makes replacements an in-place operation.
483
 *
484
 * @param input the UTF-16BE string to correct.
485
 * @param len the length of the string in number of 2-byte code units
486
 * (char16_t).
487
 * @param output the output buffer.
488
 */
489
void to_well_formed_utf16be(const char16_t *input, size_t len,
490
                            char16_t *output) noexcept;
491
  #if SIMDUTF_SPAN
492
simdutf_really_inline void
493
to_well_formed_utf16be(std::span<const char16_t> input,
494
0
                       std::span<char16_t> output) noexcept {
495
0
  to_well_formed_utf16be(input.data(), input.size(), output.data());
496
0
}
497
  #endif // SIMDUTF_SPAN
498
499
/**
500
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
501
 * Unicode replacement character U+FFFD. If input and output points to different
502
 * memory areas, the procedure copies string, and it's expected that output
503
 * memory is at least as big as the input. It's also possible to set input equal
504
 * output, that makes replacements an in-place operation.
505
 *
506
 * @param input the UTF-16 string to correct.
507
 * @param len the length of the string in number of 2-byte code units
508
 * (char16_t).
509
 * @param output the output buffer.
510
 */
511
void to_well_formed_utf16(const char16_t *input, size_t len,
512
                          char16_t *output) noexcept;
513
  #if SIMDUTF_SPAN
514
simdutf_really_inline void
515
to_well_formed_utf16(std::span<const char16_t> input,
516
0
                     std::span<char16_t> output) noexcept {
517
0
  to_well_formed_utf16(input.data(), input.size(), output.data());
518
0
}
519
  #endif // SIMDUTF_SPAN
520
521
#endif // SIMDUTF_FEATURE_UTF16
522
523
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
524
/**
525
 * Validate the UTF-32 string. This function may be best when you expect
526
 * the input to be almost always valid. Otherwise, consider using
527
 * validate_utf32_with_errors.
528
 *
529
 * Overridden by each implementation.
530
 *
531
 * This function is not BOM-aware.
532
 *
533
 * @param buf the UTF-32 string to validate.
534
 * @param len the length of the string in number of 4-byte code units
535
 * (char32_t).
536
 * @return true if and only if the string is valid UTF-32.
537
 */
538
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
539
                                        size_t len) noexcept;
540
  #if SIMDUTF_SPAN
541
simdutf_really_inline simdutf_warn_unused bool
542
0
validate_utf32(std::span<const char32_t> input) noexcept {
543
0
  return validate_utf32(input.data(), input.size());
544
0
}
545
  #endif // SIMDUTF_SPAN
546
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
547
548
#if SIMDUTF_FEATURE_UTF32
549
/**
550
 * Validate the UTF-32 string and stop on error. It might be faster than
551
 * validate_utf32 when an error is expected to occur early.
552
 *
553
 * Overridden by each implementation.
554
 *
555
 * This function is not BOM-aware.
556
 *
557
 * @param buf the UTF-32 string to validate.
558
 * @param len the length of the string in number of 4-byte code units
559
 * (char32_t).
560
 * @return a result pair struct (of type simdutf::result containing the two
561
 * fields error and count) with an error code and either position of the error
562
 * (in the input in code units) if any, or the number of code units validated if
563
 * successful.
564
 */
565
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
566
                                                      size_t len) noexcept;
567
  #if SIMDUTF_SPAN
568
simdutf_really_inline simdutf_warn_unused result
569
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
570
0
  return validate_utf32_with_errors(input.data(), input.size());
571
0
}
572
  #endif // SIMDUTF_SPAN
573
#endif   // SIMDUTF_FEATURE_UTF32
574
575
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
576
/**
577
 * Convert Latin1 string into UTF-8 string.
578
 *
579
 * This function is suitable to work with inputs from untrusted sources.
580
 *
581
 * @param input         the Latin1 string to convert
582
 * @param length        the length of the string in bytes
583
 * @param utf8_output   the pointer to buffer that can hold conversion result
584
 * @return the number of written char; 0 if conversion is not possible
585
 */
586
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
587
                                                  size_t length,
588
                                                  char *utf8_output) noexcept;
589
  #if SIMDUTF_SPAN
590
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8(
591
    const detail::input_span_of_byte_like auto &latin1_input,
592
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
593
  return convert_latin1_to_utf8(
594
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
595
      utf8_output.data());
596
}
597
  #endif // SIMDUTF_SPAN
598
599
/**
600
 * Convert Latin1 string into UTF-8 string with output limit.
601
 *
602
 * This function is suitable to work with inputs from untrusted sources.
603
 *
604
 * We write as many characters as possible.
605
 *
606
 * @param input         the Latin1 string to convert
607
 * @param length        the length of the string in bytes
608
 * @param utf8_output   the pointer to buffer that can hold conversion result
609
 * @param utf8_len      the maximum output length
610
 * @return the number of written char; 0 if conversion is not possible
611
 */
612
simdutf_warn_unused size_t
613
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
614
                            size_t utf8_len) noexcept;
615
  #if SIMDUTF_SPAN
616
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
617
    const detail::input_span_of_byte_like auto &input,
618
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
619
  // implementation note: outputspan is a forwarding ref to avoid copying and
620
  // allow both lvalues and rvalues. std::span can be copied without problems,
621
  // but std::vector should not, and this function should accept both. it will
622
  // allow using an owning rvalue ref (example: passing a temporary std::string)
623
  // as output, but the user will quickly find out that he has no way of getting
624
  // the data out of the object in that case.
625
  return convert_latin1_to_utf8_safe(
626
      input.data(), input.size(), reinterpret_cast<char *>(utf8_output.data()),
627
      utf8_output.size());
628
}
629
  #endif // SIMDUTF_SPAN
630
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
631
632
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
633
/**
634
 * Convert possibly Latin1 string into UTF-16LE string.
635
 *
636
 * This function is suitable to work with inputs from untrusted sources.
637
 *
638
 * @param input         the Latin1 string to convert
639
 * @param length        the length of the string in bytes
640
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
641
 * @return the number of written char16_t; 0 if conversion is not possible
642
 */
643
simdutf_warn_unused size_t convert_latin1_to_utf16le(
644
    const char *input, size_t length, char16_t *utf16_output) noexcept;
645
  #if SIMDUTF_SPAN
646
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf16le(
647
    const detail::input_span_of_byte_like auto &latin1_input,
648
    std::span<char16_t> utf16_output) noexcept {
649
  return convert_latin1_to_utf16le(
650
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
651
      utf16_output.data());
652
}
653
  #endif // SIMDUTF_SPAN
654
655
/**
656
 * Convert Latin1 string into UTF-16BE string.
657
 *
658
 * This function is suitable to work with inputs from untrusted sources.
659
 *
660
 * @param input         the Latin1 string to convert
661
 * @param length        the length of the string in bytes
662
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
663
 * @return the number of written char16_t; 0 if conversion is not possible
664
 */
665
simdutf_warn_unused size_t convert_latin1_to_utf16be(
666
    const char *input, size_t length, char16_t *utf16_output) noexcept;
667
  #if SIMDUTF_SPAN
668
simdutf_really_inline simdutf_warn_unused size_t
669
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
670
                          std::span<char16_t> output) noexcept {
671
  return convert_latin1_to_utf16be(reinterpret_cast<const char *>(input.data()),
672
                                   input.size(), output.data());
673
}
674
  #endif // SIMDUTF_SPAN
675
/**
676
 * Compute the number of bytes that this UTF-16 string would require in Latin1
677
 * format.
678
 *
679
 * @param length        the length of the string in Latin1 code units (char)
680
 * @return the length of the string in Latin1 code units (char) required to
681
 * encode the UTF-16 string as Latin1
682
 */
683
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
684
685
/**
686
 * Compute the number of code units that this Latin1 string would require in
687
 * UTF-16 format.
688
 *
689
 * @param length        the length of the string in Latin1 code units (char)
690
 * @return the length of the string in 2-byte code units (char16_t) required to
691
 * encode the Latin1 string as UTF-16
692
 */
693
simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept;
694
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
695
696
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
697
/**
698
 * Convert Latin1 string into UTF-32 string.
699
 *
700
 * This function is suitable to work with inputs from untrusted sources.
701
 *
702
 * @param input         the Latin1 string to convert
703
 * @param length        the length of the string in bytes
704
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
705
 * @return the number of written char32_t; 0 if conversion is not possible
706
 */
707
simdutf_warn_unused size_t convert_latin1_to_utf32(
708
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
709
  #if SIMDUTF_SPAN
710
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf32(
711
    const detail::input_span_of_byte_like auto &latin1_input,
712
    std::span<char32_t> utf32_output) noexcept {
713
  return convert_latin1_to_utf32(
714
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
715
      utf32_output.data());
716
}
717
  #endif // SIMDUTF_SPAN
718
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
719
720
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
721
/**
722
 * Convert possibly broken UTF-8 string into latin1 string.
723
 *
724
 * During the conversion also validation of the input string is done.
725
 * This function is suitable to work with inputs from untrusted sources.
726
 *
727
 * @param input         the UTF-8 string to convert
728
 * @param length        the length of the string in bytes
729
 * @param latin1_output  the pointer to buffer that can hold conversion result
730
 * @return the number of written char; 0 if the input was not valid UTF-8 string
731
 * or if it cannot be represented as Latin1
732
 */
733
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
734
                                                  size_t length,
735
                                                  char *latin1_output) noexcept;
736
  #if SIMDUTF_SPAN
737
simdutf_really_inline simdutf_warn_unused size_t convert_utf8_to_latin1(
738
    const detail::input_span_of_byte_like auto &input,
739
    detail::output_span_of_byte_like auto &&output) noexcept {
740
  return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
741
                                input.size(),
742
                                reinterpret_cast<char *>(output.data()));
743
}
744
  #endif // SIMDUTF_SPAN
745
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
746
747
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
748
/**
749
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
750
 * string.
751
 *
752
 * During the conversion also validation of the input string is done.
753
 * This function is suitable to work with inputs from untrusted sources.
754
 *
755
 * @param input         the UTF-8 string to convert
756
 * @param length        the length of the string in bytes
757
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
758
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
759
 * string
760
 */
761
simdutf_warn_unused size_t convert_utf8_to_utf16(
762
    const char *input, size_t length, char16_t *utf16_output) noexcept;
763
  #if SIMDUTF_SPAN
764
simdutf_really_inline simdutf_warn_unused size_t
765
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
766
                      std::span<char16_t> output) noexcept {
767
  return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
768
                               input.size(), output.data());
769
}
770
  #endif // SIMDUTF_SPAN
771
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
772
773
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
774
/**
775
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
776
 *
777
 * @param input         the Latin1 string to convert
778
 * @param length        the length of the string in bytes
779
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
780
 * @return the number of written char16_t.
781
 */
782
simdutf_warn_unused size_t convert_latin1_to_utf16(
783
    const char *input, size_t length, char16_t *utf16_output) noexcept;
784
  #if SIMDUTF_SPAN
785
simdutf_really_inline simdutf_warn_unused size_t
786
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
787
                        std::span<char16_t> output) noexcept {
788
  return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
789
                                 input.size(), output.data());
790
}
791
  #endif // SIMDUTF_SPAN
792
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
793
794
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
795
/**
796
 * Convert possibly broken UTF-8 string into UTF-16LE string.
797
 *
798
 * During the conversion also validation of the input string is done.
799
 * This function is suitable to work with inputs from untrusted sources.
800
 *
801
 * @param input         the UTF-8 string to convert
802
 * @param length        the length of the string in bytes
803
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
804
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
805
 * string
806
 */
807
simdutf_warn_unused size_t convert_utf8_to_utf16le(
808
    const char *input, size_t length, char16_t *utf16_output) noexcept;
809
  #if SIMDUTF_SPAN
810
simdutf_really_inline simdutf_warn_unused size_t
811
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
812
                        std::span<char16_t> utf16_output) noexcept {
813
  return convert_utf8_to_utf16le(
814
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
815
      utf16_output.data());
816
}
817
  #endif // SIMDUTF_SPAN
818
819
/**
820
 * Convert possibly broken UTF-8 string into UTF-16BE string.
821
 *
822
 * During the conversion also validation of the input string is done.
823
 * This function is suitable to work with inputs from untrusted sources.
824
 *
825
 * @param input         the UTF-8 string to convert
826
 * @param length        the length of the string in bytes
827
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
828
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
829
 * string
830
 */
831
simdutf_warn_unused size_t convert_utf8_to_utf16be(
832
    const char *input, size_t length, char16_t *utf16_output) noexcept;
833
  #if SIMDUTF_SPAN
834
simdutf_really_inline simdutf_warn_unused size_t
835
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
836
                        std::span<char16_t> utf16_output) noexcept {
837
  return convert_utf8_to_utf16be(
838
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
839
      utf16_output.data());
840
}
841
  #endif // SIMDUTF_SPAN
842
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
843
844
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
845
/**
846
 * Convert possibly broken UTF-8 string into latin1 string with errors.
847
 * If the string cannot be represented as Latin1, an error
848
 * code is returned.
849
 *
850
 * During the conversion also validation of the input string is done.
851
 * This function is suitable to work with inputs from untrusted sources.
852
 *
853
 * @param input         the UTF-8 string to convert
854
 * @param length        the length of the string in bytes
855
 * @param latin1_output  the pointer to buffer that can hold conversion result
856
 * @return a result pair struct (of type simdutf::result containing the two
857
 * fields error and count) with an error code and either position of the error
858
 * (in the input in code units) if any, or the number of code units validated if
859
 * successful.
860
 */
861
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
862
    const char *input, size_t length, char *latin1_output) noexcept;
863
  #if SIMDUTF_SPAN
864
simdutf_really_inline simdutf_warn_unused result
865
convert_utf8_to_latin1_with_errors(
866
    const detail::input_span_of_byte_like auto &utf8_input,
867
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
868
  return convert_utf8_to_latin1_with_errors(
869
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
870
      reinterpret_cast<char *>(latin1_output.data()));
871
}
872
  #endif // SIMDUTF_SPAN
873
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
874
875
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
876
/**
877
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
878
 * string and stop on error.
879
 *
880
 * During the conversion also validation of the input string is done.
881
 * This function is suitable to work with inputs from untrusted sources.
882
 *
883
 * @param input         the UTF-8 string to convert
884
 * @param length        the length of the string in bytes
885
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
886
 * @return a result pair struct (of type simdutf::result containing the two
887
 * fields error and count) with an error code and either position of the error
888
 * (in the input in code units) if any, or the number of char16_t written if
889
 * successful.
890
 */
891
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
892
    const char *input, size_t length, char16_t *utf16_output) noexcept;
893
  #if SIMDUTF_SPAN
894
simdutf_really_inline simdutf_warn_unused result
895
convert_utf8_to_utf16_with_errors(
896
    const detail::input_span_of_byte_like auto &utf8_input,
897
    std::span<char16_t> utf16_output) noexcept {
898
  return convert_utf8_to_utf16_with_errors(
899
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
900
      utf16_output.data());
901
}
902
  #endif // SIMDUTF_SPAN
903
904
/**
905
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
906
 *
907
 * During the conversion also validation of the input string is done.
908
 * This function is suitable to work with inputs from untrusted sources.
909
 *
910
 * @param input         the UTF-8 string to convert
911
 * @param length        the length of the string in bytes
912
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
913
 * @return a result pair struct (of type simdutf::result containing the two
914
 * fields error and count) with an error code and either position of the error
915
 * (in the input in code units) if any, or the number of char16_t written if
916
 * successful.
917
 */
918
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
919
    const char *input, size_t length, char16_t *utf16_output) noexcept;
920
  #if SIMDUTF_SPAN
921
simdutf_really_inline simdutf_warn_unused result
922
convert_utf8_to_utf16le_with_errors(
923
    const detail::input_span_of_byte_like auto &utf8_input,
924
    std::span<char16_t> utf16_output) noexcept {
925
  return convert_utf8_to_utf16le_with_errors(
926
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
927
      utf16_output.data());
928
}
929
  #endif // SIMDUTF_SPAN
930
931
/**
932
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
933
 *
934
 * During the conversion also validation of the input string is done.
935
 * This function is suitable to work with inputs from untrusted sources.
936
 *
937
 * @param input         the UTF-8 string to convert
938
 * @param length        the length of the string in bytes
939
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
940
 * @return a result pair struct (of type simdutf::result containing the two
941
 * fields error and count) with an error code and either position of the error
942
 * (in the input in code units) if any, or the number of char16_t written if
943
 * successful.
944
 */
945
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
946
    const char *input, size_t length, char16_t *utf16_output) noexcept;
947
  #if SIMDUTF_SPAN
948
simdutf_really_inline simdutf_warn_unused result
949
convert_utf8_to_utf16be_with_errors(
950
    const detail::input_span_of_byte_like auto &utf8_input,
951
    std::span<char16_t> utf16_output) noexcept {
952
  return convert_utf8_to_utf16be_with_errors(
953
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
954
      utf16_output.data());
955
}
956
  #endif // SIMDUTF_SPAN
957
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
958
959
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
960
/**
961
 * Convert possibly broken UTF-8 string into UTF-32 string.
962
 *
963
 * During the conversion also validation of the input string is done.
964
 * This function is suitable to work with inputs from untrusted sources.
965
 *
966
 * @param input         the UTF-8 string to convert
967
 * @param length        the length of the string in bytes
968
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
969
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
970
 * string
971
 */
972
simdutf_warn_unused size_t convert_utf8_to_utf32(
973
    const char *input, size_t length, char32_t *utf32_output) noexcept;
974
  #if SIMDUTF_SPAN
975
simdutf_really_inline simdutf_warn_unused size_t
976
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
977
                      std::span<char32_t> utf32_output) noexcept {
978
  return convert_utf8_to_utf32(
979
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
980
      utf32_output.data());
981
}
982
  #endif // SIMDUTF_SPAN
983
984
/**
985
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
986
 *
987
 * During the conversion also validation of the input string is done.
988
 * This function is suitable to work with inputs from untrusted sources.
989
 *
990
 * @param input         the UTF-8 string to convert
991
 * @param length        the length of the string in bytes
992
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
993
 * @return a result pair struct (of type simdutf::result containing the two
994
 * fields error and count) with an error code and either position of the error
995
 * (in the input in code units) if any, or the number of char32_t written if
996
 * successful.
997
 */
998
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
999
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1000
  #if SIMDUTF_SPAN
1001
simdutf_really_inline simdutf_warn_unused result
1002
convert_utf8_to_utf32_with_errors(
1003
    const detail::input_span_of_byte_like auto &utf8_input,
1004
    std::span<char32_t> utf32_output) noexcept {
1005
  return convert_utf8_to_utf32_with_errors(
1006
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1007
      utf32_output.data());
1008
}
1009
  #endif // SIMDUTF_SPAN
1010
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1011
1012
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1013
/**
1014
 * Convert valid UTF-8 string into latin1 string.
1015
 *
1016
 * This function assumes that the input string is valid UTF-8 and that it can be
1017
 * represented as Latin1. If you violate this assumption, the result is
1018
 * implementation defined and may include system-dependent behavior such as
1019
 * crashes.
1020
 *
1021
 * This function is for expert users only and not part of our public API. Use
1022
 * convert_utf8_to_latin1 instead. The function may be removed from the library
1023
 * in the future.
1024
 *
1025
 * This function is not BOM-aware.
1026
 *
1027
 * @param input         the UTF-8 string to convert
1028
 * @param length        the length of the string in bytes
1029
 * @param latin1_output  the pointer to buffer that can hold conversion result
1030
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1031
 */
1032
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1033
    const char *input, size_t length, char *latin1_output) noexcept;
1034
  #if SIMDUTF_SPAN
1035
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1036
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1037
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1038
  return convert_valid_utf8_to_latin1(
1039
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1040
      valid_utf8_input.size(), latin1_output.data());
1041
}
1042
  #endif // SIMDUTF_SPAN
1043
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1044
1045
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1046
/**
1047
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1048
 *
1049
 * This function assumes that the input string is valid UTF-8.
1050
 *
1051
 * @param input         the UTF-8 string to convert
1052
 * @param length        the length of the string in bytes
1053
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1054
 * @return the number of written char16_t
1055
 */
1056
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1057
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1058
  #if SIMDUTF_SPAN
1059
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1060
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1061
    std::span<char16_t> utf16_output) noexcept {
1062
  return convert_valid_utf8_to_utf16(
1063
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1064
      valid_utf8_input.size(), utf16_output.data());
1065
}
1066
  #endif // SIMDUTF_SPAN
1067
1068
/**
1069
 * Convert valid UTF-8 string into UTF-16LE string.
1070
 *
1071
 * This function assumes that the input string is valid UTF-8.
1072
 *
1073
 * @param input         the UTF-8 string to convert
1074
 * @param length        the length of the string in bytes
1075
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1076
 * @return the number of written char16_t
1077
 */
1078
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1079
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1080
  #if SIMDUTF_SPAN
1081
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1082
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1083
    std::span<char16_t> utf16_output) noexcept {
1084
  return convert_valid_utf8_to_utf16le(
1085
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1086
      valid_utf8_input.size(), utf16_output.data());
1087
}
1088
  #endif // SIMDUTF_SPAN
1089
1090
/**
1091
 * Convert valid UTF-8 string into UTF-16BE string.
1092
 *
1093
 * This function assumes that the input string is valid UTF-8.
1094
 *
1095
 * @param input         the UTF-8 string to convert
1096
 * @param length        the length of the string in bytes
1097
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1098
 * @return the number of written char16_t
1099
 */
1100
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1101
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1102
  #if SIMDUTF_SPAN
1103
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1104
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1105
    std::span<char16_t> utf16_output) noexcept {
1106
  return convert_valid_utf8_to_utf16be(
1107
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1108
      valid_utf8_input.size(), utf16_output.data());
1109
}
1110
  #endif // SIMDUTF_SPAN
1111
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1112
1113
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1114
/**
1115
 * Convert valid UTF-8 string into UTF-32 string.
1116
 *
1117
 * This function assumes that the input string is valid UTF-8.
1118
 *
1119
 * @param input         the UTF-8 string to convert
1120
 * @param length        the length of the string in bytes
1121
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1122
 * @return the number of written char32_t
1123
 */
1124
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1125
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1126
  #if SIMDUTF_SPAN
1127
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1128
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1129
    std::span<char32_t> utf32_output) noexcept {
1130
  return convert_valid_utf8_to_utf32(
1131
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1132
      valid_utf8_input.size(), utf32_output.data());
1133
}
1134
  #endif // SIMDUTF_SPAN
1135
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1136
1137
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1138
/**
1139
 * Return the number of bytes that this Latin1 string would require in UTF-8
1140
 * format.
1141
 *
1142
 * @param input         the Latin1 string to convert
1143
 * @param length        the length of the string bytes
1144
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1145
 */
1146
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1147
                                                   size_t length) noexcept;
1148
  #if SIMDUTF_SPAN
1149
simdutf_really_inline simdutf_warn_unused size_t utf8_length_from_latin1(
1150
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1151
  return utf8_length_from_latin1(
1152
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size());
1153
}
1154
  #endif // SIMDUTF_SPAN
1155
1156
/**
1157
 * Compute the number of bytes that this UTF-8 string would require in Latin1
1158
 * format.
1159
 *
1160
 * This function does not validate the input. It is acceptable to pass invalid
1161
 * UTF-8 strings but in such cases the result is implementation defined.
1162
 *
1163
 * This function is not BOM-aware.
1164
 *
1165
 * @param input         the UTF-8 string to convert
1166
 * @param length        the length of the string in byte
1167
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1168
 */
1169
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1170
                                                   size_t length) noexcept;
1171
  #if SIMDUTF_SPAN
1172
simdutf_really_inline simdutf_warn_unused size_t latin1_length_from_utf8(
1173
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1174
  return latin1_length_from_utf8(
1175
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1176
      valid_utf8_input.size());
1177
}
1178
  #endif // SIMDUTF_SPAN
1179
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1180
1181
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1182
/**
1183
 * Compute the number of 2-byte code units that this UTF-8 string would require
1184
 * in UTF-16LE format.
1185
 *
1186
 * This function does not validate the input. It is acceptable to pass invalid
1187
 * UTF-8 strings but in such cases the result is implementation defined.
1188
 *
1189
 * This function is not BOM-aware.
1190
 *
1191
 * @param input         the UTF-8 string to process
1192
 * @param length        the length of the string in bytes
1193
 * @return the number of char16_t code units required to encode the UTF-8 string
1194
 * as UTF-16LE
1195
 */
1196
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1197
                                                  size_t length) noexcept;
1198
  #if SIMDUTF_SPAN
1199
simdutf_really_inline simdutf_warn_unused size_t utf16_length_from_utf8(
1200
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1201
  return utf16_length_from_utf8(
1202
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1203
      valid_utf8_input.size());
1204
}
1205
  #endif // SIMDUTF_SPAN
1206
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1207
1208
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1209
/**
1210
 * Compute the number of 4-byte code units that this UTF-8 string would require
1211
 * in UTF-32 format.
1212
 *
1213
 * This function is equivalent to count_utf8
1214
 *
1215
 * This function does not validate the input. It is acceptable to pass invalid
1216
 * UTF-8 strings but in such cases the result is implementation defined.
1217
 *
1218
 * This function is not BOM-aware.
1219
 *
1220
 * @param input         the UTF-8 string to process
1221
 * @param length        the length of the string in bytes
1222
 * @return the number of char32_t code units required to encode the UTF-8 string
1223
 * as UTF-32
1224
 */
1225
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1226
                                                  size_t length) noexcept;
1227
  #if SIMDUTF_SPAN
1228
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf8(
1229
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1230
  return utf32_length_from_utf8(
1231
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1232
      valid_utf8_input.size());
1233
}
1234
  #endif // SIMDUTF_SPAN
1235
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1236
1237
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1238
/**
1239
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1240
 * string.
1241
 *
1242
 * During the conversion also validation of the input string is done.
1243
 * This function is suitable to work with inputs from untrusted sources.
1244
 *
1245
 * This function is not BOM-aware.
1246
 *
1247
 * @param input         the UTF-16 string to convert
1248
 * @param length        the length of the string in 2-byte code units (char16_t)
1249
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1250
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1251
 * string
1252
 */
1253
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1254
                                                 size_t length,
1255
                                                 char *utf8_buffer) noexcept;
1256
  #if SIMDUTF_SPAN
1257
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8(
1258
    std::span<const char16_t> utf16_input,
1259
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1260
  return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1261
                               reinterpret_cast<char *>(utf8_output.data()));
1262
}
1263
  #endif // SIMDUTF_SPAN
1264
1265
/**
1266
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1267
 * string with output limit.
1268
 *
1269
 * We write as many characters as possible into the output buffer,
1270
 *
1271
 * During the conversion also validation of the input string is done.
1272
 * This function is suitable to work with inputs from untrusted sources.
1273
 *
1274
 * This function is not BOM-aware.
1275
 *
1276
 *
1277
 * @param input         the UTF-16 string to convert
1278
 * @param length        the length of the string in 16-bit code units (char16_t)
1279
 * @param utf8_output   the pointer to buffer that can hold conversion result
1280
 * @param utf8_len      the maximum output length
1281
 * @return the number of written char; 0 if conversion is not possible
1282
 */
1283
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1284
                                                      size_t length,
1285
                                                      char *utf8_output,
1286
                                                      size_t utf8_len) noexcept;
1287
  #if SIMDUTF_SPAN
1288
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8_safe(
1289
    std::span<const char16_t> utf16_input,
1290
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1291
  // implementation note: outputspan is a forwarding ref to avoid copying and
1292
  // allow both lvalues and rvalues. std::span can be copied without problems,
1293
  // but std::vector should not, and this function should accept both. it will
1294
  // allow using an owning rvalue ref (example: passing a temporary std::string)
1295
  // as output, but the user will quickly find out that he has no way of getting
1296
  // the data out of the object in that case.
1297
  return convert_utf16_to_utf8_safe(
1298
      utf16_input.data(), utf16_input.size(),
1299
      reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1300
}
1301
  #endif // SIMDUTF_SPAN
1302
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1303
1304
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1305
/**
1306
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1307
 * string.
1308
 *
1309
 * During the conversion also validation of the input string is done.
1310
 * This function is suitable to work with inputs from untrusted sources.
1311
 *
1312
 * This function is not BOM-aware.
1313
 *
1314
 * @param input         the UTF-16 string to convert
1315
 * @param length        the length of the string in 2-byte code units (char16_t)
1316
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1317
 * @return number of written code units; 0 if input is not a valid UTF-16 string
1318
 * or if it cannot be represented as Latin1
1319
 */
1320
simdutf_warn_unused size_t convert_utf16_to_latin1(
1321
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1322
  #if SIMDUTF_SPAN
1323
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_latin1(
1324
    std::span<const char16_t> utf16_input,
1325
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1326
  return convert_utf16_to_latin1(
1327
      utf16_input.data(), utf16_input.size(),
1328
      reinterpret_cast<char *>(latin1_output.data()));
1329
}
1330
  #endif // SIMDUTF_SPAN
1331
1332
/**
1333
 * Convert possibly broken UTF-16LE string into Latin1 string.
1334
 * If the string cannot be represented as Latin1, an error
1335
 * is returned.
1336
 *
1337
 * During the conversion also validation of the input string is done.
1338
 * This function is suitable to work with inputs from untrusted sources.
1339
 *
1340
 * This function is not BOM-aware.
1341
 *
1342
 * @param input         the UTF-16LE string to convert
1343
 * @param length        the length of the string in 2-byte code units (char16_t)
1344
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1345
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1346
 * string or if it cannot be represented as Latin1
1347
 */
1348
simdutf_warn_unused size_t convert_utf16le_to_latin1(
1349
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1350
  #if SIMDUTF_SPAN
1351
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_latin1(
1352
    std::span<const char16_t> utf16_input,
1353
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1354
  return convert_utf16le_to_latin1(
1355
      utf16_input.data(), utf16_input.size(),
1356
      reinterpret_cast<char *>(latin1_output.data()));
1357
}
1358
  #endif // SIMDUTF_SPAN
1359
1360
/**
1361
 * Convert possibly broken UTF-16BE string into Latin1 string.
1362
 *
1363
 * During the conversion also validation of the input string is done.
1364
 * This function is suitable to work with inputs from untrusted sources.
1365
 *
1366
 * This function is not BOM-aware.
1367
 *
1368
 * @param input         the UTF-16BE string to convert
1369
 * @param length        the length of the string in 2-byte code units (char16_t)
1370
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1371
 * @return number of written code units; 0 if input is not a valid UTF-16BE
1372
 * string or if it cannot be represented as Latin1
1373
 */
1374
simdutf_warn_unused size_t convert_utf16be_to_latin1(
1375
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1376
  #if SIMDUTF_SPAN
1377
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_latin1(
1378
    std::span<const char16_t> utf16_input,
1379
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1380
  return convert_utf16be_to_latin1(
1381
      utf16_input.data(), utf16_input.size(),
1382
      reinterpret_cast<char *>(latin1_output.data()));
1383
}
1384
  #endif // SIMDUTF_SPAN
1385
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1386
1387
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1388
/**
1389
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1390
 *
1391
 * During the conversion also validation of the input string is done.
1392
 * This function is suitable to work with inputs from untrusted sources.
1393
 *
1394
 * This function is not BOM-aware.
1395
 *
1396
 * @param input         the UTF-16LE string to convert
1397
 * @param length        the length of the string in 2-byte code units (char16_t)
1398
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1399
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1400
 * string
1401
 */
1402
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1403
                                                   size_t length,
1404
                                                   char *utf8_buffer) noexcept;
1405
  #if SIMDUTF_SPAN
1406
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_utf8(
1407
    std::span<const char16_t> utf16_input,
1408
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1409
  return convert_utf16le_to_utf8(utf16_input.data(), utf16_input.size(),
1410
                                 reinterpret_cast<char *>(utf8_output.data()));
1411
}
1412
  #endif // SIMDUTF_SPAN
1413
1414
/**
1415
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1416
 *
1417
 * During the conversion also validation of the input string is done.
1418
 * This function is suitable to work with inputs from untrusted sources.
1419
 *
1420
 * This function is not BOM-aware.
1421
 *
1422
 * @param input         the UTF-16BE string to convert
1423
 * @param length        the length of the string in 2-byte code units (char16_t)
1424
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1425
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1426
 * string
1427
 */
1428
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
1429
                                                   size_t length,
1430
                                                   char *utf8_buffer) noexcept;
1431
  #if SIMDUTF_SPAN
1432
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_utf8(
1433
    std::span<const char16_t> utf16_input,
1434
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1435
  return convert_utf16be_to_utf8(utf16_input.data(), utf16_input.size(),
1436
                                 reinterpret_cast<char *>(utf8_output.data()));
1437
}
1438
  #endif // SIMDUTF_SPAN
1439
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1440
1441
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1442
/**
1443
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1444
 * string.
1445
 *
1446
 * During the conversion also validation of the input string is done.
1447
 * This function is suitable to work with inputs from untrusted sources.
1448
 * This function is not BOM-aware.
1449
 *
1450
 * @param input         the UTF-16 string to convert
1451
 * @param length        the length of the string in 2-byte code units (char16_t)
1452
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1453
 * @return a result pair struct (of type simdutf::result containing the two
1454
 * fields error and count) with an error code and either position of the error
1455
 * (in the input in code units) if any, or the number of char written if
1456
 * successful.
1457
 */
1458
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
1459
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1460
  #if SIMDUTF_SPAN
1461
simdutf_really_inline simdutf_warn_unused result
1462
convert_utf16_to_latin1_with_errors(
1463
    std::span<const char16_t> utf16_input,
1464
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1465
  return convert_utf16_to_latin1_with_errors(
1466
      utf16_input.data(), utf16_input.size(),
1467
      reinterpret_cast<char *>(latin1_output.data()));
1468
}
1469
  #endif // SIMDUTF_SPAN
1470
1471
/**
1472
 * Convert possibly broken UTF-16LE string into Latin1 string.
1473
 *
1474
 * During the conversion also validation of the input string is done.
1475
 * This function is suitable to work with inputs from untrusted sources.
1476
 * This function is not BOM-aware.
1477
 *
1478
 * @param input         the UTF-16LE string to convert
1479
 * @param length        the length of the string in 2-byte code units (char16_t)
1480
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1481
 * @return a result pair struct (of type simdutf::result containing the two
1482
 * fields error and count) with an error code and either position of the error
1483
 * (in the input in code units) if any, or the number of char written if
1484
 * successful.
1485
 */
1486
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
1487
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1488
  #if SIMDUTF_SPAN
1489
simdutf_really_inline simdutf_warn_unused result
1490
convert_utf16le_to_latin1_with_errors(
1491
    std::span<const char16_t> utf16_input,
1492
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1493
  return convert_utf16le_to_latin1_with_errors(
1494
      utf16_input.data(), utf16_input.size(),
1495
      reinterpret_cast<char *>(latin1_output.data()));
1496
}
1497
  #endif // SIMDUTF_SPAN
1498
1499
/**
1500
 * Convert possibly broken UTF-16BE string into Latin1 string.
1501
 * If the string cannot be represented as Latin1, an error
1502
 * is returned.
1503
 *
1504
 * During the conversion also validation of the input string is done.
1505
 * This function is suitable to work with inputs from untrusted sources.
1506
 * This function is not BOM-aware.
1507
 *
1508
 * @param input         the UTF-16BE string to convert
1509
 * @param length        the length of the string in 2-byte code units (char16_t)
1510
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1511
 * @return a result pair struct (of type simdutf::result containing the two
1512
 * fields error and count) with an error code and either position of the error
1513
 * (in the input in code units) if any, or the number of char written if
1514
 * successful.
1515
 */
1516
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
1517
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1518
  #if SIMDUTF_SPAN
1519
simdutf_really_inline simdutf_warn_unused result
1520
convert_utf16be_to_latin1_with_errors(
1521
    std::span<const char16_t> utf16_input,
1522
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1523
  return convert_utf16be_to_latin1_with_errors(
1524
      utf16_input.data(), utf16_input.size(),
1525
      reinterpret_cast<char *>(latin1_output.data()));
1526
}
1527
  #endif // SIMDUTF_SPAN
1528
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1529
1530
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1531
/**
1532
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1533
 * string and stop on error.
1534
 *
1535
 * During the conversion also validation of the input string is done.
1536
 * This function is suitable to work with inputs from untrusted sources.
1537
 *
1538
 * This function is not BOM-aware.
1539
 *
1540
 * @param input         the UTF-16 string to convert
1541
 * @param length        the length of the string in 2-byte code units (char16_t)
1542
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1543
 * @return a result pair struct (of type simdutf::result containing the two
1544
 * fields error and count) with an error code and either position of the error
1545
 * (in the input in code units) if any, or the number of char written if
1546
 * successful.
1547
 */
1548
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
1549
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1550
  #if SIMDUTF_SPAN
1551
simdutf_really_inline simdutf_warn_unused result
1552
convert_utf16_to_utf8_with_errors(
1553
    std::span<const char16_t> utf16_input,
1554
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1555
  return convert_utf16_to_utf8_with_errors(
1556
      utf16_input.data(), utf16_input.size(),
1557
      reinterpret_cast<char *>(utf8_output.data()));
1558
}
1559
  #endif // SIMDUTF_SPAN
1560
1561
/**
1562
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1563
 *
1564
 * During the conversion also validation of the input string is done.
1565
 * This function is suitable to work with inputs from untrusted sources.
1566
 *
1567
 * This function is not BOM-aware.
1568
 *
1569
 * @param input         the UTF-16LE string to convert
1570
 * @param length        the length of the string in 2-byte code units (char16_t)
1571
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1572
 * @return a result pair struct (of type simdutf::result containing the two
1573
 * fields error and count) with an error code and either position of the error
1574
 * (in the input in code units) if any, or the number of char written if
1575
 * successful.
1576
 */
1577
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
1578
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1579
  #if SIMDUTF_SPAN
1580
simdutf_really_inline simdutf_warn_unused result
1581
convert_utf16le_to_utf8_with_errors(
1582
    std::span<const char16_t> utf16_input,
1583
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1584
  return convert_utf16le_to_utf8_with_errors(
1585
      utf16_input.data(), utf16_input.size(),
1586
      reinterpret_cast<char *>(utf8_output.data()));
1587
}
1588
  #endif // SIMDUTF_SPAN
1589
1590
/**
1591
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1592
 *
1593
 * During the conversion also validation of the input string is done.
1594
 * This function is suitable to work with inputs from untrusted sources.
1595
 *
1596
 * This function is not BOM-aware.
1597
 *
1598
 * @param input         the UTF-16BE string to convert
1599
 * @param length        the length of the string in 2-byte code units (char16_t)
1600
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1601
 * @return a result pair struct (of type simdutf::result containing the two
1602
 * fields error and count) with an error code and either position of the error
1603
 * (in the input in code units) if any, or the number of char written if
1604
 * successful.
1605
 */
1606
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
1607
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1608
  #if SIMDUTF_SPAN
1609
simdutf_really_inline simdutf_warn_unused result
1610
convert_utf16be_to_utf8_with_errors(
1611
    std::span<const char16_t> utf16_input,
1612
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1613
  return convert_utf16be_to_utf8_with_errors(
1614
      utf16_input.data(), utf16_input.size(),
1615
      reinterpret_cast<char *>(utf8_output.data()));
1616
}
1617
  #endif // SIMDUTF_SPAN
1618
1619
/**
1620
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
1621
 *
1622
 * This function assumes that the input string is valid UTF-16LE.
1623
 *
1624
 * This function is not BOM-aware.
1625
 *
1626
 * @param input         the UTF-16 string to convert
1627
 * @param length        the length of the string in 2-byte code units (char16_t)
1628
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1629
 * result
1630
 * @return number of written code units; 0 if conversion is not possible
1631
 */
1632
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1633
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1634
  #if SIMDUTF_SPAN
1635
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1636
    std::span<const char16_t> valid_utf16_input,
1637
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1638
  return convert_valid_utf16_to_utf8(
1639
      valid_utf16_input.data(), valid_utf16_input.size(),
1640
      reinterpret_cast<char *>(utf8_output.data()));
1641
}
1642
  #endif // SIMDUTF_SPAN
1643
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1644
1645
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1646
/**
1647
 * Using native endianness, convert UTF-16 string into Latin1 string.
1648
 *
1649
 * This function assumes that the input string is valid UTF-16 and that it can
1650
 * be represented as Latin1. If you violate this assumption, the result is
1651
 * implementation defined and may include system-dependent behavior such as
1652
 * crashes.
1653
 *
1654
 * This function is for expert users only and not part of our public API. Use
1655
 * convert_utf16_to_latin1 instead. The function may be removed from the library
1656
 * in the future.
1657
 *
1658
 * This function is not BOM-aware.
1659
 *
1660
 * @param input         the UTF-16 string to convert
1661
 * @param length        the length of the string in 2-byte code units (char16_t)
1662
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1663
 * @return number of written code units; 0 if conversion is not possible
1664
 */
1665
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1666
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1667
  #if SIMDUTF_SPAN
1668
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1669
    std::span<const char16_t> valid_utf16_input,
1670
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1671
  return convert_valid_utf16_to_latin1(
1672
      valid_utf16_input.data(), valid_utf16_input.size(),
1673
      reinterpret_cast<char *>(latin1_output.data()));
1674
}
1675
  #endif // SIMDUTF_SPAN
1676
1677
/**
1678
 * Convert valid UTF-16LE string into Latin1 string.
1679
 *
1680
 * This function assumes that the input string is valid UTF-16LE and that it can
1681
 * be represented as Latin1. If you violate this assumption, the result is
1682
 * implementation defined and may include system-dependent behavior such as
1683
 * crashes.
1684
 *
1685
 * This function is for expert users only and not part of our public API. Use
1686
 * convert_utf16le_to_latin1 instead. The function may be removed from the
1687
 * library in the future.
1688
 *
1689
 * This function is not BOM-aware.
1690
 *
1691
 * @param input         the UTF-16LE string to convert
1692
 * @param length        the length of the string in 2-byte code units (char16_t)
1693
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1694
 * @return number of written code units; 0 if conversion is not possible
1695
 */
1696
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
1697
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1698
  #if SIMDUTF_SPAN
1699
simdutf_really_inline simdutf_warn_unused size_t
1700
convert_valid_utf16le_to_latin1(
1701
    std::span<const char16_t> valid_utf16_input,
1702
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1703
  return convert_valid_utf16le_to_latin1(
1704
      valid_utf16_input.data(), valid_utf16_input.size(),
1705
      reinterpret_cast<char *>(latin1_output.data()));
1706
}
1707
  #endif // SIMDUTF_SPAN
1708
1709
/**
1710
 * Convert valid UTF-16BE string into Latin1 string.
1711
 *
1712
 * This function assumes that the input string is valid UTF-16BE and that it can
1713
 * be represented as Latin1. If you violate this assumption, the result is
1714
 * implementation defined and may include system-dependent behavior such as
1715
 * crashes.
1716
 *
1717
 * This function is for expert users only and not part of our public API. Use
1718
 * convert_utf16be_to_latin1 instead. The function may be removed from the
1719
 * library in the future.
1720
 *
1721
 * This function is not BOM-aware.
1722
 *
1723
 * @param input         the UTF-16BE string to convert
1724
 * @param length        the length of the string in 2-byte code units (char16_t)
1725
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1726
 * @return number of written code units; 0 if conversion is not possible
1727
 */
1728
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
1729
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1730
  #if SIMDUTF_SPAN
1731
simdutf_really_inline simdutf_warn_unused size_t
1732
convert_valid_utf16be_to_latin1(
1733
    std::span<const char16_t> valid_utf16_input,
1734
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1735
  return convert_valid_utf16be_to_latin1(
1736
      valid_utf16_input.data(), valid_utf16_input.size(),
1737
      reinterpret_cast<char *>(latin1_output.data()));
1738
}
1739
  #endif // SIMDUTF_SPAN
1740
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1741
1742
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1743
/**
1744
 * Convert valid UTF-16LE string into UTF-8 string.
1745
 *
1746
 * This function assumes that the input string is valid UTF-16LE and that it can
1747
 * be represented as Latin1.
1748
 *
1749
 * This function is not BOM-aware.
1750
 *
1751
 * @param input         the UTF-16LE string to convert
1752
 * @param length        the length of the string in 2-byte code units (char16_t)
1753
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1754
 * result
1755
 * @return number of written code units; 0 if conversion is not possible
1756
 */
1757
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1758
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1759
  #if SIMDUTF_SPAN
1760
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1761
    std::span<const char16_t> valid_utf16_input,
1762
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1763
  return convert_valid_utf16le_to_utf8(
1764
      valid_utf16_input.data(), valid_utf16_input.size(),
1765
      reinterpret_cast<char *>(utf8_output.data()));
1766
}
1767
  #endif // SIMDUTF_SPAN
1768
1769
/**
1770
 * Convert valid UTF-16BE string into UTF-8 string.
1771
 *
1772
 * This function assumes that the input string is valid UTF-16BE.
1773
 *
1774
 * This function is not BOM-aware.
1775
 *
1776
 * @param input         the UTF-16BE string to convert
1777
 * @param length        the length of the string in 2-byte code units (char16_t)
1778
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1779
 * result
1780
 * @return number of written code units; 0 if conversion is not possible
1781
 */
1782
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1783
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1784
  #if SIMDUTF_SPAN
1785
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1786
    std::span<const char16_t> valid_utf16_input,
1787
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1788
  return convert_valid_utf16be_to_utf8(
1789
      valid_utf16_input.data(), valid_utf16_input.size(),
1790
      reinterpret_cast<char *>(utf8_output.data()));
1791
}
1792
  #endif // SIMDUTF_SPAN
1793
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1794
1795
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
1796
/**
1797
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
1798
 * string.
1799
 *
1800
 * During the conversion also validation of the input string is done.
1801
 * This function is suitable to work with inputs from untrusted sources.
1802
 *
1803
 * This function is not BOM-aware.
1804
 *
1805
 * @param input         the UTF-16 string to convert
1806
 * @param length        the length of the string in 2-byte code units (char16_t)
1807
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1808
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1809
 * string
1810
 */
1811
simdutf_warn_unused size_t convert_utf16_to_utf32(
1812
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1813
  #if SIMDUTF_SPAN
1814
simdutf_really_inline simdutf_warn_unused size_t
1815
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
1816
0
                       std::span<char32_t> utf32_output) noexcept {
1817
0
  return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
1818
0
                                utf32_output.data());
1819
0
}
1820
  #endif // SIMDUTF_SPAN
1821
1822
/**
1823
 * Convert possibly broken UTF-16LE string into UTF-32 string.
1824
 *
1825
 * During the conversion also validation of the input string is done.
1826
 * This function is suitable to work with inputs from untrusted sources.
1827
 *
1828
 * This function is not BOM-aware.
1829
 *
1830
 * @param input         the UTF-16LE string to convert
1831
 * @param length        the length of the string in 2-byte code units (char16_t)
1832
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1833
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1834
 * string
1835
 */
1836
simdutf_warn_unused size_t convert_utf16le_to_utf32(
1837
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1838
  #if SIMDUTF_SPAN
1839
simdutf_really_inline simdutf_warn_unused size_t
1840
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
1841
0
                         std::span<char32_t> utf32_output) noexcept {
1842
0
  return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
1843
0
                                  utf32_output.data());
1844
0
}
1845
  #endif // SIMDUTF_SPAN
1846
1847
/**
1848
 * Convert possibly broken UTF-16BE string into UTF-32 string.
1849
 *
1850
 * During the conversion also validation of the input string is done.
1851
 * This function is suitable to work with inputs from untrusted sources.
1852
 *
1853
 * This function is not BOM-aware.
1854
 *
1855
 * @param input         the UTF-16BE string to convert
1856
 * @param length        the length of the string in 2-byte code units (char16_t)
1857
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1858
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1859
 * string
1860
 */
1861
simdutf_warn_unused size_t convert_utf16be_to_utf32(
1862
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1863
  #if SIMDUTF_SPAN
1864
simdutf_really_inline simdutf_warn_unused size_t
1865
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
1866
0
                         std::span<char32_t> utf32_output) noexcept {
1867
0
  return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
1868
0
                                  utf32_output.data());
1869
0
}
1870
  #endif // SIMDUTF_SPAN
1871
1872
/**
1873
 * Using native endianness, convert possibly broken UTF-16 string into
1874
 * UTF-32 string and stop on error.
1875
 *
1876
 * During the conversion also validation of the input string is done.
1877
 * This function is suitable to work with inputs from untrusted sources.
1878
 *
1879
 * This function is not BOM-aware.
1880
 *
1881
 * @param input         the UTF-16 string to convert
1882
 * @param length        the length of the string in 2-byte code units (char16_t)
1883
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1884
 * @return a result pair struct (of type simdutf::result containing the two
1885
 * fields error and count) with an error code and either position of the error
1886
 * (in the input in code units) if any, or the number of char32_t written if
1887
 * successful.
1888
 */
1889
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
1890
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1891
  #if SIMDUTF_SPAN
1892
simdutf_really_inline simdutf_warn_unused result
1893
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
1894
0
                                   std::span<char32_t> utf32_output) noexcept {
1895
0
  return convert_utf16_to_utf32_with_errors(
1896
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1897
0
}
1898
  #endif // SIMDUTF_SPAN
1899
1900
/**
1901
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1902
 *
1903
 * During the conversion also validation of the input string is done.
1904
 * This function is suitable to work with inputs from untrusted sources.
1905
 *
1906
 * This function is not BOM-aware.
1907
 *
1908
 * @param input         the UTF-16LE string to convert
1909
 * @param length        the length of the string in 2-byte code units (char16_t)
1910
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1911
 * @return a result pair struct (of type simdutf::result containing the two
1912
 * fields error and count) with an error code and either position of the error
1913
 * (in the input in code units) if any, or the number of char32_t written if
1914
 * successful.
1915
 */
1916
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
1917
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1918
  #if SIMDUTF_SPAN
1919
simdutf_really_inline simdutf_warn_unused result
1920
convert_utf16le_to_utf32_with_errors(
1921
    std::span<const char16_t> utf16_input,
1922
0
    std::span<char32_t> utf32_output) noexcept {
1923
0
  return convert_utf16le_to_utf32_with_errors(
1924
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1925
0
}
1926
  #endif // SIMDUTF_SPAN
1927
1928
/**
1929
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1930
 *
1931
 * During the conversion also validation of the input string is done.
1932
 * This function is suitable to work with inputs from untrusted sources.
1933
 *
1934
 * This function is not BOM-aware.
1935
 *
1936
 * @param input         the UTF-16BE string to convert
1937
 * @param length        the length of the string in 2-byte code units (char16_t)
1938
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1939
 * @return a result pair struct (of type simdutf::result containing the two
1940
 * fields error and count) with an error code and either position of the error
1941
 * (in the input in code units) if any, or the number of char32_t written if
1942
 * successful.
1943
 */
1944
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
1945
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1946
  #if SIMDUTF_SPAN
1947
simdutf_really_inline simdutf_warn_unused result
1948
convert_utf16be_to_utf32_with_errors(
1949
    std::span<const char16_t> utf16_input,
1950
0
    std::span<char32_t> utf32_output) noexcept {
1951
0
  return convert_utf16be_to_utf32_with_errors(
1952
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1953
0
}
1954
  #endif // SIMDUTF_SPAN
1955
1956
/**
1957
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
1958
 *
1959
 * This function assumes that the input string is valid UTF-16 (native
1960
 * endianness).
1961
 *
1962
 * This function is not BOM-aware.
1963
 *
1964
 * @param input         the UTF-16 string to convert
1965
 * @param length        the length of the string in 2-byte code units (char16_t)
1966
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
1967
 * result
1968
 * @return number of written code units; 0 if conversion is not possible
1969
 */
1970
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
1971
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1972
  #if SIMDUTF_SPAN
1973
simdutf_really_inline simdutf_warn_unused size_t
1974
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
1975
0
                             std::span<char32_t> utf32_output) noexcept {
1976
0
  return convert_valid_utf16_to_utf32(
1977
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
1978
0
}
1979
  #endif // SIMDUTF_SPAN
1980
1981
/**
1982
 * Convert valid UTF-16LE string into UTF-32 string.
1983
 *
1984
 * This function assumes that the input string is valid UTF-16LE.
1985
 *
1986
 * This function is not BOM-aware.
1987
 *
1988
 * @param input         the UTF-16LE string to convert
1989
 * @param length        the length of the string in 2-byte code units (char16_t)
1990
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
1991
 * result
1992
 * @return number of written code units; 0 if conversion is not possible
1993
 */
1994
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
1995
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1996
  #if SIMDUTF_SPAN
1997
simdutf_really_inline simdutf_warn_unused size_t
1998
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
1999
0
                               std::span<char32_t> utf32_output) noexcept {
2000
0
  return convert_valid_utf16le_to_utf32(
2001
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2002
0
}
2003
  #endif // SIMDUTF_SPAN
2004
2005
/**
2006
 * Convert valid UTF-16BE string into UTF-32 string.
2007
 *
2008
 * This function assumes that the input string is valid UTF-16LE.
2009
 *
2010
 * This function is not BOM-aware.
2011
 *
2012
 * @param input         the UTF-16BE string to convert
2013
 * @param length        the length of the string in 2-byte code units (char16_t)
2014
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2015
 * result
2016
 * @return number of written code units; 0 if conversion is not possible
2017
 */
2018
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
2019
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2020
  #if SIMDUTF_SPAN
2021
simdutf_really_inline simdutf_warn_unused size_t
2022
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
2023
0
                               std::span<char32_t> utf32_output) noexcept {
2024
0
  return convert_valid_utf16be_to_utf32(
2025
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2026
0
}
2027
  #endif // SIMDUTF_SPAN
2028
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2029
2030
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2031
/**
2032
 * Compute the number of bytes that this UTF-16LE/BE string would require in
2033
 * Latin1 format.
2034
 *
2035
 * This function does not validate the input. It is acceptable to pass invalid
2036
 * UTF-16 strings but in such cases the result is implementation defined.
2037
 *
2038
 * This function is not BOM-aware.
2039
 *
2040
 * @param length        the length of the string in 2-byte code units (char16_t)
2041
 * @return the number of bytes required to encode the UTF-16LE string as Latin1
2042
 */
2043
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
2044
2045
/**
2046
 * Using native endianness; Compute the number of bytes that this UTF-16
2047
 * string would require in UTF-8 format.
2048
 *
2049
 * This function does not validate the input. It is acceptable to pass invalid
2050
 * UTF-16 strings but in such cases the result is implementation defined.
2051
 *
2052
 * @param input         the UTF-16 string to convert
2053
 * @param length        the length of the string in 2-byte code units (char16_t)
2054
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2055
 */
2056
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
2057
                                                  size_t length) noexcept;
2058
  #if SIMDUTF_SPAN
2059
simdutf_really_inline simdutf_warn_unused size_t
2060
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2061
0
  return utf8_length_from_utf16(valid_utf16_input.data(),
2062
0
                                valid_utf16_input.size());
2063
0
}
2064
  #endif // SIMDUTF_SPAN
2065
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2066
2067
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2068
/**
2069
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2070
 * format.
2071
 *
2072
 * This function does not validate the input. It is acceptable to pass invalid
2073
 * UTF-16 strings but in such cases the result is implementation defined.
2074
 *
2075
 * @param input         the UTF-16LE string to convert
2076
 * @param length        the length of the string in 2-byte code units (char16_t)
2077
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2078
 */
2079
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
2080
                                                    size_t length) noexcept;
2081
  #if SIMDUTF_SPAN
2082
simdutf_really_inline simdutf_warn_unused size_t
2083
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2084
0
  return utf8_length_from_utf16le(valid_utf16_input.data(),
2085
0
                                  valid_utf16_input.size());
2086
0
}
2087
  #endif // SIMDUTF_SPAN
2088
2089
/**
2090
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
2091
 * format.
2092
 *
2093
 * This function does not validate the input. It is acceptable to pass invalid
2094
 * UTF-16 strings but in such cases the result is implementation defined.
2095
 *
2096
 * @param input         the UTF-16BE string to convert
2097
 * @param length        the length of the string in 2-byte code units (char16_t)
2098
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2099
 */
2100
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
2101
                                                    size_t length) noexcept;
2102
  #if SIMDUTF_SPAN
2103
simdutf_really_inline simdutf_warn_unused size_t
2104
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2105
0
  return utf8_length_from_utf16be(valid_utf16_input.data(),
2106
0
                                  valid_utf16_input.size());
2107
0
}
2108
  #endif // SIMDUTF_SPAN
2109
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2110
2111
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2112
/**
2113
 * Convert possibly broken UTF-32 string into UTF-8 string.
2114
 *
2115
 * During the conversion also validation of the input string is done.
2116
 * This function is suitable to work with inputs from untrusted sources.
2117
 *
2118
 * This function is not BOM-aware.
2119
 *
2120
 * @param input         the UTF-32 string to convert
2121
 * @param length        the length of the string in 4-byte code units (char32_t)
2122
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2123
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2124
 */
2125
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
2126
                                                 size_t length,
2127
                                                 char *utf8_buffer) noexcept;
2128
  #if SIMDUTF_SPAN
2129
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_utf8(
2130
    std::span<const char32_t> utf32_input,
2131
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2132
  return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
2133
                               reinterpret_cast<char *>(utf8_output.data()));
2134
}
2135
  #endif // SIMDUTF_SPAN
2136
2137
/**
2138
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2139
 *
2140
 * During the conversion also validation of the input string is done.
2141
 * This function is suitable to work with inputs from untrusted sources.
2142
 *
2143
 * This function is not BOM-aware.
2144
 *
2145
 * @param input         the UTF-32 string to convert
2146
 * @param length        the length of the string in 4-byte code units (char32_t)
2147
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2148
 * @return a result pair struct (of type simdutf::result containing the two
2149
 * fields error and count) with an error code and either position of the error
2150
 * (in the input in code units) if any, or the number of char written if
2151
 * successful.
2152
 */
2153
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
2154
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2155
  #if SIMDUTF_SPAN
2156
simdutf_really_inline simdutf_warn_unused result
2157
convert_utf32_to_utf8_with_errors(
2158
    std::span<const char32_t> utf32_input,
2159
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2160
  return convert_utf32_to_utf8_with_errors(
2161
      utf32_input.data(), utf32_input.size(),
2162
      reinterpret_cast<char *>(utf8_output.data()));
2163
}
2164
  #endif // SIMDUTF_SPAN
2165
2166
/**
2167
 * Convert valid UTF-32 string into UTF-8 string.
2168
 *
2169
 * This function assumes that the input string is valid UTF-32.
2170
 *
2171
 * This function is not BOM-aware.
2172
 *
2173
 * @param input         the UTF-32 string to convert
2174
 * @param length        the length of the string in 4-byte code units (char32_t)
2175
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2176
 * result
2177
 * @return number of written code units; 0 if conversion is not possible
2178
 */
2179
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2180
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2181
  #if SIMDUTF_SPAN
2182
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2183
    std::span<const char32_t> valid_utf32_input,
2184
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2185
  return convert_valid_utf32_to_utf8(
2186
      valid_utf32_input.data(), valid_utf32_input.size(),
2187
      reinterpret_cast<char *>(utf8_output.data()));
2188
}
2189
  #endif // SIMDUTF_SPAN
2190
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2191
2192
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2193
/**
2194
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
2195
 * string.
2196
 *
2197
 * During the conversion also validation of the input string is done.
2198
 * This function is suitable to work with inputs from untrusted sources.
2199
 *
2200
 * This function is not BOM-aware.
2201
 *
2202
 * @param input         the UTF-32 string to convert
2203
 * @param length        the length of the string in 4-byte code units (char32_t)
2204
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2205
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2206
 */
2207
simdutf_warn_unused size_t convert_utf32_to_utf16(
2208
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2209
  #if SIMDUTF_SPAN
2210
simdutf_really_inline simdutf_warn_unused size_t
2211
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
2212
0
                       std::span<char16_t> utf16_output) noexcept {
2213
0
  return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
2214
0
                                utf16_output.data());
2215
0
}
2216
  #endif // SIMDUTF_SPAN
2217
2218
/**
2219
 * Convert possibly broken UTF-32 string into UTF-16LE string.
2220
 *
2221
 * During the conversion also validation of the input string is done.
2222
 * This function is suitable to work with inputs from untrusted sources.
2223
 *
2224
 * This function is not BOM-aware.
2225
 *
2226
 * @param input         the UTF-32 string to convert
2227
 * @param length        the length of the string in 4-byte code units (char32_t)
2228
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2229
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2230
 */
2231
simdutf_warn_unused size_t convert_utf32_to_utf16le(
2232
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2233
  #if SIMDUTF_SPAN
2234
simdutf_really_inline simdutf_warn_unused size_t
2235
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
2236
0
                         std::span<char16_t> utf16_output) noexcept {
2237
0
  return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
2238
0
                                  utf16_output.data());
2239
0
}
2240
  #endif // SIMDUTF_SPAN
2241
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2242
2243
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2244
/**
2245
 * Convert possibly broken UTF-32 string into Latin1 string.
2246
 *
2247
 * During the conversion also validation of the input string is done.
2248
 * This function is suitable to work with inputs from untrusted sources.
2249
 *
2250
 * This function is not BOM-aware.
2251
 *
2252
 * @param input         the UTF-32 string to convert
2253
 * @param length        the length of the string in 4-byte code units (char32_t)
2254
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2255
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2256
 * or if it cannot be represented as Latin1
2257
 */
2258
simdutf_warn_unused size_t convert_utf32_to_latin1(
2259
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2260
  #if SIMDUTF_SPAN
2261
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_latin1(
2262
    std::span<const char32_t> utf32_input,
2263
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2264
  return convert_utf32_to_latin1(
2265
      utf32_input.data(), utf32_input.size(),
2266
      reinterpret_cast<char *>(latin1_output.data()));
2267
}
2268
  #endif // SIMDUTF_SPAN
2269
2270
/**
2271
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
2272
 * If the string cannot be represented as Latin1, an error is returned.
2273
 *
2274
 * During the conversion also validation of the input string is done.
2275
 * This function is suitable to work with inputs from untrusted sources.
2276
 *
2277
 * This function is not BOM-aware.
2278
 *
2279
 * @param input         the UTF-32 string to convert
2280
 * @param length        the length of the string in 4-byte code units (char32_t)
2281
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2282
 * @return a result pair struct (of type simdutf::result containing the two
2283
 * fields error and count) with an error code and either position of the error
2284
 * (in the input in code units) if any, or the number of char written if
2285
 * successful.
2286
 */
2287
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
2288
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2289
  #if SIMDUTF_SPAN
2290
simdutf_really_inline simdutf_warn_unused result
2291
convert_utf32_to_latin1_with_errors(
2292
    std::span<const char32_t> utf32_input,
2293
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2294
  return convert_utf32_to_latin1_with_errors(
2295
      utf32_input.data(), utf32_input.size(),
2296
      reinterpret_cast<char *>(latin1_output.data()));
2297
}
2298
  #endif // SIMDUTF_SPAN
2299
2300
/**
2301
 * Convert valid UTF-32 string into Latin1 string.
2302
 *
2303
 * This function assumes that the input string is valid UTF-32 and that it can
2304
 * be represented as Latin1. If you violate this assumption, the result is
2305
 * implementation defined and may include system-dependent behavior such as
2306
 * crashes.
2307
 *
2308
 * This function is for expert users only and not part of our public API. Use
2309
 * convert_utf32_to_latin1 instead. The function may be removed from the library
2310
 * in the future.
2311
 *
2312
 * This function is not BOM-aware.
2313
 *
2314
 * @param input         the UTF-32 string to convert
2315
 * @param length        the length of the string in 4-byte code units (char32_t)
2316
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
2317
 * result
2318
 * @return number of written code units; 0 if conversion is not possible
2319
 */
2320
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2321
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2322
  #if SIMDUTF_SPAN
2323
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2324
    std::span<const char32_t> valid_utf32_input,
2325
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2326
  return convert_valid_utf32_to_latin1(
2327
      valid_utf32_input.data(), valid_utf32_input.size(),
2328
      reinterpret_cast<char *>(latin1_output.data()));
2329
}
2330
  #endif // SIMDUTF_SPAN
2331
2332
/**
2333
 * Compute the number of bytes that this UTF-32 string would require in Latin1
2334
 * format.
2335
 *
2336
 * This function does not validate the input. It is acceptable to pass invalid
2337
 * UTF-32 strings but in such cases the result is implementation defined.
2338
 *
2339
 * This function is not BOM-aware.
2340
 *
2341
 * @param length        the length of the string in 4-byte code units (char32_t)
2342
 * @return the number of bytes required to encode the UTF-32 string as Latin1
2343
 */
2344
simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept;
2345
2346
/**
2347
 * Compute the number of bytes that this Latin1 string would require in UTF-32
2348
 * format.
2349
 *
2350
 * @param length        the length of the string in Latin1 code units (char)
2351
 * @return the length of the string in 4-byte code units (char32_t) required to
2352
 * encode the Latin1 string as UTF-32
2353
 */
2354
simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept;
2355
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2356
2357
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2358
/**
2359
 * Convert possibly broken UTF-32 string into UTF-16BE string.
2360
 *
2361
 * During the conversion also validation of the input string is done.
2362
 * This function is suitable to work with inputs from untrusted sources.
2363
 *
2364
 * This function is not BOM-aware.
2365
 *
2366
 * @param input         the UTF-32 string to convert
2367
 * @param length        the length of the string in 4-byte code units (char32_t)
2368
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2369
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2370
 */
2371
simdutf_warn_unused size_t convert_utf32_to_utf16be(
2372
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2373
  #if SIMDUTF_SPAN
2374
simdutf_really_inline simdutf_warn_unused size_t
2375
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
2376
0
                         std::span<char16_t> utf16_output) noexcept {
2377
0
  return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
2378
0
                                  utf16_output.data());
2379
0
}
2380
  #endif // SIMDUTF_SPAN
2381
2382
/**
2383
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
2384
 * string and stop on error.
2385
 *
2386
 * During the conversion also validation of the input string is done.
2387
 * This function is suitable to work with inputs from untrusted sources.
2388
 *
2389
 * This function is not BOM-aware.
2390
 *
2391
 * @param input         the UTF-32 string to convert
2392
 * @param length        the length of the string in 4-byte code units (char32_t)
2393
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2394
 * @return a result pair struct (of type simdutf::result containing the two
2395
 * fields error and count) with an error code and either position of the error
2396
 * (in the input in code units) if any, or the number of char16_t written if
2397
 * successful.
2398
 */
2399
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
2400
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2401
  #if SIMDUTF_SPAN
2402
simdutf_really_inline simdutf_warn_unused result
2403
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
2404
0
                                   std::span<char16_t> utf16_output) noexcept {
2405
0
  return convert_utf32_to_utf16_with_errors(
2406
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2407
0
}
2408
  #endif // SIMDUTF_SPAN
2409
2410
/**
2411
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2412
 *
2413
 * During the conversion also validation of the input string is done.
2414
 * This function is suitable to work with inputs from untrusted sources.
2415
 *
2416
 * This function is not BOM-aware.
2417
 *
2418
 * @param input         the UTF-32 string to convert
2419
 * @param length        the length of the string in 4-byte code units (char32_t)
2420
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2421
 * @return a result pair struct (of type simdutf::result containing the two
2422
 * fields error and count) with an error code and either position of the error
2423
 * (in the input in code units) if any, or the number of char16_t written if
2424
 * successful.
2425
 */
2426
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
2427
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2428
  #if SIMDUTF_SPAN
2429
simdutf_really_inline simdutf_warn_unused result
2430
convert_utf32_to_utf16le_with_errors(
2431
    std::span<const char32_t> utf32_input,
2432
0
    std::span<char16_t> utf16_output) noexcept {
2433
0
  return convert_utf32_to_utf16le_with_errors(
2434
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2435
0
}
2436
  #endif // SIMDUTF_SPAN
2437
2438
/**
2439
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2440
 *
2441
 * During the conversion also validation of the input string is done.
2442
 * This function is suitable to work with inputs from untrusted sources.
2443
 *
2444
 * This function is not BOM-aware.
2445
 *
2446
 * @param input         the UTF-32 string to convert
2447
 * @param length        the length of the string in 4-byte code units (char32_t)
2448
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2449
 * @return a result pair struct (of type simdutf::result containing the two
2450
 * fields error and count) with an error code and either position of the error
2451
 * (in the input in code units) if any, or the number of char16_t written if
2452
 * successful.
2453
 */
2454
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
2455
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2456
  #if SIMDUTF_SPAN
2457
simdutf_really_inline simdutf_warn_unused result
2458
convert_utf32_to_utf16be_with_errors(
2459
    std::span<const char32_t> utf32_input,
2460
0
    std::span<char16_t> utf16_output) noexcept {
2461
0
  return convert_utf32_to_utf16be_with_errors(
2462
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2463
0
}
2464
  #endif // SIMDUTF_SPAN
2465
2466
/**
2467
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
2468
 *
2469
 * This function assumes that the input string is valid UTF-32.
2470
 *
2471
 * This function is not BOM-aware.
2472
 *
2473
 * @param input         the UTF-32 string to convert
2474
 * @param length        the length of the string in 4-byte code units (char32_t)
2475
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2476
 * result
2477
 * @return number of written code units; 0 if conversion is not possible
2478
 */
2479
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
2480
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2481
  #if SIMDUTF_SPAN
2482
simdutf_really_inline simdutf_warn_unused size_t
2483
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
2484
0
                             std::span<char16_t> utf16_output) noexcept {
2485
0
  return convert_valid_utf32_to_utf16(
2486
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2487
0
}
2488
  #endif // SIMDUTF_SPAN
2489
2490
/**
2491
 * Convert valid UTF-32 string into UTF-16LE string.
2492
 *
2493
 * This function assumes that the input string is valid UTF-32.
2494
 *
2495
 * This function is not BOM-aware.
2496
 *
2497
 * @param input         the UTF-32 string to convert
2498
 * @param length        the length of the string in 4-byte code units (char32_t)
2499
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2500
 * result
2501
 * @return number of written code units; 0 if conversion is not possible
2502
 */
2503
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
2504
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2505
  #if SIMDUTF_SPAN
2506
simdutf_really_inline simdutf_warn_unused size_t
2507
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
2508
0
                               std::span<char16_t> utf16_output) noexcept {
2509
0
  return convert_valid_utf32_to_utf16le(
2510
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2511
0
}
2512
  #endif // SIMDUTF_SPAN
2513
2514
/**
2515
 * Convert valid UTF-32 string into UTF-16BE string.
2516
 *
2517
 * This function assumes that the input string is valid UTF-32.
2518
 *
2519
 * This function is not BOM-aware.
2520
 *
2521
 * @param input         the UTF-32 string to convert
2522
 * @param length        the length of the string in 4-byte code units (char32_t)
2523
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2524
 * result
2525
 * @return number of written code units; 0 if conversion is not possible
2526
 */
2527
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
2528
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2529
  #if SIMDUTF_SPAN
2530
simdutf_really_inline simdutf_warn_unused size_t
2531
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
2532
0
                               std::span<char16_t> utf16_output) noexcept {
2533
0
  return convert_valid_utf32_to_utf16be(
2534
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2535
0
}
2536
  #endif // SIMDUTF_SPAN
2537
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2538
2539
#if SIMDUTF_FEATURE_UTF16
2540
/**
2541
 * Change the endianness of the input. Can be used to go from UTF-16LE to
2542
 * UTF-16BE or from UTF-16BE to UTF-16LE.
2543
 *
2544
 * This function does not validate the input.
2545
 *
2546
 * This function is not BOM-aware.
2547
 *
2548
 * @param input         the UTF-16 string to process
2549
 * @param length        the length of the string in 2-byte code units (char16_t)
2550
 * @param output        the pointer to a buffer that can hold the conversion
2551
 * result
2552
 */
2553
void change_endianness_utf16(const char16_t *input, size_t length,
2554
                             char16_t *output) noexcept;
2555
  #if SIMDUTF_SPAN
2556
simdutf_really_inline void
2557
change_endianness_utf16(std::span<const char16_t> utf16_input,
2558
0
                        std::span<char16_t> utf16_output) noexcept {
2559
0
  return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
2560
0
                                 utf16_output.data());
2561
0
}
2562
  #endif // SIMDUTF_SPAN
2563
#endif   // SIMDUTF_FEATURE_UTF16
2564
2565
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2566
/**
2567
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
2568
 * format.
2569
 *
2570
 * This function does not validate the input. It is acceptable to pass invalid
2571
 * UTF-32 strings but in such cases the result is implementation defined.
2572
 *
2573
 * @param input         the UTF-32 string to convert
2574
 * @param length        the length of the string in 4-byte code units (char32_t)
2575
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2576
 */
2577
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
2578
                                                  size_t length) noexcept;
2579
  #if SIMDUTF_SPAN
2580
simdutf_really_inline simdutf_warn_unused size_t
2581
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2582
0
  return utf8_length_from_utf32(valid_utf32_input.data(),
2583
0
                                valid_utf32_input.size());
2584
0
}
2585
  #endif // SIMDUTF_SPAN
2586
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2587
2588
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2589
/**
2590
 * Compute the number of two-byte code units that this UTF-32 string would
2591
 * require in UTF-16 format.
2592
 *
2593
 * This function does not validate the input. It is acceptable to pass invalid
2594
 * UTF-32 strings but in such cases the result is implementation defined.
2595
 *
2596
 * @param input         the UTF-32 string to convert
2597
 * @param length        the length of the string in 4-byte code units (char32_t)
2598
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2599
 */
2600
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
2601
                                                   size_t length) noexcept;
2602
  #if SIMDUTF_SPAN
2603
simdutf_really_inline simdutf_warn_unused size_t
2604
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2605
0
  return utf16_length_from_utf32(valid_utf32_input.data(),
2606
0
                                 valid_utf32_input.size());
2607
0
}
2608
  #endif // SIMDUTF_SPAN
2609
2610
/**
2611
 * Using native endianness; Compute the number of bytes that this UTF-16
2612
 * string would require in UTF-32 format.
2613
 *
2614
 * This function is equivalent to count_utf16.
2615
 *
2616
 * This function does not validate the input. It is acceptable to pass invalid
2617
 * UTF-16 strings but in such cases the result is implementation defined.
2618
 *
2619
 * This function is not BOM-aware.
2620
 *
2621
 * @param input         the UTF-16 string to convert
2622
 * @param length        the length of the string in 2-byte code units (char16_t)
2623
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2624
 */
2625
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
2626
                                                   size_t length) noexcept;
2627
  #if SIMDUTF_SPAN
2628
simdutf_really_inline simdutf_warn_unused size_t
2629
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2630
0
  return utf32_length_from_utf16(valid_utf16_input.data(),
2631
0
                                 valid_utf16_input.size());
2632
0
}
2633
  #endif // SIMDUTF_SPAN
2634
2635
/**
2636
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
2637
 * format.
2638
 *
2639
 * This function is equivalent to count_utf16le.
2640
 *
2641
 * This function does not validate the input. It is acceptable to pass invalid
2642
 * UTF-16 strings but in such cases the result is implementation defined.
2643
 *
2644
 * This function is not BOM-aware.
2645
 *
2646
 * @param input         the UTF-16LE string to convert
2647
 * @param length        the length of the string in 2-byte code units (char16_t)
2648
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2649
 */
2650
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
2651
                                                     size_t length) noexcept;
2652
  #if SIMDUTF_SPAN
2653
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16le(
2654
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2655
0
  return utf32_length_from_utf16le(valid_utf16_input.data(),
2656
0
                                   valid_utf16_input.size());
2657
0
}
2658
  #endif // SIMDUTF_SPAN
2659
2660
/**
2661
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
2662
 * format.
2663
 *
2664
 * This function is equivalent to count_utf16be.
2665
 *
2666
 * This function does not validate the input. It is acceptable to pass invalid
2667
 * UTF-16 strings but in such cases the result is implementation defined.
2668
 *
2669
 * This function is not BOM-aware.
2670
 *
2671
 * @param input         the UTF-16BE string to convert
2672
 * @param length        the length of the string in 2-byte code units (char16_t)
2673
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2674
 */
2675
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
2676
                                                     size_t length) noexcept;
2677
  #if SIMDUTF_SPAN
2678
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16be(
2679
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2680
0
  return utf32_length_from_utf16be(valid_utf16_input.data(),
2681
0
                                   valid_utf16_input.size());
2682
0
}
2683
  #endif // SIMDUTF_SPAN
2684
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2685
2686
#if SIMDUTF_FEATURE_UTF16
2687
/**
2688
 * Count the number of code points (characters) in the string assuming that
2689
 * it is valid.
2690
 *
2691
 * This function assumes that the input string is valid UTF-16 (native
2692
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
2693
 * cases the result is implementation defined.
2694
 *
2695
 * This function is not BOM-aware.
2696
 *
2697
 * @param input         the UTF-16 string to process
2698
 * @param length        the length of the string in 2-byte code units (char16_t)
2699
 * @return number of code points
2700
 */
2701
simdutf_warn_unused size_t count_utf16(const char16_t *input,
2702
                                       size_t length) noexcept;
2703
  #if SIMDUTF_SPAN
2704
simdutf_really_inline simdutf_warn_unused size_t
2705
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2706
0
  return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2707
0
}
2708
  #endif // SIMDUTF_SPAN
2709
2710
/**
2711
 * Count the number of code points (characters) in the string assuming that
2712
 * it is valid.
2713
 *
2714
 * This function assumes that the input string is valid UTF-16LE.
2715
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2716
 * the result is implementation defined.
2717
 *
2718
 * This function is not BOM-aware.
2719
 *
2720
 * @param input         the UTF-16LE string to process
2721
 * @param length        the length of the string in 2-byte code units (char16_t)
2722
 * @return number of code points
2723
 */
2724
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
2725
                                         size_t length) noexcept;
2726
  #if SIMDUTF_SPAN
2727
simdutf_really_inline simdutf_warn_unused size_t
2728
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2729
0
  return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
2730
0
}
2731
  #endif // SIMDUTF_SPAN
2732
2733
/**
2734
 * Count the number of code points (characters) in the string assuming that
2735
 * it is valid.
2736
 *
2737
 * This function assumes that the input string is valid UTF-16BE.
2738
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2739
 * the result is implementation defined.
2740
 *
2741
 * This function is not BOM-aware.
2742
 *
2743
 * @param input         the UTF-16BE string to process
2744
 * @param length        the length of the string in 2-byte code units (char16_t)
2745
 * @return number of code points
2746
 */
2747
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
2748
                                         size_t length) noexcept;
2749
  #if SIMDUTF_SPAN
2750
simdutf_really_inline simdutf_warn_unused size_t
2751
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2752
0
  return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
2753
0
}
2754
  #endif // SIMDUTF_SPAN
2755
#endif   // SIMDUTF_FEATURE_UTF16
2756
2757
#if SIMDUTF_FEATURE_UTF8
2758
/**
2759
 * Count the number of code points (characters) in the string assuming that
2760
 * it is valid.
2761
 *
2762
 * This function assumes that the input string is valid UTF-8.
2763
 * It is acceptable to pass invalid UTF-8 strings but in such cases
2764
 * the result is implementation defined.
2765
 *
2766
 * @param input         the UTF-8 string to process
2767
 * @param length        the length of the string in bytes
2768
 * @return number of code points
2769
 */
2770
simdutf_warn_unused size_t count_utf8(const char *input,
2771
                                      size_t length) noexcept;
2772
  #if SIMDUTF_SPAN
2773
simdutf_really_inline simdutf_warn_unused size_t count_utf8(
2774
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2775
  return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
2776
                    valid_utf8_input.size());
2777
}
2778
  #endif // SIMDUTF_SPAN
2779
2780
/**
2781
 * Given a valid UTF-8 string having a possibly truncated last character,
2782
 * this function checks the end of string. If the last character is truncated
2783
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
2784
 * that the short UTF-8 strings only contain complete characters. If there is no
2785
 * truncated character, the original length is returned.
2786
 *
2787
 * This function assumes that the input string is valid UTF-8, but possibly
2788
 * truncated.
2789
 *
2790
 * @param input         the UTF-8 string to process
2791
 * @param length        the length of the string in bytes
2792
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
2793
 */
2794
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
2795
  #if SIMDUTF_SPAN
2796
simdutf_really_inline simdutf_warn_unused size_t trim_partial_utf8(
2797
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2798
  return trim_partial_utf8(
2799
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2800
      valid_utf8_input.size());
2801
}
2802
  #endif // SIMDUTF_SPAN
2803
#endif   // SIMDUTF_FEATURE_UTF8
2804
2805
#if SIMDUTF_FEATURE_UTF16
2806
/**
2807
 * Given a valid UTF-16BE string having a possibly truncated last character,
2808
 * this function checks the end of string. If the last character is truncated
2809
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2810
 * the short UTF-16BE strings only contain complete characters. If there is no
2811
 * truncated character, the original length is returned.
2812
 *
2813
 * This function assumes that the input string is valid UTF-16BE, but possibly
2814
 * truncated.
2815
 *
2816
 * @param input         the UTF-16BE string to process
2817
 * @param length        the length of the string in bytes
2818
 * @return the length of the string in bytes, possibly shorter by 1 unit
2819
 */
2820
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
2821
                                                size_t length);
2822
  #if SIMDUTF_SPAN
2823
simdutf_really_inline simdutf_warn_unused size_t
2824
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2825
0
  return trim_partial_utf16be(valid_utf16_input.data(),
2826
0
                              valid_utf16_input.size());
2827
0
}
2828
  #endif // SIMDUTF_SPAN
2829
2830
/**
2831
 * Given a valid UTF-16LE string having a possibly truncated last character,
2832
 * this function checks the end of string. If the last character is truncated
2833
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2834
 * the short UTF-16LE strings only contain complete characters. If there is no
2835
 * truncated character, the original length is returned.
2836
 *
2837
 * This function assumes that the input string is valid UTF-16LE, but possibly
2838
 * truncated.
2839
 *
2840
 * @param input         the UTF-16LE string to process
2841
 * @param length        the length of the string in bytes
2842
 * @return the length of the string in unit, possibly shorter by 1 unit
2843
 */
2844
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
2845
                                                size_t length);
2846
  #if SIMDUTF_SPAN
2847
simdutf_really_inline simdutf_warn_unused size_t
2848
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2849
0
  return trim_partial_utf16le(valid_utf16_input.data(),
2850
0
                              valid_utf16_input.size());
2851
0
}
2852
  #endif // SIMDUTF_SPAN
2853
2854
/**
2855
 * Given a valid UTF-16 string having a possibly truncated last character,
2856
 * this function checks the end of string. If the last character is truncated
2857
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2858
 * the short UTF-16 strings only contain complete characters. If there is no
2859
 * truncated character, the original length is returned.
2860
 *
2861
 * This function assumes that the input string is valid UTF-16, but possibly
2862
 * truncated. We use the native endianness.
2863
 *
2864
 * @param input         the UTF-16 string to process
2865
 * @param length        the length of the string in bytes
2866
 * @return the length of the string in unit, possibly shorter by 1 unit
2867
 */
2868
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
2869
                                              size_t length);
2870
  #if SIMDUTF_SPAN
2871
simdutf_really_inline simdutf_warn_unused size_t
2872
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2873
0
  return trim_partial_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2874
0
}
2875
  #endif // SIMDUTF_SPAN
2876
#endif   // SIMDUTF_FEATURE_UTF16
2877
2878
#if SIMDUTF_FEATURE_BASE64
2879
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
2880
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
2881
  #endif
2882
// base64_options are used to specify the base64 encoding options.
2883
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
2884
// garbage characters are characters that are not part of the base64 alphabet
2885
// nor ASCII spaces.
2886
constexpr uint64_t base64_reverse_padding =
2887
    2; /* modifier for base64_default and base64_url */
2888
enum base64_options : uint64_t {
2889
  base64_default = 0, /* standard base64 format (with padding) */
2890
  base64_url = 1,     /* base64url format (no padding) */
2891
  base64_default_no_padding =
2892
      base64_default |
2893
      base64_reverse_padding, /* standard base64 format without padding */
2894
  base64_url_with_padding =
2895
      base64_url | base64_reverse_padding, /* base64url with padding */
2896
  base64_default_accept_garbage =
2897
      4, /* standard base64 format accepting garbage characters, the input stops
2898
            with the first '=' if any */
2899
  base64_url_accept_garbage =
2900
      5, /* base64url format accepting garbage characters, the input stops with
2901
            the first '=' if any */
2902
  base64_default_or_url =
2903
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
2904
  base64_default_or_url_accept_garbage =
2905
      12, /* standard/base64url hybrid format accepting garbage characters
2906
             (only meaningful for decoding!), the input stops with the first '='
2907
             if any */
2908
};
2909
2910
  #if SIMDUTF_CPLUSPLUS17
2911
0
inline std::string_view to_string(base64_options options) {
2912
0
  switch (options) {
2913
0
  case base64_default:
2914
0
    return "base64_default";
2915
0
  case base64_url:
2916
0
    return "base64_url";
2917
0
  case base64_reverse_padding:
2918
0
    return "base64_reverse_padding";
2919
0
  case base64_url_with_padding:
2920
0
    return "base64_url_with_padding";
2921
0
  case base64_default_accept_garbage:
2922
0
    return "base64_default_accept_garbage";
2923
0
  case base64_url_accept_garbage:
2924
0
    return "base64_url_accept_garbage";
2925
0
  case base64_default_or_url:
2926
0
    return "base64_default_or_url";
2927
0
  case base64_default_or_url_accept_garbage:
2928
0
    return "base64_default_or_url_accept_garbage";
2929
0
  }
2930
0
  return "<unknown>";
2931
0
}
2932
  #endif // SIMDUTF_CPLUSPLUS17
2933
2934
// last_chunk_handling_options are used to specify the handling of the last
2935
// chunk in base64 decoding.
2936
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
2937
enum last_chunk_handling_options : uint64_t {
2938
  loose = 0,  /* standard base64 format, decode partial final chunk */
2939
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
2940
                 unpadded, or non-zero bit padding */
2941
  stop_before_partial =
2942
      2, /* if the last chunk is partial, ignore it (no error) */
2943
  only_full_chunks =
2944
      3 /* only decode full blocks (4 base64 characters, no padding) */
2945
};
2946
2947
inline bool is_partial(last_chunk_handling_options options) {
2948
  return (options == stop_before_partial) || (options == only_full_chunks);
2949
}
2950
2951
  #if SIMDUTF_CPLUSPLUS17
2952
0
inline std::string_view to_string(last_chunk_handling_options options) {
2953
0
  switch (options) {
2954
0
  case loose:
2955
0
    return "loose";
2956
0
  case strict:
2957
0
    return "strict";
2958
0
  case stop_before_partial:
2959
0
    return "stop_before_partial";
2960
0
  case only_full_chunks:
2961
0
    return "only_full_chunks";
2962
0
  }
2963
0
  return "<unknown>";
2964
0
}
2965
  #endif
2966
2967
/**
2968
 * Provide the maximal binary length in bytes given the base64 input.
2969
 * In general, if the input contains ASCII spaces, the result will be less than
2970
 * the maximum length.
2971
 *
2972
 * @param input         the base64 input to process
2973
 * @param length        the length of the base64 input in bytes
2974
 * @return maximum number of binary bytes
2975
 */
2976
simdutf_warn_unused size_t
2977
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
2978
  #if SIMDUTF_SPAN
2979
simdutf_really_inline simdutf_warn_unused size_t
2980
maximal_binary_length_from_base64(
2981
    const detail::input_span_of_byte_like auto &input) noexcept {
2982
  return maximal_binary_length_from_base64(
2983
      reinterpret_cast<const char *>(input.data()), input.size());
2984
}
2985
  #endif // SIMDUTF_SPAN
2986
2987
/**
2988
 * Provide the maximal binary length in bytes given the base64 input.
2989
 * In general, if the input contains ASCII spaces, the result will be less than
2990
 * the maximum length.
2991
 *
2992
 * @param input         the base64 input to process, in ASCII stored as 16-bit
2993
 * units
2994
 * @param length        the length of the base64 input in 16-bit units
2995
 * @return maximal number of binary bytes
2996
 */
2997
simdutf_warn_unused size_t maximal_binary_length_from_base64(
2998
    const char16_t *input, size_t length) noexcept;
2999
  #if SIMDUTF_SPAN
3000
simdutf_really_inline simdutf_warn_unused size_t
3001
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
3002
0
  return maximal_binary_length_from_base64(input.data(), input.size());
3003
0
}
3004
  #endif // SIMDUTF_SPAN
3005
3006
/**
3007
 * Convert a base64 input to a binary output.
3008
 *
3009
 * This function follows the WHATWG forgiving-base64 format, which means that it
3010
 * will ignore any ASCII spaces in the input. You may provide a padded input
3011
 * (with one or two equal signs at the end) or an unpadded input (without any
3012
 * equal signs at the end).
3013
 *
3014
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3015
 *
3016
 * This function will fail in case of invalid input. When last_chunk_options =
3017
 * loose, there are two possible reasons for failure: the input contains a
3018
 * number of base64 characters that when divided by 4, leaves a single remainder
3019
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3020
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3021
 *
3022
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3023
 * input where the invalid character was found. When the error is
3024
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3025
 *
3026
 * The default option (simdutf::base64_default) expects the characters `+` and
3027
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3028
 * characters `-` and `_` as part of its alphabet.
3029
 *
3030
 * The padding (`=`) is validated if present. There may be at most two padding
3031
 * characters at the end of the input. If there are any padding characters, the
3032
 * total number of characters (excluding spaces but including padding
3033
 * characters) must be divisible by four.
3034
 *
3035
 * You should call this function with a buffer that is at least
3036
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
3037
 * provide that much space, the function may cause a buffer overflow.
3038
 *
3039
 * Advanced users may want to tailor how the last chunk is handled. By default,
3040
 * we use a loose (forgiving) approach but we also support a strict approach
3041
 * as well as a stop_before_partial approach, as per the following proposal:
3042
 *
3043
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3044
 *
3045
 * @param input         the base64 string to process
3046
 * @param length        the length of the string in bytes
3047
 * @param output        the pointer to a buffer that can hold the conversion
3048
 * result (should be at least maximal_binary_length_from_base64(input, length)
3049
 * bytes long).
3050
 * @param options       the base64 options to use, usually base64_default or
3051
 * base64_url, and base64_default by default.
3052
 * @param last_chunk_options the last chunk handling options,
3053
 * last_chunk_handling_options::loose by default
3054
 * but can also be last_chunk_handling_options::strict or
3055
 * last_chunk_handling_options::stop_before_partial.
3056
 * @return a result pair struct (of type simdutf::result containing the two
3057
 * fields error and count) with an error code and either position of the error
3058
 * (in the input in bytes) if any, or the number of bytes written if successful.
3059
 */
3060
simdutf_warn_unused result base64_to_binary(
3061
    const char *input, size_t length, char *output,
3062
    base64_options options = base64_default,
3063
    last_chunk_handling_options last_chunk_options = loose) noexcept;
3064
  #if SIMDUTF_SPAN
3065
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3066
    const detail::input_span_of_byte_like auto &input,
3067
    detail::output_span_of_byte_like auto &&binary_output,
3068
    base64_options options = base64_default,
3069
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3070
  return base64_to_binary(reinterpret_cast<const char *>(input.data()),
3071
                          input.size(),
3072
                          reinterpret_cast<char *>(binary_output.data()),
3073
                          options, last_chunk_options);
3074
}
3075
  #endif // SIMDUTF_SPAN
3076
3077
/**
3078
 * Provide the base64 length in bytes given the length of a binary input.
3079
 *
3080
 * @param length        the length of the input in bytes
3081
 * @return number of base64 bytes
3082
 */
3083
simdutf_warn_unused size_t base64_length_from_binary(
3084
    size_t length, base64_options options = base64_default) noexcept;
3085
3086
/**
3087
 * Convert a binary input to a base64 output.
3088
 *
3089
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3090
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3091
 * output to ensure that the output length is a multiple of four.
3092
 *
3093
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3094
 * of its alphabet. No padding is added at the end of the output.
3095
 *
3096
 * This function always succeeds.
3097
 *
3098
 * @param input         the binary to process
3099
 * @param length        the length of the input in bytes
3100
 * @param output        the pointer to a buffer that can hold the conversion
3101
 * result (should be at least base64_length_from_binary(length) bytes long)
3102
 * @param options       the base64 options to use, can be base64_default or
3103
 * base64_url, is base64_default by default.
3104
 * @return number of written bytes, will be equal to
3105
 * base64_length_from_binary(length, options)
3106
 */
3107
size_t binary_to_base64(const char *input, size_t length, char *output,
3108
                        base64_options options = base64_default) noexcept;
3109
  #if SIMDUTF_SPAN
3110
simdutf_really_inline simdutf_warn_unused size_t
3111
binary_to_base64(const detail::input_span_of_byte_like auto &input,
3112
                 detail::output_span_of_byte_like auto &&binary_output,
3113
                 base64_options options = base64_default) noexcept {
3114
  return binary_to_base64(
3115
      reinterpret_cast<const char *>(input.data()), input.size(),
3116
      reinterpret_cast<char *>(binary_output.data()), options);
3117
}
3118
  #endif // SIMDUTF_SPAN
3119
3120
  #if SIMDUTF_ATOMIC_REF
3121
/**
3122
 * Convert a binary input to a base64 output, using atomic accesses.
3123
 * This function comes with a potentially significant performance
3124
 * penalty, but it may be useful in some cases where the input
3125
 * buffers are shared between threads, to avoid undefined
3126
 * behavior in case of data races.
3127
 *
3128
 * The function is for advanced users. Its main use case is when
3129
 * to silence sanitizer warnings. We have no documented use case
3130
 * where this function is actually necessary in terms of practical correctness.
3131
 *
3132
 * This function is only available when simdutf is compiled with
3133
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3134
 * the availability of this function by checking the macro
3135
 * SIMDUTF_ATOMIC_REF.
3136
 *
3137
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3138
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3139
 * output to ensure that the output length is a multiple of four.
3140
 *
3141
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3142
 * of its alphabet. No padding is added at the end of the output.
3143
 *
3144
 * This function always succeeds.
3145
 *
3146
 * This function is considered experimental. It is not tested by default
3147
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3148
 * It is not documented in the public API documentation (README). It is
3149
 * offered on a best effort basis. We rely on the community for further
3150
 * testing and feedback.
3151
 *
3152
 * @brief atomic_binary_to_base64
3153
 * @param input         the binary to process
3154
 * @param length        the length of the input in bytes
3155
 * @param output        the pointer to a buffer that can hold the conversion
3156
 * result (should be at least base64_length_from_binary(length) bytes long)
3157
 * @param options       the base64 options to use, can be base64_default or
3158
 * base64_url, is base64_default by default.
3159
 * @return number of written bytes, will be equal to
3160
 * base64_length_from_binary(length, options)
3161
 */
3162
size_t
3163
atomic_binary_to_base64(const char *input, size_t length, char *output,
3164
                        base64_options options = base64_default) noexcept;
3165
    #if SIMDUTF_SPAN
3166
simdutf_really_inline simdutf_warn_unused size_t
3167
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
3168
                        detail::output_span_of_byte_like auto &&binary_output,
3169
                        base64_options options = base64_default) noexcept {
3170
  return atomic_binary_to_base64(
3171
      reinterpret_cast<const char *>(input.data()), input.size(),
3172
      reinterpret_cast<char *>(binary_output.data()), options);
3173
}
3174
    #endif // SIMDUTF_SPAN
3175
  #endif   // SIMDUTF_ATOMIC_REF
3176
3177
/**
3178
 * Convert a base64 input to a binary output.
3179
 *
3180
 * This function follows the WHATWG forgiving-base64 format, which means that it
3181
 * will ignore any ASCII spaces in the input. You may provide a padded input
3182
 * (with one or two equal signs at the end) or an unpadded input (without any
3183
 * equal signs at the end).
3184
 *
3185
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3186
 *
3187
 * This function will fail in case of invalid input. When last_chunk_options =
3188
 * loose, there are two possible reasons for failure: the input contains a
3189
 * number of base64 characters that when divided by 4, leaves a single remainder
3190
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3191
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3192
 *
3193
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3194
 * input where the invalid character was found. When the error is
3195
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3196
 *
3197
 * The default option (simdutf::base64_default) expects the characters `+` and
3198
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3199
 * characters `-` and `_` as part of its alphabet.
3200
 *
3201
 * The padding (`=`) is validated if present. There may be at most two padding
3202
 * characters at the end of the input. If there are any padding characters, the
3203
 * total number of characters (excluding spaces but including padding
3204
 * characters) must be divisible by four.
3205
 *
3206
 * You should call this function with a buffer that is at least
3207
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
3208
 * to provide that much space, the function may cause a buffer overflow.
3209
 *
3210
 * Advanced users may want to tailor how the last chunk is handled. By default,
3211
 * we use a loose (forgiving) approach but we also support a strict approach
3212
 * as well as a stop_before_partial approach, as per the following proposal:
3213
 *
3214
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3215
 *
3216
 * @param input         the base64 string to process, in ASCII stored as 16-bit
3217
 * units
3218
 * @param length        the length of the string in 16-bit units
3219
 * @param output        the pointer to a buffer that can hold the conversion
3220
 * result (should be at least maximal_binary_length_from_base64(input, length)
3221
 * bytes long).
3222
 * @param options       the base64 options to use, can be base64_default or
3223
 * base64_url, is base64_default by default.
3224
 * @param last_chunk_options the last chunk handling options,
3225
 * last_chunk_handling_options::loose by default
3226
 * but can also be last_chunk_handling_options::strict or
3227
 * last_chunk_handling_options::stop_before_partial.
3228
 * @return a result pair struct (of type simdutf::result containing the two
3229
 * fields error and count) with an error code and position of the
3230
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3231
 * of bytes written if successful.
3232
 */
3233
simdutf_warn_unused result
3234
base64_to_binary(const char16_t *input, size_t length, char *output,
3235
                 base64_options options = base64_default,
3236
                 last_chunk_handling_options last_chunk_options =
3237
                     last_chunk_handling_options::loose) noexcept;
3238
  #if SIMDUTF_SPAN
3239
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3240
    std::span<const char16_t> input,
3241
    detail::output_span_of_byte_like auto &&binary_output,
3242
    base64_options options = base64_default,
3243
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3244
  return base64_to_binary(input.data(), input.size(),
3245
                          reinterpret_cast<char *>(binary_output.data()),
3246
                          options, last_chunk_options);
3247
}
3248
  #endif // SIMDUTF_SPAN
3249
3250
/**
3251
 * Check if a character is an ignorabl base64 character.
3252
 * Checking a large input, character by character, is not computationally
3253
 * efficient.
3254
 *
3255
 * @param input         the character to check
3256
 * @param options       the base64 options to use, is base64_default by default.
3257
 * @return true if the character is an ignorablee base64 character, false
3258
 * otherwise.
3259
 */
3260
simdutf_warn_unused bool
3261
base64_ignorable(char input, base64_options options = base64_default) noexcept;
3262
simdutf_warn_unused bool
3263
base64_ignorable(char16_t input,
3264
                 base64_options options = base64_default) noexcept;
3265
3266
/**
3267
 * Check if a character is a valid base64 character.
3268
 * Checking a large input, character by character, is not computationally
3269
 * efficient.
3270
 * Note that padding characters are not considered valid base64 characters in
3271
 * this context, nor are spaces.
3272
 *
3273
 * @param input         the character to check
3274
 * @param options       the base64 options to use, is base64_default by default.
3275
 * @return true if the character is a base64 character, false otherwise.
3276
 */
3277
simdutf_warn_unused bool
3278
base64_valid(char input, base64_options options = base64_default) noexcept;
3279
simdutf_warn_unused bool
3280
base64_valid(char16_t input, base64_options options = base64_default) noexcept;
3281
3282
/**
3283
 * Check if a character is a valid base64 character or the padding character
3284
 * ('='). Checking a large input, character by character, is not computationally
3285
 * efficient.
3286
 *
3287
 * @param input         the character to check
3288
 * @param options       the base64 options to use, is base64_default by default.
3289
 * @return true if the character is a base64 character, false otherwise.
3290
 */
3291
simdutf_warn_unused bool
3292
base64_valid_or_padding(char input,
3293
                        base64_options options = base64_default) noexcept;
3294
simdutf_warn_unused bool
3295
base64_valid_or_padding(char16_t input,
3296
                        base64_options options = base64_default) noexcept;
3297
3298
/**
3299
 * Convert a base64 input to a binary output.
3300
 *
3301
 * This function follows the WHATWG forgiving-base64 format, which means that it
3302
 * will ignore any ASCII spaces in the input. You may provide a padded input
3303
 * (with one or two equal signs at the end) or an unpadded input (without any
3304
 * equal signs at the end).
3305
 *
3306
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3307
 *
3308
 * This function will fail in case of invalid input. When last_chunk_options =
3309
 * loose, there are three possible reasons for failure: the input contains a
3310
 * number of base64 characters that when divided by 4, leaves a single remainder
3311
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
3312
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
3313
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
3314
 *
3315
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
3316
 * and the number of units processed, see description of the parameters and
3317
 * returned value.
3318
 *
3319
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3320
 * input where the invalid character was found. When the error is
3321
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3322
 *
3323
 * The default option (simdutf::base64_default) expects the characters `+` and
3324
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3325
 * characters `-` and `_` as part of its alphabet.
3326
 *
3327
 * The padding (`=`) is validated if present. There may be at most two padding
3328
 * characters at the end of the input. If there are any padding characters, the
3329
 * total number of characters (excluding spaces but including padding
3330
 * characters) must be divisible by four.
3331
 *
3332
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
3333
 * to discard the output unless the parameter decode_up_to_bad_char is set to
3334
 * true. In that case, the function will decode up to the first invalid
3335
 * character. Extra padding characters ('=') are considered invalid characters.
3336
 *
3337
 * Advanced users may want to tailor how the last chunk is handled. By default,
3338
 * we use a loose (forgiving) approach but we also support a strict approach
3339
 * as well as a stop_before_partial approach, as per the following proposal:
3340
 *
3341
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3342
 *
3343
 * @param input         the base64 string to process, in ASCII stored as 8-bit
3344
 * or 16-bit units
3345
 * @param length        the length of the string in 8-bit or 16-bit units.
3346
 * @param output        the pointer to a buffer that can hold the conversion
3347
 * result.
3348
 * @param outlen        the number of bytes that can be written in the output
3349
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3350
 * @param options       the base64 options to use, can be base64_default or
3351
 * base64_url, is base64_default by default.
3352
 * @param last_chunk_options the last chunk handling options,
3353
 * last_chunk_handling_options::loose by default
3354
 * but can also be last_chunk_handling_options::strict or
3355
 * last_chunk_handling_options::stop_before_partial.
3356
 * @param decode_up_to_bad_char if true, the function will decode up to the
3357
 * first invalid character. By default (false), it is assumed that the output
3358
 * buffer is to be discarded. When there are multiple errors in the input,
3359
 * using decode_up_to_bad_char might trigger a different error.
3360
 * @return a result pair struct (of type simdutf::result containing the two
3361
 * fields error and count) with an error code and position of the
3362
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3363
 * of units processed if successful.
3364
 */
3365
simdutf_warn_unused result
3366
base64_to_binary_safe(const char *input, size_t length, char *output,
3367
                      size_t &outlen, base64_options options = base64_default,
3368
                      last_chunk_handling_options last_chunk_options =
3369
                          last_chunk_handling_options::loose,
3370
                      bool decode_up_to_bad_char = false) noexcept;
3371
  #if SIMDUTF_SPAN
3372
/**
3373
 * @brief span overload
3374
 * @return a tuple of result and outlen
3375
 */
3376
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3377
base64_to_binary_safe(const detail::input_span_of_byte_like auto &input,
3378
                      detail::output_span_of_byte_like auto &&binary_output,
3379
                      base64_options options = base64_default,
3380
                      last_chunk_handling_options last_chunk_options = loose,
3381
                      bool decode_up_to_bad_char = false) noexcept {
3382
  size_t outlen = binary_output.size();
3383
  auto r = base64_to_binary_safe(
3384
      reinterpret_cast<const char *>(input.data()), input.size(),
3385
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3386
      last_chunk_options, decode_up_to_bad_char);
3387
  return {r, outlen};
3388
}
3389
  #endif // SIMDUTF_SPAN
3390
3391
simdutf_warn_unused result
3392
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
3393
                      size_t &outlen, base64_options options = base64_default,
3394
                      last_chunk_handling_options last_chunk_options =
3395
                          last_chunk_handling_options::loose,
3396
                      bool decode_up_to_bad_char = false) noexcept;
3397
  #if SIMDUTF_SPAN
3398
/**
3399
 * @brief span overload
3400
 * @return a tuple of result and outlen
3401
 */
3402
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3403
base64_to_binary_safe(std::span<const char16_t> input,
3404
                      detail::output_span_of_byte_like auto &&binary_output,
3405
                      base64_options options = base64_default,
3406
                      last_chunk_handling_options last_chunk_options = loose,
3407
                      bool decode_up_to_bad_char = false) noexcept {
3408
  size_t outlen = binary_output.size();
3409
  auto r = base64_to_binary_safe(input.data(), input.size(),
3410
                                 reinterpret_cast<char *>(binary_output.data()),
3411
                                 outlen, options, last_chunk_options,
3412
                                 decode_up_to_bad_char);
3413
  return {r, outlen};
3414
}
3415
  #endif // SIMDUTF_SPAN
3416
3417
  #if SIMDUTF_ATOMIC_REF
3418
/**
3419
 * Convert a base64 input to a binary output with a size limit and using atomic
3420
 * operations.
3421
 *
3422
 * Like `base64_to_binary_safe` but using atomic operations, this function is
3423
 * thread-safe for concurrent memory access, allowing the output
3424
 * buffers to be shared between threads without undefined behavior in case of
3425
 * data races.
3426
 *
3427
 * This function comes with a potentially significant performance penalty, but
3428
 * is useful when thread safety is needed during base64 decoding.
3429
 *
3430
 * This function is only available when simdutf is compiled with
3431
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3432
 * the availability of this function by checking the macro
3433
 * SIMDUTF_ATOMIC_REF.
3434
 *
3435
 * This function is considered experimental. It is not tested by default
3436
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3437
 * It is not documented in the public API documentation (README). It is
3438
 * offered on a best effort basis. We rely on the community for further
3439
 * testing and feedback.
3440
 *
3441
 * @param input         the base64 input to decode
3442
 * @param length        the length of the input in bytes
3443
 * @param output        the pointer to buffer that can hold the conversion
3444
 * result
3445
 * @param outlen        the number of bytes that can be written in the output
3446
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3447
 * @param options       the base64 options to use (default, url, etc.)
3448
 * @param last_chunk_options the last chunk handling options (loose, strict,
3449
 * stop_before_partial)
3450
 * @param decode_up_to_bad_char if true, the function will decode up to the
3451
 * first invalid character. By default (false), it is assumed that the output
3452
 * buffer is to be discarded. When there are multiple errors in the input,
3453
 * using decode_up_to_bad_char might trigger a different error.
3454
 * @return a result struct with an error code and count indicating error
3455
 * position or success
3456
 */
3457
simdutf_warn_unused result atomic_base64_to_binary_safe(
3458
    const char *input, size_t length, char *output, size_t &outlen,
3459
    base64_options options = base64_default,
3460
    last_chunk_handling_options last_chunk_options =
3461
        last_chunk_handling_options::loose,
3462
    bool decode_up_to_bad_char = false) noexcept;
3463
simdutf_warn_unused result atomic_base64_to_binary_safe(
3464
    const char16_t *input, size_t length, char *output, size_t &outlen,
3465
    base64_options options = base64_default,
3466
    last_chunk_handling_options last_chunk_options = loose,
3467
    bool decode_up_to_bad_char = false) noexcept;
3468
    #if SIMDUTF_SPAN
3469
/**
3470
 * @brief span overload
3471
 * @return a tuple of result and outlen
3472
 */
3473
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3474
atomic_base64_to_binary_safe(
3475
    const detail::input_span_of_byte_like auto &binary_input,
3476
    detail::output_span_of_byte_like auto &&output,
3477
    base64_options options = base64_default,
3478
    last_chunk_handling_options last_chunk_options =
3479
        last_chunk_handling_options::loose,
3480
    bool decode_up_to_bad_char = false) noexcept {
3481
  size_t outlen = output.size();
3482
  auto ret = atomic_base64_to_binary_safe(
3483
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
3484
      reinterpret_cast<char *>(output.data()), outlen, options,
3485
      last_chunk_options, decode_up_to_bad_char);
3486
  return {ret, outlen};
3487
}
3488
/**
3489
 * @brief span overload
3490
 * @return a tuple of result and outlen
3491
 */
3492
simdutf_warn_unused std::tuple<result, std::size_t>
3493
atomic_base64_to_binary_safe(
3494
    std::span<const char16_t> base64_input,
3495
    detail::output_span_of_byte_like auto &&binary_output,
3496
    base64_options options = base64_default,
3497
    last_chunk_handling_options last_chunk_options = loose,
3498
    bool decode_up_to_bad_char = false) noexcept {
3499
  size_t outlen = binary_output.size();
3500
  auto ret = atomic_base64_to_binary_safe(
3501
      base64_input.data(), base64_input.size(),
3502
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3503
      last_chunk_options, decode_up_to_bad_char);
3504
  return {ret, outlen};
3505
}
3506
    #endif // SIMDUTF_SPAN
3507
  #endif   // SIMDUTF_ATOMIC_REF
3508
3509
/**
3510
 * Find the first occurrence of a character in a string. If the character is
3511
 * not found, return a pointer to the end of the string.
3512
 * @param start        the start of the string
3513
 * @param end          the end of the string
3514
 * @param character    the character to find
3515
 * @return a pointer to the first occurrence of the character in the string,
3516
 * or a pointer to the end of the string if the character is not found.
3517
 *
3518
 */
3519
simdutf_warn_unused const char *find(const char *start, const char *end,
3520
                                     char character) noexcept;
3521
simdutf_warn_unused const char16_t *
3522
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
3523
#endif // SIMDUTF_FEATURE_BASE64
3524
3525
/**
3526
 * An implementation of simdutf for a particular CPU architecture.
3527
 *
3528
 * Also used to maintain the currently active implementation. The active
3529
 * implementation is automatically initialized on first use to the most advanced
3530
 * implementation supported by the host.
3531
 */
3532
class implementation {
3533
public:
3534
  /**
3535
   * The name of this implementation.
3536
   *
3537
   *     const implementation *impl = simdutf::active_implementation;
3538
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3539
   * impl->description() << ")" << endl;
3540
   *
3541
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3542
   */
3543
  virtual std::string name() const { return std::string(_name); }
3544
3545
  /**
3546
   * The description of this implementation.
3547
   *
3548
   *     const implementation *impl = simdutf::active_implementation;
3549
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3550
   * impl->description() << ")" << endl;
3551
   *
3552
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3553
   */
3554
  virtual std::string description() const { return std::string(_description); }
3555
3556
  /**
3557
   * The instruction sets this implementation is compiled against
3558
   * and the current CPU match. This function may poll the current CPU/system
3559
   * and should therefore not be called too often if performance is a concern.
3560
   *
3561
   *
3562
   * @return true if the implementation can be safely used on the current system
3563
   * (determined at runtime)
3564
   */
3565
  bool supported_by_runtime_system() const;
3566
3567
#if SIMDUTF_FEATURE_DETECT_ENCODING
3568
  /**
3569
   * This function will try to detect the encoding
3570
   * @param input the string to identify
3571
   * @param length the length of the string in bytes.
3572
   * @return the encoding type detected
3573
   */
3574
  virtual encoding_type autodetect_encoding(const char *input,
3575
                                            size_t length) const noexcept;
3576
3577
  /**
3578
   * This function will try to detect the possible encodings in one pass
3579
   * @param input the string to identify
3580
   * @param length the length of the string in bytes.
3581
   * @return the encoding type detected
3582
   */
3583
  virtual int detect_encodings(const char *input,
3584
                               size_t length) const noexcept = 0;
3585
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
3586
3587
  /**
3588
   * @private For internal implementation use
3589
   *
3590
   * The instruction sets this implementation is compiled against.
3591
   *
3592
   * @return a mask of all required `internal::instruction_set::` values
3593
   */
3594
  virtual uint32_t required_instruction_sets() const {
3595
    return _required_instruction_sets;
3596
  }
3597
3598
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3599
  /**
3600
   * Validate the UTF-8 string.
3601
   *
3602
   * Overridden by each implementation.
3603
   *
3604
   * @param buf the UTF-8 string to validate.
3605
   * @param len the length of the string in bytes.
3606
   * @return true if and only if the string is valid UTF-8.
3607
   */
3608
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
3609
                                                 size_t len) const noexcept = 0;
3610
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3611
3612
#if SIMDUTF_FEATURE_UTF8
3613
  /**
3614
   * Validate the UTF-8 string and stop on errors.
3615
   *
3616
   * Overridden by each implementation.
3617
   *
3618
   * @param buf the UTF-8 string to validate.
3619
   * @param len the length of the string in bytes.
3620
   * @return a result pair struct (of type simdutf::result containing the two
3621
   * fields error and count) with an error code and either position of the error
3622
   * (in the input in code units) if any, or the number of code units validated
3623
   * if successful.
3624
   */
3625
  simdutf_warn_unused virtual result
3626
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
3627
#endif // SIMDUTF_FEATURE_UTF8
3628
3629
#if SIMDUTF_FEATURE_ASCII
3630
  /**
3631
   * Validate the ASCII string.
3632
   *
3633
   * Overridden by each implementation.
3634
   *
3635
   * @param buf the ASCII string to validate.
3636
   * @param len the length of the string in bytes.
3637
   * @return true if and only if the string is valid ASCII.
3638
   */
3639
  simdutf_warn_unused virtual bool
3640
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
3641
3642
  /**
3643
   * Validate the ASCII string and stop on error.
3644
   *
3645
   * Overridden by each implementation.
3646
   *
3647
   * @param buf the ASCII string to validate.
3648
   * @param len the length of the string in bytes.
3649
   * @return a result pair struct (of type simdutf::result containing the two
3650
   * fields error and count) with an error code and either position of the error
3651
   * (in the input in code units) if any, or the number of code units validated
3652
   * if successful.
3653
   */
3654
  simdutf_warn_unused virtual result
3655
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
3656
3657
#endif // SIMDUTF_FEATURE_ASCII
3658
3659
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3660
  /**
3661
   * Validate the ASCII string as a UTF-16BE sequence.
3662
   * An UTF-16 sequence is considered an ASCII sequence
3663
   * if it could be converted to an ASCII string losslessly.
3664
   *
3665
   * Overridden by each implementation.
3666
   *
3667
   * @param buf the UTF-16BE string to validate.
3668
   * @param len the length of the string in bytes.
3669
   * @return true if and only if the string is valid ASCII.
3670
   */
3671
  simdutf_warn_unused virtual bool
3672
  validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3673
3674
  /**
3675
   * Validate the ASCII string as a UTF-16LE sequence.
3676
   * An UTF-16 sequence is considered an ASCII sequence
3677
   * if it could be converted to an ASCII string losslessly.
3678
   *
3679
   * Overridden by each implementation.
3680
   *
3681
   * @param buf the UTF-16LE string to validate.
3682
   * @param len the length of the string in bytes.
3683
   * @return true if and only if the string is valid ASCII.
3684
   */
3685
  simdutf_warn_unused virtual bool
3686
  validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3687
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3688
3689
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3690
  /**
3691
   * Validate the UTF-16LE string.This function may be best when you expect
3692
   * the input to be almost always valid. Otherwise, consider using
3693
   * validate_utf16le_with_errors.
3694
   *
3695
   * Overridden by each implementation.
3696
   *
3697
   * This function is not BOM-aware.
3698
   *
3699
   * @param buf the UTF-16LE string to validate.
3700
   * @param len the length of the string in number of 2-byte code units
3701
   * (char16_t).
3702
   * @return true if and only if the string is valid UTF-16LE.
3703
   */
3704
  simdutf_warn_unused virtual bool
3705
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
3706
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3707
3708
#if SIMDUTF_FEATURE_UTF16
3709
  /**
3710
   * Validate the UTF-16BE string. This function may be best when you expect
3711
   * the input to be almost always valid. Otherwise, consider using
3712
   * validate_utf16be_with_errors.
3713
   *
3714
   * Overridden by each implementation.
3715
   *
3716
   * This function is not BOM-aware.
3717
   *
3718
   * @param buf the UTF-16BE string to validate.
3719
   * @param len the length of the string in number of 2-byte code units
3720
   * (char16_t).
3721
   * @return true if and only if the string is valid UTF-16BE.
3722
   */
3723
  simdutf_warn_unused virtual bool
3724
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
3725
3726
  /**
3727
   * Validate the UTF-16LE string and stop on error.  It might be faster than
3728
   * validate_utf16le when an error is expected to occur early.
3729
   *
3730
   * Overridden by each implementation.
3731
   *
3732
   * This function is not BOM-aware.
3733
   *
3734
   * @param buf the UTF-16LE string to validate.
3735
   * @param len the length of the string in number of 2-byte code units
3736
   * (char16_t).
3737
   * @return a result pair struct (of type simdutf::result containing the two
3738
   * fields error and count) with an error code and either position of the error
3739
   * (in the input in code units) if any, or the number of code units validated
3740
   * if successful.
3741
   */
3742
  simdutf_warn_unused virtual result
3743
  validate_utf16le_with_errors(const char16_t *buf,
3744
                               size_t len) const noexcept = 0;
3745
3746
  /**
3747
   * Validate the UTF-16BE string and stop on error. It might be faster than
3748
   * validate_utf16be when an error is expected to occur early.
3749
   *
3750
   * Overridden by each implementation.
3751
   *
3752
   * This function is not BOM-aware.
3753
   *
3754
   * @param buf the UTF-16BE string to validate.
3755
   * @param len the length of the string in number of 2-byte code units
3756
   * (char16_t).
3757
   * @return a result pair struct (of type simdutf::result containing the two
3758
   * fields error and count) with an error code and either position of the error
3759
   * (in the input in code units) if any, or the number of code units validated
3760
   * if successful.
3761
   */
3762
  simdutf_warn_unused virtual result
3763
  validate_utf16be_with_errors(const char16_t *buf,
3764
                               size_t len) const noexcept = 0;
3765
  /**
3766
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
3767
   * Unicode replacement character U+FFFD. We allow the input and output to be
3768
   * the same buffer so that the correction is done in-place.
3769
   *
3770
   * Overridden by each implementation.
3771
   *
3772
   * @param input the UTF-16LE string to correct.
3773
   * @param len the length of the string in number of 2-byte code units
3774
   * (char16_t).
3775
   * @param output the output buffer.
3776
   */
3777
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
3778
                                      char16_t *output) const noexcept = 0;
3779
  /**
3780
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
3781
   * Unicode replacement character U+FFFD. We allow the input and output to be
3782
   * the same buffer so that the correction is done in-place.
3783
   *
3784
   * Overridden by each implementation.
3785
   *
3786
   * @param input the UTF-16BE string to correct.
3787
   * @param len the length of the string in number of 2-byte code units
3788
   * (char16_t).
3789
   * @param output the output buffer.
3790
   */
3791
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
3792
                                      char16_t *output) const noexcept = 0;
3793
#endif // SIMDUTF_FEATURE_UTF16
3794
3795
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3796
  /**
3797
   * Validate the UTF-32 string.
3798
   *
3799
   * Overridden by each implementation.
3800
   *
3801
   * This function is not BOM-aware.
3802
   *
3803
   * @param buf the UTF-32 string to validate.
3804
   * @param len the length of the string in number of 4-byte code units
3805
   * (char32_t).
3806
   * @return true if and only if the string is valid UTF-32.
3807
   */
3808
  simdutf_warn_unused virtual bool
3809
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
3810
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3811
3812
#if SIMDUTF_FEATURE_UTF32
3813
  /**
3814
   * Validate the UTF-32 string and stop on error.
3815
   *
3816
   * Overridden by each implementation.
3817
   *
3818
   * This function is not BOM-aware.
3819
   *
3820
   * @param buf the UTF-32 string to validate.
3821
   * @param len the length of the string in number of 4-byte code units
3822
   * (char32_t).
3823
   * @return a result pair struct (of type simdutf::result containing the two
3824
   * fields error and count) with an error code and either position of the error
3825
   * (in the input in code units) if any, or the number of code units validated
3826
   * if successful.
3827
   */
3828
  simdutf_warn_unused virtual result
3829
  validate_utf32_with_errors(const char32_t *buf,
3830
                             size_t len) const noexcept = 0;
3831
#endif // SIMDUTF_FEATURE_UTF32
3832
3833
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3834
  /**
3835
   * Convert Latin1 string into UTF-8 string.
3836
   *
3837
   * This function is suitable to work with inputs from untrusted sources.
3838
   *
3839
   * @param input         the Latin1 string to convert
3840
   * @param length        the length of the string in bytes
3841
   * @param utf8_output  the pointer to buffer that can hold conversion result
3842
   * @return the number of written char; 0 if conversion is not possible
3843
   */
3844
  simdutf_warn_unused virtual size_t
3845
  convert_latin1_to_utf8(const char *input, size_t length,
3846
                         char *utf8_output) const noexcept = 0;
3847
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3848
3849
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3850
  /**
3851
   * Convert possibly Latin1 string into UTF-16LE string.
3852
   *
3853
   * This function is suitable to work with inputs from untrusted sources.
3854
   *
3855
   * @param input         the Latin1  string to convert
3856
   * @param length        the length of the string in bytes
3857
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3858
   * @return the number of written char16_t; 0 if conversion is not possible
3859
   */
3860
  simdutf_warn_unused virtual size_t
3861
  convert_latin1_to_utf16le(const char *input, size_t length,
3862
                            char16_t *utf16_output) const noexcept = 0;
3863
3864
  /**
3865
   * Convert Latin1 string into UTF-16BE string.
3866
   *
3867
   * This function is suitable to work with inputs from untrusted sources.
3868
   *
3869
   * @param input         the Latin1 string to convert
3870
   * @param length        the length of the string in bytes
3871
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3872
   * @return the number of written char16_t; 0 if conversion is not possible
3873
   */
3874
  simdutf_warn_unused virtual size_t
3875
  convert_latin1_to_utf16be(const char *input, size_t length,
3876
                            char16_t *utf16_output) const noexcept = 0;
3877
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3878
3879
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3880
  /**
3881
   * Convert Latin1 string into UTF-32 string.
3882
   *
3883
   * This function is suitable to work with inputs from untrusted sources.
3884
   *
3885
   * @param input         the Latin1 string to convert
3886
   * @param length        the length of the string in bytes
3887
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
3888
   * @return the number of written char32_t; 0 if conversion is not possible
3889
   */
3890
  simdutf_warn_unused virtual size_t
3891
  convert_latin1_to_utf32(const char *input, size_t length,
3892
                          char32_t *utf32_buffer) const noexcept = 0;
3893
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3894
3895
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3896
  /**
3897
   * Convert possibly broken UTF-8 string into latin1 string.
3898
   *
3899
   * During the conversion also validation of the input string is done.
3900
   * This function is suitable to work with inputs from untrusted sources.
3901
   *
3902
   * @param input         the UTF-8 string to convert
3903
   * @param length        the length of the string in bytes
3904
   * @param latin1_output  the pointer to buffer that can hold conversion result
3905
   * @return the number of written char; 0 if the input was not valid UTF-8
3906
   * string or if it cannot be represented as Latin1
3907
   */
3908
  simdutf_warn_unused virtual size_t
3909
  convert_utf8_to_latin1(const char *input, size_t length,
3910
                         char *latin1_output) const noexcept = 0;
3911
3912
  /**
3913
   * Convert possibly broken UTF-8 string into latin1 string with errors.
3914
   * If the string cannot be represented as Latin1, an error
3915
   * code is returned.
3916
   *
3917
   * During the conversion also validation of the input string is done.
3918
   * This function is suitable to work with inputs from untrusted sources.
3919
   *
3920
   * @param input         the UTF-8 string to convert
3921
   * @param length        the length of the string in bytes
3922
   * @param latin1_output  the pointer to buffer that can hold conversion result
3923
   * @return a result pair struct (of type simdutf::result containing the two
3924
   * fields error and count) with an error code and either position of the error
3925
   * (in the input in code units) if any, or the number of code units validated
3926
   * if successful.
3927
   */
3928
  simdutf_warn_unused virtual result
3929
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
3930
                                     char *latin1_output) const noexcept = 0;
3931
3932
  /**
3933
   * Convert valid UTF-8 string into latin1 string.
3934
   *
3935
   * This function assumes that the input string is valid UTF-8 and that it can
3936
   * be represented as Latin1. If you violate this assumption, the result is
3937
   * implementation defined and may include system-dependent behavior such as
3938
   * crashes.
3939
   *
3940
   * This function is for expert users only and not part of our public API. Use
3941
   * convert_utf8_to_latin1 instead.
3942
   *
3943
   * This function is not BOM-aware.
3944
   *
3945
   * @param input         the UTF-8 string to convert
3946
   * @param length        the length of the string in bytes
3947
   * @param latin1_output  the pointer to buffer that can hold conversion result
3948
   * @return the number of written char; 0 if the input was not valid UTF-8
3949
   * string
3950
   */
3951
  simdutf_warn_unused virtual size_t
3952
  convert_valid_utf8_to_latin1(const char *input, size_t length,
3953
                               char *latin1_output) const noexcept = 0;
3954
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3955
3956
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3957
  /**
3958
   * Convert possibly broken UTF-8 string into UTF-16LE string.
3959
   *
3960
   * During the conversion also validation of the input string is done.
3961
   * This function is suitable to work with inputs from untrusted sources.
3962
   *
3963
   * @param input         the UTF-8 string to convert
3964
   * @param length        the length of the string in bytes
3965
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3966
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
3967
   * string
3968
   */
3969
  simdutf_warn_unused virtual size_t
3970
  convert_utf8_to_utf16le(const char *input, size_t length,
3971
                          char16_t *utf16_output) const noexcept = 0;
3972
3973
  /**
3974
   * Convert possibly broken UTF-8 string into UTF-16BE string.
3975
   *
3976
   * During the conversion also validation of the input string is done.
3977
   * This function is suitable to work with inputs from untrusted sources.
3978
   *
3979
   * @param input         the UTF-8 string to convert
3980
   * @param length        the length of the string in bytes
3981
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3982
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
3983
   * string
3984
   */
3985
  simdutf_warn_unused virtual size_t
3986
  convert_utf8_to_utf16be(const char *input, size_t length,
3987
                          char16_t *utf16_output) const noexcept = 0;
3988
3989
  /**
3990
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
3991
   * error.
3992
   *
3993
   * During the conversion also validation of the input string is done.
3994
   * This function is suitable to work with inputs from untrusted sources.
3995
   *
3996
   * @param input         the UTF-8 string to convert
3997
   * @param length        the length of the string in bytes
3998
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3999
   * @return a result pair struct (of type simdutf::result containing the two
4000
   * fields error and count) with an error code and either position of the error
4001
   * (in the input in code units) if any, or the number of code units validated
4002
   * if successful.
4003
   */
4004
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
4005
      const char *input, size_t length,
4006
      char16_t *utf16_output) const noexcept = 0;
4007
4008
  /**
4009
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
4010
   * error.
4011
   *
4012
   * During the conversion also validation of the input string is done.
4013
   * This function is suitable to work with inputs from untrusted sources.
4014
   *
4015
   * @param input         the UTF-8 string to convert
4016
   * @param length        the length of the string in bytes
4017
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4018
   * @return a result pair struct (of type simdutf::result containing the two
4019
   * fields error and count) with an error code and either position of the error
4020
   * (in the input in code units) if any, or the number of code units validated
4021
   * if successful.
4022
   */
4023
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
4024
      const char *input, size_t length,
4025
      char16_t *utf16_output) const noexcept = 0;
4026
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4027
4028
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4029
  /**
4030
   * Convert possibly broken UTF-8 string into UTF-32 string.
4031
   *
4032
   * During the conversion also validation of the input string is done.
4033
   * This function is suitable to work with inputs from untrusted sources.
4034
   *
4035
   * @param input         the UTF-8 string to convert
4036
   * @param length        the length of the string in bytes
4037
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4038
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4039
   * string
4040
   */
4041
  simdutf_warn_unused virtual size_t
4042
  convert_utf8_to_utf32(const char *input, size_t length,
4043
                        char32_t *utf32_output) const noexcept = 0;
4044
4045
  /**
4046
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
4047
   *
4048
   * During the conversion also validation of the input string is done.
4049
   * This function is suitable to work with inputs from untrusted sources.
4050
   *
4051
   * @param input         the UTF-8 string to convert
4052
   * @param length        the length of the string in bytes
4053
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4054
   * @return a result pair struct (of type simdutf::result containing the two
4055
   * fields error and count) with an error code and either position of the error
4056
   * (in the input in code units) if any, or the number of char32_t written if
4057
   * successful.
4058
   */
4059
  simdutf_warn_unused virtual result
4060
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
4061
                                    char32_t *utf32_output) const noexcept = 0;
4062
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4063
4064
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4065
  /**
4066
   * Convert valid UTF-8 string into UTF-16LE string.
4067
   *
4068
   * This function assumes that the input string is valid UTF-8.
4069
   *
4070
   * @param input         the UTF-8 string to convert
4071
   * @param length        the length of the string in bytes
4072
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4073
   * @return the number of written char16_t
4074
   */
4075
  simdutf_warn_unused virtual size_t
4076
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
4077
                                char16_t *utf16_buffer) const noexcept = 0;
4078
4079
  /**
4080
   * Convert valid UTF-8 string into UTF-16BE string.
4081
   *
4082
   * This function assumes that the input string is valid UTF-8.
4083
   *
4084
   * @param input         the UTF-8 string to convert
4085
   * @param length        the length of the string in bytes
4086
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4087
   * @return the number of written char16_t
4088
   */
4089
  simdutf_warn_unused virtual size_t
4090
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
4091
                                char16_t *utf16_buffer) const noexcept = 0;
4092
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4093
4094
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4095
  /**
4096
   * Convert valid UTF-8 string into UTF-32 string.
4097
   *
4098
   * This function assumes that the input string is valid UTF-8.
4099
   *
4100
   * @param input         the UTF-8 string to convert
4101
   * @param length        the length of the string in bytes
4102
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4103
   * @return the number of written char32_t
4104
   */
4105
  simdutf_warn_unused virtual size_t
4106
  convert_valid_utf8_to_utf32(const char *input, size_t length,
4107
                              char32_t *utf32_buffer) const noexcept = 0;
4108
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4109
4110
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4111
  /**
4112
   * Compute the number of 2-byte code units that this UTF-8 string would
4113
   * require in UTF-16LE format.
4114
   *
4115
   * This function does not validate the input. It is acceptable to pass invalid
4116
   * UTF-8 strings but in such cases the result is implementation defined.
4117
   *
4118
   * @param input         the UTF-8 string to process
4119
   * @param length        the length of the string in bytes
4120
   * @return the number of char16_t code units required to encode the UTF-8
4121
   * string as UTF-16LE
4122
   */
4123
  simdutf_warn_unused virtual size_t
4124
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4125
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4126
4127
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4128
  /**
4129
   * Compute the number of 4-byte code units that this UTF-8 string would
4130
   * require in UTF-32 format.
4131
   *
4132
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
4133
   * UTF-8 strings but in such cases the result is implementation defined.
4134
   *
4135
   * This function does not validate the input.
4136
   *
4137
   * @param input         the UTF-8 string to process
4138
   * @param length        the length of the string in bytes
4139
   * @return the number of char32_t code units required to encode the UTF-8
4140
   * string as UTF-32
4141
   */
4142
  simdutf_warn_unused virtual size_t
4143
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4144
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4145
4146
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4147
  /**
4148
   * Convert possibly broken UTF-16LE string into Latin1 string.
4149
   *
4150
   * During the conversion also validation of the input string is done.
4151
   * This function is suitable to work with inputs from untrusted sources.
4152
   *
4153
   * This function is not BOM-aware.
4154
   *
4155
   * @param input         the UTF-16LE string to convert
4156
   * @param length        the length of the string in 2-byte code units
4157
   * (char16_t)
4158
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4159
   * result
4160
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4161
   * string or if it cannot be represented as Latin1
4162
   */
4163
  simdutf_warn_unused virtual size_t
4164
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
4165
                            char *latin1_buffer) const noexcept = 0;
4166
4167
  /**
4168
   * Convert possibly broken UTF-16BE string into Latin1 string.
4169
   *
4170
   * During the conversion also validation of the input string is done.
4171
   * This function is suitable to work with inputs from untrusted sources.
4172
   *
4173
   * This function is not BOM-aware.
4174
   *
4175
   * @param input         the UTF-16BE string to convert
4176
   * @param length        the length of the string in 2-byte code units
4177
   * (char16_t)
4178
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4179
   * result
4180
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4181
   * string or if it cannot be represented as Latin1
4182
   */
4183
  simdutf_warn_unused virtual size_t
4184
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
4185
                            char *latin1_buffer) const noexcept = 0;
4186
4187
  /**
4188
   * Convert possibly broken UTF-16LE string into Latin1 string.
4189
   * If the string cannot be represented as Latin1, an error
4190
   * is returned.
4191
   *
4192
   * During the conversion also validation of the input string is done.
4193
   * This function is suitable to work with inputs from untrusted sources.
4194
   * This function is not BOM-aware.
4195
   *
4196
   * @param input         the UTF-16LE string to convert
4197
   * @param length        the length of the string in 2-byte code units
4198
   * (char16_t)
4199
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4200
   * result
4201
   * @return a result pair struct (of type simdutf::result containing the two
4202
   * fields error and count) with an error code and either position of the error
4203
   * (in the input in code units) if any, or the number of char written if
4204
   * successful.
4205
   */
4206
  simdutf_warn_unused virtual result
4207
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
4208
                                        char *latin1_buffer) const noexcept = 0;
4209
4210
  /**
4211
   * Convert possibly broken UTF-16BE string into Latin1 string.
4212
   * If the string cannot be represented as Latin1, an error
4213
   * is returned.
4214
   *
4215
   * During the conversion also validation of the input string is done.
4216
   * This function is suitable to work with inputs from untrusted sources.
4217
   * This function is not BOM-aware.
4218
   *
4219
   * @param input         the UTF-16BE string to convert
4220
   * @param length        the length of the string in 2-byte code units
4221
   * (char16_t)
4222
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4223
   * result
4224
   * @return a result pair struct (of type simdutf::result containing the two
4225
   * fields error and count) with an error code and either position of the error
4226
   * (in the input in code units) if any, or the number of char written if
4227
   * successful.
4228
   */
4229
  simdutf_warn_unused virtual result
4230
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
4231
                                        char *latin1_buffer) const noexcept = 0;
4232
4233
  /**
4234
   * Convert valid UTF-16LE string into Latin1 string.
4235
   *
4236
   * This function assumes that the input string is valid UTF-L16LE and that it
4237
   * can be represented as Latin1. If you violate this assumption, the result is
4238
   * implementation defined and may include system-dependent behavior such as
4239
   * crashes.
4240
   *
4241
   * This function is for expert users only and not part of our public API. Use
4242
   * convert_utf16le_to_latin1 instead.
4243
   *
4244
   * This function is not BOM-aware.
4245
   *
4246
   * @param input         the UTF-16LE string to convert
4247
   * @param length        the length of the string in 2-byte code units
4248
   * (char16_t)
4249
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4250
   * result
4251
   * @return number of written code units; 0 if conversion is not possible
4252
   */
4253
  simdutf_warn_unused virtual size_t
4254
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
4255
                                  char *latin1_buffer) const noexcept = 0;
4256
4257
  /**
4258
   * Convert valid UTF-16BE string into Latin1 string.
4259
   *
4260
   * This function assumes that the input string is valid UTF16-BE and that it
4261
   * can be represented as Latin1. If you violate this assumption, the result is
4262
   * implementation defined and may include system-dependent behavior such as
4263
   * crashes.
4264
   *
4265
   * This function is for expert users only and not part of our public API. Use
4266
   * convert_utf16be_to_latin1 instead.
4267
   *
4268
   * This function is not BOM-aware.
4269
   *
4270
   * @param input         the UTF-16BE string to convert
4271
   * @param length        the length of the string in 2-byte code units
4272
   * (char16_t)
4273
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4274
   * result
4275
   * @return number of written code units; 0 if conversion is not possible
4276
   */
4277
  simdutf_warn_unused virtual size_t
4278
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
4279
                                  char *latin1_buffer) const noexcept = 0;
4280
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4281
4282
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4283
  /**
4284
   * Convert possibly broken UTF-16LE string into UTF-8 string.
4285
   *
4286
   * During the conversion also validation of the input string is done.
4287
   * This function is suitable to work with inputs from untrusted sources.
4288
   *
4289
   * This function is not BOM-aware.
4290
   *
4291
   * @param input         the UTF-16LE string to convert
4292
   * @param length        the length of the string in 2-byte code units
4293
   * (char16_t)
4294
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4295
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4296
   * string
4297
   */
4298
  simdutf_warn_unused virtual size_t
4299
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
4300
                          char *utf8_buffer) const noexcept = 0;
4301
4302
  /**
4303
   * Convert possibly broken UTF-16BE string into UTF-8 string.
4304
   *
4305
   * During the conversion also validation of the input string is done.
4306
   * This function is suitable to work with inputs from untrusted sources.
4307
   *
4308
   * This function is not BOM-aware.
4309
   *
4310
   * @param input         the UTF-16BE string to convert
4311
   * @param length        the length of the string in 2-byte code units
4312
   * (char16_t)
4313
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4314
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4315
   * string
4316
   */
4317
  simdutf_warn_unused virtual size_t
4318
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
4319
                          char *utf8_buffer) const noexcept = 0;
4320
4321
  /**
4322
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
4323
   * error.
4324
   *
4325
   * During the conversion also validation of the input string is done.
4326
   * This function is suitable to work with inputs from untrusted sources.
4327
   *
4328
   * This function is not BOM-aware.
4329
   *
4330
   * @param input         the UTF-16LE string to convert
4331
   * @param length        the length of the string in 2-byte code units
4332
   * (char16_t)
4333
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4334
   * @return a result pair struct (of type simdutf::result containing the two
4335
   * fields error and count) with an error code and either position of the error
4336
   * (in the input in code units) if any, or the number of char written if
4337
   * successful.
4338
   */
4339
  simdutf_warn_unused virtual result
4340
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
4341
                                      char *utf8_buffer) const noexcept = 0;
4342
4343
  /**
4344
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
4345
   * error.
4346
   *
4347
   * During the conversion also validation of the input string is done.
4348
   * This function is suitable to work with inputs from untrusted sources.
4349
   *
4350
   * This function is not BOM-aware.
4351
   *
4352
   * @param input         the UTF-16BE string to convert
4353
   * @param length        the length of the string in 2-byte code units
4354
   * (char16_t)
4355
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4356
   * @return a result pair struct (of type simdutf::result containing the two
4357
   * fields error and count) with an error code and either position of the error
4358
   * (in the input in code units) if any, or the number of char written if
4359
   * successful.
4360
   */
4361
  simdutf_warn_unused virtual result
4362
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
4363
                                      char *utf8_buffer) const noexcept = 0;
4364
4365
  /**
4366
   * Convert valid UTF-16LE string into UTF-8 string.
4367
   *
4368
   * This function assumes that the input string is valid UTF-16LE.
4369
   *
4370
   * This function is not BOM-aware.
4371
   *
4372
   * @param input         the UTF-16LE string to convert
4373
   * @param length        the length of the string in 2-byte code units
4374
   * (char16_t)
4375
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4376
   * result
4377
   * @return number of written code units; 0 if conversion is not possible
4378
   */
4379
  simdutf_warn_unused virtual size_t
4380
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
4381
                                char *utf8_buffer) const noexcept = 0;
4382
4383
  /**
4384
   * Convert valid UTF-16BE string into UTF-8 string.
4385
   *
4386
   * This function assumes that the input string is valid UTF-16BE.
4387
   *
4388
   * This function is not BOM-aware.
4389
   *
4390
   * @param input         the UTF-16BE string to convert
4391
   * @param length        the length of the string in 2-byte code units
4392
   * (char16_t)
4393
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4394
   * result
4395
   * @return number of written code units; 0 if conversion is not possible
4396
   */
4397
  simdutf_warn_unused virtual size_t
4398
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
4399
                                char *utf8_buffer) const noexcept = 0;
4400
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4401
4402
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4403
  /**
4404
   * Convert possibly broken UTF-16LE string into UTF-32 string.
4405
   *
4406
   * During the conversion also validation of the input string is done.
4407
   * This function is suitable to work with inputs from untrusted sources.
4408
   *
4409
   * This function is not BOM-aware.
4410
   *
4411
   * @param input         the UTF-16LE string to convert
4412
   * @param length        the length of the string in 2-byte code units
4413
   * (char16_t)
4414
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4415
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4416
   * string
4417
   */
4418
  simdutf_warn_unused virtual size_t
4419
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
4420
                           char32_t *utf32_buffer) const noexcept = 0;
4421
4422
  /**
4423
   * Convert possibly broken UTF-16BE string into UTF-32 string.
4424
   *
4425
   * During the conversion also validation of the input string is done.
4426
   * This function is suitable to work with inputs from untrusted sources.
4427
   *
4428
   * This function is not BOM-aware.
4429
   *
4430
   * @param input         the UTF-16BE string to convert
4431
   * @param length        the length of the string in 2-byte code units
4432
   * (char16_t)
4433
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4434
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4435
   * string
4436
   */
4437
  simdutf_warn_unused virtual size_t
4438
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
4439
                           char32_t *utf32_buffer) const noexcept = 0;
4440
4441
  /**
4442
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
4443
   * error.
4444
   *
4445
   * During the conversion also validation of the input string is done.
4446
   * This function is suitable to work with inputs from untrusted sources.
4447
   *
4448
   * This function is not BOM-aware.
4449
   *
4450
   * @param input         the UTF-16LE string to convert
4451
   * @param length        the length of the string in 2-byte code units
4452
   * (char16_t)
4453
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4454
   * @return a result pair struct (of type simdutf::result containing the two
4455
   * fields error and count) with an error code and either position of the error
4456
   * (in the input in code units) if any, or the number of char32_t written if
4457
   * successful.
4458
   */
4459
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
4460
      const char16_t *input, size_t length,
4461
      char32_t *utf32_buffer) const noexcept = 0;
4462
4463
  /**
4464
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
4465
   * error.
4466
   *
4467
   * During the conversion also validation of the input string is done.
4468
   * This function is suitable to work with inputs from untrusted sources.
4469
   *
4470
   * This function is not BOM-aware.
4471
   *
4472
   * @param input         the UTF-16BE string to convert
4473
   * @param length        the length of the string in 2-byte code units
4474
   * (char16_t)
4475
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4476
   * @return a result pair struct (of type simdutf::result containing the two
4477
   * fields error and count) with an error code and either position of the error
4478
   * (in the input in code units) if any, or the number of char32_t written if
4479
   * successful.
4480
   */
4481
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
4482
      const char16_t *input, size_t length,
4483
      char32_t *utf32_buffer) const noexcept = 0;
4484
4485
  /**
4486
   * Convert valid UTF-16LE string into UTF-32 string.
4487
   *
4488
   * This function assumes that the input string is valid UTF-16LE.
4489
   *
4490
   * This function is not BOM-aware.
4491
   *
4492
   * @param input         the UTF-16LE string to convert
4493
   * @param length        the length of the string in 2-byte code units
4494
   * (char16_t)
4495
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4496
   * result
4497
   * @return number of written code units; 0 if conversion is not possible
4498
   */
4499
  simdutf_warn_unused virtual size_t
4500
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
4501
                                 char32_t *utf32_buffer) const noexcept = 0;
4502
4503
  /**
4504
   * Convert valid UTF-16LE string into UTF-32BE string.
4505
   *
4506
   * This function assumes that the input string is valid UTF-16BE.
4507
   *
4508
   * This function is not BOM-aware.
4509
   *
4510
   * @param input         the UTF-16BE string to convert
4511
   * @param length        the length of the string in 2-byte code units
4512
   * (char16_t)
4513
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4514
   * result
4515
   * @return number of written code units; 0 if conversion is not possible
4516
   */
4517
  simdutf_warn_unused virtual size_t
4518
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
4519
                                 char32_t *utf32_buffer) const noexcept = 0;
4520
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4521
4522
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4523
  /**
4524
   * Compute the number of bytes that this UTF-16LE string would require in
4525
   * UTF-8 format.
4526
   *
4527
   * This function does not validate the input. It is acceptable to pass invalid
4528
   * UTF-16 strings but in such cases the result is implementation defined.
4529
   *
4530
   * This function is not BOM-aware.
4531
   *
4532
   * @param input         the UTF-16LE string to convert
4533
   * @param length        the length of the string in 2-byte code units
4534
   * (char16_t)
4535
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
4536
   */
4537
  simdutf_warn_unused virtual size_t
4538
  utf8_length_from_utf16le(const char16_t *input,
4539
                           size_t length) const noexcept = 0;
4540
4541
  /**
4542
   * Compute the number of bytes that this UTF-16BE string would require in
4543
   * UTF-8 format.
4544
   *
4545
   * This function does not validate the input. It is acceptable to pass invalid
4546
   * UTF-16 strings but in such cases the result is implementation defined.
4547
   *
4548
   * This function is not BOM-aware.
4549
   *
4550
   * @param input         the UTF-16BE string to convert
4551
   * @param length        the length of the string in 2-byte code units
4552
   * (char16_t)
4553
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
4554
   */
4555
  simdutf_warn_unused virtual size_t
4556
  utf8_length_from_utf16be(const char16_t *input,
4557
                           size_t length) const noexcept = 0;
4558
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4559
4560
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4561
  /**
4562
   * Convert possibly broken UTF-32 string into Latin1 string.
4563
   *
4564
   * During the conversion also validation of the input string is done.
4565
   * This function is suitable to work with inputs from untrusted sources.
4566
   *
4567
   * This function is not BOM-aware.
4568
   *
4569
   * @param input         the UTF-32 string to convert
4570
   * @param length        the length of the string in 4-byte code units
4571
   * (char32_t)
4572
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4573
   * result
4574
   * @return number of written code units; 0 if input is not a valid UTF-32
4575
   * string
4576
   */
4577
  simdutf_warn_unused virtual size_t
4578
  convert_utf32_to_latin1(const char32_t *input, size_t length,
4579
                          char *latin1_buffer) const noexcept = 0;
4580
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4581
4582
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4583
  /**
4584
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
4585
   * If the string cannot be represented as Latin1, an error is returned.
4586
   *
4587
   * During the conversion also validation of the input string is done.
4588
   * This function is suitable to work with inputs from untrusted sources.
4589
   *
4590
   * This function is not BOM-aware.
4591
   *
4592
   * @param input         the UTF-32 string to convert
4593
   * @param length        the length of the string in 4-byte code units
4594
   * (char32_t)
4595
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4596
   * result
4597
   * @return a result pair struct (of type simdutf::result containing the two
4598
   * fields error and count) with an error code and either position of the error
4599
   * (in the input in code units) if any, or the number of char written if
4600
   * successful.
4601
   */
4602
  simdutf_warn_unused virtual result
4603
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
4604
                                      char *latin1_buffer) const noexcept = 0;
4605
4606
  /**
4607
   * Convert valid UTF-32 string into Latin1 string.
4608
   *
4609
   * This function assumes that the input string is valid UTF-32 and can be
4610
   * represented as Latin1. If you violate this assumption, the result is
4611
   * implementation defined and may include system-dependent behavior such as
4612
   * crashes.
4613
   *
4614
   * This function is for expert users only and not part of our public API. Use
4615
   * convert_utf32_to_latin1 instead.
4616
   *
4617
   * This function is not BOM-aware.
4618
   *
4619
   * @param input         the UTF-32 string to convert
4620
   * @param length        the length of the string in 4-byte code units
4621
   * (char32_t)
4622
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
4623
   * result
4624
   * @return number of written code units; 0 if conversion is not possible
4625
   */
4626
  simdutf_warn_unused virtual size_t
4627
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
4628
                                char *latin1_buffer) const noexcept = 0;
4629
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4630
4631
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4632
  /**
4633
   * Convert possibly broken UTF-32 string into UTF-8 string.
4634
   *
4635
   * During the conversion also validation of the input string is done.
4636
   * This function is suitable to work with inputs from untrusted sources.
4637
   *
4638
   * This function is not BOM-aware.
4639
   *
4640
   * @param input         the UTF-32 string to convert
4641
   * @param length        the length of the string in 4-byte code units
4642
   * (char32_t)
4643
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4644
   * @return number of written code units; 0 if input is not a valid UTF-32
4645
   * string
4646
   */
4647
  simdutf_warn_unused virtual size_t
4648
  convert_utf32_to_utf8(const char32_t *input, size_t length,
4649
                        char *utf8_buffer) const noexcept = 0;
4650
4651
  /**
4652
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
4653
   *
4654
   * During the conversion also validation of the input string is done.
4655
   * This function is suitable to work with inputs from untrusted sources.
4656
   *
4657
   * This function is not BOM-aware.
4658
   *
4659
   * @param input         the UTF-32 string to convert
4660
   * @param length        the length of the string in 4-byte code units
4661
   * (char32_t)
4662
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4663
   * @return a result pair struct (of type simdutf::result containing the two
4664
   * fields error and count) with an error code and either position of the error
4665
   * (in the input in code units) if any, or the number of char written if
4666
   * successful.
4667
   */
4668
  simdutf_warn_unused virtual result
4669
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
4670
                                    char *utf8_buffer) const noexcept = 0;
4671
4672
  /**
4673
   * Convert valid UTF-32 string into UTF-8 string.
4674
   *
4675
   * This function assumes that the input string is valid UTF-32.
4676
   *
4677
   * This function is not BOM-aware.
4678
   *
4679
   * @param input         the UTF-32 string to convert
4680
   * @param length        the length of the string in 4-byte code units
4681
   * (char32_t)
4682
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4683
   * result
4684
   * @return number of written code units; 0 if conversion is not possible
4685
   */
4686
  simdutf_warn_unused virtual size_t
4687
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
4688
                              char *utf8_buffer) const noexcept = 0;
4689
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4690
4691
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4692
  /**
4693
   * Return the number of bytes that this UTF-16 string would require in Latin1
4694
   * format.
4695
   *
4696
   *
4697
   * @param input         the UTF-16 string to convert
4698
   * @param length        the length of the string in 2-byte code units
4699
   * (char16_t)
4700
   * @return the number of bytes required to encode the UTF-16 string as Latin1
4701
   */
4702
  simdutf_warn_unused virtual size_t
4703
  utf16_length_from_latin1(size_t length) const noexcept {
4704
    return length;
4705
  }
4706
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4707
4708
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4709
  /**
4710
   * Convert possibly broken UTF-32 string into UTF-16LE string.
4711
   *
4712
   * During the conversion also validation of the input string is done.
4713
   * This function is suitable to work with inputs from untrusted sources.
4714
   *
4715
   * This function is not BOM-aware.
4716
   *
4717
   * @param input         the UTF-32 string to convert
4718
   * @param length        the length of the string in 4-byte code units
4719
   * (char32_t)
4720
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4721
   * @return number of written code units; 0 if input is not a valid UTF-32
4722
   * string
4723
   */
4724
  simdutf_warn_unused virtual size_t
4725
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
4726
                           char16_t *utf16_buffer) const noexcept = 0;
4727
4728
  /**
4729
   * Convert possibly broken UTF-32 string into UTF-16BE string.
4730
   *
4731
   * During the conversion also validation of the input string is done.
4732
   * This function is suitable to work with inputs from untrusted sources.
4733
   *
4734
   * This function is not BOM-aware.
4735
   *
4736
   * @param input         the UTF-32 string to convert
4737
   * @param length        the length of the string in 4-byte code units
4738
   * (char32_t)
4739
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4740
   * @return number of written code units; 0 if input is not a valid UTF-32
4741
   * string
4742
   */
4743
  simdutf_warn_unused virtual size_t
4744
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
4745
                           char16_t *utf16_buffer) const noexcept = 0;
4746
4747
  /**
4748
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
4749
   * error.
4750
   *
4751
   * During the conversion also validation of the input string is done.
4752
   * This function is suitable to work with inputs from untrusted sources.
4753
   *
4754
   * This function is not BOM-aware.
4755
   *
4756
   * @param input         the UTF-32 string to convert
4757
   * @param length        the length of the string in 4-byte code units
4758
   * (char32_t)
4759
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4760
   * @return a result pair struct (of type simdutf::result containing the two
4761
   * fields error and count) with an error code and either position of the error
4762
   * (in the input in code units) if any, or the number of char16_t written if
4763
   * successful.
4764
   */
4765
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
4766
      const char32_t *input, size_t length,
4767
      char16_t *utf16_buffer) const noexcept = 0;
4768
4769
  /**
4770
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
4771
   * error.
4772
   *
4773
   * During the conversion also validation of the input string is done.
4774
   * This function is suitable to work with inputs from untrusted sources.
4775
   *
4776
   * This function is not BOM-aware.
4777
   *
4778
   * @param input         the UTF-32 string to convert
4779
   * @param length        the length of the string in 4-byte code units
4780
   * (char32_t)
4781
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4782
   * @return a result pair struct (of type simdutf::result containing the two
4783
   * fields error and count) with an error code and either position of the error
4784
   * (in the input in code units) if any, or the number of char16_t written if
4785
   * successful.
4786
   */
4787
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
4788
      const char32_t *input, size_t length,
4789
      char16_t *utf16_buffer) const noexcept = 0;
4790
4791
  /**
4792
   * Convert valid UTF-32 string into UTF-16LE string.
4793
   *
4794
   * This function assumes that the input string is valid UTF-32.
4795
   *
4796
   * This function is not BOM-aware.
4797
   *
4798
   * @param input         the UTF-32 string to convert
4799
   * @param length        the length of the string in 4-byte code units
4800
   * (char32_t)
4801
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
4802
   * result
4803
   * @return number of written code units; 0 if conversion is not possible
4804
   */
4805
  simdutf_warn_unused virtual size_t
4806
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
4807
                                 char16_t *utf16_buffer) const noexcept = 0;
4808
4809
  /**
4810
   * Convert valid UTF-32 string into UTF-16BE string.
4811
   *
4812
   * This function assumes that the input string is valid UTF-32.
4813
   *
4814
   * This function is not BOM-aware.
4815
   *
4816
   * @param input         the UTF-32 string to convert
4817
   * @param length        the length of the string in 4-byte code units
4818
   * (char32_t)
4819
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
4820
   * result
4821
   * @return number of written code units; 0 if conversion is not possible
4822
   */
4823
  simdutf_warn_unused virtual size_t
4824
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
4825
                                 char16_t *utf16_buffer) const noexcept = 0;
4826
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4827
4828
#if SIMDUTF_FEATURE_UTF16
4829
  /**
4830
   * Change the endianness of the input. Can be used to go from UTF-16LE to
4831
   * UTF-16BE or from UTF-16BE to UTF-16LE.
4832
   *
4833
   * This function does not validate the input.
4834
   *
4835
   * This function is not BOM-aware.
4836
   *
4837
   * @param input         the UTF-16 string to process
4838
   * @param length        the length of the string in 2-byte code units
4839
   * (char16_t)
4840
   * @param output        the pointer to a buffer that can hold the conversion
4841
   * result
4842
   */
4843
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
4844
                                       char16_t *output) const noexcept = 0;
4845
#endif // SIMDUTF_FEATURE_UTF16
4846
4847
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4848
  /**
4849
   * Return the number of bytes that this Latin1 string would require in UTF-8
4850
   * format.
4851
   *
4852
   * @param input         the Latin1 string to convert
4853
   * @param length        the length of the string bytes
4854
   * @return the number of bytes required to encode the Latin1 string as UTF-8
4855
   */
4856
  simdutf_warn_unused virtual size_t
4857
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
4858
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4859
4860
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4861
  /**
4862
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
4863
   * format.
4864
   *
4865
   * This function does not validate the input. It is acceptable to pass invalid
4866
   * UTF-32 strings but in such cases the result is implementation defined.
4867
   *
4868
   * @param input         the UTF-32 string to convert
4869
   * @param length        the length of the string in 4-byte code units
4870
   * (char32_t)
4871
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
4872
   */
4873
  simdutf_warn_unused virtual size_t
4874
  utf8_length_from_utf32(const char32_t *input,
4875
                         size_t length) const noexcept = 0;
4876
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4877
4878
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4879
  /**
4880
   * Compute the number of bytes that this UTF-32 string would require in Latin1
4881
   * format.
4882
   *
4883
   * This function does not validate the input. It is acceptable to pass invalid
4884
   * UTF-32 strings but in such cases the result is implementation defined.
4885
   *
4886
   * @param length        the length of the string in 4-byte code units
4887
   * (char32_t)
4888
   * @return the number of bytes required to encode the UTF-32 string as Latin1
4889
   */
4890
  simdutf_warn_unused virtual size_t
4891
  latin1_length_from_utf32(size_t length) const noexcept {
4892
    return length;
4893
  }
4894
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4895
4896
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4897
  /**
4898
   * Compute the number of bytes that this UTF-8 string would require in Latin1
4899
   * format.
4900
   *
4901
   * This function does not validate the input. It is acceptable to pass invalid
4902
   * UTF-8 strings but in such cases the result is implementation defined.
4903
   *
4904
   * @param input         the UTF-8 string to convert
4905
   * @param length        the length of the string in byte
4906
   * @return the number of bytes required to encode the UTF-8 string as Latin1
4907
   */
4908
  simdutf_warn_unused virtual size_t
4909
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4910
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4911
4912
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4913
  /**
4914
   * Compute the number of bytes that this UTF-16LE/BE string would require in
4915
   * Latin1 format.
4916
   *
4917
   * This function does not validate the input. It is acceptable to pass invalid
4918
   * UTF-16 strings but in such cases the result is implementation defined.
4919
   *
4920
   * This function is not BOM-aware.
4921
   *
4922
   * @param input         the UTF-16LE string to convert
4923
   * @param length        the length of the string in 2-byte code units
4924
   * (char16_t)
4925
   * @return the number of bytes required to encode the UTF-16LE string as
4926
   * Latin1
4927
   */
4928
  simdutf_warn_unused virtual size_t
4929
  latin1_length_from_utf16(size_t length) const noexcept {
4930
    return length;
4931
  }
4932
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4933
4934
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4935
  /**
4936
   * Compute the number of two-byte code units that this UTF-32 string would
4937
   * require in UTF-16 format.
4938
   *
4939
   * This function does not validate the input. It is acceptable to pass invalid
4940
   * UTF-32 strings but in such cases the result is implementation defined.
4941
   *
4942
   * @param input         the UTF-32 string to convert
4943
   * @param length        the length of the string in 4-byte code units
4944
   * (char32_t)
4945
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
4946
   */
4947
  simdutf_warn_unused virtual size_t
4948
  utf16_length_from_utf32(const char32_t *input,
4949
                          size_t length) const noexcept = 0;
4950
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4951
4952
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4953
  /**
4954
   * Return the number of bytes that this UTF-32 string would require in Latin1
4955
   * format.
4956
   *
4957
   * @param length        the length of the string in 4-byte code units
4958
   * (char32_t)
4959
   * @return the number of bytes required to encode the UTF-32 string as Latin1
4960
   */
4961
  simdutf_warn_unused virtual size_t
4962
  utf32_length_from_latin1(size_t length) const noexcept {
4963
    return length;
4964
  }
4965
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4966
4967
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4968
  /**
4969
   * Compute the number of bytes that this UTF-16LE string would require in
4970
   * UTF-32 format.
4971
   *
4972
   * This function is equivalent to count_utf16le.
4973
   *
4974
   * This function does not validate the input. It is acceptable to pass invalid
4975
   * UTF-16 strings but in such cases the result is implementation defined.
4976
   *
4977
   * This function is not BOM-aware.
4978
   *
4979
   * @param input         the UTF-16LE string to convert
4980
   * @param length        the length of the string in 2-byte code units
4981
   * (char16_t)
4982
   * @return the number of bytes required to encode the UTF-16LE string as
4983
   * UTF-32
4984
   */
4985
  simdutf_warn_unused virtual size_t
4986
  utf32_length_from_utf16le(const char16_t *input,
4987
                            size_t length) const noexcept = 0;
4988
4989
  /**
4990
   * Compute the number of bytes that this UTF-16BE string would require in
4991
   * UTF-32 format.
4992
   *
4993
   * This function is equivalent to count_utf16be.
4994
   *
4995
   * This function does not validate the input. It is acceptable to pass invalid
4996
   * UTF-16 strings but in such cases the result is implementation defined.
4997
   *
4998
   * This function is not BOM-aware.
4999
   *
5000
   * @param input         the UTF-16BE string to convert
5001
   * @param length        the length of the string in 2-byte code units
5002
   * (char16_t)
5003
   * @return the number of bytes required to encode the UTF-16BE string as
5004
   * UTF-32
5005
   */
5006
  simdutf_warn_unused virtual size_t
5007
  utf32_length_from_utf16be(const char16_t *input,
5008
                            size_t length) const noexcept = 0;
5009
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5010
5011
#if SIMDUTF_FEATURE_UTF16
5012
  /**
5013
   * Count the number of code points (characters) in the string assuming that
5014
   * it is valid.
5015
   *
5016
   * This function assumes that the input string is valid UTF-16LE.
5017
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5018
   * the result is implementation defined.
5019
   *
5020
   * This function is not BOM-aware.
5021
   *
5022
   * @param input         the UTF-16LE string to process
5023
   * @param length        the length of the string in 2-byte code units
5024
   * (char16_t)
5025
   * @return number of code points
5026
   */
5027
  simdutf_warn_unused virtual size_t
5028
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
5029
5030
  /**
5031
   * Count the number of code points (characters) in the string assuming that
5032
   * it is valid.
5033
   *
5034
   * This function assumes that the input string is valid UTF-16BE.
5035
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5036
   * the result is implementation defined.
5037
   *
5038
   * This function is not BOM-aware.
5039
   *
5040
   * @param input         the UTF-16BE string to process
5041
   * @param length        the length of the string in 2-byte code units
5042
   * (char16_t)
5043
   * @return number of code points
5044
   */
5045
  simdutf_warn_unused virtual size_t
5046
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
5047
#endif // SIMDUTF_FEATURE_UTF16
5048
5049
#if SIMDUTF_FEATURE_UTF8
5050
  /**
5051
   * Count the number of code points (characters) in the string assuming that
5052
   * it is valid.
5053
   *
5054
   * This function assumes that the input string is valid UTF-8.
5055
   * It is acceptable to pass invalid UTF-8 strings but in such cases
5056
   * the result is implementation defined.
5057
   *
5058
   * @param input         the UTF-8 string to process
5059
   * @param length        the length of the string in bytes
5060
   * @return number of code points
5061
   */
5062
  simdutf_warn_unused virtual size_t
5063
  count_utf8(const char *input, size_t length) const noexcept = 0;
5064
#endif // SIMDUTF_FEATURE_UTF8
5065
5066
#if SIMDUTF_FEATURE_BASE64
5067
  /**
5068
   * Provide the maximal binary length in bytes given the base64 input.
5069
   * In general, if the input contains ASCII spaces, the result will be less
5070
   * than the maximum length. It is acceptable to pass invalid base64 strings
5071
   * but in such cases the result is implementation defined.
5072
   *
5073
   * @param input         the base64 input to process
5074
   * @param length        the length of the base64 input in bytes
5075
   * @return maximal number of binary bytes
5076
   */
5077
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5078
      const char *input, size_t length) const noexcept;
5079
5080
  /**
5081
   * Provide the maximal binary length in bytes given the base64 input.
5082
   * In general, if the input contains ASCII spaces, the result will be less
5083
   * than the maximum length. It is acceptable to pass invalid base64 strings
5084
   * but in such cases the result is implementation defined.
5085
   *
5086
   * @param input         the base64 input to process, in ASCII stored as 16-bit
5087
   * units
5088
   * @param length        the length of the base64 input in 16-bit units
5089
   * @return maximal number of binary bytes
5090
   */
5091
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5092
      const char16_t *input, size_t length) const noexcept;
5093
5094
  /**
5095
   * Convert a base64 input to a binary output.
5096
   *
5097
   * This function follows the WHATWG forgiving-base64 format, which means that
5098
   * it will ignore any ASCII spaces in the input. You may provide a padded
5099
   * input (with one or two equal signs at the end) or an unpadded input
5100
   * (without any equal signs at the end).
5101
   *
5102
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5103
   *
5104
   * This function will fail in case of invalid input. When last_chunk_options =
5105
   * loose, there are two possible reasons for failure: the input contains a
5106
   * number of base64 characters that when divided by 4, leaves a single
5107
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5108
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5109
   *
5110
   * You should call this function with a buffer that is at least
5111
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5112
   * provide that much space, the function may cause a buffer overflow.
5113
   *
5114
   * @param input         the base64 string to process
5115
   * @param length        the length of the string in bytes
5116
   * @param output        the pointer to a buffer that can hold the conversion
5117
   * result (should be at least maximal_binary_length_from_base64(input, length)
5118
   * bytes long).
5119
   * @param options       the base64 options to use, can be base64_default or
5120
   * base64_url, is base64_default by default.
5121
   * @return a result pair struct (of type simdutf::result containing the two
5122
   * fields error and count) with an error code and either position of the error
5123
   * (in the input in bytes) if any, or the number of bytes written if
5124
   * successful.
5125
   */
5126
  simdutf_warn_unused virtual result
5127
  base64_to_binary(const char *input, size_t length, char *output,
5128
                   base64_options options = base64_default,
5129
                   last_chunk_handling_options last_chunk_options =
5130
                       last_chunk_handling_options::loose) const noexcept = 0;
5131
5132
  /**
5133
   * Convert a base64 input to a binary output while returning more details
5134
   * than base64_to_binary.
5135
   *
5136
   * This function follows the WHATWG forgiving-base64 format, which means that
5137
   * it will ignore any ASCII spaces in the input. You may provide a padded
5138
   * input (with one or two equal signs at the end) or an unpadded input
5139
   * (without any equal signs at the end).
5140
   *
5141
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5142
   *
5143
   * This function will fail in case of invalid input. When last_chunk_options =
5144
   * loose, there are two possible reasons for failure: the input contains a
5145
   * number of base64 characters that when divided by 4, leaves a single
5146
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5147
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5148
   *
5149
   * You should call this function with a buffer that is at least
5150
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5151
   * provide that much space, the function may cause a buffer overflow.
5152
   *
5153
   * @param input         the base64 string to process
5154
   * @param length        the length of the string in bytes
5155
   * @param output        the pointer to a buffer that can hold the conversion
5156
   * result (should be at least maximal_binary_length_from_base64(input, length)
5157
   * bytes long).
5158
   * @param options       the base64 options to use, can be base64_default or
5159
   * base64_url, is base64_default by default.
5160
   * @return a full_result pair struct (of type simdutf::result containing the
5161
   * three fields error, input_count and output_count).
5162
   */
5163
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5164
      const char *input, size_t length, char *output,
5165
      base64_options options = base64_default,
5166
      last_chunk_handling_options last_chunk_options =
5167
          last_chunk_handling_options::loose) const noexcept = 0;
5168
  /**
5169
   * Convert a base64 input to a binary output.
5170
   *
5171
   * This function follows the WHATWG forgiving-base64 format, which means that
5172
   * it will ignore any ASCII spaces in the input. You may provide a padded
5173
   * input (with one or two equal signs at the end) or an unpadded input
5174
   * (without any equal signs at the end).
5175
   *
5176
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5177
   *
5178
   * This function will fail in case of invalid input. When last_chunk_options =
5179
   * loose, there are two possible reasons for failure: the input contains a
5180
   * number of base64 characters that when divided by 4, leaves a single
5181
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5182
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5183
   *
5184
   * You should call this function with a buffer that is at least
5185
   * maximal_binary_length_from_base64(input, length) bytes long. If you
5186
   * fail to provide that much space, the function may cause a buffer overflow.
5187
   *
5188
   * @param input         the base64 string to process, in ASCII stored as
5189
   * 16-bit units
5190
   * @param length        the length of the string in 16-bit units
5191
   * @param output        the pointer to a buffer that can hold the conversion
5192
   * result (should be at least maximal_binary_length_from_base64(input, length)
5193
   * bytes long).
5194
   * @param options       the base64 options to use, can be base64_default or
5195
   * base64_url, is base64_default by default.
5196
   * @return a result pair struct (of type simdutf::result containing the two
5197
   * fields error and count) with an error code and position of the
5198
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
5199
   * number of bytes written if successful.
5200
   */
5201
  simdutf_warn_unused virtual result
5202
  base64_to_binary(const char16_t *input, size_t length, char *output,
5203
                   base64_options options = base64_default,
5204
                   last_chunk_handling_options last_chunk_options =
5205
                       last_chunk_handling_options::loose) const noexcept = 0;
5206
5207
  /**
5208
   * Convert a base64 input to a binary output while returning more details
5209
   * than base64_to_binary.
5210
   *
5211
   * This function follows the WHATWG forgiving-base64 format, which means that
5212
   * it will ignore any ASCII spaces in the input. You may provide a padded
5213
   * input (with one or two equal signs at the end) or an unpadded input
5214
   * (without any equal signs at the end).
5215
   *
5216
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5217
   *
5218
   * This function will fail in case of invalid input. When last_chunk_options =
5219
   * loose, there are two possible reasons for failure: the input contains a
5220
   * number of base64 characters that when divided by 4, leaves a single
5221
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5222
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5223
   *
5224
   * You should call this function with a buffer that is at least
5225
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5226
   * provide that much space, the function may cause a buffer overflow.
5227
   *
5228
   * @param input         the base64 string to process
5229
   * @param length        the length of the string in bytes
5230
   * @param output        the pointer to a buffer that can hold the conversion
5231
   * result (should be at least maximal_binary_length_from_base64(input, length)
5232
   * bytes long).
5233
   * @param options       the base64 options to use, can be base64_default or
5234
   * base64_url, is base64_default by default.
5235
   * @return a full_result pair struct (of type simdutf::result containing the
5236
   * three fields error, input_count and output_count).
5237
   */
5238
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5239
      const char16_t *input, size_t length, char *output,
5240
      base64_options options = base64_default,
5241
      last_chunk_handling_options last_chunk_options =
5242
          last_chunk_handling_options::loose) const noexcept = 0;
5243
  /**
5244
   * Provide the base64 length in bytes given the length of a binary input.
5245
   *
5246
   * @param length        the length of the input in bytes
5247
   * @parem options       the base64 options to use, can be base64_default or
5248
   * base64_url, is base64_default by default.
5249
   * @return number of base64 bytes
5250
   */
5251
  simdutf_warn_unused size_t base64_length_from_binary(
5252
      size_t length, base64_options options = base64_default) const noexcept;
5253
5254
  /**
5255
   * Convert a binary input to a base64 output.
5256
   *
5257
   * The default option (simdutf::base64_default) uses the characters `+` and
5258
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5259
   * the output to ensure that the output length is a multiple of four.
5260
   *
5261
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5262
   * part of its alphabet. No padding is added at the end of the output.
5263
   *
5264
   * This function always succeeds.
5265
   *
5266
   * @param input         the binary to process
5267
   * @param length        the length of the input in bytes
5268
   * @param output        the pointer to a buffer that can hold the conversion
5269
   * result (should be at least base64_length_from_binary(length) bytes long)
5270
   * @param options       the base64 options to use, can be base64_default or
5271
   * base64_url, is base64_default by default.
5272
   * @return number of written bytes, will be equal to
5273
   * base64_length_from_binary(length, options)
5274
   */
5275
  virtual size_t
5276
  binary_to_base64(const char *input, size_t length, char *output,
5277
                   base64_options options = base64_default) const noexcept = 0;
5278
  /**
5279
   * Find the first occurrence of a character in a string. If the character is
5280
   * not found, return a pointer to the end of the string.
5281
   * @param start        the start of the string
5282
   * @param end          the end of the string
5283
   * @param character    the character to find
5284
   * @return a pointer to the first occurrence of the character in the string,
5285
   * or a pointer to the end of the string if the character is not found.
5286
   *
5287
   */
5288
  virtual const char *find(const char *start, const char *end,
5289
                           char character) const noexcept = 0;
5290
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
5291
                               char16_t character) const noexcept = 0;
5292
#endif // SIMDUTF_FEATURE_BASE64
5293
5294
#ifdef SIMDUTF_INTERNAL_TESTS
5295
  // This method is exported only in developer mode, its purpose
5296
  // is to expose some internal test procedures from the given
5297
  // implementation and then use them through our standard test
5298
  // framework.
5299
  //
5300
  // Regular users should not use it, the tests of the public
5301
  // API are enough.
5302
5303
  struct TestProcedure {
5304
    // display name
5305
    std::string name;
5306
5307
    // procedure should return whether given test pass or not
5308
    void (*procedure)(const implementation &);
5309
  };
5310
5311
  virtual std::vector<TestProcedure> internal_tests() const;
5312
#endif
5313
5314
protected:
5315
  /** @private Construct an implementation with the given name and description.
5316
   * For subclasses. */
5317
  simdutf_really_inline implementation(const char *name,
5318
                                       const char *description,
5319
                                       uint32_t required_instruction_sets)
5320
      : _name(name), _description(description),
5321
        _required_instruction_sets(required_instruction_sets) {}
5322
5323
protected:
5324
  ~implementation() = default;
5325
5326
private:
5327
  /**
5328
   * The name of this implementation.
5329
   */
5330
  const char *_name;
5331
5332
  /**
5333
   * The description of this implementation.
5334
   */
5335
  const char *_description;
5336
5337
  /**
5338
   * Instruction sets required for this implementation.
5339
   */
5340
  const uint32_t _required_instruction_sets;
5341
};
5342
5343
/** @private */
5344
namespace internal {
5345
5346
/**
5347
 * The list of available implementations compiled into simdutf.
5348
 */
5349
class available_implementation_list {
5350
public:
5351
  /** Get the list of available implementations compiled into simdutf */
5352
  simdutf_really_inline available_implementation_list() {}
5353
  /** Number of implementations */
5354
  size_t size() const noexcept;
5355
  /** STL const begin() iterator */
5356
  const implementation *const *begin() const noexcept;
5357
  /** STL const end() iterator */
5358
  const implementation *const *end() const noexcept;
5359
5360
  /**
5361
   * Get the implementation with the given name.
5362
   *
5363
   * Case sensitive.
5364
   *
5365
   *     const implementation *impl =
5366
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
5367
   * (!imp->supported_by_runtime_system()) { exit(1); }
5368
   *     simdutf::active_implementation = impl;
5369
   *
5370
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
5371
   * @return the implementation, or nullptr if the parse failed.
5372
   */
5373
  const implementation *operator[](const std::string &name) const noexcept {
5374
    for (const implementation *impl : *this) {
5375
      if (impl->name() == name) {
5376
        return impl;
5377
      }
5378
    }
5379
    return nullptr;
5380
  }
5381
5382
  /**
5383
   * Detect the most advanced implementation supported by the current host.
5384
   *
5385
   * This is used to initialize the implementation on startup.
5386
   *
5387
   *     const implementation *impl =
5388
   * simdutf::available_implementation::detect_best_supported();
5389
   *     simdutf::active_implementation = impl;
5390
   *
5391
   * @return the most advanced supported implementation for the current host, or
5392
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
5393
   * supported implementation. Will never return nullptr.
5394
   */
5395
  const implementation *detect_best_supported() const noexcept;
5396
};
5397
5398
template <typename T> class atomic_ptr {
5399
public:
5400
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
5401
5402
#if defined(SIMDUTF_NO_THREADS)
5403
  operator const T *() const { return ptr; }
5404
  const T &operator*() const { return *ptr; }
5405
  const T *operator->() const { return ptr; }
5406
5407
  operator T *() { return ptr; }
5408
  T &operator*() { return *ptr; }
5409
  T *operator->() { return ptr; }
5410
  atomic_ptr &operator=(T *_ptr) {
5411
    ptr = _ptr;
5412
    return *this;
5413
  }
5414
5415
#else
5416
  operator const T *() const { return ptr.load(); }
5417
  const T &operator*() const { return *ptr; }
5418
  const T *operator->() const { return ptr.load(); }
5419
5420
  operator T *() { return ptr.load(); }
5421
  T &operator*() { return *ptr; }
5422
  T *operator->() { return ptr.load(); }
5423
  atomic_ptr &operator=(T *_ptr) {
5424
    ptr = _ptr;
5425
    return *this;
5426
  }
5427
5428
#endif
5429
5430
private:
5431
#if defined(SIMDUTF_NO_THREADS)
5432
  T *ptr;
5433
#else
5434
  std::atomic<T *> ptr;
5435
#endif
5436
};
5437
5438
class detect_best_supported_implementation_on_first_use;
5439
5440
} // namespace internal
5441
5442
/**
5443
 * The list of available implementations compiled into simdutf.
5444
 */
5445
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
5446
get_available_implementations();
5447
5448
/**
5449
 * The active implementation.
5450
 *
5451
 * Automatically initialized on first use to the most advanced implementation
5452
 * supported by this hardware.
5453
 */
5454
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
5455
get_active_implementation();
5456
5457
} // namespace simdutf
5458
5459
#endif // SIMDUTF_IMPLEMENTATION_H