Coverage Report

Created: 2025-07-11 06:12

/work/include/simdutf/implementation.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef SIMDUTF_IMPLEMENTATION_H
2
#define SIMDUTF_IMPLEMENTATION_H
3
#if !defined(SIMDUTF_NO_THREADS)
4
  #include <atomic>
5
#endif
6
#include <string>
7
#ifdef SIMDUTF_INTERNAL_TESTS
8
  #include <vector>
9
#endif
10
#include "simdutf/common_defs.h"
11
#include "simdutf/compiler_check.h"
12
#include "simdutf/encoding_types.h"
13
#include "simdutf/error.h"
14
#include "simdutf/internal/isadetection.h"
15
16
#if SIMDUTF_SPAN
17
  #include <concepts>
18
  #include <type_traits>
19
  #include <span>
20
  #include <tuple>
21
#endif
22
#if SIMDUTF_CPLUSPLUS17
23
  #include <string_view>
24
#endif
25
// The following defines are conditionally enabled/disabled during amalgamation.
26
// By default all features are enabled, regular code shouldn't check them. Only
27
// when user code really relies of a selected subset, it's good to verify these
28
// flags, like:
29
//
30
//      #if !SIMDUTF_FEATURE_UTF16
31
//      #   error("Please amalgamate simdutf with UTF-16 support")
32
//      #endif
33
//
34
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
35
#define SIMDUTF_FEATURE_ASCII 1
36
#define SIMDUTF_FEATURE_LATIN1 1
37
#define SIMDUTF_FEATURE_UTF8 1
38
#define SIMDUTF_FEATURE_UTF16 1
39
#define SIMDUTF_FEATURE_UTF32 1
40
#define SIMDUTF_FEATURE_BASE64 1
41
42
namespace simdutf {
43
44
#if SIMDUTF_SPAN
45
/// helpers placed in namespace detail are not a part of the public API
46
namespace detail {
47
/**
48
 * matches a byte, in the many ways C++ allows. note that these
49
 * are all distinct types.
50
 */
51
template <typename T>
52
concept byte_like = std::is_same_v<T, std::byte> ||   //
53
                    std::is_same_v<T, char> ||        //
54
                    std::is_same_v<T, signed char> || //
55
                    std::is_same_v<T, unsigned char>;
56
57
template <typename T>
58
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
59
60
template <typename T>
61
concept is_pointer = std::is_pointer_v<T>;
62
63
/**
64
 * matches anything that behaves like std::span and points to character-like
65
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
66
 * std::uint8_t
67
 */
68
template <typename T>
69
concept input_span_of_byte_like = requires(const T &t) {
70
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
71
  { t.data() } noexcept -> is_pointer;
72
  { *t.data() } noexcept -> is_byte_like;
73
};
74
75
template <typename T>
76
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
77
78
/**
79
 * like span_of_byte_like, but for an output span (intended to be written to)
80
 */
81
template <typename T>
82
concept output_span_of_byte_like = requires(T &t) {
83
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
84
  { t.data() } noexcept -> is_pointer;
85
  { *t.data() } noexcept -> is_byte_like;
86
  { *t.data() } noexcept -> is_mutable;
87
};
88
} // namespace detail
89
#endif
90
91
#if SIMDUTF_FEATURE_DETECT_ENCODING
92
/**
93
 * Autodetect the encoding of the input, a single encoding is recommended.
94
 * E.g., the function might return simdutf::encoding_type::UTF8,
95
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
96
 * simdutf::encoding_type::UTF32_LE.
97
 *
98
 * @param input the string to analyze.
99
 * @param length the length of the string in bytes.
100
 * @return the detected encoding type
101
 */
102
simdutf_warn_unused simdutf::encoding_type
103
autodetect_encoding(const char *input, size_t length) noexcept;
104
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
105
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
106
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
107
0
}
108
  #if SIMDUTF_SPAN
109
/**
110
 * Autodetect the encoding of the input, a single encoding is recommended.
111
 * E.g., the function might return simdutf::encoding_type::UTF8,
112
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
113
 * simdutf::encoding_type::UTF32_LE.
114
 *
115
 * @param input the string to analyze. can be a anything span-like that has a
116
 * data() and size() that points to character data: std::string,
117
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
118
 * @return the detected encoding type
119
 */
120
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
121
autodetect_encoding(
122
    const detail::input_span_of_byte_like auto &input) noexcept {
123
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
124
                             input.size());
125
}
126
  #endif // SIMDUTF_SPAN
127
128
/**
129
 * Autodetect the possible encodings of the input in one pass.
130
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
131
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
132
 *
133
 * Overridden by each implementation.
134
 *
135
 * @param input the string to analyze.
136
 * @param length the length of the string in bytes.
137
 * @return the detected encoding type
138
 */
139
simdutf_warn_unused int detect_encodings(const char *input,
140
                                         size_t length) noexcept;
141
simdutf_really_inline simdutf_warn_unused int
142
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
143
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
144
0
}
145
  #if SIMDUTF_SPAN
146
simdutf_really_inline simdutf_warn_unused int
147
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
148
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
149
                          input.size());
150
}
151
  #endif // SIMDUTF_SPAN
152
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
153
154
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
155
/**
156
 * Validate the UTF-8 string. This function may be best when you expect
157
 * the input to be almost always valid. Otherwise, consider using
158
 * validate_utf8_with_errors.
159
 *
160
 * Overridden by each implementation.
161
 *
162
 * @param buf the UTF-8 string to validate.
163
 * @param len the length of the string in bytes.
164
 * @return true if and only if the string is valid UTF-8.
165
 */
166
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
167
  #if SIMDUTF_SPAN
168
simdutf_really_inline simdutf_warn_unused bool
169
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
170
  return validate_utf8(reinterpret_cast<const char *>(input.data()),
171
                       input.size());
172
}
173
  #endif // SIMDUTF_SPAN
174
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
175
176
#if SIMDUTF_FEATURE_UTF8
177
/**
178
 * Validate the UTF-8 string and stop on error.
179
 *
180
 * Overridden by each implementation.
181
 *
182
 * @param buf the UTF-8 string to validate.
183
 * @param len the length of the string in bytes.
184
 * @return a result pair struct (of type simdutf::result containing the two
185
 * fields error and count) with an error code and either position of the error
186
 * (in the input in code units) if any, or the number of code units validated if
187
 * successful.
188
 */
189
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
190
                                                     size_t len) noexcept;
191
  #if SIMDUTF_SPAN
192
simdutf_really_inline simdutf_warn_unused result validate_utf8_with_errors(
193
    const detail::input_span_of_byte_like auto &input) noexcept {
194
  return validate_utf8_with_errors(reinterpret_cast<const char *>(input.data()),
195
                                   input.size());
196
}
197
  #endif // SIMDUTF_SPAN
198
#endif   // SIMDUTF_FEATURE_UTF8
199
200
#if SIMDUTF_FEATURE_ASCII
201
/**
202
 * Validate the ASCII string.
203
 *
204
 * Overridden by each implementation.
205
 *
206
 * @param buf the ASCII string to validate.
207
 * @param len the length of the string in bytes.
208
 * @return true if and only if the string is valid ASCII.
209
 */
210
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
211
  #if SIMDUTF_SPAN
212
simdutf_really_inline simdutf_warn_unused bool
213
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
214
  return validate_ascii(reinterpret_cast<const char *>(input.data()),
215
                        input.size());
216
}
217
  #endif // SIMDUTF_SPAN
218
219
/**
220
 * Validate the ASCII string and stop on error. It might be faster than
221
 * validate_utf8 when an error is expected to occur early.
222
 *
223
 * Overridden by each implementation.
224
 *
225
 * @param buf the ASCII string to validate.
226
 * @param len the length of the string in bytes.
227
 * @return a result pair struct (of type simdutf::result containing the two
228
 * fields error and count) with an error code and either position of the error
229
 * (in the input in code units) if any, or the number of code units validated if
230
 * successful.
231
 */
232
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
233
                                                      size_t len) noexcept;
234
  #if SIMDUTF_SPAN
235
simdutf_really_inline simdutf_warn_unused result validate_ascii_with_errors(
236
    const detail::input_span_of_byte_like auto &input) noexcept {
237
  return validate_ascii_with_errors(
238
      reinterpret_cast<const char *>(input.data()), input.size());
239
}
240
  #endif // SIMDUTF_SPAN
241
#endif   // SIMDUTF_FEATURE_ASCII
242
243
#if SIMDUTF_FEATURE_UTF16
244
/**
245
 * Using native endianness; Validate the UTF-16 string.
246
 * This function may be best when you expect the input to be almost always
247
 * valid. Otherwise, consider using validate_utf16_with_errors.
248
 *
249
 * Overridden by each implementation.
250
 *
251
 * This function is not BOM-aware.
252
 *
253
 * @param buf the UTF-16 string to validate.
254
 * @param len the length of the string in number of 2-byte code units
255
 * (char16_t).
256
 * @return true if and only if the string is valid UTF-16.
257
 */
258
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
259
                                        size_t len) noexcept;
260
  #if SIMDUTF_SPAN
261
simdutf_really_inline simdutf_warn_unused bool
262
0
validate_utf16(std::span<const char16_t> input) noexcept {
263
0
  return validate_utf16(input.data(), input.size());
264
0
}
265
  #endif // SIMDUTF_SPAN
266
#endif   // SIMDUTF_FEATURE_UTF16
267
268
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
269
/**
270
 * Validate the UTF-16LE string. This function may be best when you expect
271
 * the input to be almost always valid. Otherwise, consider using
272
 * validate_utf16le_with_errors.
273
 *
274
 * Overridden by each implementation.
275
 *
276
 * This function is not BOM-aware.
277
 *
278
 * @param buf the UTF-16LE string to validate.
279
 * @param len the length of the string in number of 2-byte code units
280
 * (char16_t).
281
 * @return true if and only if the string is valid UTF-16LE.
282
 */
283
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
284
                                          size_t len) noexcept;
285
  #if SIMDUTF_SPAN
286
simdutf_really_inline simdutf_warn_unused bool
287
0
validate_utf16le(std::span<const char16_t> input) noexcept {
288
0
  return validate_utf16le(input.data(), input.size());
289
0
}
290
  #endif // SIMDUTF_SPAN
291
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
292
293
#if SIMDUTF_FEATURE_UTF16
294
/**
295
 * Validate the UTF-16BE string. This function may be best when you expect
296
 * the input to be almost always valid. Otherwise, consider using
297
 * validate_utf16be_with_errors.
298
 *
299
 * Overridden by each implementation.
300
 *
301
 * This function is not BOM-aware.
302
 *
303
 * @param buf the UTF-16BE string to validate.
304
 * @param len the length of the string in number of 2-byte code units
305
 * (char16_t).
306
 * @return true if and only if the string is valid UTF-16BE.
307
 */
308
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
309
                                          size_t len) noexcept;
310
  #if SIMDUTF_SPAN
311
simdutf_really_inline simdutf_warn_unused bool
312
0
validate_utf16be(std::span<const char16_t> input) noexcept {
313
0
  return validate_utf16be(input.data(), input.size());
314
0
}
315
  #endif // SIMDUTF_SPAN
316
317
/**
318
 * Using native endianness; Validate the UTF-16 string and stop on error.
319
 * It might be faster than validate_utf16 when an error is expected to occur
320
 * early.
321
 *
322
 * Overridden by each implementation.
323
 *
324
 * This function is not BOM-aware.
325
 *
326
 * @param buf the UTF-16 string to validate.
327
 * @param len the length of the string in number of 2-byte code units
328
 * (char16_t).
329
 * @return a result pair struct (of type simdutf::result containing the two
330
 * fields error and count) with an error code and either position of the error
331
 * (in the input in code units) if any, or the number of code units validated if
332
 * successful.
333
 */
334
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
335
                                                      size_t len) noexcept;
336
  #if SIMDUTF_SPAN
337
simdutf_really_inline simdutf_warn_unused result
338
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
339
0
  return validate_utf16_with_errors(input.data(), input.size());
340
0
}
341
  #endif // SIMDUTF_SPAN
342
343
/**
344
 * Validate the UTF-16LE string and stop on error. It might be faster than
345
 * validate_utf16le when an error is expected to occur early.
346
 *
347
 * Overridden by each implementation.
348
 *
349
 * This function is not BOM-aware.
350
 *
351
 * @param buf the UTF-16LE string to validate.
352
 * @param len the length of the string in number of 2-byte code units
353
 * (char16_t).
354
 * @return a result pair struct (of type simdutf::result containing the two
355
 * fields error and count) with an error code and either position of the error
356
 * (in the input in code units) if any, or the number of code units validated if
357
 * successful.
358
 */
359
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
360
                                                        size_t len) noexcept;
361
  #if SIMDUTF_SPAN
362
simdutf_really_inline simdutf_warn_unused result
363
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
364
0
  return validate_utf16le_with_errors(input.data(), input.size());
365
0
}
366
  #endif // SIMDUTF_SPAN
367
368
/**
369
 * Validate the UTF-16BE string and stop on error. It might be faster than
370
 * validate_utf16be when an error is expected to occur early.
371
 *
372
 * Overridden by each implementation.
373
 *
374
 * This function is not BOM-aware.
375
 *
376
 * @param buf the UTF-16BE string to validate.
377
 * @param len the length of the string in number of 2-byte code units
378
 * (char16_t).
379
 * @return a result pair struct (of type simdutf::result containing the two
380
 * fields error and count) with an error code and either position of the error
381
 * (in the input in code units) if any, or the number of code units validated if
382
 * successful.
383
 */
384
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
385
                                                        size_t len) noexcept;
386
  #if SIMDUTF_SPAN
387
simdutf_really_inline simdutf_warn_unused result
388
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
389
0
  return validate_utf16be_with_errors(input.data(), input.size());
390
0
}
391
  #endif // SIMDUTF_SPAN
392
393
/**
394
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
395
 * the Unicode replacement character U+FFFD. If input and output points to
396
 * different memory areas, the procedure copies string, and it's expected that
397
 * output memory is at least as big as the input. It's also possible to set
398
 * input equal output, that makes replacements an in-place operation.
399
 *
400
 * @param input the UTF-16LE string to correct.
401
 * @param len the length of the string in number of 2-byte code units
402
 * (char16_t).
403
 * @param output the output buffer.
404
 */
405
void to_well_formed_utf16le(const char16_t *input, size_t len,
406
                            char16_t *output) noexcept;
407
  #if SIMDUTF_SPAN
408
simdutf_really_inline void
409
to_well_formed_utf16le(std::span<const char16_t> input,
410
0
                       std::span<char16_t> output) noexcept {
411
0
  to_well_formed_utf16le(input.data(), input.size(), output.data());
412
0
}
413
  #endif // SIMDUTF_SPAN
414
415
/**
416
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
417
 * the Unicode replacement character U+FFFD. If input and output points to
418
 * different memory areas, the procedure copies string, and it's expected that
419
 * output memory is at least as big as the input. It's also possible to set
420
 * input equal output, that makes replacements an in-place operation.
421
 *
422
 * @param input the UTF-16BE string to correct.
423
 * @param len the length of the string in number of 2-byte code units
424
 * (char16_t).
425
 * @param output the output buffer.
426
 */
427
void to_well_formed_utf16be(const char16_t *input, size_t len,
428
                            char16_t *output) noexcept;
429
  #if SIMDUTF_SPAN
430
simdutf_really_inline void
431
to_well_formed_utf16be(std::span<const char16_t> input,
432
0
                       std::span<char16_t> output) noexcept {
433
0
  to_well_formed_utf16be(input.data(), input.size(), output.data());
434
0
}
435
  #endif // SIMDUTF_SPAN
436
437
/**
438
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
439
 * Unicode replacement character U+FFFD. If input and output points to different
440
 * memory areas, the procedure copies string, and it's expected that output
441
 * memory is at least as big as the input. It's also possible to set input equal
442
 * output, that makes replacements an in-place operation.
443
 *
444
 * @param input the UTF-16 string to correct.
445
 * @param len the length of the string in number of 2-byte code units
446
 * (char16_t).
447
 * @param output the output buffer.
448
 */
449
void to_well_formed_utf16(const char16_t *input, size_t len,
450
                          char16_t *output) noexcept;
451
  #if SIMDUTF_SPAN
452
simdutf_really_inline void
453
to_well_formed_utf16(std::span<const char16_t> input,
454
0
                     std::span<char16_t> output) noexcept {
455
0
  to_well_formed_utf16(input.data(), input.size(), output.data());
456
0
}
457
  #endif // SIMDUTF_SPAN
458
459
#endif // SIMDUTF_FEATURE_UTF16
460
461
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
462
/**
463
 * Validate the UTF-32 string. This function may be best when you expect
464
 * the input to be almost always valid. Otherwise, consider using
465
 * validate_utf32_with_errors.
466
 *
467
 * Overridden by each implementation.
468
 *
469
 * This function is not BOM-aware.
470
 *
471
 * @param buf the UTF-32 string to validate.
472
 * @param len the length of the string in number of 4-byte code units
473
 * (char32_t).
474
 * @return true if and only if the string is valid UTF-32.
475
 */
476
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
477
                                        size_t len) noexcept;
478
  #if SIMDUTF_SPAN
479
simdutf_really_inline simdutf_warn_unused bool
480
0
validate_utf32(std::span<const char32_t> input) noexcept {
481
0
  return validate_utf32(input.data(), input.size());
482
0
}
483
  #endif // SIMDUTF_SPAN
484
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
485
486
#if SIMDUTF_FEATURE_UTF32
487
/**
488
 * Validate the UTF-32 string and stop on error. It might be faster than
489
 * validate_utf32 when an error is expected to occur early.
490
 *
491
 * Overridden by each implementation.
492
 *
493
 * This function is not BOM-aware.
494
 *
495
 * @param buf the UTF-32 string to validate.
496
 * @param len the length of the string in number of 4-byte code units
497
 * (char32_t).
498
 * @return a result pair struct (of type simdutf::result containing the two
499
 * fields error and count) with an error code and either position of the error
500
 * (in the input in code units) if any, or the number of code units validated if
501
 * successful.
502
 */
503
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
504
                                                      size_t len) noexcept;
505
  #if SIMDUTF_SPAN
506
simdutf_really_inline simdutf_warn_unused result
507
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
508
0
  return validate_utf32_with_errors(input.data(), input.size());
509
0
}
510
  #endif // SIMDUTF_SPAN
511
#endif   // SIMDUTF_FEATURE_UTF32
512
513
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
514
/**
515
 * Convert Latin1 string into UTF-8 string.
516
 *
517
 * This function is suitable to work with inputs from untrusted sources.
518
 *
519
 * @param input         the Latin1 string to convert
520
 * @param length        the length of the string in bytes
521
 * @param utf8_output   the pointer to buffer that can hold conversion result
522
 * @return the number of written char; 0 if conversion is not possible
523
 */
524
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
525
                                                  size_t length,
526
                                                  char *utf8_output) noexcept;
527
  #if SIMDUTF_SPAN
528
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8(
529
    const detail::input_span_of_byte_like auto &latin1_input,
530
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
531
  return convert_latin1_to_utf8(
532
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
533
      utf8_output.data());
534
}
535
  #endif // SIMDUTF_SPAN
536
537
/**
538
 * Convert Latin1 string into UTF-8 string with output limit.
539
 *
540
 * This function is suitable to work with inputs from untrusted sources.
541
 *
542
 * We write as many characters as possible.
543
 *
544
 * @param input         the Latin1 string to convert
545
 * @param length        the length of the string in bytes
546
 * @param utf8_output   the pointer to buffer that can hold conversion result
547
 * @param utf8_len      the maximum output length
548
 * @return the number of written char; 0 if conversion is not possible
549
 */
550
simdutf_warn_unused size_t
551
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
552
                            size_t utf8_len) noexcept;
553
  #if SIMDUTF_SPAN
554
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
555
    const detail::input_span_of_byte_like auto &input,
556
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
557
  // implementation note: outputspan is a forwarding ref to avoid copying and
558
  // allow both lvalues and rvalues. std::span can be copied without problems,
559
  // but std::vector should not, and this function should accept both. it will
560
  // allow using an owning rvalue ref (example: passing a temporary std::string)
561
  // as output, but the user will quickly find out that he has no way of getting
562
  // the data out of the object in that case.
563
  return convert_latin1_to_utf8_safe(
564
      input.data(), input.size(), reinterpret_cast<char *>(utf8_output.data()),
565
      utf8_output.size());
566
}
567
  #endif // SIMDUTF_SPAN
568
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
569
570
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
571
/**
572
 * Convert possibly Latin1 string into UTF-16LE string.
573
 *
574
 * This function is suitable to work with inputs from untrusted sources.
575
 *
576
 * @param input         the Latin1 string to convert
577
 * @param length        the length of the string in bytes
578
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
579
 * @return the number of written char16_t; 0 if conversion is not possible
580
 */
581
simdutf_warn_unused size_t convert_latin1_to_utf16le(
582
    const char *input, size_t length, char16_t *utf16_output) noexcept;
583
  #if SIMDUTF_SPAN
584
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf16le(
585
    const detail::input_span_of_byte_like auto &latin1_input,
586
    std::span<char16_t> utf16_output) noexcept {
587
  return convert_latin1_to_utf16le(
588
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
589
      utf16_output.data());
590
}
591
  #endif // SIMDUTF_SPAN
592
593
/**
594
 * Convert Latin1 string into UTF-16BE string.
595
 *
596
 * This function is suitable to work with inputs from untrusted sources.
597
 *
598
 * @param input         the Latin1 string to convert
599
 * @param length        the length of the string in bytes
600
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
601
 * @return the number of written char16_t; 0 if conversion is not possible
602
 */
603
simdutf_warn_unused size_t convert_latin1_to_utf16be(
604
    const char *input, size_t length, char16_t *utf16_output) noexcept;
605
  #if SIMDUTF_SPAN
606
simdutf_really_inline simdutf_warn_unused size_t
607
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
608
                          std::span<char16_t> output) noexcept {
609
  return convert_latin1_to_utf16be(reinterpret_cast<const char *>(input.data()),
610
                                   input.size(), output.data());
611
}
612
  #endif // SIMDUTF_SPAN
613
/**
614
 * Compute the number of bytes that this UTF-16 string would require in Latin1
615
 * format.
616
 *
617
 * @param length        the length of the string in Latin1 code units (char)
618
 * @return the length of the string in Latin1 code units (char) required to
619
 * encode the UTF-16 string as Latin1
620
 */
621
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
622
623
/**
624
 * Compute the number of code units that this Latin1 string would require in
625
 * UTF-16 format.
626
 *
627
 * @param length        the length of the string in Latin1 code units (char)
628
 * @return the length of the string in 2-byte code units (char16_t) required to
629
 * encode the Latin1 string as UTF-16
630
 */
631
simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept;
632
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
633
634
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
635
/**
636
 * Convert Latin1 string into UTF-32 string.
637
 *
638
 * This function is suitable to work with inputs from untrusted sources.
639
 *
640
 * @param input         the Latin1 string to convert
641
 * @param length        the length of the string in bytes
642
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
643
 * @return the number of written char32_t; 0 if conversion is not possible
644
 */
645
simdutf_warn_unused size_t convert_latin1_to_utf32(
646
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
647
  #if SIMDUTF_SPAN
648
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf32(
649
    const detail::input_span_of_byte_like auto &latin1_input,
650
    std::span<char32_t> utf32_output) noexcept {
651
  return convert_latin1_to_utf32(
652
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
653
      utf32_output.data());
654
}
655
  #endif // SIMDUTF_SPAN
656
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
657
658
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
659
/**
660
 * Convert possibly broken UTF-8 string into latin1 string.
661
 *
662
 * During the conversion also validation of the input string is done.
663
 * This function is suitable to work with inputs from untrusted sources.
664
 *
665
 * @param input         the UTF-8 string to convert
666
 * @param length        the length of the string in bytes
667
 * @param latin1_output  the pointer to buffer that can hold conversion result
668
 * @return the number of written char; 0 if the input was not valid UTF-8 string
669
 * or if it cannot be represented as Latin1
670
 */
671
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
672
                                                  size_t length,
673
                                                  char *latin1_output) noexcept;
674
  #if SIMDUTF_SPAN
675
simdutf_really_inline simdutf_warn_unused size_t convert_utf8_to_latin1(
676
    const detail::input_span_of_byte_like auto &input,
677
    detail::output_span_of_byte_like auto &&output) noexcept {
678
  return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
679
                                input.size(),
680
                                reinterpret_cast<char *>(output.data()));
681
}
682
  #endif // SIMDUTF_SPAN
683
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
684
685
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
686
/**
687
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
688
 * string.
689
 *
690
 * During the conversion also validation of the input string is done.
691
 * This function is suitable to work with inputs from untrusted sources.
692
 *
693
 * @param input         the UTF-8 string to convert
694
 * @param length        the length of the string in bytes
695
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
696
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
697
 * string
698
 */
699
simdutf_warn_unused size_t convert_utf8_to_utf16(
700
    const char *input, size_t length, char16_t *utf16_output) noexcept;
701
  #if SIMDUTF_SPAN
702
simdutf_really_inline simdutf_warn_unused size_t
703
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
704
                      std::span<char16_t> output) noexcept {
705
  return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
706
                               input.size(), output.data());
707
}
708
  #endif // SIMDUTF_SPAN
709
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
710
711
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
712
/**
713
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
714
 *
715
 * @param input         the Latin1 string to convert
716
 * @param length        the length of the string in bytes
717
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
718
 * @return the number of written char16_t.
719
 */
720
simdutf_warn_unused size_t convert_latin1_to_utf16(
721
    const char *input, size_t length, char16_t *utf16_output) noexcept;
722
  #if SIMDUTF_SPAN
723
simdutf_really_inline simdutf_warn_unused size_t
724
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
725
                        std::span<char16_t> output) noexcept {
726
  return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
727
                                 input.size(), output.data());
728
}
729
  #endif // SIMDUTF_SPAN
730
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
731
732
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
733
/**
734
 * Convert possibly broken UTF-8 string into UTF-16LE string.
735
 *
736
 * During the conversion also validation of the input string is done.
737
 * This function is suitable to work with inputs from untrusted sources.
738
 *
739
 * @param input         the UTF-8 string to convert
740
 * @param length        the length of the string in bytes
741
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
742
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
743
 * string
744
 */
745
simdutf_warn_unused size_t convert_utf8_to_utf16le(
746
    const char *input, size_t length, char16_t *utf16_output) noexcept;
747
  #if SIMDUTF_SPAN
748
simdutf_really_inline simdutf_warn_unused size_t
749
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
750
                        std::span<char16_t> utf16_output) noexcept {
751
  return convert_utf8_to_utf16le(
752
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
753
      utf16_output.data());
754
}
755
  #endif // SIMDUTF_SPAN
756
757
/**
758
 * Convert possibly broken UTF-8 string into UTF-16BE string.
759
 *
760
 * During the conversion also validation of the input string is done.
761
 * This function is suitable to work with inputs from untrusted sources.
762
 *
763
 * @param input         the UTF-8 string to convert
764
 * @param length        the length of the string in bytes
765
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
766
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
767
 * string
768
 */
769
simdutf_warn_unused size_t convert_utf8_to_utf16be(
770
    const char *input, size_t length, char16_t *utf16_output) noexcept;
771
  #if SIMDUTF_SPAN
772
simdutf_really_inline simdutf_warn_unused size_t
773
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
774
                        std::span<char16_t> utf16_output) noexcept {
775
  return convert_utf8_to_utf16be(
776
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
777
      utf16_output.data());
778
}
779
  #endif // SIMDUTF_SPAN
780
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
781
782
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
783
/**
784
 * Convert possibly broken UTF-8 string into latin1 string with errors.
785
 * If the string cannot be represented as Latin1, an error
786
 * code is returned.
787
 *
788
 * During the conversion also validation of the input string is done.
789
 * This function is suitable to work with inputs from untrusted sources.
790
 *
791
 * @param input         the UTF-8 string to convert
792
 * @param length        the length of the string in bytes
793
 * @param latin1_output  the pointer to buffer that can hold conversion result
794
 * @return a result pair struct (of type simdutf::result containing the two
795
 * fields error and count) with an error code and either position of the error
796
 * (in the input in code units) if any, or the number of code units validated if
797
 * successful.
798
 */
799
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
800
    const char *input, size_t length, char *latin1_output) noexcept;
801
  #if SIMDUTF_SPAN
802
simdutf_really_inline simdutf_warn_unused result
803
convert_utf8_to_latin1_with_errors(
804
    const detail::input_span_of_byte_like auto &utf8_input,
805
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
806
  return convert_utf8_to_latin1_with_errors(
807
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
808
      reinterpret_cast<char *>(latin1_output.data()));
809
}
810
  #endif // SIMDUTF_SPAN
811
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
812
813
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
814
/**
815
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
816
 * string and stop on error.
817
 *
818
 * During the conversion also validation of the input string is done.
819
 * This function is suitable to work with inputs from untrusted sources.
820
 *
821
 * @param input         the UTF-8 string to convert
822
 * @param length        the length of the string in bytes
823
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
824
 * @return a result pair struct (of type simdutf::result containing the two
825
 * fields error and count) with an error code and either position of the error
826
 * (in the input in code units) if any, or the number of char16_t written if
827
 * successful.
828
 */
829
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
830
    const char *input, size_t length, char16_t *utf16_output) noexcept;
831
  #if SIMDUTF_SPAN
832
simdutf_really_inline simdutf_warn_unused result
833
convert_utf8_to_utf16_with_errors(
834
    const detail::input_span_of_byte_like auto &utf8_input,
835
    std::span<char16_t> utf16_output) noexcept {
836
  return convert_utf8_to_utf16_with_errors(
837
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
838
      utf16_output.data());
839
}
840
  #endif // SIMDUTF_SPAN
841
842
/**
843
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
844
 *
845
 * During the conversion also validation of the input string is done.
846
 * This function is suitable to work with inputs from untrusted sources.
847
 *
848
 * @param input         the UTF-8 string to convert
849
 * @param length        the length of the string in bytes
850
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
851
 * @return a result pair struct (of type simdutf::result containing the two
852
 * fields error and count) with an error code and either position of the error
853
 * (in the input in code units) if any, or the number of char16_t written if
854
 * successful.
855
 */
856
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
857
    const char *input, size_t length, char16_t *utf16_output) noexcept;
858
  #if SIMDUTF_SPAN
859
simdutf_really_inline simdutf_warn_unused result
860
convert_utf8_to_utf16le_with_errors(
861
    const detail::input_span_of_byte_like auto &utf8_input,
862
    std::span<char16_t> utf16_output) noexcept {
863
  return convert_utf8_to_utf16le_with_errors(
864
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
865
      utf16_output.data());
866
}
867
  #endif // SIMDUTF_SPAN
868
869
/**
870
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
871
 *
872
 * During the conversion also validation of the input string is done.
873
 * This function is suitable to work with inputs from untrusted sources.
874
 *
875
 * @param input         the UTF-8 string to convert
876
 * @param length        the length of the string in bytes
877
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
878
 * @return a result pair struct (of type simdutf::result containing the two
879
 * fields error and count) with an error code and either position of the error
880
 * (in the input in code units) if any, or the number of char16_t written if
881
 * successful.
882
 */
883
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
884
    const char *input, size_t length, char16_t *utf16_output) noexcept;
885
  #if SIMDUTF_SPAN
886
simdutf_really_inline simdutf_warn_unused result
887
convert_utf8_to_utf16be_with_errors(
888
    const detail::input_span_of_byte_like auto &utf8_input,
889
    std::span<char16_t> utf16_output) noexcept {
890
  return convert_utf8_to_utf16be_with_errors(
891
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
892
      utf16_output.data());
893
}
894
  #endif // SIMDUTF_SPAN
895
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
896
897
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
898
/**
899
 * Convert possibly broken UTF-8 string into UTF-32 string.
900
 *
901
 * During the conversion also validation of the input string is done.
902
 * This function is suitable to work with inputs from untrusted sources.
903
 *
904
 * @param input         the UTF-8 string to convert
905
 * @param length        the length of the string in bytes
906
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
907
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
908
 * string
909
 */
910
simdutf_warn_unused size_t convert_utf8_to_utf32(
911
    const char *input, size_t length, char32_t *utf32_output) noexcept;
912
  #if SIMDUTF_SPAN
913
simdutf_really_inline simdutf_warn_unused size_t
914
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
915
                      std::span<char32_t> utf32_output) noexcept {
916
  return convert_utf8_to_utf32(
917
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
918
      utf32_output.data());
919
}
920
  #endif // SIMDUTF_SPAN
921
922
/**
923
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
924
 *
925
 * During the conversion also validation of the input string is done.
926
 * This function is suitable to work with inputs from untrusted sources.
927
 *
928
 * @param input         the UTF-8 string to convert
929
 * @param length        the length of the string in bytes
930
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
931
 * @return a result pair struct (of type simdutf::result containing the two
932
 * fields error and count) with an error code and either position of the error
933
 * (in the input in code units) if any, or the number of char32_t written if
934
 * successful.
935
 */
936
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
937
    const char *input, size_t length, char32_t *utf32_output) noexcept;
938
  #if SIMDUTF_SPAN
939
simdutf_really_inline simdutf_warn_unused result
940
convert_utf8_to_utf32_with_errors(
941
    const detail::input_span_of_byte_like auto &utf8_input,
942
    std::span<char32_t> utf32_output) noexcept {
943
  return convert_utf8_to_utf32_with_errors(
944
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
945
      utf32_output.data());
946
}
947
  #endif // SIMDUTF_SPAN
948
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
949
950
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
951
/**
952
 * Convert valid UTF-8 string into latin1 string.
953
 *
954
 * This function assumes that the input string is valid UTF-8 and that it can be
955
 * represented as Latin1. If you violate this assumption, the result is
956
 * implementation defined and may include system-dependent behavior such as
957
 * crashes.
958
 *
959
 * This function is for expert users only and not part of our public API. Use
960
 * convert_utf8_to_latin1 instead. The function may be removed from the library
961
 * in the future.
962
 *
963
 * This function is not BOM-aware.
964
 *
965
 * @param input         the UTF-8 string to convert
966
 * @param length        the length of the string in bytes
967
 * @param latin1_output  the pointer to buffer that can hold conversion result
968
 * @return the number of written char; 0 if the input was not valid UTF-8 string
969
 */
970
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
971
    const char *input, size_t length, char *latin1_output) noexcept;
972
  #if SIMDUTF_SPAN
973
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
974
    const detail::input_span_of_byte_like auto &valid_utf8_input,
975
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
976
  return convert_valid_utf8_to_latin1(
977
      reinterpret_cast<const char *>(valid_utf8_input.data()),
978
      valid_utf8_input.size(), latin1_output.data());
979
}
980
  #endif // SIMDUTF_SPAN
981
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
982
983
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
984
/**
985
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
986
 *
987
 * This function assumes that the input string is valid UTF-8.
988
 *
989
 * @param input         the UTF-8 string to convert
990
 * @param length        the length of the string in bytes
991
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
992
 * @return the number of written char16_t
993
 */
994
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
995
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
996
  #if SIMDUTF_SPAN
997
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
998
    const detail::input_span_of_byte_like auto &valid_utf8_input,
999
    std::span<char16_t> utf16_output) noexcept {
1000
  return convert_valid_utf8_to_utf16(
1001
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1002
      valid_utf8_input.size(), utf16_output.data());
1003
}
1004
  #endif // SIMDUTF_SPAN
1005
1006
/**
1007
 * Convert valid UTF-8 string into UTF-16LE string.
1008
 *
1009
 * This function assumes that the input string is valid UTF-8.
1010
 *
1011
 * @param input         the UTF-8 string to convert
1012
 * @param length        the length of the string in bytes
1013
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1014
 * @return the number of written char16_t
1015
 */
1016
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1017
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1018
  #if SIMDUTF_SPAN
1019
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1020
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1021
    std::span<char16_t> utf16_output) noexcept {
1022
  return convert_valid_utf8_to_utf16le(
1023
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1024
      valid_utf8_input.size(), utf16_output.data());
1025
}
1026
  #endif // SIMDUTF_SPAN
1027
1028
/**
1029
 * Convert valid UTF-8 string into UTF-16BE string.
1030
 *
1031
 * This function assumes that the input string is valid UTF-8.
1032
 *
1033
 * @param input         the UTF-8 string to convert
1034
 * @param length        the length of the string in bytes
1035
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1036
 * @return the number of written char16_t
1037
 */
1038
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1039
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1040
  #if SIMDUTF_SPAN
1041
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1042
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1043
    std::span<char16_t> utf16_output) noexcept {
1044
  return convert_valid_utf8_to_utf16be(
1045
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1046
      valid_utf8_input.size(), utf16_output.data());
1047
}
1048
  #endif // SIMDUTF_SPAN
1049
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1050
1051
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1052
/**
1053
 * Convert valid UTF-8 string into UTF-32 string.
1054
 *
1055
 * This function assumes that the input string is valid UTF-8.
1056
 *
1057
 * @param input         the UTF-8 string to convert
1058
 * @param length        the length of the string in bytes
1059
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1060
 * @return the number of written char32_t
1061
 */
1062
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1063
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1064
  #if SIMDUTF_SPAN
1065
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1066
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1067
    std::span<char32_t> utf32_output) noexcept {
1068
  return convert_valid_utf8_to_utf32(
1069
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1070
      valid_utf8_input.size(), utf32_output.data());
1071
}
1072
  #endif // SIMDUTF_SPAN
1073
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1074
1075
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1076
/**
1077
 * Return the number of bytes that this Latin1 string would require in UTF-8
1078
 * format.
1079
 *
1080
 * @param input         the Latin1 string to convert
1081
 * @param length        the length of the string bytes
1082
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1083
 */
1084
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1085
                                                   size_t length) noexcept;
1086
  #if SIMDUTF_SPAN
1087
simdutf_really_inline simdutf_warn_unused size_t utf8_length_from_latin1(
1088
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1089
  return utf8_length_from_latin1(
1090
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size());
1091
}
1092
  #endif // SIMDUTF_SPAN
1093
1094
/**
1095
 * Compute the number of bytes that this UTF-8 string would require in Latin1
1096
 * format.
1097
 *
1098
 * This function does not validate the input. It is acceptable to pass invalid
1099
 * UTF-8 strings but in such cases the result is implementation defined.
1100
 *
1101
 * This function is not BOM-aware.
1102
 *
1103
 * @param input         the UTF-8 string to convert
1104
 * @param length        the length of the string in byte
1105
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1106
 */
1107
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1108
                                                   size_t length) noexcept;
1109
  #if SIMDUTF_SPAN
1110
simdutf_really_inline simdutf_warn_unused size_t latin1_length_from_utf8(
1111
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1112
  return latin1_length_from_utf8(
1113
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1114
      valid_utf8_input.size());
1115
}
1116
  #endif // SIMDUTF_SPAN
1117
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1118
1119
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1120
/**
1121
 * Compute the number of 2-byte code units that this UTF-8 string would require
1122
 * in UTF-16LE format.
1123
 *
1124
 * This function does not validate the input. It is acceptable to pass invalid
1125
 * UTF-8 strings but in such cases the result is implementation defined.
1126
 *
1127
 * This function is not BOM-aware.
1128
 *
1129
 * @param input         the UTF-8 string to process
1130
 * @param length        the length of the string in bytes
1131
 * @return the number of char16_t code units required to encode the UTF-8 string
1132
 * as UTF-16LE
1133
 */
1134
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1135
                                                  size_t length) noexcept;
1136
  #if SIMDUTF_SPAN
1137
simdutf_really_inline simdutf_warn_unused size_t utf16_length_from_utf8(
1138
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1139
  return utf16_length_from_utf8(
1140
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1141
      valid_utf8_input.size());
1142
}
1143
  #endif // SIMDUTF_SPAN
1144
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1145
1146
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1147
/**
1148
 * Compute the number of 4-byte code units that this UTF-8 string would require
1149
 * in UTF-32 format.
1150
 *
1151
 * This function is equivalent to count_utf8
1152
 *
1153
 * This function does not validate the input. It is acceptable to pass invalid
1154
 * UTF-8 strings but in such cases the result is implementation defined.
1155
 *
1156
 * This function is not BOM-aware.
1157
 *
1158
 * @param input         the UTF-8 string to process
1159
 * @param length        the length of the string in bytes
1160
 * @return the number of char32_t code units required to encode the UTF-8 string
1161
 * as UTF-32
1162
 */
1163
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1164
                                                  size_t length) noexcept;
1165
  #if SIMDUTF_SPAN
1166
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf8(
1167
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1168
  return utf32_length_from_utf8(
1169
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1170
      valid_utf8_input.size());
1171
}
1172
  #endif // SIMDUTF_SPAN
1173
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1174
1175
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1176
/**
1177
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1178
 * string.
1179
 *
1180
 * During the conversion also validation of the input string is done.
1181
 * This function is suitable to work with inputs from untrusted sources.
1182
 *
1183
 * This function is not BOM-aware.
1184
 *
1185
 * @param input         the UTF-16 string to convert
1186
 * @param length        the length of the string in 2-byte code units (char16_t)
1187
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1188
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1189
 * string
1190
 */
1191
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1192
                                                 size_t length,
1193
                                                 char *utf8_buffer) noexcept;
1194
  #if SIMDUTF_SPAN
1195
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8(
1196
    std::span<const char16_t> utf16_input,
1197
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1198
  return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1199
                               reinterpret_cast<char *>(utf8_output.data()));
1200
}
1201
  #endif // SIMDUTF_SPAN
1202
1203
/**
1204
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1205
 * string with output limit.
1206
 *
1207
 * We write as many characters as possible into the output buffer,
1208
 *
1209
 * During the conversion also validation of the input string is done.
1210
 * This function is suitable to work with inputs from untrusted sources.
1211
 *
1212
 * This function is not BOM-aware.
1213
 *
1214
 *
1215
 * @param input         the UTF-16 string to convert
1216
 * @param length        the length of the string in 16-bit code units (char16_t)
1217
 * @param utf8_output   the pointer to buffer that can hold conversion result
1218
 * @param utf8_len      the maximum output length
1219
 * @return the number of written char; 0 if conversion is not possible
1220
 */
1221
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1222
                                                      size_t length,
1223
                                                      char *utf8_output,
1224
                                                      size_t utf8_len) noexcept;
1225
  #if SIMDUTF_SPAN
1226
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8_safe(
1227
    std::span<const char16_t> utf16_input,
1228
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1229
  // implementation note: outputspan is a forwarding ref to avoid copying and
1230
  // allow both lvalues and rvalues. std::span can be copied without problems,
1231
  // but std::vector should not, and this function should accept both. it will
1232
  // allow using an owning rvalue ref (example: passing a temporary std::string)
1233
  // as output, but the user will quickly find out that he has no way of getting
1234
  // the data out of the object in that case.
1235
  return convert_utf16_to_utf8_safe(
1236
      utf16_input.data(), utf16_input.size(),
1237
      reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1238
}
1239
  #endif // SIMDUTF_SPAN
1240
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1241
1242
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1243
/**
1244
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1245
 * string.
1246
 *
1247
 * During the conversion also validation of the input string is done.
1248
 * This function is suitable to work with inputs from untrusted sources.
1249
 *
1250
 * This function is not BOM-aware.
1251
 *
1252
 * @param input         the UTF-16 string to convert
1253
 * @param length        the length of the string in 2-byte code units (char16_t)
1254
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1255
 * @return number of written code units; 0 if input is not a valid UTF-16 string
1256
 * or if it cannot be represented as Latin1
1257
 */
1258
simdutf_warn_unused size_t convert_utf16_to_latin1(
1259
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1260
  #if SIMDUTF_SPAN
1261
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_latin1(
1262
    std::span<const char16_t> utf16_input,
1263
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1264
  return convert_utf16_to_latin1(
1265
      utf16_input.data(), utf16_input.size(),
1266
      reinterpret_cast<char *>(latin1_output.data()));
1267
}
1268
  #endif // SIMDUTF_SPAN
1269
1270
/**
1271
 * Convert possibly broken UTF-16LE string into Latin1 string.
1272
 * If the string cannot be represented as Latin1, an error
1273
 * is returned.
1274
 *
1275
 * During the conversion also validation of the input string is done.
1276
 * This function is suitable to work with inputs from untrusted sources.
1277
 *
1278
 * This function is not BOM-aware.
1279
 *
1280
 * @param input         the UTF-16LE string to convert
1281
 * @param length        the length of the string in 2-byte code units (char16_t)
1282
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1283
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1284
 * string or if it cannot be represented as Latin1
1285
 */
1286
simdutf_warn_unused size_t convert_utf16le_to_latin1(
1287
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1288
  #if SIMDUTF_SPAN
1289
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_latin1(
1290
    std::span<const char16_t> utf16_input,
1291
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1292
  return convert_utf16le_to_latin1(
1293
      utf16_input.data(), utf16_input.size(),
1294
      reinterpret_cast<char *>(latin1_output.data()));
1295
}
1296
  #endif // SIMDUTF_SPAN
1297
1298
/**
1299
 * Convert possibly broken UTF-16BE string into Latin1 string.
1300
 *
1301
 * During the conversion also validation of the input string is done.
1302
 * This function is suitable to work with inputs from untrusted sources.
1303
 *
1304
 * This function is not BOM-aware.
1305
 *
1306
 * @param input         the UTF-16BE string to convert
1307
 * @param length        the length of the string in 2-byte code units (char16_t)
1308
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1309
 * @return number of written code units; 0 if input is not a valid UTF-16BE
1310
 * string or if it cannot be represented as Latin1
1311
 */
1312
simdutf_warn_unused size_t convert_utf16be_to_latin1(
1313
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1314
  #if SIMDUTF_SPAN
1315
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_latin1(
1316
    std::span<const char16_t> utf16_input,
1317
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1318
  return convert_utf16be_to_latin1(
1319
      utf16_input.data(), utf16_input.size(),
1320
      reinterpret_cast<char *>(latin1_output.data()));
1321
}
1322
  #endif // SIMDUTF_SPAN
1323
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1324
1325
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1326
/**
1327
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1328
 *
1329
 * During the conversion also validation of the input string is done.
1330
 * This function is suitable to work with inputs from untrusted sources.
1331
 *
1332
 * This function is not BOM-aware.
1333
 *
1334
 * @param input         the UTF-16LE string to convert
1335
 * @param length        the length of the string in 2-byte code units (char16_t)
1336
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1337
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1338
 * string
1339
 */
1340
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1341
                                                   size_t length,
1342
                                                   char *utf8_buffer) noexcept;
1343
  #if SIMDUTF_SPAN
1344
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_utf8(
1345
    std::span<const char16_t> utf16_input,
1346
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1347
  return convert_utf16le_to_utf8(utf16_input.data(), utf16_input.size(),
1348
                                 reinterpret_cast<char *>(utf8_output.data()));
1349
}
1350
  #endif // SIMDUTF_SPAN
1351
1352
/**
1353
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1354
 *
1355
 * During the conversion also validation of the input string is done.
1356
 * This function is suitable to work with inputs from untrusted sources.
1357
 *
1358
 * This function is not BOM-aware.
1359
 *
1360
 * @param input         the UTF-16BE string to convert
1361
 * @param length        the length of the string in 2-byte code units (char16_t)
1362
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1363
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1364
 * string
1365
 */
1366
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
1367
                                                   size_t length,
1368
                                                   char *utf8_buffer) noexcept;
1369
  #if SIMDUTF_SPAN
1370
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_utf8(
1371
    std::span<const char16_t> utf16_input,
1372
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1373
  return convert_utf16be_to_utf8(utf16_input.data(), utf16_input.size(),
1374
                                 reinterpret_cast<char *>(utf8_output.data()));
1375
}
1376
  #endif // SIMDUTF_SPAN
1377
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1378
1379
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1380
/**
1381
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1382
 * string.
1383
 *
1384
 * During the conversion also validation of the input string is done.
1385
 * This function is suitable to work with inputs from untrusted sources.
1386
 * This function is not BOM-aware.
1387
 *
1388
 * @param input         the UTF-16 string to convert
1389
 * @param length        the length of the string in 2-byte code units (char16_t)
1390
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1391
 * @return a result pair struct (of type simdutf::result containing the two
1392
 * fields error and count) with an error code and either position of the error
1393
 * (in the input in code units) if any, or the number of char written if
1394
 * successful.
1395
 */
1396
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
1397
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1398
  #if SIMDUTF_SPAN
1399
simdutf_really_inline simdutf_warn_unused result
1400
convert_utf16_to_latin1_with_errors(
1401
    std::span<const char16_t> utf16_input,
1402
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1403
  return convert_utf16_to_latin1_with_errors(
1404
      utf16_input.data(), utf16_input.size(),
1405
      reinterpret_cast<char *>(latin1_output.data()));
1406
}
1407
  #endif // SIMDUTF_SPAN
1408
1409
/**
1410
 * Convert possibly broken UTF-16LE string into Latin1 string.
1411
 *
1412
 * During the conversion also validation of the input string is done.
1413
 * This function is suitable to work with inputs from untrusted sources.
1414
 * This function is not BOM-aware.
1415
 *
1416
 * @param input         the UTF-16LE string to convert
1417
 * @param length        the length of the string in 2-byte code units (char16_t)
1418
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1419
 * @return a result pair struct (of type simdutf::result containing the two
1420
 * fields error and count) with an error code and either position of the error
1421
 * (in the input in code units) if any, or the number of char written if
1422
 * successful.
1423
 */
1424
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
1425
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1426
  #if SIMDUTF_SPAN
1427
simdutf_really_inline simdutf_warn_unused result
1428
convert_utf16le_to_latin1_with_errors(
1429
    std::span<const char16_t> utf16_input,
1430
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1431
  return convert_utf16le_to_latin1_with_errors(
1432
      utf16_input.data(), utf16_input.size(),
1433
      reinterpret_cast<char *>(latin1_output.data()));
1434
}
1435
  #endif // SIMDUTF_SPAN
1436
1437
/**
1438
 * Convert possibly broken UTF-16BE string into Latin1 string.
1439
 * If the string cannot be represented as Latin1, an error
1440
 * is returned.
1441
 *
1442
 * During the conversion also validation of the input string is done.
1443
 * This function is suitable to work with inputs from untrusted sources.
1444
 * This function is not BOM-aware.
1445
 *
1446
 * @param input         the UTF-16BE string to convert
1447
 * @param length        the length of the string in 2-byte code units (char16_t)
1448
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1449
 * @return a result pair struct (of type simdutf::result containing the two
1450
 * fields error and count) with an error code and either position of the error
1451
 * (in the input in code units) if any, or the number of char written if
1452
 * successful.
1453
 */
1454
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
1455
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1456
  #if SIMDUTF_SPAN
1457
simdutf_really_inline simdutf_warn_unused result
1458
convert_utf16be_to_latin1_with_errors(
1459
    std::span<const char16_t> utf16_input,
1460
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1461
  return convert_utf16be_to_latin1_with_errors(
1462
      utf16_input.data(), utf16_input.size(),
1463
      reinterpret_cast<char *>(latin1_output.data()));
1464
}
1465
  #endif // SIMDUTF_SPAN
1466
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1467
1468
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1469
/**
1470
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1471
 * string and stop on error.
1472
 *
1473
 * During the conversion also validation of the input string is done.
1474
 * This function is suitable to work with inputs from untrusted sources.
1475
 *
1476
 * This function is not BOM-aware.
1477
 *
1478
 * @param input         the UTF-16 string to convert
1479
 * @param length        the length of the string in 2-byte code units (char16_t)
1480
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1481
 * @return a result pair struct (of type simdutf::result containing the two
1482
 * fields error and count) with an error code and either position of the error
1483
 * (in the input in code units) if any, or the number of char written if
1484
 * successful.
1485
 */
1486
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
1487
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1488
  #if SIMDUTF_SPAN
1489
simdutf_really_inline simdutf_warn_unused result
1490
convert_utf16_to_utf8_with_errors(
1491
    std::span<const char16_t> utf16_input,
1492
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1493
  return convert_utf16_to_utf8_with_errors(
1494
      utf16_input.data(), utf16_input.size(),
1495
      reinterpret_cast<char *>(utf8_output.data()));
1496
}
1497
  #endif // SIMDUTF_SPAN
1498
1499
/**
1500
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1501
 *
1502
 * During the conversion also validation of the input string is done.
1503
 * This function is suitable to work with inputs from untrusted sources.
1504
 *
1505
 * This function is not BOM-aware.
1506
 *
1507
 * @param input         the UTF-16LE string to convert
1508
 * @param length        the length of the string in 2-byte code units (char16_t)
1509
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1510
 * @return a result pair struct (of type simdutf::result containing the two
1511
 * fields error and count) with an error code and either position of the error
1512
 * (in the input in code units) if any, or the number of char written if
1513
 * successful.
1514
 */
1515
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
1516
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1517
  #if SIMDUTF_SPAN
1518
simdutf_really_inline simdutf_warn_unused result
1519
convert_utf16le_to_utf8_with_errors(
1520
    std::span<const char16_t> utf16_input,
1521
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1522
  return convert_utf16le_to_utf8_with_errors(
1523
      utf16_input.data(), utf16_input.size(),
1524
      reinterpret_cast<char *>(utf8_output.data()));
1525
}
1526
  #endif // SIMDUTF_SPAN
1527
1528
/**
1529
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1530
 *
1531
 * During the conversion also validation of the input string is done.
1532
 * This function is suitable to work with inputs from untrusted sources.
1533
 *
1534
 * This function is not BOM-aware.
1535
 *
1536
 * @param input         the UTF-16BE string to convert
1537
 * @param length        the length of the string in 2-byte code units (char16_t)
1538
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1539
 * @return a result pair struct (of type simdutf::result containing the two
1540
 * fields error and count) with an error code and either position of the error
1541
 * (in the input in code units) if any, or the number of char written if
1542
 * successful.
1543
 */
1544
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
1545
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1546
  #if SIMDUTF_SPAN
1547
simdutf_really_inline simdutf_warn_unused result
1548
convert_utf16be_to_utf8_with_errors(
1549
    std::span<const char16_t> utf16_input,
1550
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1551
  return convert_utf16be_to_utf8_with_errors(
1552
      utf16_input.data(), utf16_input.size(),
1553
      reinterpret_cast<char *>(utf8_output.data()));
1554
}
1555
  #endif // SIMDUTF_SPAN
1556
1557
/**
1558
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
1559
 *
1560
 * This function assumes that the input string is valid UTF-16LE.
1561
 *
1562
 * This function is not BOM-aware.
1563
 *
1564
 * @param input         the UTF-16 string to convert
1565
 * @param length        the length of the string in 2-byte code units (char16_t)
1566
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1567
 * result
1568
 * @return number of written code units; 0 if conversion is not possible
1569
 */
1570
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1571
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1572
  #if SIMDUTF_SPAN
1573
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1574
    std::span<const char16_t> valid_utf16_input,
1575
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1576
  return convert_valid_utf16_to_utf8(
1577
      valid_utf16_input.data(), valid_utf16_input.size(),
1578
      reinterpret_cast<char *>(utf8_output.data()));
1579
}
1580
  #endif // SIMDUTF_SPAN
1581
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1582
1583
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1584
/**
1585
 * Using native endianness, convert UTF-16 string into Latin1 string.
1586
 *
1587
 * This function assumes that the input string is valid UTF-16 and that it can
1588
 * be represented as Latin1. If you violate this assumption, the result is
1589
 * implementation defined and may include system-dependent behavior such as
1590
 * crashes.
1591
 *
1592
 * This function is for expert users only and not part of our public API. Use
1593
 * convert_utf16_to_latin1 instead. The function may be removed from the library
1594
 * in the future.
1595
 *
1596
 * This function is not BOM-aware.
1597
 *
1598
 * @param input         the UTF-16 string to convert
1599
 * @param length        the length of the string in 2-byte code units (char16_t)
1600
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1601
 * @return number of written code units; 0 if conversion is not possible
1602
 */
1603
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1604
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1605
  #if SIMDUTF_SPAN
1606
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1607
    std::span<const char16_t> valid_utf16_input,
1608
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1609
  return convert_valid_utf16_to_latin1(
1610
      valid_utf16_input.data(), valid_utf16_input.size(),
1611
      reinterpret_cast<char *>(latin1_output.data()));
1612
}
1613
  #endif // SIMDUTF_SPAN
1614
1615
/**
1616
 * Convert valid UTF-16LE string into Latin1 string.
1617
 *
1618
 * This function assumes that the input string is valid UTF-16LE and that it can
1619
 * be represented as Latin1. If you violate this assumption, the result is
1620
 * implementation defined and may include system-dependent behavior such as
1621
 * crashes.
1622
 *
1623
 * This function is for expert users only and not part of our public API. Use
1624
 * convert_utf16le_to_latin1 instead. The function may be removed from the
1625
 * library in the future.
1626
 *
1627
 * This function is not BOM-aware.
1628
 *
1629
 * @param input         the UTF-16LE string to convert
1630
 * @param length        the length of the string in 2-byte code units (char16_t)
1631
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1632
 * @return number of written code units; 0 if conversion is not possible
1633
 */
1634
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
1635
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1636
  #if SIMDUTF_SPAN
1637
simdutf_really_inline simdutf_warn_unused size_t
1638
convert_valid_utf16le_to_latin1(
1639
    std::span<const char16_t> valid_utf16_input,
1640
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1641
  return convert_valid_utf16le_to_latin1(
1642
      valid_utf16_input.data(), valid_utf16_input.size(),
1643
      reinterpret_cast<char *>(latin1_output.data()));
1644
}
1645
  #endif // SIMDUTF_SPAN
1646
1647
/**
1648
 * Convert valid UTF-16BE string into Latin1 string.
1649
 *
1650
 * This function assumes that the input string is valid UTF-16BE and that it can
1651
 * be represented as Latin1. If you violate this assumption, the result is
1652
 * implementation defined and may include system-dependent behavior such as
1653
 * crashes.
1654
 *
1655
 * This function is for expert users only and not part of our public API. Use
1656
 * convert_utf16be_to_latin1 instead. The function may be removed from the
1657
 * library in the future.
1658
 *
1659
 * This function is not BOM-aware.
1660
 *
1661
 * @param input         the UTF-16BE string to convert
1662
 * @param length        the length of the string in 2-byte code units (char16_t)
1663
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1664
 * @return number of written code units; 0 if conversion is not possible
1665
 */
1666
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
1667
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1668
  #if SIMDUTF_SPAN
1669
simdutf_really_inline simdutf_warn_unused size_t
1670
convert_valid_utf16be_to_latin1(
1671
    std::span<const char16_t> valid_utf16_input,
1672
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1673
  return convert_valid_utf16be_to_latin1(
1674
      valid_utf16_input.data(), valid_utf16_input.size(),
1675
      reinterpret_cast<char *>(latin1_output.data()));
1676
}
1677
  #endif // SIMDUTF_SPAN
1678
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1679
1680
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1681
/**
1682
 * Convert valid UTF-16LE string into UTF-8 string.
1683
 *
1684
 * This function assumes that the input string is valid UTF-16LE and that it can
1685
 * be represented as Latin1.
1686
 *
1687
 * This function is not BOM-aware.
1688
 *
1689
 * @param input         the UTF-16LE string to convert
1690
 * @param length        the length of the string in 2-byte code units (char16_t)
1691
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1692
 * result
1693
 * @return number of written code units; 0 if conversion is not possible
1694
 */
1695
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1696
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1697
  #if SIMDUTF_SPAN
1698
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1699
    std::span<const char16_t> valid_utf16_input,
1700
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1701
  return convert_valid_utf16le_to_utf8(
1702
      valid_utf16_input.data(), valid_utf16_input.size(),
1703
      reinterpret_cast<char *>(utf8_output.data()));
1704
}
1705
  #endif // SIMDUTF_SPAN
1706
1707
/**
1708
 * Convert valid UTF-16BE string into UTF-8 string.
1709
 *
1710
 * This function assumes that the input string is valid UTF-16BE.
1711
 *
1712
 * This function is not BOM-aware.
1713
 *
1714
 * @param input         the UTF-16BE string to convert
1715
 * @param length        the length of the string in 2-byte code units (char16_t)
1716
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1717
 * result
1718
 * @return number of written code units; 0 if conversion is not possible
1719
 */
1720
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1721
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1722
  #if SIMDUTF_SPAN
1723
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1724
    std::span<const char16_t> valid_utf16_input,
1725
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1726
  return convert_valid_utf16be_to_utf8(
1727
      valid_utf16_input.data(), valid_utf16_input.size(),
1728
      reinterpret_cast<char *>(utf8_output.data()));
1729
}
1730
  #endif // SIMDUTF_SPAN
1731
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1732
1733
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
1734
/**
1735
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
1736
 * string.
1737
 *
1738
 * During the conversion also validation of the input string is done.
1739
 * This function is suitable to work with inputs from untrusted sources.
1740
 *
1741
 * This function is not BOM-aware.
1742
 *
1743
 * @param input         the UTF-16 string to convert
1744
 * @param length        the length of the string in 2-byte code units (char16_t)
1745
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1746
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1747
 * string
1748
 */
1749
simdutf_warn_unused size_t convert_utf16_to_utf32(
1750
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1751
  #if SIMDUTF_SPAN
1752
simdutf_really_inline simdutf_warn_unused size_t
1753
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
1754
0
                       std::span<char32_t> utf32_output) noexcept {
1755
0
  return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
1756
0
                                utf32_output.data());
1757
0
}
1758
  #endif // SIMDUTF_SPAN
1759
1760
/**
1761
 * Convert possibly broken UTF-16LE string into UTF-32 string.
1762
 *
1763
 * During the conversion also validation of the input string is done.
1764
 * This function is suitable to work with inputs from untrusted sources.
1765
 *
1766
 * This function is not BOM-aware.
1767
 *
1768
 * @param input         the UTF-16LE string to convert
1769
 * @param length        the length of the string in 2-byte code units (char16_t)
1770
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1771
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1772
 * string
1773
 */
1774
simdutf_warn_unused size_t convert_utf16le_to_utf32(
1775
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1776
  #if SIMDUTF_SPAN
1777
simdutf_really_inline simdutf_warn_unused size_t
1778
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
1779
0
                         std::span<char32_t> utf32_output) noexcept {
1780
0
  return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
1781
0
                                  utf32_output.data());
1782
0
}
1783
  #endif // SIMDUTF_SPAN
1784
1785
/**
1786
 * Convert possibly broken UTF-16BE string into UTF-32 string.
1787
 *
1788
 * During the conversion also validation of the input string is done.
1789
 * This function is suitable to work with inputs from untrusted sources.
1790
 *
1791
 * This function is not BOM-aware.
1792
 *
1793
 * @param input         the UTF-16BE string to convert
1794
 * @param length        the length of the string in 2-byte code units (char16_t)
1795
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1796
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1797
 * string
1798
 */
1799
simdutf_warn_unused size_t convert_utf16be_to_utf32(
1800
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1801
  #if SIMDUTF_SPAN
1802
simdutf_really_inline simdutf_warn_unused size_t
1803
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
1804
0
                         std::span<char32_t> utf32_output) noexcept {
1805
0
  return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
1806
0
                                  utf32_output.data());
1807
0
}
1808
  #endif // SIMDUTF_SPAN
1809
1810
/**
1811
 * Using native endianness, convert possibly broken UTF-16 string into
1812
 * UTF-32 string and stop on error.
1813
 *
1814
 * During the conversion also validation of the input string is done.
1815
 * This function is suitable to work with inputs from untrusted sources.
1816
 *
1817
 * This function is not BOM-aware.
1818
 *
1819
 * @param input         the UTF-16 string to convert
1820
 * @param length        the length of the string in 2-byte code units (char16_t)
1821
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1822
 * @return a result pair struct (of type simdutf::result containing the two
1823
 * fields error and count) with an error code and either position of the error
1824
 * (in the input in code units) if any, or the number of char32_t written if
1825
 * successful.
1826
 */
1827
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
1828
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1829
  #if SIMDUTF_SPAN
1830
simdutf_really_inline simdutf_warn_unused result
1831
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
1832
0
                                   std::span<char32_t> utf32_output) noexcept {
1833
0
  return convert_utf16_to_utf32_with_errors(
1834
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1835
0
}
1836
  #endif // SIMDUTF_SPAN
1837
1838
/**
1839
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1840
 *
1841
 * During the conversion also validation of the input string is done.
1842
 * This function is suitable to work with inputs from untrusted sources.
1843
 *
1844
 * This function is not BOM-aware.
1845
 *
1846
 * @param input         the UTF-16LE string to convert
1847
 * @param length        the length of the string in 2-byte code units (char16_t)
1848
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1849
 * @return a result pair struct (of type simdutf::result containing the two
1850
 * fields error and count) with an error code and either position of the error
1851
 * (in the input in code units) if any, or the number of char32_t written if
1852
 * successful.
1853
 */
1854
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
1855
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1856
  #if SIMDUTF_SPAN
1857
simdutf_really_inline simdutf_warn_unused result
1858
convert_utf16le_to_utf32_with_errors(
1859
    std::span<const char16_t> utf16_input,
1860
0
    std::span<char32_t> utf32_output) noexcept {
1861
0
  return convert_utf16le_to_utf32_with_errors(
1862
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1863
0
}
1864
  #endif // SIMDUTF_SPAN
1865
1866
/**
1867
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1868
 *
1869
 * During the conversion also validation of the input string is done.
1870
 * This function is suitable to work with inputs from untrusted sources.
1871
 *
1872
 * This function is not BOM-aware.
1873
 *
1874
 * @param input         the UTF-16BE string to convert
1875
 * @param length        the length of the string in 2-byte code units (char16_t)
1876
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1877
 * @return a result pair struct (of type simdutf::result containing the two
1878
 * fields error and count) with an error code and either position of the error
1879
 * (in the input in code units) if any, or the number of char32_t written if
1880
 * successful.
1881
 */
1882
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
1883
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1884
  #if SIMDUTF_SPAN
1885
simdutf_really_inline simdutf_warn_unused result
1886
convert_utf16be_to_utf32_with_errors(
1887
    std::span<const char16_t> utf16_input,
1888
0
    std::span<char32_t> utf32_output) noexcept {
1889
0
  return convert_utf16be_to_utf32_with_errors(
1890
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1891
0
}
1892
  #endif // SIMDUTF_SPAN
1893
1894
/**
1895
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
1896
 *
1897
 * This function assumes that the input string is valid UTF-16 (native
1898
 * endianness).
1899
 *
1900
 * This function is not BOM-aware.
1901
 *
1902
 * @param input         the UTF-16 string to convert
1903
 * @param length        the length of the string in 2-byte code units (char16_t)
1904
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
1905
 * result
1906
 * @return number of written code units; 0 if conversion is not possible
1907
 */
1908
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
1909
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1910
  #if SIMDUTF_SPAN
1911
simdutf_really_inline simdutf_warn_unused size_t
1912
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
1913
0
                             std::span<char32_t> utf32_output) noexcept {
1914
0
  return convert_valid_utf16_to_utf32(
1915
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
1916
0
}
1917
  #endif // SIMDUTF_SPAN
1918
1919
/**
1920
 * Convert valid UTF-16LE string into UTF-32 string.
1921
 *
1922
 * This function assumes that the input string is valid UTF-16LE.
1923
 *
1924
 * This function is not BOM-aware.
1925
 *
1926
 * @param input         the UTF-16LE string to convert
1927
 * @param length        the length of the string in 2-byte code units (char16_t)
1928
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
1929
 * result
1930
 * @return number of written code units; 0 if conversion is not possible
1931
 */
1932
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
1933
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1934
  #if SIMDUTF_SPAN
1935
simdutf_really_inline simdutf_warn_unused size_t
1936
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
1937
0
                               std::span<char32_t> utf32_output) noexcept {
1938
0
  return convert_valid_utf16le_to_utf32(
1939
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
1940
0
}
1941
  #endif // SIMDUTF_SPAN
1942
1943
/**
1944
 * Convert valid UTF-16BE string into UTF-32 string.
1945
 *
1946
 * This function assumes that the input string is valid UTF-16LE.
1947
 *
1948
 * This function is not BOM-aware.
1949
 *
1950
 * @param input         the UTF-16BE string to convert
1951
 * @param length        the length of the string in 2-byte code units (char16_t)
1952
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
1953
 * result
1954
 * @return number of written code units; 0 if conversion is not possible
1955
 */
1956
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
1957
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1958
  #if SIMDUTF_SPAN
1959
simdutf_really_inline simdutf_warn_unused size_t
1960
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
1961
0
                               std::span<char32_t> utf32_output) noexcept {
1962
0
  return convert_valid_utf16be_to_utf32(
1963
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
1964
0
}
1965
  #endif // SIMDUTF_SPAN
1966
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
1967
1968
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1969
/**
1970
 * Compute the number of bytes that this UTF-16LE/BE string would require in
1971
 * Latin1 format.
1972
 *
1973
 * This function does not validate the input. It is acceptable to pass invalid
1974
 * UTF-16 strings but in such cases the result is implementation defined.
1975
 *
1976
 * This function is not BOM-aware.
1977
 *
1978
 * @param length        the length of the string in 2-byte code units (char16_t)
1979
 * @return the number of bytes required to encode the UTF-16LE string as Latin1
1980
 */
1981
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
1982
1983
/**
1984
 * Using native endianness; Compute the number of bytes that this UTF-16
1985
 * string would require in UTF-8 format.
1986
 *
1987
 * This function does not validate the input. It is acceptable to pass invalid
1988
 * UTF-16 strings but in such cases the result is implementation defined.
1989
 *
1990
 * @param input         the UTF-16 string to convert
1991
 * @param length        the length of the string in 2-byte code units (char16_t)
1992
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1993
 */
1994
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
1995
                                                  size_t length) noexcept;
1996
  #if SIMDUTF_SPAN
1997
simdutf_really_inline simdutf_warn_unused size_t
1998
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
1999
0
  return utf8_length_from_utf16(valid_utf16_input.data(),
2000
0
                                valid_utf16_input.size());
2001
0
}
2002
  #endif // SIMDUTF_SPAN
2003
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2004
2005
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2006
/**
2007
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2008
 * format.
2009
 *
2010
 * This function does not validate the input. It is acceptable to pass invalid
2011
 * UTF-16 strings but in such cases the result is implementation defined.
2012
 *
2013
 * @param input         the UTF-16LE string to convert
2014
 * @param length        the length of the string in 2-byte code units (char16_t)
2015
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2016
 */
2017
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
2018
                                                    size_t length) noexcept;
2019
  #if SIMDUTF_SPAN
2020
simdutf_really_inline simdutf_warn_unused size_t
2021
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2022
0
  return utf8_length_from_utf16le(valid_utf16_input.data(),
2023
0
                                  valid_utf16_input.size());
2024
0
}
2025
  #endif // SIMDUTF_SPAN
2026
2027
/**
2028
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
2029
 * format.
2030
 *
2031
 * This function does not validate the input. It is acceptable to pass invalid
2032
 * UTF-16 strings but in such cases the result is implementation defined.
2033
 *
2034
 * @param input         the UTF-16BE string to convert
2035
 * @param length        the length of the string in 2-byte code units (char16_t)
2036
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2037
 */
2038
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
2039
                                                    size_t length) noexcept;
2040
  #if SIMDUTF_SPAN
2041
simdutf_really_inline simdutf_warn_unused size_t
2042
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2043
0
  return utf8_length_from_utf16be(valid_utf16_input.data(),
2044
0
                                  valid_utf16_input.size());
2045
0
}
2046
  #endif // SIMDUTF_SPAN
2047
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2048
2049
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2050
/**
2051
 * Convert possibly broken UTF-32 string into UTF-8 string.
2052
 *
2053
 * During the conversion also validation of the input string is done.
2054
 * This function is suitable to work with inputs from untrusted sources.
2055
 *
2056
 * This function is not BOM-aware.
2057
 *
2058
 * @param input         the UTF-32 string to convert
2059
 * @param length        the length of the string in 4-byte code units (char32_t)
2060
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2061
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2062
 */
2063
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
2064
                                                 size_t length,
2065
                                                 char *utf8_buffer) noexcept;
2066
  #if SIMDUTF_SPAN
2067
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_utf8(
2068
    std::span<const char32_t> utf32_input,
2069
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2070
  return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
2071
                               reinterpret_cast<char *>(utf8_output.data()));
2072
}
2073
  #endif // SIMDUTF_SPAN
2074
2075
/**
2076
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2077
 *
2078
 * During the conversion also validation of the input string is done.
2079
 * This function is suitable to work with inputs from untrusted sources.
2080
 *
2081
 * This function is not BOM-aware.
2082
 *
2083
 * @param input         the UTF-32 string to convert
2084
 * @param length        the length of the string in 4-byte code units (char32_t)
2085
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2086
 * @return a result pair struct (of type simdutf::result containing the two
2087
 * fields error and count) with an error code and either position of the error
2088
 * (in the input in code units) if any, or the number of char written if
2089
 * successful.
2090
 */
2091
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
2092
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2093
  #if SIMDUTF_SPAN
2094
simdutf_really_inline simdutf_warn_unused result
2095
convert_utf32_to_utf8_with_errors(
2096
    std::span<const char32_t> utf32_input,
2097
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2098
  return convert_utf32_to_utf8_with_errors(
2099
      utf32_input.data(), utf32_input.size(),
2100
      reinterpret_cast<char *>(utf8_output.data()));
2101
}
2102
  #endif // SIMDUTF_SPAN
2103
2104
/**
2105
 * Convert valid UTF-32 string into UTF-8 string.
2106
 *
2107
 * This function assumes that the input string is valid UTF-32.
2108
 *
2109
 * This function is not BOM-aware.
2110
 *
2111
 * @param input         the UTF-32 string to convert
2112
 * @param length        the length of the string in 4-byte code units (char32_t)
2113
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2114
 * result
2115
 * @return number of written code units; 0 if conversion is not possible
2116
 */
2117
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2118
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2119
  #if SIMDUTF_SPAN
2120
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2121
    std::span<const char32_t> valid_utf32_input,
2122
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2123
  return convert_valid_utf32_to_utf8(
2124
      valid_utf32_input.data(), valid_utf32_input.size(),
2125
      reinterpret_cast<char *>(utf8_output.data()));
2126
}
2127
  #endif // SIMDUTF_SPAN
2128
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2129
2130
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2131
/**
2132
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
2133
 * string.
2134
 *
2135
 * During the conversion also validation of the input string is done.
2136
 * This function is suitable to work with inputs from untrusted sources.
2137
 *
2138
 * This function is not BOM-aware.
2139
 *
2140
 * @param input         the UTF-32 string to convert
2141
 * @param length        the length of the string in 4-byte code units (char32_t)
2142
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2143
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2144
 */
2145
simdutf_warn_unused size_t convert_utf32_to_utf16(
2146
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2147
  #if SIMDUTF_SPAN
2148
simdutf_really_inline simdutf_warn_unused size_t
2149
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
2150
0
                       std::span<char16_t> utf16_output) noexcept {
2151
0
  return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
2152
0
                                utf16_output.data());
2153
0
}
2154
  #endif // SIMDUTF_SPAN
2155
2156
/**
2157
 * Convert possibly broken UTF-32 string into UTF-16LE string.
2158
 *
2159
 * During the conversion also validation of the input string is done.
2160
 * This function is suitable to work with inputs from untrusted sources.
2161
 *
2162
 * This function is not BOM-aware.
2163
 *
2164
 * @param input         the UTF-32 string to convert
2165
 * @param length        the length of the string in 4-byte code units (char32_t)
2166
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2167
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2168
 */
2169
simdutf_warn_unused size_t convert_utf32_to_utf16le(
2170
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2171
  #if SIMDUTF_SPAN
2172
simdutf_really_inline simdutf_warn_unused size_t
2173
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
2174
0
                         std::span<char16_t> utf16_output) noexcept {
2175
0
  return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
2176
0
                                  utf16_output.data());
2177
0
}
2178
  #endif // SIMDUTF_SPAN
2179
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2180
2181
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2182
/**
2183
 * Convert possibly broken UTF-32 string into Latin1 string.
2184
 *
2185
 * During the conversion also validation of the input string is done.
2186
 * This function is suitable to work with inputs from untrusted sources.
2187
 *
2188
 * This function is not BOM-aware.
2189
 *
2190
 * @param input         the UTF-32 string to convert
2191
 * @param length        the length of the string in 4-byte code units (char32_t)
2192
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2193
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2194
 * or if it cannot be represented as Latin1
2195
 */
2196
simdutf_warn_unused size_t convert_utf32_to_latin1(
2197
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2198
  #if SIMDUTF_SPAN
2199
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_latin1(
2200
    std::span<const char32_t> utf32_input,
2201
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2202
  return convert_utf32_to_latin1(
2203
      utf32_input.data(), utf32_input.size(),
2204
      reinterpret_cast<char *>(latin1_output.data()));
2205
}
2206
  #endif // SIMDUTF_SPAN
2207
2208
/**
2209
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
2210
 * If the string cannot be represented as Latin1, an error is returned.
2211
 *
2212
 * During the conversion also validation of the input string is done.
2213
 * This function is suitable to work with inputs from untrusted sources.
2214
 *
2215
 * This function is not BOM-aware.
2216
 *
2217
 * @param input         the UTF-32 string to convert
2218
 * @param length        the length of the string in 4-byte code units (char32_t)
2219
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2220
 * @return a result pair struct (of type simdutf::result containing the two
2221
 * fields error and count) with an error code and either position of the error
2222
 * (in the input in code units) if any, or the number of char written if
2223
 * successful.
2224
 */
2225
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
2226
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2227
  #if SIMDUTF_SPAN
2228
simdutf_really_inline simdutf_warn_unused result
2229
convert_utf32_to_latin1_with_errors(
2230
    std::span<const char32_t> utf32_input,
2231
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2232
  return convert_utf32_to_latin1_with_errors(
2233
      utf32_input.data(), utf32_input.size(),
2234
      reinterpret_cast<char *>(latin1_output.data()));
2235
}
2236
  #endif // SIMDUTF_SPAN
2237
2238
/**
2239
 * Convert valid UTF-32 string into Latin1 string.
2240
 *
2241
 * This function assumes that the input string is valid UTF-32 and that it can
2242
 * be represented as Latin1. If you violate this assumption, the result is
2243
 * implementation defined and may include system-dependent behavior such as
2244
 * crashes.
2245
 *
2246
 * This function is for expert users only and not part of our public API. Use
2247
 * convert_utf32_to_latin1 instead. The function may be removed from the library
2248
 * in the future.
2249
 *
2250
 * This function is not BOM-aware.
2251
 *
2252
 * @param input         the UTF-32 string to convert
2253
 * @param length        the length of the string in 4-byte code units (char32_t)
2254
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
2255
 * result
2256
 * @return number of written code units; 0 if conversion is not possible
2257
 */
2258
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2259
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2260
  #if SIMDUTF_SPAN
2261
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2262
    std::span<const char32_t> valid_utf32_input,
2263
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2264
  return convert_valid_utf32_to_latin1(
2265
      valid_utf32_input.data(), valid_utf32_input.size(),
2266
      reinterpret_cast<char *>(latin1_output.data()));
2267
}
2268
  #endif // SIMDUTF_SPAN
2269
2270
/**
2271
 * Compute the number of bytes that this UTF-32 string would require in Latin1
2272
 * format.
2273
 *
2274
 * This function does not validate the input. It is acceptable to pass invalid
2275
 * UTF-32 strings but in such cases the result is implementation defined.
2276
 *
2277
 * This function is not BOM-aware.
2278
 *
2279
 * @param length        the length of the string in 4-byte code units (char32_t)
2280
 * @return the number of bytes required to encode the UTF-32 string as Latin1
2281
 */
2282
simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept;
2283
2284
/**
2285
 * Compute the number of bytes that this Latin1 string would require in UTF-32
2286
 * format.
2287
 *
2288
 * @param length        the length of the string in Latin1 code units (char)
2289
 * @return the length of the string in 4-byte code units (char32_t) required to
2290
 * encode the Latin1 string as UTF-32
2291
 */
2292
simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept;
2293
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2294
2295
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2296
/**
2297
 * Convert possibly broken UTF-32 string into UTF-16BE string.
2298
 *
2299
 * During the conversion also validation of the input string is done.
2300
 * This function is suitable to work with inputs from untrusted sources.
2301
 *
2302
 * This function is not BOM-aware.
2303
 *
2304
 * @param input         the UTF-32 string to convert
2305
 * @param length        the length of the string in 4-byte code units (char32_t)
2306
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2307
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2308
 */
2309
simdutf_warn_unused size_t convert_utf32_to_utf16be(
2310
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2311
  #if SIMDUTF_SPAN
2312
simdutf_really_inline simdutf_warn_unused size_t
2313
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
2314
0
                         std::span<char16_t> utf16_output) noexcept {
2315
0
  return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
2316
0
                                  utf16_output.data());
2317
0
}
2318
  #endif // SIMDUTF_SPAN
2319
2320
/**
2321
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
2322
 * string and stop on error.
2323
 *
2324
 * During the conversion also validation of the input string is done.
2325
 * This function is suitable to work with inputs from untrusted sources.
2326
 *
2327
 * This function is not BOM-aware.
2328
 *
2329
 * @param input         the UTF-32 string to convert
2330
 * @param length        the length of the string in 4-byte code units (char32_t)
2331
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2332
 * @return a result pair struct (of type simdutf::result containing the two
2333
 * fields error and count) with an error code and either position of the error
2334
 * (in the input in code units) if any, or the number of char16_t written if
2335
 * successful.
2336
 */
2337
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
2338
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2339
  #if SIMDUTF_SPAN
2340
simdutf_really_inline simdutf_warn_unused result
2341
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
2342
0
                                   std::span<char16_t> utf16_output) noexcept {
2343
0
  return convert_utf32_to_utf16_with_errors(
2344
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2345
0
}
2346
  #endif // SIMDUTF_SPAN
2347
2348
/**
2349
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2350
 *
2351
 * During the conversion also validation of the input string is done.
2352
 * This function is suitable to work with inputs from untrusted sources.
2353
 *
2354
 * This function is not BOM-aware.
2355
 *
2356
 * @param input         the UTF-32 string to convert
2357
 * @param length        the length of the string in 4-byte code units (char32_t)
2358
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2359
 * @return a result pair struct (of type simdutf::result containing the two
2360
 * fields error and count) with an error code and either position of the error
2361
 * (in the input in code units) if any, or the number of char16_t written if
2362
 * successful.
2363
 */
2364
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
2365
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2366
  #if SIMDUTF_SPAN
2367
simdutf_really_inline simdutf_warn_unused result
2368
convert_utf32_to_utf16le_with_errors(
2369
    std::span<const char32_t> utf32_input,
2370
0
    std::span<char16_t> utf16_output) noexcept {
2371
0
  return convert_utf32_to_utf16le_with_errors(
2372
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2373
0
}
2374
  #endif // SIMDUTF_SPAN
2375
2376
/**
2377
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2378
 *
2379
 * During the conversion also validation of the input string is done.
2380
 * This function is suitable to work with inputs from untrusted sources.
2381
 *
2382
 * This function is not BOM-aware.
2383
 *
2384
 * @param input         the UTF-32 string to convert
2385
 * @param length        the length of the string in 4-byte code units (char32_t)
2386
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2387
 * @return a result pair struct (of type simdutf::result containing the two
2388
 * fields error and count) with an error code and either position of the error
2389
 * (in the input in code units) if any, or the number of char16_t written if
2390
 * successful.
2391
 */
2392
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
2393
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2394
  #if SIMDUTF_SPAN
2395
simdutf_really_inline simdutf_warn_unused result
2396
convert_utf32_to_utf16be_with_errors(
2397
    std::span<const char32_t> utf32_input,
2398
0
    std::span<char16_t> utf16_output) noexcept {
2399
0
  return convert_utf32_to_utf16be_with_errors(
2400
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2401
0
}
2402
  #endif // SIMDUTF_SPAN
2403
2404
/**
2405
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
2406
 *
2407
 * This function assumes that the input string is valid UTF-32.
2408
 *
2409
 * This function is not BOM-aware.
2410
 *
2411
 * @param input         the UTF-32 string to convert
2412
 * @param length        the length of the string in 4-byte code units (char32_t)
2413
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2414
 * result
2415
 * @return number of written code units; 0 if conversion is not possible
2416
 */
2417
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
2418
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2419
  #if SIMDUTF_SPAN
2420
simdutf_really_inline simdutf_warn_unused size_t
2421
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
2422
0
                             std::span<char16_t> utf16_output) noexcept {
2423
0
  return convert_valid_utf32_to_utf16(
2424
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2425
0
}
2426
  #endif // SIMDUTF_SPAN
2427
2428
/**
2429
 * Convert valid UTF-32 string into UTF-16LE string.
2430
 *
2431
 * This function assumes that the input string is valid UTF-32.
2432
 *
2433
 * This function is not BOM-aware.
2434
 *
2435
 * @param input         the UTF-32 string to convert
2436
 * @param length        the length of the string in 4-byte code units (char32_t)
2437
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2438
 * result
2439
 * @return number of written code units; 0 if conversion is not possible
2440
 */
2441
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
2442
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2443
  #if SIMDUTF_SPAN
2444
simdutf_really_inline simdutf_warn_unused size_t
2445
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
2446
0
                               std::span<char16_t> utf16_output) noexcept {
2447
0
  return convert_valid_utf32_to_utf16le(
2448
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2449
0
}
2450
  #endif // SIMDUTF_SPAN
2451
2452
/**
2453
 * Convert valid UTF-32 string into UTF-16BE string.
2454
 *
2455
 * This function assumes that the input string is valid UTF-32.
2456
 *
2457
 * This function is not BOM-aware.
2458
 *
2459
 * @param input         the UTF-32 string to convert
2460
 * @param length        the length of the string in 4-byte code units (char32_t)
2461
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2462
 * result
2463
 * @return number of written code units; 0 if conversion is not possible
2464
 */
2465
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
2466
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2467
  #if SIMDUTF_SPAN
2468
simdutf_really_inline simdutf_warn_unused size_t
2469
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
2470
0
                               std::span<char16_t> utf16_output) noexcept {
2471
0
  return convert_valid_utf32_to_utf16be(
2472
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2473
0
}
2474
  #endif // SIMDUTF_SPAN
2475
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2476
2477
#if SIMDUTF_FEATURE_UTF16
2478
/**
2479
 * Change the endianness of the input. Can be used to go from UTF-16LE to
2480
 * UTF-16BE or from UTF-16BE to UTF-16LE.
2481
 *
2482
 * This function does not validate the input.
2483
 *
2484
 * This function is not BOM-aware.
2485
 *
2486
 * @param input         the UTF-16 string to process
2487
 * @param length        the length of the string in 2-byte code units (char16_t)
2488
 * @param output        the pointer to a buffer that can hold the conversion
2489
 * result
2490
 */
2491
void change_endianness_utf16(const char16_t *input, size_t length,
2492
                             char16_t *output) noexcept;
2493
  #if SIMDUTF_SPAN
2494
simdutf_really_inline void
2495
change_endianness_utf16(std::span<const char16_t> utf16_input,
2496
0
                        std::span<char16_t> utf16_output) noexcept {
2497
0
  return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
2498
0
                                 utf16_output.data());
2499
0
}
2500
  #endif // SIMDUTF_SPAN
2501
#endif   // SIMDUTF_FEATURE_UTF16
2502
2503
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2504
/**
2505
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
2506
 * format.
2507
 *
2508
 * This function does not validate the input. It is acceptable to pass invalid
2509
 * UTF-32 strings but in such cases the result is implementation defined.
2510
 *
2511
 * @param input         the UTF-32 string to convert
2512
 * @param length        the length of the string in 4-byte code units (char32_t)
2513
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2514
 */
2515
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
2516
                                                  size_t length) noexcept;
2517
  #if SIMDUTF_SPAN
2518
simdutf_really_inline simdutf_warn_unused size_t
2519
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2520
0
  return utf8_length_from_utf32(valid_utf32_input.data(),
2521
0
                                valid_utf32_input.size());
2522
0
}
2523
  #endif // SIMDUTF_SPAN
2524
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2525
2526
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2527
/**
2528
 * Compute the number of two-byte code units that this UTF-32 string would
2529
 * require in UTF-16 format.
2530
 *
2531
 * This function does not validate the input. It is acceptable to pass invalid
2532
 * UTF-32 strings but in such cases the result is implementation defined.
2533
 *
2534
 * @param input         the UTF-32 string to convert
2535
 * @param length        the length of the string in 4-byte code units (char32_t)
2536
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2537
 */
2538
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
2539
                                                   size_t length) noexcept;
2540
  #if SIMDUTF_SPAN
2541
simdutf_really_inline simdutf_warn_unused size_t
2542
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2543
0
  return utf16_length_from_utf32(valid_utf32_input.data(),
2544
0
                                 valid_utf32_input.size());
2545
0
}
2546
  #endif // SIMDUTF_SPAN
2547
2548
/**
2549
 * Using native endianness; Compute the number of bytes that this UTF-16
2550
 * string would require in UTF-32 format.
2551
 *
2552
 * This function is equivalent to count_utf16.
2553
 *
2554
 * This function does not validate the input. It is acceptable to pass invalid
2555
 * UTF-16 strings but in such cases the result is implementation defined.
2556
 *
2557
 * This function is not BOM-aware.
2558
 *
2559
 * @param input         the UTF-16 string to convert
2560
 * @param length        the length of the string in 2-byte code units (char16_t)
2561
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2562
 */
2563
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
2564
                                                   size_t length) noexcept;
2565
  #if SIMDUTF_SPAN
2566
simdutf_really_inline simdutf_warn_unused size_t
2567
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2568
0
  return utf32_length_from_utf16(valid_utf16_input.data(),
2569
0
                                 valid_utf16_input.size());
2570
0
}
2571
  #endif // SIMDUTF_SPAN
2572
2573
/**
2574
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
2575
 * format.
2576
 *
2577
 * This function is equivalent to count_utf16le.
2578
 *
2579
 * This function does not validate the input. It is acceptable to pass invalid
2580
 * UTF-16 strings but in such cases the result is implementation defined.
2581
 *
2582
 * This function is not BOM-aware.
2583
 *
2584
 * @param input         the UTF-16LE string to convert
2585
 * @param length        the length of the string in 2-byte code units (char16_t)
2586
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2587
 */
2588
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
2589
                                                     size_t length) noexcept;
2590
  #if SIMDUTF_SPAN
2591
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16le(
2592
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2593
0
  return utf32_length_from_utf16le(valid_utf16_input.data(),
2594
0
                                   valid_utf16_input.size());
2595
0
}
2596
  #endif // SIMDUTF_SPAN
2597
2598
/**
2599
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
2600
 * format.
2601
 *
2602
 * This function is equivalent to count_utf16be.
2603
 *
2604
 * This function does not validate the input. It is acceptable to pass invalid
2605
 * UTF-16 strings but in such cases the result is implementation defined.
2606
 *
2607
 * This function is not BOM-aware.
2608
 *
2609
 * @param input         the UTF-16BE string to convert
2610
 * @param length        the length of the string in 2-byte code units (char16_t)
2611
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2612
 */
2613
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
2614
                                                     size_t length) noexcept;
2615
  #if SIMDUTF_SPAN
2616
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16be(
2617
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2618
0
  return utf32_length_from_utf16be(valid_utf16_input.data(),
2619
0
                                   valid_utf16_input.size());
2620
0
}
2621
  #endif // SIMDUTF_SPAN
2622
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2623
2624
#if SIMDUTF_FEATURE_UTF16
2625
/**
2626
 * Count the number of code points (characters) in the string assuming that
2627
 * it is valid.
2628
 *
2629
 * This function assumes that the input string is valid UTF-16 (native
2630
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
2631
 * cases the result is implementation defined.
2632
 *
2633
 * This function is not BOM-aware.
2634
 *
2635
 * @param input         the UTF-16 string to process
2636
 * @param length        the length of the string in 2-byte code units (char16_t)
2637
 * @return number of code points
2638
 */
2639
simdutf_warn_unused size_t count_utf16(const char16_t *input,
2640
                                       size_t length) noexcept;
2641
  #if SIMDUTF_SPAN
2642
simdutf_really_inline simdutf_warn_unused size_t
2643
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2644
0
  return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2645
0
}
2646
  #endif // SIMDUTF_SPAN
2647
2648
/**
2649
 * Count the number of code points (characters) in the string assuming that
2650
 * it is valid.
2651
 *
2652
 * This function assumes that the input string is valid UTF-16LE.
2653
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2654
 * the result is implementation defined.
2655
 *
2656
 * This function is not BOM-aware.
2657
 *
2658
 * @param input         the UTF-16LE string to process
2659
 * @param length        the length of the string in 2-byte code units (char16_t)
2660
 * @return number of code points
2661
 */
2662
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
2663
                                         size_t length) noexcept;
2664
  #if SIMDUTF_SPAN
2665
simdutf_really_inline simdutf_warn_unused size_t
2666
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2667
0
  return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
2668
0
}
2669
  #endif // SIMDUTF_SPAN
2670
2671
/**
2672
 * Count the number of code points (characters) in the string assuming that
2673
 * it is valid.
2674
 *
2675
 * This function assumes that the input string is valid UTF-16BE.
2676
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2677
 * the result is implementation defined.
2678
 *
2679
 * This function is not BOM-aware.
2680
 *
2681
 * @param input         the UTF-16BE string to process
2682
 * @param length        the length of the string in 2-byte code units (char16_t)
2683
 * @return number of code points
2684
 */
2685
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
2686
                                         size_t length) noexcept;
2687
  #if SIMDUTF_SPAN
2688
simdutf_really_inline simdutf_warn_unused size_t
2689
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2690
0
  return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
2691
0
}
2692
  #endif // SIMDUTF_SPAN
2693
#endif   // SIMDUTF_FEATURE_UTF16
2694
2695
#if SIMDUTF_FEATURE_UTF8
2696
/**
2697
 * Count the number of code points (characters) in the string assuming that
2698
 * it is valid.
2699
 *
2700
 * This function assumes that the input string is valid UTF-8.
2701
 * It is acceptable to pass invalid UTF-8 strings but in such cases
2702
 * the result is implementation defined.
2703
 *
2704
 * @param input         the UTF-8 string to process
2705
 * @param length        the length of the string in bytes
2706
 * @return number of code points
2707
 */
2708
simdutf_warn_unused size_t count_utf8(const char *input,
2709
                                      size_t length) noexcept;
2710
  #if SIMDUTF_SPAN
2711
simdutf_really_inline simdutf_warn_unused size_t count_utf8(
2712
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2713
  return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
2714
                    valid_utf8_input.size());
2715
}
2716
  #endif // SIMDUTF_SPAN
2717
2718
/**
2719
 * Given a valid UTF-8 string having a possibly truncated last character,
2720
 * this function checks the end of string. If the last character is truncated
2721
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
2722
 * that the short UTF-8 strings only contain complete characters. If there is no
2723
 * truncated character, the original length is returned.
2724
 *
2725
 * This function assumes that the input string is valid UTF-8, but possibly
2726
 * truncated.
2727
 *
2728
 * @param input         the UTF-8 string to process
2729
 * @param length        the length of the string in bytes
2730
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
2731
 */
2732
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
2733
  #if SIMDUTF_SPAN
2734
simdutf_really_inline simdutf_warn_unused size_t trim_partial_utf8(
2735
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2736
  return trim_partial_utf8(
2737
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2738
      valid_utf8_input.size());
2739
}
2740
  #endif // SIMDUTF_SPAN
2741
#endif   // SIMDUTF_FEATURE_UTF8
2742
2743
#if SIMDUTF_FEATURE_UTF16
2744
/**
2745
 * Given a valid UTF-16BE string having a possibly truncated last character,
2746
 * this function checks the end of string. If the last character is truncated
2747
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2748
 * the short UTF-16BE strings only contain complete characters. If there is no
2749
 * truncated character, the original length is returned.
2750
 *
2751
 * This function assumes that the input string is valid UTF-16BE, but possibly
2752
 * truncated.
2753
 *
2754
 * @param input         the UTF-16BE string to process
2755
 * @param length        the length of the string in bytes
2756
 * @return the length of the string in bytes, possibly shorter by 1 unit
2757
 */
2758
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
2759
                                                size_t length);
2760
  #if SIMDUTF_SPAN
2761
simdutf_really_inline simdutf_warn_unused size_t
2762
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2763
0
  return trim_partial_utf16be(valid_utf16_input.data(),
2764
0
                              valid_utf16_input.size());
2765
0
}
2766
  #endif // SIMDUTF_SPAN
2767
2768
/**
2769
 * Given a valid UTF-16LE string having a possibly truncated last character,
2770
 * this function checks the end of string. If the last character is truncated
2771
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2772
 * the short UTF-16LE strings only contain complete characters. If there is no
2773
 * truncated character, the original length is returned.
2774
 *
2775
 * This function assumes that the input string is valid UTF-16LE, but possibly
2776
 * truncated.
2777
 *
2778
 * @param input         the UTF-16LE string to process
2779
 * @param length        the length of the string in bytes
2780
 * @return the length of the string in unit, possibly shorter by 1 unit
2781
 */
2782
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
2783
                                                size_t length);
2784
  #if SIMDUTF_SPAN
2785
simdutf_really_inline simdutf_warn_unused size_t
2786
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2787
0
  return trim_partial_utf16le(valid_utf16_input.data(),
2788
0
                              valid_utf16_input.size());
2789
0
}
2790
  #endif // SIMDUTF_SPAN
2791
2792
/**
2793
 * Given a valid UTF-16 string having a possibly truncated last character,
2794
 * this function checks the end of string. If the last character is truncated
2795
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2796
 * the short UTF-16 strings only contain complete characters. If there is no
2797
 * truncated character, the original length is returned.
2798
 *
2799
 * This function assumes that the input string is valid UTF-16, but possibly
2800
 * truncated. We use the native endianness.
2801
 *
2802
 * @param input         the UTF-16 string to process
2803
 * @param length        the length of the string in bytes
2804
 * @return the length of the string in unit, possibly shorter by 1 unit
2805
 */
2806
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
2807
                                              size_t length);
2808
  #if SIMDUTF_SPAN
2809
simdutf_really_inline simdutf_warn_unused size_t
2810
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2811
0
  return trim_partial_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2812
0
}
2813
  #endif // SIMDUTF_SPAN
2814
#endif   // SIMDUTF_FEATURE_UTF16
2815
2816
#if SIMDUTF_FEATURE_BASE64
2817
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
2818
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
2819
  #endif
2820
// base64_options are used to specify the base64 encoding options.
2821
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
2822
// garbage characters are characters that are not part of the base64 alphabet
2823
// nor ASCII spaces.
2824
constexpr uint64_t base64_reverse_padding =
2825
    2; /* modifier for base64_default and base64_url */
2826
enum base64_options : uint64_t {
2827
  base64_default = 0, /* standard base64 format (with padding) */
2828
  base64_url = 1,     /* base64url format (no padding) */
2829
  base64_default_no_padding =
2830
      base64_default |
2831
      base64_reverse_padding, /* standard base64 format without padding */
2832
  base64_url_with_padding =
2833
      base64_url | base64_reverse_padding, /* base64url with padding */
2834
  base64_default_accept_garbage =
2835
      4, /* standard base64 format accepting garbage characters, the input stops
2836
            with the first '=' if any */
2837
  base64_url_accept_garbage =
2838
      5, /* base64url format accepting garbage characters, the input stops with
2839
            the first '=' if any */
2840
  base64_default_or_url =
2841
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
2842
  base64_default_or_url_accept_garbage =
2843
      12, /* standard/base64url hybrid format accepting garbage characters
2844
             (only meaningful for decoding!), the input stops with the first '='
2845
             if any */
2846
};
2847
2848
  #if SIMDUTF_CPLUSPLUS17
2849
0
inline std::string_view to_string(base64_options options) {
2850
0
  switch (options) {
2851
0
  case base64_default:
2852
0
    return "base64_default";
2853
0
  case base64_url:
2854
0
    return "base64_url";
2855
0
  case base64_reverse_padding:
2856
0
    return "base64_reverse_padding";
2857
0
  case base64_url_with_padding:
2858
0
    return "base64_url_with_padding";
2859
0
  case base64_default_accept_garbage:
2860
0
    return "base64_default_accept_garbage";
2861
0
  case base64_url_accept_garbage:
2862
0
    return "base64_url_accept_garbage";
2863
0
  case base64_default_or_url:
2864
0
    return "base64_default_or_url";
2865
0
  case base64_default_or_url_accept_garbage:
2866
0
    return "base64_default_or_url_accept_garbage";
2867
0
  }
2868
0
  return "<unknown>";
2869
0
}
2870
  #endif // SIMDUTF_CPLUSPLUS17
2871
2872
// last_chunk_handling_options are used to specify the handling of the last
2873
// chunk in base64 decoding.
2874
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
2875
enum last_chunk_handling_options : uint64_t {
2876
  loose = 0,  /* standard base64 format, decode partial final chunk */
2877
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
2878
                 unpadded, or non-zero bit padding */
2879
  stop_before_partial =
2880
      2, /* if the last chunk is partial, ignore it (no error) */
2881
  only_full_chunks =
2882
      3 /* only decode full blocks (4 base64 characters, no padding) */
2883
};
2884
2885
inline bool is_partial(last_chunk_handling_options options) {
2886
  return (options == stop_before_partial) || (options == only_full_chunks);
2887
}
2888
2889
  #if SIMDUTF_CPLUSPLUS17
2890
0
inline std::string_view to_string(last_chunk_handling_options options) {
2891
0
  switch (options) {
2892
0
  case loose:
2893
0
    return "loose";
2894
0
  case strict:
2895
0
    return "strict";
2896
0
  case stop_before_partial:
2897
0
    return "stop_before_partial";
2898
0
  case only_full_chunks:
2899
0
    return "only_full_chunks";
2900
0
  }
2901
0
  return "<unknown>";
2902
0
}
2903
  #endif
2904
2905
/**
2906
 * Provide the maximal binary length in bytes given the base64 input.
2907
 * In general, if the input contains ASCII spaces, the result will be less than
2908
 * the maximum length.
2909
 *
2910
 * @param input         the base64 input to process
2911
 * @param length        the length of the base64 input in bytes
2912
 * @return maximum number of binary bytes
2913
 */
2914
simdutf_warn_unused size_t
2915
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
2916
  #if SIMDUTF_SPAN
2917
simdutf_really_inline simdutf_warn_unused size_t
2918
maximal_binary_length_from_base64(
2919
    const detail::input_span_of_byte_like auto &input) noexcept {
2920
  return maximal_binary_length_from_base64(
2921
      reinterpret_cast<const char *>(input.data()), input.size());
2922
}
2923
  #endif // SIMDUTF_SPAN
2924
2925
/**
2926
 * Provide the maximal binary length in bytes given the base64 input.
2927
 * In general, if the input contains ASCII spaces, the result will be less than
2928
 * the maximum length.
2929
 *
2930
 * @param input         the base64 input to process, in ASCII stored as 16-bit
2931
 * units
2932
 * @param length        the length of the base64 input in 16-bit units
2933
 * @return maximal number of binary bytes
2934
 */
2935
simdutf_warn_unused size_t maximal_binary_length_from_base64(
2936
    const char16_t *input, size_t length) noexcept;
2937
  #if SIMDUTF_SPAN
2938
simdutf_really_inline simdutf_warn_unused size_t
2939
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
2940
0
  return maximal_binary_length_from_base64(input.data(), input.size());
2941
0
}
2942
  #endif // SIMDUTF_SPAN
2943
2944
/**
2945
 * Convert a base64 input to a binary output.
2946
 *
2947
 * This function follows the WHATWG forgiving-base64 format, which means that it
2948
 * will ignore any ASCII spaces in the input. You may provide a padded input
2949
 * (with one or two equal signs at the end) or an unpadded input (without any
2950
 * equal signs at the end).
2951
 *
2952
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
2953
 *
2954
 * This function will fail in case of invalid input. When last_chunk_options =
2955
 * loose, there are two possible reasons for failure: the input contains a
2956
 * number of base64 characters that when divided by 4, leaves a single remainder
2957
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
2958
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
2959
 *
2960
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
2961
 * input where the invalid character was found. When the error is
2962
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
2963
 *
2964
 * The default option (simdutf::base64_default) expects the characters `+` and
2965
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
2966
 * characters `-` and `_` as part of its alphabet.
2967
 *
2968
 * The padding (`=`) is validated if present. There may be at most two padding
2969
 * characters at the end of the input. If there are any padding characters, the
2970
 * total number of characters (excluding spaces but including padding
2971
 * characters) must be divisible by four.
2972
 *
2973
 * You should call this function with a buffer that is at least
2974
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
2975
 * provide that much space, the function may cause a buffer overflow.
2976
 *
2977
 * Advanced users may want to taylor how the last chunk is handled. By default,
2978
 * we use a loose (forgiving) approach but we also support a strict approach
2979
 * as well as a stop_before_partial approach, as per the following proposal:
2980
 *
2981
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
2982
 *
2983
 * @param input         the base64 string to process
2984
 * @param length        the length of the string in bytes
2985
 * @param output        the pointer to a buffer that can hold the conversion
2986
 * result (should be at least maximal_binary_length_from_base64(input, length)
2987
 * bytes long).
2988
 * @param options       the base64 options to use, usually base64_default or
2989
 * base64_url, and base64_default by default.
2990
 * @param last_chunk_options the last chunk handling options,
2991
 * last_chunk_handling_options::loose by default
2992
 * but can also be last_chunk_handling_options::strict or
2993
 * last_chunk_handling_options::stop_before_partial.
2994
 * @return a result pair struct (of type simdutf::result containing the two
2995
 * fields error and count) with an error code and either position of the error
2996
 * (in the input in bytes) if any, or the number of bytes written if successful.
2997
 */
2998
simdutf_warn_unused result base64_to_binary(
2999
    const char *input, size_t length, char *output,
3000
    base64_options options = base64_default,
3001
    last_chunk_handling_options last_chunk_options = loose) noexcept;
3002
  #if SIMDUTF_SPAN
3003
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3004
    const detail::input_span_of_byte_like auto &input,
3005
    detail::output_span_of_byte_like auto &&binary_output,
3006
    base64_options options = base64_default,
3007
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3008
  return base64_to_binary(reinterpret_cast<const char *>(input.data()),
3009
                          input.size(),
3010
                          reinterpret_cast<char *>(binary_output.data()),
3011
                          options, last_chunk_options);
3012
}
3013
  #endif // SIMDUTF_SPAN
3014
3015
/**
3016
 * Provide the base64 length in bytes given the length of a binary input.
3017
 *
3018
 * @param length        the length of the input in bytes
3019
 * @return number of base64 bytes
3020
 */
3021
simdutf_warn_unused size_t base64_length_from_binary(
3022
    size_t length, base64_options options = base64_default) noexcept;
3023
3024
/**
3025
 * Convert a binary input to a base64 output.
3026
 *
3027
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3028
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3029
 * output to ensure that the output length is a multiple of four.
3030
 *
3031
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3032
 * of its alphabet. No padding is added at the end of the output.
3033
 *
3034
 * This function always succeeds.
3035
 *
3036
 * @param input         the binary to process
3037
 * @param length        the length of the input in bytes
3038
 * @param output        the pointer to a buffer that can hold the conversion
3039
 * result (should be at least base64_length_from_binary(length) bytes long)
3040
 * @param options       the base64 options to use, can be base64_default or
3041
 * base64_url, is base64_default by default.
3042
 * @return number of written bytes, will be equal to
3043
 * base64_length_from_binary(length, options)
3044
 */
3045
size_t binary_to_base64(const char *input, size_t length, char *output,
3046
                        base64_options options = base64_default) noexcept;
3047
  #if SIMDUTF_SPAN
3048
simdutf_really_inline simdutf_warn_unused size_t
3049
binary_to_base64(const detail::input_span_of_byte_like auto &input,
3050
                 detail::output_span_of_byte_like auto &&binary_output,
3051
                 base64_options options = base64_default) noexcept {
3052
  return binary_to_base64(
3053
      reinterpret_cast<const char *>(input.data()), input.size(),
3054
      reinterpret_cast<char *>(binary_output.data()), options);
3055
}
3056
  #endif // SIMDUTF_SPAN
3057
3058
  #if SIMDUTF_ATOMIC_REF
3059
/**
3060
 * Convert a binary input to a base64 output, using atomic accesses.
3061
 * This function comes with a potentially significant performance
3062
 * penalty, but it may be useful in some cases where the input
3063
 * buffers are shared between threads, to avoid undefined
3064
 * behavior in case of data races.
3065
 *
3066
 * The function is for advanced users. Its main use case is when
3067
 * to silence sanitizer warnings. We have no documented use case
3068
 * where this function is actually necessary in terms of practical correctness.
3069
 *
3070
 * This function is only available when simdutf is compiled with
3071
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3072
 * the availability of this function by checking the macro
3073
 * SIMDUTF_ATOMIC_REF.
3074
 *
3075
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3076
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3077
 * output to ensure that the output length is a multiple of four.
3078
 *
3079
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3080
 * of its alphabet. No padding is added at the end of the output.
3081
 *
3082
 * This function always succeeds.
3083
 *
3084
 * This function is considered experimental. It is not tested by default
3085
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3086
 * It is not documented in the public API documentation (README). It is
3087
 * offered on a best effort basis. We rely on the community for further
3088
 * testing and feedback.
3089
 *
3090
 * @brief atomic_binary_to_base64
3091
 * @param input         the binary to process
3092
 * @param length        the length of the input in bytes
3093
 * @param output        the pointer to a buffer that can hold the conversion
3094
 * result (should be at least base64_length_from_binary(length) bytes long)
3095
 * @param options       the base64 options to use, can be base64_default or
3096
 * base64_url, is base64_default by default.
3097
 * @return number of written bytes, will be equal to
3098
 * base64_length_from_binary(length, options)
3099
 */
3100
size_t
3101
atomic_binary_to_base64(const char *input, size_t length, char *output,
3102
                        base64_options options = base64_default) noexcept;
3103
    #if SIMDUTF_SPAN
3104
simdutf_really_inline simdutf_warn_unused size_t
3105
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
3106
                        detail::output_span_of_byte_like auto &&binary_output,
3107
                        base64_options options = base64_default) noexcept {
3108
  return atomic_binary_to_base64(
3109
      reinterpret_cast<const char *>(input.data()), input.size(),
3110
      reinterpret_cast<char *>(binary_output.data()), options);
3111
}
3112
    #endif // SIMDUTF_SPAN
3113
  #endif   // SIMDUTF_ATOMIC_REF
3114
3115
/**
3116
 * Convert a base64 input to a binary output.
3117
 *
3118
 * This function follows the WHATWG forgiving-base64 format, which means that it
3119
 * will ignore any ASCII spaces in the input. You may provide a padded input
3120
 * (with one or two equal signs at the end) or an unpadded input (without any
3121
 * equal signs at the end).
3122
 *
3123
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3124
 *
3125
 * This function will fail in case of invalid input. When last_chunk_options =
3126
 * loose, there are two possible reasons for failure: the input contains a
3127
 * number of base64 characters that when divided by 4, leaves a single remainder
3128
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3129
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3130
 *
3131
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3132
 * input where the invalid character was found. When the error is
3133
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3134
 *
3135
 * The default option (simdutf::base64_default) expects the characters `+` and
3136
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3137
 * characters `-` and `_` as part of its alphabet.
3138
 *
3139
 * The padding (`=`) is validated if present. There may be at most two padding
3140
 * characters at the end of the input. If there are any padding characters, the
3141
 * total number of characters (excluding spaces but including padding
3142
 * characters) must be divisible by four.
3143
 *
3144
 * You should call this function with a buffer that is at least
3145
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
3146
 * to provide that much space, the function may cause a buffer overflow.
3147
 *
3148
 * Advanced users may want to taylor how the last chunk is handled. By default,
3149
 * we use a loose (forgiving) approach but we also support a strict approach
3150
 * as well as a stop_before_partial approach, as per the following proposal:
3151
 *
3152
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3153
 *
3154
 * @param input         the base64 string to process, in ASCII stored as 16-bit
3155
 * units
3156
 * @param length        the length of the string in 16-bit units
3157
 * @param output        the pointer to a buffer that can hold the conversion
3158
 * result (should be at least maximal_binary_length_from_base64(input, length)
3159
 * bytes long).
3160
 * @param options       the base64 options to use, can be base64_default or
3161
 * base64_url, is base64_default by default.
3162
 * @param last_chunk_options the last chunk handling options,
3163
 * last_chunk_handling_options::loose by default
3164
 * but can also be last_chunk_handling_options::strict or
3165
 * last_chunk_handling_options::stop_before_partial.
3166
 * @return a result pair struct (of type simdutf::result containing the two
3167
 * fields error and count) with an error code and position of the
3168
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3169
 * of bytes written if successful.
3170
 */
3171
simdutf_warn_unused result
3172
base64_to_binary(const char16_t *input, size_t length, char *output,
3173
                 base64_options options = base64_default,
3174
                 last_chunk_handling_options last_chunk_options =
3175
                     last_chunk_handling_options::loose) noexcept;
3176
  #if SIMDUTF_SPAN
3177
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3178
    std::span<const char16_t> input,
3179
    detail::output_span_of_byte_like auto &&binary_output,
3180
    base64_options options = base64_default,
3181
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3182
  return base64_to_binary(input.data(), input.size(),
3183
                          reinterpret_cast<char *>(binary_output.data()),
3184
                          options, last_chunk_options);
3185
}
3186
  #endif // SIMDUTF_SPAN
3187
3188
/**
3189
 * Check if a character is an ignorabl base64 character.
3190
 * Checking a large input, character by character, is not computationally
3191
 * efficient.
3192
 *
3193
 * @param input         the character to check
3194
 * @param options       the base64 options to use, is base64_default by default.
3195
 * @return true if the character is an ignorablee base64 character, false
3196
 * otherwise.
3197
 */
3198
simdutf_warn_unused bool
3199
base64_ignorable(char input, base64_options options = base64_default) noexcept;
3200
simdutf_warn_unused bool
3201
base64_ignorable(char16_t input,
3202
                 base64_options options = base64_default) noexcept;
3203
3204
/**
3205
 * Check if a character is a valid base64 character.
3206
 * Checking a large input, character by character, is not computationally
3207
 * efficient.
3208
 * Note that padding characters are not considered valid base64 characters in
3209
 * this context, nor are spaces.
3210
 *
3211
 * @param input         the character to check
3212
 * @param options       the base64 options to use, is base64_default by default.
3213
 * @return true if the character is a base64 character, false otherwise.
3214
 */
3215
simdutf_warn_unused bool
3216
base64_valid(char input, base64_options options = base64_default) noexcept;
3217
simdutf_warn_unused bool
3218
base64_valid(char16_t input, base64_options options = base64_default) noexcept;
3219
3220
/**
3221
 * Check if a character is a valid base64 character or the padding character
3222
 * ('='). Checking a large input, character by character, is not computationally
3223
 * efficient.
3224
 *
3225
 * @param input         the character to check
3226
 * @param options       the base64 options to use, is base64_default by default.
3227
 * @return true if the character is a base64 character, false otherwise.
3228
 */
3229
simdutf_warn_unused bool
3230
base64_valid_or_padding(char input,
3231
                        base64_options options = base64_default) noexcept;
3232
simdutf_warn_unused bool
3233
base64_valid_or_padding(char16_t input,
3234
                        base64_options options = base64_default) noexcept;
3235
3236
/**
3237
 * Convert a base64 input to a binary output.
3238
 *
3239
 * This function follows the WHATWG forgiving-base64 format, which means that it
3240
 * will ignore any ASCII spaces in the input. You may provide a padded input
3241
 * (with one or two equal signs at the end) or an unpadded input (without any
3242
 * equal signs at the end).
3243
 *
3244
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3245
 *
3246
 * This function will fail in case of invalid input. When last_chunk_options =
3247
 * loose, there are three possible reasons for failure: the input contains a
3248
 * number of base64 characters that when divided by 4, leaves a single remainder
3249
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
3250
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
3251
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
3252
 *
3253
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
3254
 * and the number of units processed, see description of the parameters and
3255
 * returned value.
3256
 *
3257
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3258
 * input where the invalid character was found. When the error is
3259
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3260
 *
3261
 * The default option (simdutf::base64_default) expects the characters `+` and
3262
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3263
 * characters `-` and `_` as part of its alphabet.
3264
 *
3265
 * The padding (`=`) is validated if present. There may be at most two padding
3266
 * characters at the end of the input. If there are any padding characters, the
3267
 * total number of characters (excluding spaces but including padding
3268
 * characters) must be divisible by four.
3269
 *
3270
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
3271
 * to discard the output unless the parameter decode_up_to_bad_char is set to
3272
 * true. In that case, the function will decode up to the first invalid
3273
 * character. Extra padding characters ('=') are considered invalid characters.
3274
 *
3275
 * Advanced users may want to taylor how the last chunk is handled. By default,
3276
 * we use a loose (forgiving) approach but we also support a strict approach
3277
 * as well as a stop_before_partial approach, as per the following proposal:
3278
 *
3279
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3280
 *
3281
 * @param input         the base64 string to process, in ASCII stored as 8-bit
3282
 * or 16-bit units
3283
 * @param length        the length of the string in 8-bit or 16-bit units.
3284
 * @param output        the pointer to a buffer that can hold the conversion
3285
 * result.
3286
 * @param outlen        the number of bytes that can be written in the output
3287
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3288
 * @param options       the base64 options to use, can be base64_default or
3289
 * base64_url, is base64_default by default.
3290
 * @param last_chunk_options the last chunk handling options,
3291
 * last_chunk_handling_options::loose by default
3292
 * but can also be last_chunk_handling_options::strict or
3293
 * last_chunk_handling_options::stop_before_partial.
3294
 * @param decode_up_to_bad_char if true, the function will decode up to the
3295
 * first invalid character. By default (false), it is assumed that the output
3296
 * buffer is to be discarded. When there are multiple errors in the input,
3297
 * using decode_up_to_bad_char might trigger a different error.
3298
 * @return a result pair struct (of type simdutf::result containing the two
3299
 * fields error and count) with an error code and position of the
3300
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3301
 * of units processed if successful.
3302
 */
3303
simdutf_warn_unused result
3304
base64_to_binary_safe(const char *input, size_t length, char *output,
3305
                      size_t &outlen, base64_options options = base64_default,
3306
                      last_chunk_handling_options last_chunk_options =
3307
                          last_chunk_handling_options::loose,
3308
                      bool decode_up_to_bad_char = false) noexcept;
3309
  #if SIMDUTF_SPAN
3310
/**
3311
 * @brief span overload
3312
 * @return a tuple of result and outlen
3313
 */
3314
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3315
base64_to_binary_safe(const detail::input_span_of_byte_like auto &input,
3316
                      detail::output_span_of_byte_like auto &&binary_output,
3317
                      base64_options options = base64_default,
3318
                      last_chunk_handling_options last_chunk_options = loose,
3319
                      bool decode_up_to_bad_char = false) noexcept {
3320
  size_t outlen = binary_output.size();
3321
  auto r = base64_to_binary_safe(
3322
      reinterpret_cast<const char *>(input.data()), input.size(),
3323
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3324
      last_chunk_options, decode_up_to_bad_char);
3325
  return {r, outlen};
3326
}
3327
  #endif // SIMDUTF_SPAN
3328
3329
simdutf_warn_unused result
3330
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
3331
                      size_t &outlen, base64_options options = base64_default,
3332
                      last_chunk_handling_options last_chunk_options =
3333
                          last_chunk_handling_options::loose,
3334
                      bool decode_up_to_bad_char = false) noexcept;
3335
  #if SIMDUTF_SPAN
3336
/**
3337
 * @brief span overload
3338
 * @return a tuple of result and outlen
3339
 */
3340
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3341
base64_to_binary_safe(std::span<const char16_t> input,
3342
                      detail::output_span_of_byte_like auto &&binary_output,
3343
                      base64_options options = base64_default,
3344
                      last_chunk_handling_options last_chunk_options = loose,
3345
                      bool decode_up_to_bad_char = false) noexcept {
3346
  size_t outlen = binary_output.size();
3347
  auto r = base64_to_binary_safe(input.data(), input.size(),
3348
                                 reinterpret_cast<char *>(binary_output.data()),
3349
                                 outlen, options, last_chunk_options,
3350
                                 decode_up_to_bad_char);
3351
  return {r, outlen};
3352
}
3353
  #endif // SIMDUTF_SPAN
3354
3355
  #if SIMDUTF_ATOMIC_REF
3356
/**
3357
 * Convert a base64 input to a binary output with a size limit and using atomic
3358
 * operations.
3359
 *
3360
 * Like `base64_to_binary_safe` but using atomic operations, this function is
3361
 * thread-safe for concurrent memory access, allowing the output
3362
 * buffers to be shared between threads without undefined behavior in case of
3363
 * data races.
3364
 *
3365
 * This function comes with a potentially significant performance penalty, but
3366
 * is useful when thread safety is needed during base64 decoding.
3367
 *
3368
 * This function is only available when simdutf is compiled with
3369
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3370
 * the availability of this function by checking the macro
3371
 * SIMDUTF_ATOMIC_REF.
3372
 *
3373
 * This function is considered experimental. It is not tested by default
3374
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3375
 * It is not documented in the public API documentation (README). It is
3376
 * offered on a best effort basis. We rely on the community for further
3377
 * testing and feedback.
3378
 *
3379
 * @param input         the base64 input to decode
3380
 * @param length        the length of the input in bytes
3381
 * @param output        the pointer to buffer that can hold the conversion
3382
 * result
3383
 * @param outlen        the number of bytes that can be written in the output
3384
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3385
 * @param options       the base64 options to use (default, url, etc.)
3386
 * @param last_chunk_options the last chunk handling options (loose, strict,
3387
 * stop_before_partial)
3388
 * @param decode_up_to_bad_char if true, the function will decode up to the
3389
 * first invalid character. By default (false), it is assumed that the output
3390
 * buffer is to be discarded. When there are multiple errors in the input,
3391
 * using decode_up_to_bad_char might trigger a different error.
3392
 * @return a result struct with an error code and count indicating error
3393
 * position or success
3394
 */
3395
simdutf_warn_unused result atomic_base64_to_binary_safe(
3396
    const char *input, size_t length, char *output, size_t &outlen,
3397
    base64_options options = base64_default,
3398
    last_chunk_handling_options last_chunk_options =
3399
        last_chunk_handling_options::loose,
3400
    bool decode_up_to_bad_char = false) noexcept;
3401
simdutf_warn_unused result atomic_base64_to_binary_safe(
3402
    const char16_t *input, size_t length, char *output, size_t &outlen,
3403
    base64_options options = base64_default,
3404
    last_chunk_handling_options last_chunk_options = loose,
3405
    bool decode_up_to_bad_char = false) noexcept;
3406
    #if SIMDUTF_SPAN
3407
/**
3408
 * @brief span overload
3409
 * @return a tuple of result and outlen
3410
 */
3411
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3412
atomic_base64_to_binary_safe(
3413
    const detail::input_span_of_byte_like auto &binary_input,
3414
    detail::output_span_of_byte_like auto &&output,
3415
    base64_options options = base64_default,
3416
    last_chunk_handling_options last_chunk_options =
3417
        last_chunk_handling_options::loose,
3418
    bool decode_up_to_bad_char = false) noexcept {
3419
  size_t outlen = output.size();
3420
  auto ret = atomic_base64_to_binary_safe(
3421
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
3422
      reinterpret_cast<char *>(output.data()), outlen, options,
3423
      last_chunk_options, decode_up_to_bad_char);
3424
  return {ret, outlen};
3425
}
3426
/**
3427
 * @brief span overload
3428
 * @return a tuple of result and outlen
3429
 */
3430
simdutf_warn_unused std::tuple<result, std::size_t>
3431
atomic_base64_to_binary_safe(
3432
    std::span<const char16_t> base64_input,
3433
    detail::output_span_of_byte_like auto &&binary_output,
3434
    base64_options options = base64_default,
3435
    last_chunk_handling_options last_chunk_options = loose,
3436
    bool decode_up_to_bad_char = false) noexcept {
3437
  size_t outlen = binary_output.size();
3438
  auto ret = atomic_base64_to_binary_safe(
3439
      base64_input.data(), base64_input.size(),
3440
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3441
      last_chunk_options, decode_up_to_bad_char);
3442
  return {ret, outlen};
3443
}
3444
    #endif // SIMDUTF_SPAN
3445
  #endif   // SIMDUTF_ATOMIC_REF
3446
3447
/**
3448
 * Find the first occurrence of a character in a string. If the character is
3449
 * not found, return a pointer to the end of the string.
3450
 * @param start        the start of the string
3451
 * @param end          the end of the string
3452
 * @param character    the character to find
3453
 * @return a pointer to the first occurrence of the character in the string,
3454
 * or a pointer to the end of the string if the character is not found.
3455
 *
3456
 */
3457
simdutf_warn_unused const char *find(const char *start, const char *end,
3458
                                     char character) noexcept;
3459
simdutf_warn_unused const char16_t *
3460
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
3461
#endif // SIMDUTF_FEATURE_BASE64
3462
3463
/**
3464
 * An implementation of simdutf for a particular CPU architecture.
3465
 *
3466
 * Also used to maintain the currently active implementation. The active
3467
 * implementation is automatically initialized on first use to the most advanced
3468
 * implementation supported by the host.
3469
 */
3470
class implementation {
3471
public:
3472
  /**
3473
   * The name of this implementation.
3474
   *
3475
   *     const implementation *impl = simdutf::active_implementation;
3476
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3477
   * impl->description() << ")" << endl;
3478
   *
3479
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3480
   */
3481
  virtual std::string name() const { return std::string(_name); }
3482
3483
  /**
3484
   * The description of this implementation.
3485
   *
3486
   *     const implementation *impl = simdutf::active_implementation;
3487
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3488
   * impl->description() << ")" << endl;
3489
   *
3490
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3491
   */
3492
  virtual std::string description() const { return std::string(_description); }
3493
3494
  /**
3495
   * The instruction sets this implementation is compiled against
3496
   * and the current CPU match. This function may poll the current CPU/system
3497
   * and should therefore not be called too often if performance is a concern.
3498
   *
3499
   *
3500
   * @return true if the implementation can be safely used on the current system
3501
   * (determined at runtime)
3502
   */
3503
  bool supported_by_runtime_system() const;
3504
3505
#if SIMDUTF_FEATURE_DETECT_ENCODING
3506
  /**
3507
   * This function will try to detect the encoding
3508
   * @param input the string to identify
3509
   * @param length the length of the string in bytes.
3510
   * @return the encoding type detected
3511
   */
3512
  virtual encoding_type autodetect_encoding(const char *input,
3513
                                            size_t length) const noexcept;
3514
3515
  /**
3516
   * This function will try to detect the possible encodings in one pass
3517
   * @param input the string to identify
3518
   * @param length the length of the string in bytes.
3519
   * @return the encoding type detected
3520
   */
3521
  virtual int detect_encodings(const char *input,
3522
                               size_t length) const noexcept = 0;
3523
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
3524
3525
  /**
3526
   * @private For internal implementation use
3527
   *
3528
   * The instruction sets this implementation is compiled against.
3529
   *
3530
   * @return a mask of all required `internal::instruction_set::` values
3531
   */
3532
  virtual uint32_t required_instruction_sets() const {
3533
    return _required_instruction_sets;
3534
  }
3535
3536
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3537
  /**
3538
   * Validate the UTF-8 string.
3539
   *
3540
   * Overridden by each implementation.
3541
   *
3542
   * @param buf the UTF-8 string to validate.
3543
   * @param len the length of the string in bytes.
3544
   * @return true if and only if the string is valid UTF-8.
3545
   */
3546
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
3547
                                                 size_t len) const noexcept = 0;
3548
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3549
3550
#if SIMDUTF_FEATURE_UTF8
3551
  /**
3552
   * Validate the UTF-8 string and stop on errors.
3553
   *
3554
   * Overridden by each implementation.
3555
   *
3556
   * @param buf the UTF-8 string to validate.
3557
   * @param len the length of the string in bytes.
3558
   * @return a result pair struct (of type simdutf::result containing the two
3559
   * fields error and count) with an error code and either position of the error
3560
   * (in the input in code units) if any, or the number of code units validated
3561
   * if successful.
3562
   */
3563
  simdutf_warn_unused virtual result
3564
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
3565
#endif // SIMDUTF_FEATURE_UTF8
3566
3567
#if SIMDUTF_FEATURE_ASCII
3568
  /**
3569
   * Validate the ASCII string.
3570
   *
3571
   * Overridden by each implementation.
3572
   *
3573
   * @param buf the ASCII string to validate.
3574
   * @param len the length of the string in bytes.
3575
   * @return true if and only if the string is valid ASCII.
3576
   */
3577
  simdutf_warn_unused virtual bool
3578
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
3579
3580
  /**
3581
   * Validate the ASCII string and stop on error.
3582
   *
3583
   * Overridden by each implementation.
3584
   *
3585
   * @param buf the ASCII string to validate.
3586
   * @param len the length of the string in bytes.
3587
   * @return a result pair struct (of type simdutf::result containing the two
3588
   * fields error and count) with an error code and either position of the error
3589
   * (in the input in code units) if any, or the number of code units validated
3590
   * if successful.
3591
   */
3592
  simdutf_warn_unused virtual result
3593
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
3594
#endif // SIMDUTF_FEATURE_ASCII
3595
3596
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3597
  /**
3598
   * Validate the UTF-16LE string.This function may be best when you expect
3599
   * the input to be almost always valid. Otherwise, consider using
3600
   * validate_utf16le_with_errors.
3601
   *
3602
   * Overridden by each implementation.
3603
   *
3604
   * This function is not BOM-aware.
3605
   *
3606
   * @param buf the UTF-16LE string to validate.
3607
   * @param len the length of the string in number of 2-byte code units
3608
   * (char16_t).
3609
   * @return true if and only if the string is valid UTF-16LE.
3610
   */
3611
  simdutf_warn_unused virtual bool
3612
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
3613
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3614
3615
#if SIMDUTF_FEATURE_UTF16
3616
  /**
3617
   * Validate the UTF-16BE string. This function may be best when you expect
3618
   * the input to be almost always valid. Otherwise, consider using
3619
   * validate_utf16be_with_errors.
3620
   *
3621
   * Overridden by each implementation.
3622
   *
3623
   * This function is not BOM-aware.
3624
   *
3625
   * @param buf the UTF-16BE string to validate.
3626
   * @param len the length of the string in number of 2-byte code units
3627
   * (char16_t).
3628
   * @return true if and only if the string is valid UTF-16BE.
3629
   */
3630
  simdutf_warn_unused virtual bool
3631
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
3632
3633
  /**
3634
   * Validate the UTF-16LE string and stop on error.  It might be faster than
3635
   * validate_utf16le when an error is expected to occur early.
3636
   *
3637
   * Overridden by each implementation.
3638
   *
3639
   * This function is not BOM-aware.
3640
   *
3641
   * @param buf the UTF-16LE string to validate.
3642
   * @param len the length of the string in number of 2-byte code units
3643
   * (char16_t).
3644
   * @return a result pair struct (of type simdutf::result containing the two
3645
   * fields error and count) with an error code and either position of the error
3646
   * (in the input in code units) if any, or the number of code units validated
3647
   * if successful.
3648
   */
3649
  simdutf_warn_unused virtual result
3650
  validate_utf16le_with_errors(const char16_t *buf,
3651
                               size_t len) const noexcept = 0;
3652
3653
  /**
3654
   * Validate the UTF-16BE string and stop on error. It might be faster than
3655
   * validate_utf16be when an error is expected to occur early.
3656
   *
3657
   * Overridden by each implementation.
3658
   *
3659
   * This function is not BOM-aware.
3660
   *
3661
   * @param buf the UTF-16BE string to validate.
3662
   * @param len the length of the string in number of 2-byte code units
3663
   * (char16_t).
3664
   * @return a result pair struct (of type simdutf::result containing the two
3665
   * fields error and count) with an error code and either position of the error
3666
   * (in the input in code units) if any, or the number of code units validated
3667
   * if successful.
3668
   */
3669
  simdutf_warn_unused virtual result
3670
  validate_utf16be_with_errors(const char16_t *buf,
3671
                               size_t len) const noexcept = 0;
3672
  /**
3673
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
3674
   * Unicode replacement character U+FFFD. We allow the input and output to be
3675
   * the same buffer so that the correction is done in-place.
3676
   *
3677
   * Overridden by each implementation.
3678
   *
3679
   * @param input the UTF-16LE string to correct.
3680
   * @param len the length of the string in number of 2-byte code units
3681
   * (char16_t).
3682
   * @param output the output buffer.
3683
   */
3684
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
3685
                                      char16_t *output) const noexcept = 0;
3686
  /**
3687
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
3688
   * Unicode replacement character U+FFFD. We allow the input and output to be
3689
   * the same buffer so that the correction is done in-place.
3690
   *
3691
   * Overridden by each implementation.
3692
   *
3693
   * @param input the UTF-16BE string to correct.
3694
   * @param len the length of the string in number of 2-byte code units
3695
   * (char16_t).
3696
   * @param output the output buffer.
3697
   */
3698
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
3699
                                      char16_t *output) const noexcept = 0;
3700
#endif // SIMDUTF_FEATURE_UTF16
3701
3702
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3703
  /**
3704
   * Validate the UTF-32 string.
3705
   *
3706
   * Overridden by each implementation.
3707
   *
3708
   * This function is not BOM-aware.
3709
   *
3710
   * @param buf the UTF-32 string to validate.
3711
   * @param len the length of the string in number of 4-byte code units
3712
   * (char32_t).
3713
   * @return true if and only if the string is valid UTF-32.
3714
   */
3715
  simdutf_warn_unused virtual bool
3716
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
3717
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3718
3719
#if SIMDUTF_FEATURE_UTF32
3720
  /**
3721
   * Validate the UTF-32 string and stop on error.
3722
   *
3723
   * Overridden by each implementation.
3724
   *
3725
   * This function is not BOM-aware.
3726
   *
3727
   * @param buf the UTF-32 string to validate.
3728
   * @param len the length of the string in number of 4-byte code units
3729
   * (char32_t).
3730
   * @return a result pair struct (of type simdutf::result containing the two
3731
   * fields error and count) with an error code and either position of the error
3732
   * (in the input in code units) if any, or the number of code units validated
3733
   * if successful.
3734
   */
3735
  simdutf_warn_unused virtual result
3736
  validate_utf32_with_errors(const char32_t *buf,
3737
                             size_t len) const noexcept = 0;
3738
#endif // SIMDUTF_FEATURE_UTF32
3739
3740
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3741
  /**
3742
   * Convert Latin1 string into UTF-8 string.
3743
   *
3744
   * This function is suitable to work with inputs from untrusted sources.
3745
   *
3746
   * @param input         the Latin1 string to convert
3747
   * @param length        the length of the string in bytes
3748
   * @param utf8_output  the pointer to buffer that can hold conversion result
3749
   * @return the number of written char; 0 if conversion is not possible
3750
   */
3751
  simdutf_warn_unused virtual size_t
3752
  convert_latin1_to_utf8(const char *input, size_t length,
3753
                         char *utf8_output) const noexcept = 0;
3754
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3755
3756
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3757
  /**
3758
   * Convert possibly Latin1 string into UTF-16LE string.
3759
   *
3760
   * This function is suitable to work with inputs from untrusted sources.
3761
   *
3762
   * @param input         the Latin1  string to convert
3763
   * @param length        the length of the string in bytes
3764
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3765
   * @return the number of written char16_t; 0 if conversion is not possible
3766
   */
3767
  simdutf_warn_unused virtual size_t
3768
  convert_latin1_to_utf16le(const char *input, size_t length,
3769
                            char16_t *utf16_output) const noexcept = 0;
3770
3771
  /**
3772
   * Convert Latin1 string into UTF-16BE string.
3773
   *
3774
   * This function is suitable to work with inputs from untrusted sources.
3775
   *
3776
   * @param input         the Latin1 string to convert
3777
   * @param length        the length of the string in bytes
3778
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3779
   * @return the number of written char16_t; 0 if conversion is not possible
3780
   */
3781
  simdutf_warn_unused virtual size_t
3782
  convert_latin1_to_utf16be(const char *input, size_t length,
3783
                            char16_t *utf16_output) const noexcept = 0;
3784
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3785
3786
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3787
  /**
3788
   * Convert Latin1 string into UTF-32 string.
3789
   *
3790
   * This function is suitable to work with inputs from untrusted sources.
3791
   *
3792
   * @param input         the Latin1 string to convert
3793
   * @param length        the length of the string in bytes
3794
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
3795
   * @return the number of written char32_t; 0 if conversion is not possible
3796
   */
3797
  simdutf_warn_unused virtual size_t
3798
  convert_latin1_to_utf32(const char *input, size_t length,
3799
                          char32_t *utf32_buffer) const noexcept = 0;
3800
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3801
3802
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3803
  /**
3804
   * Convert possibly broken UTF-8 string into latin1 string.
3805
   *
3806
   * During the conversion also validation of the input string is done.
3807
   * This function is suitable to work with inputs from untrusted sources.
3808
   *
3809
   * @param input         the UTF-8 string to convert
3810
   * @param length        the length of the string in bytes
3811
   * @param latin1_output  the pointer to buffer that can hold conversion result
3812
   * @return the number of written char; 0 if the input was not valid UTF-8
3813
   * string or if it cannot be represented as Latin1
3814
   */
3815
  simdutf_warn_unused virtual size_t
3816
  convert_utf8_to_latin1(const char *input, size_t length,
3817
                         char *latin1_output) const noexcept = 0;
3818
3819
  /**
3820
   * Convert possibly broken UTF-8 string into latin1 string with errors.
3821
   * If the string cannot be represented as Latin1, an error
3822
   * code is returned.
3823
   *
3824
   * During the conversion also validation of the input string is done.
3825
   * This function is suitable to work with inputs from untrusted sources.
3826
   *
3827
   * @param input         the UTF-8 string to convert
3828
   * @param length        the length of the string in bytes
3829
   * @param latin1_output  the pointer to buffer that can hold conversion result
3830
   * @return a result pair struct (of type simdutf::result containing the two
3831
   * fields error and count) with an error code and either position of the error
3832
   * (in the input in code units) if any, or the number of code units validated
3833
   * if successful.
3834
   */
3835
  simdutf_warn_unused virtual result
3836
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
3837
                                     char *latin1_output) const noexcept = 0;
3838
3839
  /**
3840
   * Convert valid UTF-8 string into latin1 string.
3841
   *
3842
   * This function assumes that the input string is valid UTF-8 and that it can
3843
   * be represented as Latin1. If you violate this assumption, the result is
3844
   * implementation defined and may include system-dependent behavior such as
3845
   * crashes.
3846
   *
3847
   * This function is for expert users only and not part of our public API. Use
3848
   * convert_utf8_to_latin1 instead.
3849
   *
3850
   * This function is not BOM-aware.
3851
   *
3852
   * @param input         the UTF-8 string to convert
3853
   * @param length        the length of the string in bytes
3854
   * @param latin1_output  the pointer to buffer that can hold conversion result
3855
   * @return the number of written char; 0 if the input was not valid UTF-8
3856
   * string
3857
   */
3858
  simdutf_warn_unused virtual size_t
3859
  convert_valid_utf8_to_latin1(const char *input, size_t length,
3860
                               char *latin1_output) const noexcept = 0;
3861
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3862
3863
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3864
  /**
3865
   * Convert possibly broken UTF-8 string into UTF-16LE string.
3866
   *
3867
   * During the conversion also validation of the input string is done.
3868
   * This function is suitable to work with inputs from untrusted sources.
3869
   *
3870
   * @param input         the UTF-8 string to convert
3871
   * @param length        the length of the string in bytes
3872
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3873
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
3874
   * string
3875
   */
3876
  simdutf_warn_unused virtual size_t
3877
  convert_utf8_to_utf16le(const char *input, size_t length,
3878
                          char16_t *utf16_output) const noexcept = 0;
3879
3880
  /**
3881
   * Convert possibly broken UTF-8 string into UTF-16BE string.
3882
   *
3883
   * During the conversion also validation of the input string is done.
3884
   * This function is suitable to work with inputs from untrusted sources.
3885
   *
3886
   * @param input         the UTF-8 string to convert
3887
   * @param length        the length of the string in bytes
3888
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3889
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
3890
   * string
3891
   */
3892
  simdutf_warn_unused virtual size_t
3893
  convert_utf8_to_utf16be(const char *input, size_t length,
3894
                          char16_t *utf16_output) const noexcept = 0;
3895
3896
  /**
3897
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
3898
   * error.
3899
   *
3900
   * During the conversion also validation of the input string is done.
3901
   * This function is suitable to work with inputs from untrusted sources.
3902
   *
3903
   * @param input         the UTF-8 string to convert
3904
   * @param length        the length of the string in bytes
3905
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3906
   * @return a result pair struct (of type simdutf::result containing the two
3907
   * fields error and count) with an error code and either position of the error
3908
   * (in the input in code units) if any, or the number of code units validated
3909
   * if successful.
3910
   */
3911
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
3912
      const char *input, size_t length,
3913
      char16_t *utf16_output) const noexcept = 0;
3914
3915
  /**
3916
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
3917
   * error.
3918
   *
3919
   * During the conversion also validation of the input string is done.
3920
   * This function is suitable to work with inputs from untrusted sources.
3921
   *
3922
   * @param input         the UTF-8 string to convert
3923
   * @param length        the length of the string in bytes
3924
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3925
   * @return a result pair struct (of type simdutf::result containing the two
3926
   * fields error and count) with an error code and either position of the error
3927
   * (in the input in code units) if any, or the number of code units validated
3928
   * if successful.
3929
   */
3930
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
3931
      const char *input, size_t length,
3932
      char16_t *utf16_output) const noexcept = 0;
3933
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3934
3935
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3936
  /**
3937
   * Convert possibly broken UTF-8 string into UTF-32 string.
3938
   *
3939
   * During the conversion also validation of the input string is done.
3940
   * This function is suitable to work with inputs from untrusted sources.
3941
   *
3942
   * @param input         the UTF-8 string to convert
3943
   * @param length        the length of the string in bytes
3944
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
3945
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
3946
   * string
3947
   */
3948
  simdutf_warn_unused virtual size_t
3949
  convert_utf8_to_utf32(const char *input, size_t length,
3950
                        char32_t *utf32_output) const noexcept = 0;
3951
3952
  /**
3953
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
3954
   *
3955
   * During the conversion also validation of the input string is done.
3956
   * This function is suitable to work with inputs from untrusted sources.
3957
   *
3958
   * @param input         the UTF-8 string to convert
3959
   * @param length        the length of the string in bytes
3960
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
3961
   * @return a result pair struct (of type simdutf::result containing the two
3962
   * fields error and count) with an error code and either position of the error
3963
   * (in the input in code units) if any, or the number of char32_t written if
3964
   * successful.
3965
   */
3966
  simdutf_warn_unused virtual result
3967
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
3968
                                    char32_t *utf32_output) const noexcept = 0;
3969
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3970
3971
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3972
  /**
3973
   * Convert valid UTF-8 string into UTF-16LE string.
3974
   *
3975
   * This function assumes that the input string is valid UTF-8.
3976
   *
3977
   * @param input         the UTF-8 string to convert
3978
   * @param length        the length of the string in bytes
3979
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3980
   * @return the number of written char16_t
3981
   */
3982
  simdutf_warn_unused virtual size_t
3983
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
3984
                                char16_t *utf16_buffer) const noexcept = 0;
3985
3986
  /**
3987
   * Convert valid UTF-8 string into UTF-16BE string.
3988
   *
3989
   * This function assumes that the input string is valid UTF-8.
3990
   *
3991
   * @param input         the UTF-8 string to convert
3992
   * @param length        the length of the string in bytes
3993
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
3994
   * @return the number of written char16_t
3995
   */
3996
  simdutf_warn_unused virtual size_t
3997
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
3998
                                char16_t *utf16_buffer) const noexcept = 0;
3999
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4000
4001
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4002
  /**
4003
   * Convert valid UTF-8 string into UTF-32 string.
4004
   *
4005
   * This function assumes that the input string is valid UTF-8.
4006
   *
4007
   * @param input         the UTF-8 string to convert
4008
   * @param length        the length of the string in bytes
4009
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4010
   * @return the number of written char32_t
4011
   */
4012
  simdutf_warn_unused virtual size_t
4013
  convert_valid_utf8_to_utf32(const char *input, size_t length,
4014
                              char32_t *utf32_buffer) const noexcept = 0;
4015
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4016
4017
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4018
  /**
4019
   * Compute the number of 2-byte code units that this UTF-8 string would
4020
   * require in UTF-16LE format.
4021
   *
4022
   * This function does not validate the input. It is acceptable to pass invalid
4023
   * UTF-8 strings but in such cases the result is implementation defined.
4024
   *
4025
   * @param input         the UTF-8 string to process
4026
   * @param length        the length of the string in bytes
4027
   * @return the number of char16_t code units required to encode the UTF-8
4028
   * string as UTF-16LE
4029
   */
4030
  simdutf_warn_unused virtual size_t
4031
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4032
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4033
4034
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4035
  /**
4036
   * Compute the number of 4-byte code units that this UTF-8 string would
4037
   * require in UTF-32 format.
4038
   *
4039
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
4040
   * UTF-8 strings but in such cases the result is implementation defined.
4041
   *
4042
   * This function does not validate the input.
4043
   *
4044
   * @param input         the UTF-8 string to process
4045
   * @param length        the length of the string in bytes
4046
   * @return the number of char32_t code units required to encode the UTF-8
4047
   * string as UTF-32
4048
   */
4049
  simdutf_warn_unused virtual size_t
4050
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4051
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4052
4053
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4054
  /**
4055
   * Convert possibly broken UTF-16LE string into Latin1 string.
4056
   *
4057
   * During the conversion also validation of the input string is done.
4058
   * This function is suitable to work with inputs from untrusted sources.
4059
   *
4060
   * This function is not BOM-aware.
4061
   *
4062
   * @param input         the UTF-16LE string to convert
4063
   * @param length        the length of the string in 2-byte code units
4064
   * (char16_t)
4065
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4066
   * result
4067
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4068
   * string or if it cannot be represented as Latin1
4069
   */
4070
  simdutf_warn_unused virtual size_t
4071
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
4072
                            char *latin1_buffer) const noexcept = 0;
4073
4074
  /**
4075
   * Convert possibly broken UTF-16BE string into Latin1 string.
4076
   *
4077
   * During the conversion also validation of the input string is done.
4078
   * This function is suitable to work with inputs from untrusted sources.
4079
   *
4080
   * This function is not BOM-aware.
4081
   *
4082
   * @param input         the UTF-16BE string to convert
4083
   * @param length        the length of the string in 2-byte code units
4084
   * (char16_t)
4085
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4086
   * result
4087
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4088
   * string or if it cannot be represented as Latin1
4089
   */
4090
  simdutf_warn_unused virtual size_t
4091
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
4092
                            char *latin1_buffer) const noexcept = 0;
4093
4094
  /**
4095
   * Convert possibly broken UTF-16LE string into Latin1 string.
4096
   * If the string cannot be represented as Latin1, an error
4097
   * is returned.
4098
   *
4099
   * During the conversion also validation of the input string is done.
4100
   * This function is suitable to work with inputs from untrusted sources.
4101
   * This function is not BOM-aware.
4102
   *
4103
   * @param input         the UTF-16LE string to convert
4104
   * @param length        the length of the string in 2-byte code units
4105
   * (char16_t)
4106
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4107
   * result
4108
   * @return a result pair struct (of type simdutf::result containing the two
4109
   * fields error and count) with an error code and either position of the error
4110
   * (in the input in code units) if any, or the number of char written if
4111
   * successful.
4112
   */
4113
  simdutf_warn_unused virtual result
4114
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
4115
                                        char *latin1_buffer) const noexcept = 0;
4116
4117
  /**
4118
   * Convert possibly broken UTF-16BE string into Latin1 string.
4119
   * If the string cannot be represented as Latin1, an error
4120
   * is returned.
4121
   *
4122
   * During the conversion also validation of the input string is done.
4123
   * This function is suitable to work with inputs from untrusted sources.
4124
   * This function is not BOM-aware.
4125
   *
4126
   * @param input         the UTF-16BE string to convert
4127
   * @param length        the length of the string in 2-byte code units
4128
   * (char16_t)
4129
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4130
   * result
4131
   * @return a result pair struct (of type simdutf::result containing the two
4132
   * fields error and count) with an error code and either position of the error
4133
   * (in the input in code units) if any, or the number of char written if
4134
   * successful.
4135
   */
4136
  simdutf_warn_unused virtual result
4137
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
4138
                                        char *latin1_buffer) const noexcept = 0;
4139
4140
  /**
4141
   * Convert valid UTF-16LE string into Latin1 string.
4142
   *
4143
   * This function assumes that the input string is valid UTF-L16LE and that it
4144
   * can be represented as Latin1. If you violate this assumption, the result is
4145
   * implementation defined and may include system-dependent behavior such as
4146
   * crashes.
4147
   *
4148
   * This function is for expert users only and not part of our public API. Use
4149
   * convert_utf16le_to_latin1 instead.
4150
   *
4151
   * This function is not BOM-aware.
4152
   *
4153
   * @param input         the UTF-16LE string to convert
4154
   * @param length        the length of the string in 2-byte code units
4155
   * (char16_t)
4156
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4157
   * result
4158
   * @return number of written code units; 0 if conversion is not possible
4159
   */
4160
  simdutf_warn_unused virtual size_t
4161
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
4162
                                  char *latin1_buffer) const noexcept = 0;
4163
4164
  /**
4165
   * Convert valid UTF-16BE string into Latin1 string.
4166
   *
4167
   * This function assumes that the input string is valid UTF16-BE and that it
4168
   * can be represented as Latin1. If you violate this assumption, the result is
4169
   * implementation defined and may include system-dependent behavior such as
4170
   * crashes.
4171
   *
4172
   * This function is for expert users only and not part of our public API. Use
4173
   * convert_utf16be_to_latin1 instead.
4174
   *
4175
   * This function is not BOM-aware.
4176
   *
4177
   * @param input         the UTF-16BE string to convert
4178
   * @param length        the length of the string in 2-byte code units
4179
   * (char16_t)
4180
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4181
   * result
4182
   * @return number of written code units; 0 if conversion is not possible
4183
   */
4184
  simdutf_warn_unused virtual size_t
4185
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
4186
                                  char *latin1_buffer) const noexcept = 0;
4187
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4188
4189
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4190
  /**
4191
   * Convert possibly broken UTF-16LE string into UTF-8 string.
4192
   *
4193
   * During the conversion also validation of the input string is done.
4194
   * This function is suitable to work with inputs from untrusted sources.
4195
   *
4196
   * This function is not BOM-aware.
4197
   *
4198
   * @param input         the UTF-16LE string to convert
4199
   * @param length        the length of the string in 2-byte code units
4200
   * (char16_t)
4201
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4202
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4203
   * string
4204
   */
4205
  simdutf_warn_unused virtual size_t
4206
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
4207
                          char *utf8_buffer) const noexcept = 0;
4208
4209
  /**
4210
   * Convert possibly broken UTF-16BE string into UTF-8 string.
4211
   *
4212
   * During the conversion also validation of the input string is done.
4213
   * This function is suitable to work with inputs from untrusted sources.
4214
   *
4215
   * This function is not BOM-aware.
4216
   *
4217
   * @param input         the UTF-16BE string to convert
4218
   * @param length        the length of the string in 2-byte code units
4219
   * (char16_t)
4220
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4221
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4222
   * string
4223
   */
4224
  simdutf_warn_unused virtual size_t
4225
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
4226
                          char *utf8_buffer) const noexcept = 0;
4227
4228
  /**
4229
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
4230
   * error.
4231
   *
4232
   * During the conversion also validation of the input string is done.
4233
   * This function is suitable to work with inputs from untrusted sources.
4234
   *
4235
   * This function is not BOM-aware.
4236
   *
4237
   * @param input         the UTF-16LE string to convert
4238
   * @param length        the length of the string in 2-byte code units
4239
   * (char16_t)
4240
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4241
   * @return a result pair struct (of type simdutf::result containing the two
4242
   * fields error and count) with an error code and either position of the error
4243
   * (in the input in code units) if any, or the number of char written if
4244
   * successful.
4245
   */
4246
  simdutf_warn_unused virtual result
4247
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
4248
                                      char *utf8_buffer) const noexcept = 0;
4249
4250
  /**
4251
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
4252
   * error.
4253
   *
4254
   * During the conversion also validation of the input string is done.
4255
   * This function is suitable to work with inputs from untrusted sources.
4256
   *
4257
   * This function is not BOM-aware.
4258
   *
4259
   * @param input         the UTF-16BE string to convert
4260
   * @param length        the length of the string in 2-byte code units
4261
   * (char16_t)
4262
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4263
   * @return a result pair struct (of type simdutf::result containing the two
4264
   * fields error and count) with an error code and either position of the error
4265
   * (in the input in code units) if any, or the number of char written if
4266
   * successful.
4267
   */
4268
  simdutf_warn_unused virtual result
4269
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
4270
                                      char *utf8_buffer) const noexcept = 0;
4271
4272
  /**
4273
   * Convert valid UTF-16LE string into UTF-8 string.
4274
   *
4275
   * This function assumes that the input string is valid UTF-16LE.
4276
   *
4277
   * This function is not BOM-aware.
4278
   *
4279
   * @param input         the UTF-16LE string to convert
4280
   * @param length        the length of the string in 2-byte code units
4281
   * (char16_t)
4282
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4283
   * result
4284
   * @return number of written code units; 0 if conversion is not possible
4285
   */
4286
  simdutf_warn_unused virtual size_t
4287
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
4288
                                char *utf8_buffer) const noexcept = 0;
4289
4290
  /**
4291
   * Convert valid UTF-16BE string into UTF-8 string.
4292
   *
4293
   * This function assumes that the input string is valid UTF-16BE.
4294
   *
4295
   * This function is not BOM-aware.
4296
   *
4297
   * @param input         the UTF-16BE string to convert
4298
   * @param length        the length of the string in 2-byte code units
4299
   * (char16_t)
4300
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4301
   * result
4302
   * @return number of written code units; 0 if conversion is not possible
4303
   */
4304
  simdutf_warn_unused virtual size_t
4305
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
4306
                                char *utf8_buffer) const noexcept = 0;
4307
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4308
4309
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4310
  /**
4311
   * Convert possibly broken UTF-16LE string into UTF-32 string.
4312
   *
4313
   * During the conversion also validation of the input string is done.
4314
   * This function is suitable to work with inputs from untrusted sources.
4315
   *
4316
   * This function is not BOM-aware.
4317
   *
4318
   * @param input         the UTF-16LE string to convert
4319
   * @param length        the length of the string in 2-byte code units
4320
   * (char16_t)
4321
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4322
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4323
   * string
4324
   */
4325
  simdutf_warn_unused virtual size_t
4326
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
4327
                           char32_t *utf32_buffer) const noexcept = 0;
4328
4329
  /**
4330
   * Convert possibly broken UTF-16BE string into UTF-32 string.
4331
   *
4332
   * During the conversion also validation of the input string is done.
4333
   * This function is suitable to work with inputs from untrusted sources.
4334
   *
4335
   * This function is not BOM-aware.
4336
   *
4337
   * @param input         the UTF-16BE string to convert
4338
   * @param length        the length of the string in 2-byte code units
4339
   * (char16_t)
4340
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4341
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4342
   * string
4343
   */
4344
  simdutf_warn_unused virtual size_t
4345
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
4346
                           char32_t *utf32_buffer) const noexcept = 0;
4347
4348
  /**
4349
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
4350
   * error.
4351
   *
4352
   * During the conversion also validation of the input string is done.
4353
   * This function is suitable to work with inputs from untrusted sources.
4354
   *
4355
   * This function is not BOM-aware.
4356
   *
4357
   * @param input         the UTF-16LE string to convert
4358
   * @param length        the length of the string in 2-byte code units
4359
   * (char16_t)
4360
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4361
   * @return a result pair struct (of type simdutf::result containing the two
4362
   * fields error and count) with an error code and either position of the error
4363
   * (in the input in code units) if any, or the number of char32_t written if
4364
   * successful.
4365
   */
4366
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
4367
      const char16_t *input, size_t length,
4368
      char32_t *utf32_buffer) const noexcept = 0;
4369
4370
  /**
4371
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
4372
   * error.
4373
   *
4374
   * During the conversion also validation of the input string is done.
4375
   * This function is suitable to work with inputs from untrusted sources.
4376
   *
4377
   * This function is not BOM-aware.
4378
   *
4379
   * @param input         the UTF-16BE string to convert
4380
   * @param length        the length of the string in 2-byte code units
4381
   * (char16_t)
4382
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4383
   * @return a result pair struct (of type simdutf::result containing the two
4384
   * fields error and count) with an error code and either position of the error
4385
   * (in the input in code units) if any, or the number of char32_t written if
4386
   * successful.
4387
   */
4388
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
4389
      const char16_t *input, size_t length,
4390
      char32_t *utf32_buffer) const noexcept = 0;
4391
4392
  /**
4393
   * Convert valid UTF-16LE string into UTF-32 string.
4394
   *
4395
   * This function assumes that the input string is valid UTF-16LE.
4396
   *
4397
   * This function is not BOM-aware.
4398
   *
4399
   * @param input         the UTF-16LE string to convert
4400
   * @param length        the length of the string in 2-byte code units
4401
   * (char16_t)
4402
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4403
   * result
4404
   * @return number of written code units; 0 if conversion is not possible
4405
   */
4406
  simdutf_warn_unused virtual size_t
4407
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
4408
                                 char32_t *utf32_buffer) const noexcept = 0;
4409
4410
  /**
4411
   * Convert valid UTF-16LE string into UTF-32BE string.
4412
   *
4413
   * This function assumes that the input string is valid UTF-16BE.
4414
   *
4415
   * This function is not BOM-aware.
4416
   *
4417
   * @param input         the UTF-16BE string to convert
4418
   * @param length        the length of the string in 2-byte code units
4419
   * (char16_t)
4420
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4421
   * result
4422
   * @return number of written code units; 0 if conversion is not possible
4423
   */
4424
  simdutf_warn_unused virtual size_t
4425
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
4426
                                 char32_t *utf32_buffer) const noexcept = 0;
4427
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4428
4429
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4430
  /**
4431
   * Compute the number of bytes that this UTF-16LE string would require in
4432
   * UTF-8 format.
4433
   *
4434
   * This function does not validate the input. It is acceptable to pass invalid
4435
   * UTF-16 strings but in such cases the result is implementation defined.
4436
   *
4437
   * This function is not BOM-aware.
4438
   *
4439
   * @param input         the UTF-16LE string to convert
4440
   * @param length        the length of the string in 2-byte code units
4441
   * (char16_t)
4442
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
4443
   */
4444
  simdutf_warn_unused virtual size_t
4445
  utf8_length_from_utf16le(const char16_t *input,
4446
                           size_t length) const noexcept = 0;
4447
4448
  /**
4449
   * Compute the number of bytes that this UTF-16BE string would require in
4450
   * UTF-8 format.
4451
   *
4452
   * This function does not validate the input. It is acceptable to pass invalid
4453
   * UTF-16 strings but in such cases the result is implementation defined.
4454
   *
4455
   * This function is not BOM-aware.
4456
   *
4457
   * @param input         the UTF-16BE string to convert
4458
   * @param length        the length of the string in 2-byte code units
4459
   * (char16_t)
4460
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
4461
   */
4462
  simdutf_warn_unused virtual size_t
4463
  utf8_length_from_utf16be(const char16_t *input,
4464
                           size_t length) const noexcept = 0;
4465
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4466
4467
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4468
  /**
4469
   * Convert possibly broken UTF-32 string into Latin1 string.
4470
   *
4471
   * During the conversion also validation of the input string is done.
4472
   * This function is suitable to work with inputs from untrusted sources.
4473
   *
4474
   * This function is not BOM-aware.
4475
   *
4476
   * @param input         the UTF-32 string to convert
4477
   * @param length        the length of the string in 4-byte code units
4478
   * (char32_t)
4479
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4480
   * result
4481
   * @return number of written code units; 0 if input is not a valid UTF-32
4482
   * string
4483
   */
4484
  simdutf_warn_unused virtual size_t
4485
  convert_utf32_to_latin1(const char32_t *input, size_t length,
4486
                          char *latin1_buffer) const noexcept = 0;
4487
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4488
4489
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4490
  /**
4491
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
4492
   * If the string cannot be represented as Latin1, an error is returned.
4493
   *
4494
   * During the conversion also validation of the input string is done.
4495
   * This function is suitable to work with inputs from untrusted sources.
4496
   *
4497
   * This function is not BOM-aware.
4498
   *
4499
   * @param input         the UTF-32 string to convert
4500
   * @param length        the length of the string in 4-byte code units
4501
   * (char32_t)
4502
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4503
   * result
4504
   * @return a result pair struct (of type simdutf::result containing the two
4505
   * fields error and count) with an error code and either position of the error
4506
   * (in the input in code units) if any, or the number of char written if
4507
   * successful.
4508
   */
4509
  simdutf_warn_unused virtual result
4510
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
4511
                                      char *latin1_buffer) const noexcept = 0;
4512
4513
  /**
4514
   * Convert valid UTF-32 string into Latin1 string.
4515
   *
4516
   * This function assumes that the input string is valid UTF-32 and can be
4517
   * represented as Latin1. If you violate this assumption, the result is
4518
   * implementation defined and may include system-dependent behavior such as
4519
   * crashes.
4520
   *
4521
   * This function is for expert users only and not part of our public API. Use
4522
   * convert_utf32_to_latin1 instead.
4523
   *
4524
   * This function is not BOM-aware.
4525
   *
4526
   * @param input         the UTF-32 string to convert
4527
   * @param length        the length of the string in 4-byte code units
4528
   * (char32_t)
4529
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
4530
   * result
4531
   * @return number of written code units; 0 if conversion is not possible
4532
   */
4533
  simdutf_warn_unused virtual size_t
4534
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
4535
                                char *latin1_buffer) const noexcept = 0;
4536
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4537
4538
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4539
  /**
4540
   * Convert possibly broken UTF-32 string into UTF-8 string.
4541
   *
4542
   * During the conversion also validation of the input string is done.
4543
   * This function is suitable to work with inputs from untrusted sources.
4544
   *
4545
   * This function is not BOM-aware.
4546
   *
4547
   * @param input         the UTF-32 string to convert
4548
   * @param length        the length of the string in 4-byte code units
4549
   * (char32_t)
4550
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4551
   * @return number of written code units; 0 if input is not a valid UTF-32
4552
   * string
4553
   */
4554
  simdutf_warn_unused virtual size_t
4555
  convert_utf32_to_utf8(const char32_t *input, size_t length,
4556
                        char *utf8_buffer) const noexcept = 0;
4557
4558
  /**
4559
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
4560
   *
4561
   * During the conversion also validation of the input string is done.
4562
   * This function is suitable to work with inputs from untrusted sources.
4563
   *
4564
   * This function is not BOM-aware.
4565
   *
4566
   * @param input         the UTF-32 string to convert
4567
   * @param length        the length of the string in 4-byte code units
4568
   * (char32_t)
4569
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4570
   * @return a result pair struct (of type simdutf::result containing the two
4571
   * fields error and count) with an error code and either position of the error
4572
   * (in the input in code units) if any, or the number of char written if
4573
   * successful.
4574
   */
4575
  simdutf_warn_unused virtual result
4576
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
4577
                                    char *utf8_buffer) const noexcept = 0;
4578
4579
  /**
4580
   * Convert valid UTF-32 string into UTF-8 string.
4581
   *
4582
   * This function assumes that the input string is valid UTF-32.
4583
   *
4584
   * This function is not BOM-aware.
4585
   *
4586
   * @param input         the UTF-32 string to convert
4587
   * @param length        the length of the string in 4-byte code units
4588
   * (char32_t)
4589
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4590
   * result
4591
   * @return number of written code units; 0 if conversion is not possible
4592
   */
4593
  simdutf_warn_unused virtual size_t
4594
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
4595
                              char *utf8_buffer) const noexcept = 0;
4596
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4597
4598
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4599
  /**
4600
   * Return the number of bytes that this UTF-16 string would require in Latin1
4601
   * format.
4602
   *
4603
   *
4604
   * @param input         the UTF-16 string to convert
4605
   * @param length        the length of the string in 2-byte code units
4606
   * (char16_t)
4607
   * @return the number of bytes required to encode the UTF-16 string as Latin1
4608
   */
4609
  simdutf_warn_unused virtual size_t
4610
  utf16_length_from_latin1(size_t length) const noexcept {
4611
    return length;
4612
  }
4613
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4614
4615
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4616
  /**
4617
   * Convert possibly broken UTF-32 string into UTF-16LE string.
4618
   *
4619
   * During the conversion also validation of the input string is done.
4620
   * This function is suitable to work with inputs from untrusted sources.
4621
   *
4622
   * This function is not BOM-aware.
4623
   *
4624
   * @param input         the UTF-32 string to convert
4625
   * @param length        the length of the string in 4-byte code units
4626
   * (char32_t)
4627
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4628
   * @return number of written code units; 0 if input is not a valid UTF-32
4629
   * string
4630
   */
4631
  simdutf_warn_unused virtual size_t
4632
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
4633
                           char16_t *utf16_buffer) const noexcept = 0;
4634
4635
  /**
4636
   * Convert possibly broken UTF-32 string into UTF-16BE string.
4637
   *
4638
   * During the conversion also validation of the input string is done.
4639
   * This function is suitable to work with inputs from untrusted sources.
4640
   *
4641
   * This function is not BOM-aware.
4642
   *
4643
   * @param input         the UTF-32 string to convert
4644
   * @param length        the length of the string in 4-byte code units
4645
   * (char32_t)
4646
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4647
   * @return number of written code units; 0 if input is not a valid UTF-32
4648
   * string
4649
   */
4650
  simdutf_warn_unused virtual size_t
4651
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
4652
                           char16_t *utf16_buffer) const noexcept = 0;
4653
4654
  /**
4655
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
4656
   * error.
4657
   *
4658
   * During the conversion also validation of the input string is done.
4659
   * This function is suitable to work with inputs from untrusted sources.
4660
   *
4661
   * This function is not BOM-aware.
4662
   *
4663
   * @param input         the UTF-32 string to convert
4664
   * @param length        the length of the string in 4-byte code units
4665
   * (char32_t)
4666
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4667
   * @return a result pair struct (of type simdutf::result containing the two
4668
   * fields error and count) with an error code and either position of the error
4669
   * (in the input in code units) if any, or the number of char16_t written if
4670
   * successful.
4671
   */
4672
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
4673
      const char32_t *input, size_t length,
4674
      char16_t *utf16_buffer) const noexcept = 0;
4675
4676
  /**
4677
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
4678
   * error.
4679
   *
4680
   * During the conversion also validation of the input string is done.
4681
   * This function is suitable to work with inputs from untrusted sources.
4682
   *
4683
   * This function is not BOM-aware.
4684
   *
4685
   * @param input         the UTF-32 string to convert
4686
   * @param length        the length of the string in 4-byte code units
4687
   * (char32_t)
4688
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4689
   * @return a result pair struct (of type simdutf::result containing the two
4690
   * fields error and count) with an error code and either position of the error
4691
   * (in the input in code units) if any, or the number of char16_t written if
4692
   * successful.
4693
   */
4694
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
4695
      const char32_t *input, size_t length,
4696
      char16_t *utf16_buffer) const noexcept = 0;
4697
4698
  /**
4699
   * Convert valid UTF-32 string into UTF-16LE string.
4700
   *
4701
   * This function assumes that the input string is valid UTF-32.
4702
   *
4703
   * This function is not BOM-aware.
4704
   *
4705
   * @param input         the UTF-32 string to convert
4706
   * @param length        the length of the string in 4-byte code units
4707
   * (char32_t)
4708
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
4709
   * result
4710
   * @return number of written code units; 0 if conversion is not possible
4711
   */
4712
  simdutf_warn_unused virtual size_t
4713
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
4714
                                 char16_t *utf16_buffer) const noexcept = 0;
4715
4716
  /**
4717
   * Convert valid UTF-32 string into UTF-16BE string.
4718
   *
4719
   * This function assumes that the input string is valid UTF-32.
4720
   *
4721
   * This function is not BOM-aware.
4722
   *
4723
   * @param input         the UTF-32 string to convert
4724
   * @param length        the length of the string in 4-byte code units
4725
   * (char32_t)
4726
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
4727
   * result
4728
   * @return number of written code units; 0 if conversion is not possible
4729
   */
4730
  simdutf_warn_unused virtual size_t
4731
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
4732
                                 char16_t *utf16_buffer) const noexcept = 0;
4733
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4734
4735
#if SIMDUTF_FEATURE_UTF16
4736
  /**
4737
   * Change the endianness of the input. Can be used to go from UTF-16LE to
4738
   * UTF-16BE or from UTF-16BE to UTF-16LE.
4739
   *
4740
   * This function does not validate the input.
4741
   *
4742
   * This function is not BOM-aware.
4743
   *
4744
   * @param input         the UTF-16 string to process
4745
   * @param length        the length of the string in 2-byte code units
4746
   * (char16_t)
4747
   * @param output        the pointer to a buffer that can hold the conversion
4748
   * result
4749
   */
4750
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
4751
                                       char16_t *output) const noexcept = 0;
4752
#endif // SIMDUTF_FEATURE_UTF16
4753
4754
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4755
  /**
4756
   * Return the number of bytes that this Latin1 string would require in UTF-8
4757
   * format.
4758
   *
4759
   * @param input         the Latin1 string to convert
4760
   * @param length        the length of the string bytes
4761
   * @return the number of bytes required to encode the Latin1 string as UTF-8
4762
   */
4763
  simdutf_warn_unused virtual size_t
4764
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
4765
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4766
4767
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4768
  /**
4769
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
4770
   * format.
4771
   *
4772
   * This function does not validate the input. It is acceptable to pass invalid
4773
   * UTF-32 strings but in such cases the result is implementation defined.
4774
   *
4775
   * @param input         the UTF-32 string to convert
4776
   * @param length        the length of the string in 4-byte code units
4777
   * (char32_t)
4778
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
4779
   */
4780
  simdutf_warn_unused virtual size_t
4781
  utf8_length_from_utf32(const char32_t *input,
4782
                         size_t length) const noexcept = 0;
4783
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4784
4785
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4786
  /**
4787
   * Compute the number of bytes that this UTF-32 string would require in Latin1
4788
   * format.
4789
   *
4790
   * This function does not validate the input. It is acceptable to pass invalid
4791
   * UTF-32 strings but in such cases the result is implementation defined.
4792
   *
4793
   * @param length        the length of the string in 4-byte code units
4794
   * (char32_t)
4795
   * @return the number of bytes required to encode the UTF-32 string as Latin1
4796
   */
4797
  simdutf_warn_unused virtual size_t
4798
  latin1_length_from_utf32(size_t length) const noexcept {
4799
    return length;
4800
  }
4801
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4802
4803
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4804
  /**
4805
   * Compute the number of bytes that this UTF-8 string would require in Latin1
4806
   * format.
4807
   *
4808
   * This function does not validate the input. It is acceptable to pass invalid
4809
   * UTF-8 strings but in such cases the result is implementation defined.
4810
   *
4811
   * @param input         the UTF-8 string to convert
4812
   * @param length        the length of the string in byte
4813
   * @return the number of bytes required to encode the UTF-8 string as Latin1
4814
   */
4815
  simdutf_warn_unused virtual size_t
4816
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4817
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4818
4819
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4820
  /**
4821
   * Compute the number of bytes that this UTF-16LE/BE string would require in
4822
   * Latin1 format.
4823
   *
4824
   * This function does not validate the input. It is acceptable to pass invalid
4825
   * UTF-16 strings but in such cases the result is implementation defined.
4826
   *
4827
   * This function is not BOM-aware.
4828
   *
4829
   * @param input         the UTF-16LE string to convert
4830
   * @param length        the length of the string in 2-byte code units
4831
   * (char16_t)
4832
   * @return the number of bytes required to encode the UTF-16LE string as
4833
   * Latin1
4834
   */
4835
  simdutf_warn_unused virtual size_t
4836
  latin1_length_from_utf16(size_t length) const noexcept {
4837
    return length;
4838
  }
4839
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4840
4841
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4842
  /**
4843
   * Compute the number of two-byte code units that this UTF-32 string would
4844
   * require in UTF-16 format.
4845
   *
4846
   * This function does not validate the input. It is acceptable to pass invalid
4847
   * UTF-32 strings but in such cases the result is implementation defined.
4848
   *
4849
   * @param input         the UTF-32 string to convert
4850
   * @param length        the length of the string in 4-byte code units
4851
   * (char32_t)
4852
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
4853
   */
4854
  simdutf_warn_unused virtual size_t
4855
  utf16_length_from_utf32(const char32_t *input,
4856
                          size_t length) const noexcept = 0;
4857
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4858
4859
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4860
  /**
4861
   * Return the number of bytes that this UTF-32 string would require in Latin1
4862
   * format.
4863
   *
4864
   * @param length        the length of the string in 4-byte code units
4865
   * (char32_t)
4866
   * @return the number of bytes required to encode the UTF-32 string as Latin1
4867
   */
4868
  simdutf_warn_unused virtual size_t
4869
  utf32_length_from_latin1(size_t length) const noexcept {
4870
    return length;
4871
  }
4872
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4873
4874
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4875
  /**
4876
   * Compute the number of bytes that this UTF-16LE string would require in
4877
   * UTF-32 format.
4878
   *
4879
   * This function is equivalent to count_utf16le.
4880
   *
4881
   * This function does not validate the input. It is acceptable to pass invalid
4882
   * UTF-16 strings but in such cases the result is implementation defined.
4883
   *
4884
   * This function is not BOM-aware.
4885
   *
4886
   * @param input         the UTF-16LE string to convert
4887
   * @param length        the length of the string in 2-byte code units
4888
   * (char16_t)
4889
   * @return the number of bytes required to encode the UTF-16LE string as
4890
   * UTF-32
4891
   */
4892
  simdutf_warn_unused virtual size_t
4893
  utf32_length_from_utf16le(const char16_t *input,
4894
                            size_t length) const noexcept = 0;
4895
4896
  /**
4897
   * Compute the number of bytes that this UTF-16BE string would require in
4898
   * UTF-32 format.
4899
   *
4900
   * This function is equivalent to count_utf16be.
4901
   *
4902
   * This function does not validate the input. It is acceptable to pass invalid
4903
   * UTF-16 strings but in such cases the result is implementation defined.
4904
   *
4905
   * This function is not BOM-aware.
4906
   *
4907
   * @param input         the UTF-16BE string to convert
4908
   * @param length        the length of the string in 2-byte code units
4909
   * (char16_t)
4910
   * @return the number of bytes required to encode the UTF-16BE string as
4911
   * UTF-32
4912
   */
4913
  simdutf_warn_unused virtual size_t
4914
  utf32_length_from_utf16be(const char16_t *input,
4915
                            size_t length) const noexcept = 0;
4916
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4917
4918
#if SIMDUTF_FEATURE_UTF16
4919
  /**
4920
   * Count the number of code points (characters) in the string assuming that
4921
   * it is valid.
4922
   *
4923
   * This function assumes that the input string is valid UTF-16LE.
4924
   * It is acceptable to pass invalid UTF-16 strings but in such cases
4925
   * the result is implementation defined.
4926
   *
4927
   * This function is not BOM-aware.
4928
   *
4929
   * @param input         the UTF-16LE string to process
4930
   * @param length        the length of the string in 2-byte code units
4931
   * (char16_t)
4932
   * @return number of code points
4933
   */
4934
  simdutf_warn_unused virtual size_t
4935
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
4936
4937
  /**
4938
   * Count the number of code points (characters) in the string assuming that
4939
   * it is valid.
4940
   *
4941
   * This function assumes that the input string is valid UTF-16BE.
4942
   * It is acceptable to pass invalid UTF-16 strings but in such cases
4943
   * the result is implementation defined.
4944
   *
4945
   * This function is not BOM-aware.
4946
   *
4947
   * @param input         the UTF-16BE string to process
4948
   * @param length        the length of the string in 2-byte code units
4949
   * (char16_t)
4950
   * @return number of code points
4951
   */
4952
  simdutf_warn_unused virtual size_t
4953
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
4954
#endif // SIMDUTF_FEATURE_UTF16
4955
4956
#if SIMDUTF_FEATURE_UTF8
4957
  /**
4958
   * Count the number of code points (characters) in the string assuming that
4959
   * it is valid.
4960
   *
4961
   * This function assumes that the input string is valid UTF-8.
4962
   * It is acceptable to pass invalid UTF-8 strings but in such cases
4963
   * the result is implementation defined.
4964
   *
4965
   * @param input         the UTF-8 string to process
4966
   * @param length        the length of the string in bytes
4967
   * @return number of code points
4968
   */
4969
  simdutf_warn_unused virtual size_t
4970
  count_utf8(const char *input, size_t length) const noexcept = 0;
4971
#endif // SIMDUTF_FEATURE_UTF8
4972
4973
#if SIMDUTF_FEATURE_BASE64
4974
  /**
4975
   * Provide the maximal binary length in bytes given the base64 input.
4976
   * In general, if the input contains ASCII spaces, the result will be less
4977
   * than the maximum length. It is acceptable to pass invalid base64 strings
4978
   * but in such cases the result is implementation defined.
4979
   *
4980
   * @param input         the base64 input to process
4981
   * @param length        the length of the base64 input in bytes
4982
   * @return maximal number of binary bytes
4983
   */
4984
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
4985
      const char *input, size_t length) const noexcept;
4986
4987
  /**
4988
   * Provide the maximal binary length in bytes given the base64 input.
4989
   * In general, if the input contains ASCII spaces, the result will be less
4990
   * than the maximum length. It is acceptable to pass invalid base64 strings
4991
   * but in such cases the result is implementation defined.
4992
   *
4993
   * @param input         the base64 input to process, in ASCII stored as 16-bit
4994
   * units
4995
   * @param length        the length of the base64 input in 16-bit units
4996
   * @return maximal number of binary bytes
4997
   */
4998
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
4999
      const char16_t *input, size_t length) const noexcept;
5000
5001
  /**
5002
   * Convert a base64 input to a binary output.
5003
   *
5004
   * This function follows the WHATWG forgiving-base64 format, which means that
5005
   * it will ignore any ASCII spaces in the input. You may provide a padded
5006
   * input (with one or two equal signs at the end) or an unpadded input
5007
   * (without any equal signs at the end).
5008
   *
5009
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5010
   *
5011
   * This function will fail in case of invalid input. When last_chunk_options =
5012
   * loose, there are two possible reasons for failure: the input contains a
5013
   * number of base64 characters that when divided by 4, leaves a single
5014
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5015
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5016
   *
5017
   * You should call this function with a buffer that is at least
5018
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5019
   * provide that much space, the function may cause a buffer overflow.
5020
   *
5021
   * @param input         the base64 string to process
5022
   * @param length        the length of the string in bytes
5023
   * @param output        the pointer to a buffer that can hold the conversion
5024
   * result (should be at least maximal_binary_length_from_base64(input, length)
5025
   * bytes long).
5026
   * @param options       the base64 options to use, can be base64_default or
5027
   * base64_url, is base64_default by default.
5028
   * @return a result pair struct (of type simdutf::result containing the two
5029
   * fields error and count) with an error code and either position of the error
5030
   * (in the input in bytes) if any, or the number of bytes written if
5031
   * successful.
5032
   */
5033
  simdutf_warn_unused virtual result
5034
  base64_to_binary(const char *input, size_t length, char *output,
5035
                   base64_options options = base64_default,
5036
                   last_chunk_handling_options last_chunk_options =
5037
                       last_chunk_handling_options::loose) const noexcept = 0;
5038
5039
  /**
5040
   * Convert a base64 input to a binary output while returning more details
5041
   * than base64_to_binary.
5042
   *
5043
   * This function follows the WHATWG forgiving-base64 format, which means that
5044
   * it will ignore any ASCII spaces in the input. You may provide a padded
5045
   * input (with one or two equal signs at the end) or an unpadded input
5046
   * (without any equal signs at the end).
5047
   *
5048
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5049
   *
5050
   * This function will fail in case of invalid input. When last_chunk_options =
5051
   * loose, there are two possible reasons for failure: the input contains a
5052
   * number of base64 characters that when divided by 4, leaves a single
5053
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5054
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5055
   *
5056
   * You should call this function with a buffer that is at least
5057
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5058
   * provide that much space, the function may cause a buffer overflow.
5059
   *
5060
   * @param input         the base64 string to process
5061
   * @param length        the length of the string in bytes
5062
   * @param output        the pointer to a buffer that can hold the conversion
5063
   * result (should be at least maximal_binary_length_from_base64(input, length)
5064
   * bytes long).
5065
   * @param options       the base64 options to use, can be base64_default or
5066
   * base64_url, is base64_default by default.
5067
   * @return a full_result pair struct (of type simdutf::result containing the
5068
   * three fields error, input_count and output_count).
5069
   */
5070
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5071
      const char *input, size_t length, char *output,
5072
      base64_options options = base64_default,
5073
      last_chunk_handling_options last_chunk_options =
5074
          last_chunk_handling_options::loose) const noexcept = 0;
5075
  /**
5076
   * Convert a base64 input to a binary output.
5077
   *
5078
   * This function follows the WHATWG forgiving-base64 format, which means that
5079
   * it will ignore any ASCII spaces in the input. You may provide a padded
5080
   * input (with one or two equal signs at the end) or an unpadded input
5081
   * (without any equal signs at the end).
5082
   *
5083
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5084
   *
5085
   * This function will fail in case of invalid input. When last_chunk_options =
5086
   * loose, there are two possible reasons for failure: the input contains a
5087
   * number of base64 characters that when divided by 4, leaves a single
5088
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5089
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5090
   *
5091
   * You should call this function with a buffer that is at least
5092
   * maximal_binary_length_from_base64(input, length) bytes long. If you
5093
   * fail to provide that much space, the function may cause a buffer overflow.
5094
   *
5095
   * @param input         the base64 string to process, in ASCII stored as
5096
   * 16-bit units
5097
   * @param length        the length of the string in 16-bit units
5098
   * @param output        the pointer to a buffer that can hold the conversion
5099
   * result (should be at least maximal_binary_length_from_base64(input, length)
5100
   * bytes long).
5101
   * @param options       the base64 options to use, can be base64_default or
5102
   * base64_url, is base64_default by default.
5103
   * @return a result pair struct (of type simdutf::result containing the two
5104
   * fields error and count) with an error code and position of the
5105
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
5106
   * number of bytes written if successful.
5107
   */
5108
  simdutf_warn_unused virtual result
5109
  base64_to_binary(const char16_t *input, size_t length, char *output,
5110
                   base64_options options = base64_default,
5111
                   last_chunk_handling_options last_chunk_options =
5112
                       last_chunk_handling_options::loose) const noexcept = 0;
5113
5114
  /**
5115
   * Convert a base64 input to a binary output while returning more details
5116
   * than base64_to_binary.
5117
   *
5118
   * This function follows the WHATWG forgiving-base64 format, which means that
5119
   * it will ignore any ASCII spaces in the input. You may provide a padded
5120
   * input (with one or two equal signs at the end) or an unpadded input
5121
   * (without any equal signs at the end).
5122
   *
5123
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5124
   *
5125
   * This function will fail in case of invalid input. When last_chunk_options =
5126
   * loose, there are two possible reasons for failure: the input contains a
5127
   * number of base64 characters that when divided by 4, leaves a single
5128
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5129
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5130
   *
5131
   * You should call this function with a buffer that is at least
5132
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5133
   * provide that much space, the function may cause a buffer overflow.
5134
   *
5135
   * @param input         the base64 string to process
5136
   * @param length        the length of the string in bytes
5137
   * @param output        the pointer to a buffer that can hold the conversion
5138
   * result (should be at least maximal_binary_length_from_base64(input, length)
5139
   * bytes long).
5140
   * @param options       the base64 options to use, can be base64_default or
5141
   * base64_url, is base64_default by default.
5142
   * @return a full_result pair struct (of type simdutf::result containing the
5143
   * three fields error, input_count and output_count).
5144
   */
5145
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5146
      const char16_t *input, size_t length, char *output,
5147
      base64_options options = base64_default,
5148
      last_chunk_handling_options last_chunk_options =
5149
          last_chunk_handling_options::loose) const noexcept = 0;
5150
  /**
5151
   * Provide the base64 length in bytes given the length of a binary input.
5152
   *
5153
   * @param length        the length of the input in bytes
5154
   * @parem options       the base64 options to use, can be base64_default or
5155
   * base64_url, is base64_default by default.
5156
   * @return number of base64 bytes
5157
   */
5158
  simdutf_warn_unused size_t base64_length_from_binary(
5159
      size_t length, base64_options options = base64_default) const noexcept;
5160
5161
  /**
5162
   * Convert a binary input to a base64 output.
5163
   *
5164
   * The default option (simdutf::base64_default) uses the characters `+` and
5165
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5166
   * the output to ensure that the output length is a multiple of four.
5167
   *
5168
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5169
   * part of its alphabet. No padding is added at the end of the output.
5170
   *
5171
   * This function always succeeds.
5172
   *
5173
   * @param input         the binary to process
5174
   * @param length        the length of the input in bytes
5175
   * @param output        the pointer to a buffer that can hold the conversion
5176
   * result (should be at least base64_length_from_binary(length) bytes long)
5177
   * @param options       the base64 options to use, can be base64_default or
5178
   * base64_url, is base64_default by default.
5179
   * @return number of written bytes, will be equal to
5180
   * base64_length_from_binary(length, options)
5181
   */
5182
  virtual size_t
5183
  binary_to_base64(const char *input, size_t length, char *output,
5184
                   base64_options options = base64_default) const noexcept = 0;
5185
  /**
5186
   * Find the first occurrence of a character in a string. If the character is
5187
   * not found, return a pointer to the end of the string.
5188
   * @param start        the start of the string
5189
   * @param end          the end of the string
5190
   * @param character    the character to find
5191
   * @return a pointer to the first occurrence of the character in the string,
5192
   * or a pointer to the end of the string if the character is not found.
5193
   *
5194
   */
5195
  virtual const char *find(const char *start, const char *end,
5196
                           char character) const noexcept = 0;
5197
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
5198
                               char16_t character) const noexcept = 0;
5199
#endif // SIMDUTF_FEATURE_BASE64
5200
5201
#ifdef SIMDUTF_INTERNAL_TESTS
5202
  // This method is exported only in developer mode, its purpose
5203
  // is to expose some internal test procedures from the given
5204
  // implementation and then use them through our standard test
5205
  // framework.
5206
  //
5207
  // Regular users should not use it, the tests of the public
5208
  // API are enough.
5209
5210
  struct TestProcedure {
5211
    // display name
5212
    std::string name;
5213
5214
    // procedure should return whether given test pass or not
5215
    void (*procedure)(const implementation &);
5216
  };
5217
5218
  virtual std::vector<TestProcedure> internal_tests() const;
5219
#endif
5220
5221
protected:
5222
  /** @private Construct an implementation with the given name and description.
5223
   * For subclasses. */
5224
  simdutf_really_inline implementation(const char *name,
5225
                                       const char *description,
5226
                                       uint32_t required_instruction_sets)
5227
      : _name(name), _description(description),
5228
        _required_instruction_sets(required_instruction_sets) {}
5229
5230
protected:
5231
  ~implementation() = default;
5232
5233
private:
5234
  /**
5235
   * The name of this implementation.
5236
   */
5237
  const char *_name;
5238
5239
  /**
5240
   * The description of this implementation.
5241
   */
5242
  const char *_description;
5243
5244
  /**
5245
   * Instruction sets required for this implementation.
5246
   */
5247
  const uint32_t _required_instruction_sets;
5248
};
5249
5250
/** @private */
5251
namespace internal {
5252
5253
/**
5254
 * The list of available implementations compiled into simdutf.
5255
 */
5256
class available_implementation_list {
5257
public:
5258
  /** Get the list of available implementations compiled into simdutf */
5259
  simdutf_really_inline available_implementation_list() {}
5260
  /** Number of implementations */
5261
  size_t size() const noexcept;
5262
  /** STL const begin() iterator */
5263
  const implementation *const *begin() const noexcept;
5264
  /** STL const end() iterator */
5265
  const implementation *const *end() const noexcept;
5266
5267
  /**
5268
   * Get the implementation with the given name.
5269
   *
5270
   * Case sensitive.
5271
   *
5272
   *     const implementation *impl =
5273
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
5274
   * (!imp->supported_by_runtime_system()) { exit(1); }
5275
   *     simdutf::active_implementation = impl;
5276
   *
5277
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
5278
   * @return the implementation, or nullptr if the parse failed.
5279
   */
5280
  const implementation *operator[](const std::string &name) const noexcept {
5281
    for (const implementation *impl : *this) {
5282
      if (impl->name() == name) {
5283
        return impl;
5284
      }
5285
    }
5286
    return nullptr;
5287
  }
5288
5289
  /**
5290
   * Detect the most advanced implementation supported by the current host.
5291
   *
5292
   * This is used to initialize the implementation on startup.
5293
   *
5294
   *     const implementation *impl =
5295
   * simdutf::available_implementation::detect_best_supported();
5296
   *     simdutf::active_implementation = impl;
5297
   *
5298
   * @return the most advanced supported implementation for the current host, or
5299
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
5300
   * supported implementation. Will never return nullptr.
5301
   */
5302
  const implementation *detect_best_supported() const noexcept;
5303
};
5304
5305
template <typename T> class atomic_ptr {
5306
public:
5307
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
5308
5309
#if defined(SIMDUTF_NO_THREADS)
5310
  operator const T *() const { return ptr; }
5311
  const T &operator*() const { return *ptr; }
5312
  const T *operator->() const { return ptr; }
5313
5314
  operator T *() { return ptr; }
5315
  T &operator*() { return *ptr; }
5316
  T *operator->() { return ptr; }
5317
  atomic_ptr &operator=(T *_ptr) {
5318
    ptr = _ptr;
5319
    return *this;
5320
  }
5321
5322
#else
5323
  operator const T *() const { return ptr.load(); }
5324
  const T &operator*() const { return *ptr; }
5325
  const T *operator->() const { return ptr.load(); }
5326
5327
  operator T *() { return ptr.load(); }
5328
  T &operator*() { return *ptr; }
5329
  T *operator->() { return ptr.load(); }
5330
  atomic_ptr &operator=(T *_ptr) {
5331
    ptr = _ptr;
5332
    return *this;
5333
  }
5334
5335
#endif
5336
5337
private:
5338
#if defined(SIMDUTF_NO_THREADS)
5339
  T *ptr;
5340
#else
5341
  std::atomic<T *> ptr;
5342
#endif
5343
};
5344
5345
class detect_best_supported_implementation_on_first_use;
5346
5347
} // namespace internal
5348
5349
/**
5350
 * The list of available implementations compiled into simdutf.
5351
 */
5352
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
5353
get_available_implementations();
5354
5355
/**
5356
 * The active implementation.
5357
 *
5358
 * Automatically initialized on first use to the most advanced implementation
5359
 * supported by this hardware.
5360
 */
5361
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
5362
get_active_implementation();
5363
5364
} // namespace simdutf
5365
5366
#endif // SIMDUTF_IMPLEMENTATION_H