Coverage Report

Created: 2025-11-24 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/include/simdutf/implementation.h
Line
Count
Source
1
#ifndef SIMDUTF_IMPLEMENTATION_H
2
#define SIMDUTF_IMPLEMENTATION_H
3
#if !defined(SIMDUTF_NO_THREADS)
4
  #include <atomic>
5
#endif
6
#include <string>
7
#ifdef SIMDUTF_INTERNAL_TESTS
8
  #include <vector>
9
#endif
10
#include "simdutf/common_defs.h"
11
#include "simdutf/compiler_check.h"
12
#include "simdutf/encoding_types.h"
13
#include "simdutf/error.h"
14
#include "simdutf/internal/isadetection.h"
15
16
#if SIMDUTF_SPAN
17
  #include <concepts>
18
  #include <type_traits>
19
  #include <span>
20
  #include <tuple>
21
#endif
22
#if SIMDUTF_CPLUSPLUS17
23
  #include <string_view>
24
#endif
25
// The following defines are conditionally enabled/disabled during amalgamation.
26
// By default all features are enabled, regular code shouldn't check them. Only
27
// when user code really relies of a selected subset, it's good to verify these
28
// flags, like:
29
//
30
//      #if !SIMDUTF_FEATURE_UTF16
31
//      #   error("Please amalgamate simdutf with UTF-16 support")
32
//      #endif
33
//
34
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
35
#define SIMDUTF_FEATURE_ASCII 1
36
#define SIMDUTF_FEATURE_LATIN1 1
37
#define SIMDUTF_FEATURE_UTF8 1
38
#define SIMDUTF_FEATURE_UTF16 1
39
#define SIMDUTF_FEATURE_UTF32 1
40
#define SIMDUTF_FEATURE_BASE64 1
41
42
namespace simdutf {
43
44
constexpr size_t default_line_length =
45
    76; ///< default line length for base64 encoding with lines
46
47
#if SIMDUTF_SPAN
48
/// helpers placed in namespace detail are not a part of the public API
49
namespace detail {
50
/**
51
 * matches a byte, in the many ways C++ allows. note that these
52
 * are all distinct types.
53
 */
54
template <typename T>
55
concept byte_like = std::is_same_v<T, std::byte> ||   //
56
                    std::is_same_v<T, char> ||        //
57
                    std::is_same_v<T, signed char> || //
58
                    std::is_same_v<T, unsigned char>;
59
60
template <typename T>
61
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
62
63
template <typename T>
64
concept is_pointer = std::is_pointer_v<T>;
65
66
/**
67
 * matches anything that behaves like std::span and points to character-like
68
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
69
 * std::uint8_t
70
 */
71
template <typename T>
72
concept input_span_of_byte_like = requires(const T &t) {
73
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
74
  { t.data() } noexcept -> is_pointer;
75
  { *t.data() } noexcept -> is_byte_like;
76
};
77
78
template <typename T>
79
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
80
81
/**
82
 * like span_of_byte_like, but for an output span (intended to be written to)
83
 */
84
template <typename T>
85
concept output_span_of_byte_like = requires(T &t) {
86
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
87
  { t.data() } noexcept -> is_pointer;
88
  { *t.data() } noexcept -> is_byte_like;
89
  { *t.data() } noexcept -> is_mutable;
90
};
91
} // namespace detail
92
#endif
93
94
#if SIMDUTF_FEATURE_DETECT_ENCODING
95
/**
96
 * Autodetect the encoding of the input, a single encoding is recommended.
97
 * E.g., the function might return simdutf::encoding_type::UTF8,
98
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
99
 * simdutf::encoding_type::UTF32_LE.
100
 *
101
 * @param input the string to analyze.
102
 * @param length the length of the string in bytes.
103
 * @return the detected encoding type
104
 */
105
simdutf_warn_unused simdutf::encoding_type
106
autodetect_encoding(const char *input, size_t length) noexcept;
107
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
108
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
109
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
110
0
}
111
  #if SIMDUTF_SPAN
112
/**
113
 * Autodetect the encoding of the input, a single encoding is recommended.
114
 * E.g., the function might return simdutf::encoding_type::UTF8,
115
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
116
 * simdutf::encoding_type::UTF32_LE.
117
 *
118
 * @param input the string to analyze. can be a anything span-like that has a
119
 * data() and size() that points to character data: std::string,
120
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
121
 * @return the detected encoding type
122
 */
123
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
124
autodetect_encoding(
125
    const detail::input_span_of_byte_like auto &input) noexcept {
126
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
127
                             input.size());
128
}
129
  #endif // SIMDUTF_SPAN
130
131
/**
132
 * Autodetect the possible encodings of the input in one pass.
133
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
134
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
135
 *
136
 * Overridden by each implementation.
137
 *
138
 * @param input the string to analyze.
139
 * @param length the length of the string in bytes.
140
 * @return the detected encoding type
141
 */
142
simdutf_warn_unused int detect_encodings(const char *input,
143
                                         size_t length) noexcept;
144
simdutf_really_inline simdutf_warn_unused int
145
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
146
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
147
0
}
148
  #if SIMDUTF_SPAN
149
simdutf_really_inline simdutf_warn_unused int
150
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
151
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
152
                          input.size());
153
}
154
  #endif // SIMDUTF_SPAN
155
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
156
157
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
158
/**
159
 * Validate the UTF-8 string. This function may be best when you expect
160
 * the input to be almost always valid. Otherwise, consider using
161
 * validate_utf8_with_errors.
162
 *
163
 * Overridden by each implementation.
164
 *
165
 * @param buf the UTF-8 string to validate.
166
 * @param len the length of the string in bytes.
167
 * @return true if and only if the string is valid UTF-8.
168
 */
169
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
170
  #if SIMDUTF_SPAN
171
simdutf_really_inline simdutf_warn_unused bool
172
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
173
  return validate_utf8(reinterpret_cast<const char *>(input.data()),
174
                       input.size());
175
}
176
  #endif // SIMDUTF_SPAN
177
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
178
179
#if SIMDUTF_FEATURE_UTF8
180
/**
181
 * Validate the UTF-8 string and stop on error.
182
 *
183
 * Overridden by each implementation.
184
 *
185
 * @param buf the UTF-8 string to validate.
186
 * @param len the length of the string in bytes.
187
 * @return a result pair struct (of type simdutf::result containing the two
188
 * fields error and count) with an error code and either position of the error
189
 * (in the input in code units) if any, or the number of code units validated if
190
 * successful.
191
 */
192
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
193
                                                     size_t len) noexcept;
194
  #if SIMDUTF_SPAN
195
simdutf_really_inline simdutf_warn_unused result validate_utf8_with_errors(
196
    const detail::input_span_of_byte_like auto &input) noexcept {
197
  return validate_utf8_with_errors(reinterpret_cast<const char *>(input.data()),
198
                                   input.size());
199
}
200
  #endif // SIMDUTF_SPAN
201
#endif   // SIMDUTF_FEATURE_UTF8
202
203
#if SIMDUTF_FEATURE_ASCII
204
/**
205
 * Validate the ASCII string.
206
 *
207
 * Overridden by each implementation.
208
 *
209
 * @param buf the ASCII string to validate.
210
 * @param len the length of the string in bytes.
211
 * @return true if and only if the string is valid ASCII.
212
 */
213
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
214
  #if SIMDUTF_SPAN
215
simdutf_really_inline simdutf_warn_unused bool
216
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
217
  return validate_ascii(reinterpret_cast<const char *>(input.data()),
218
                        input.size());
219
}
220
  #endif // SIMDUTF_SPAN
221
222
/**
223
 * Validate the ASCII string and stop on error. It might be faster than
224
 * validate_utf8 when an error is expected to occur early.
225
 *
226
 * Overridden by each implementation.
227
 *
228
 * @param buf the ASCII string to validate.
229
 * @param len the length of the string in bytes.
230
 * @return a result pair struct (of type simdutf::result containing the two
231
 * fields error and count) with an error code and either position of the error
232
 * (in the input in code units) if any, or the number of code units validated if
233
 * successful.
234
 */
235
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
236
                                                      size_t len) noexcept;
237
  #if SIMDUTF_SPAN
238
simdutf_really_inline simdutf_warn_unused result validate_ascii_with_errors(
239
    const detail::input_span_of_byte_like auto &input) noexcept {
240
  return validate_ascii_with_errors(
241
      reinterpret_cast<const char *>(input.data()), input.size());
242
}
243
  #endif // SIMDUTF_SPAN
244
#endif   // SIMDUTF_FEATURE_ASCII
245
246
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
247
/**
248
 * Validate the ASCII string as a UTF-16 sequence.
249
 * An UTF-16 sequence is considered an ASCII sequence
250
 * if it could be converted to an ASCII string losslessly.
251
 *
252
 * Overridden by each implementation.
253
 *
254
 * @param buf the UTF-16 string to validate.
255
 * @param len the length of the string in bytes.
256
 * @return true if and only if the string is valid ASCII.
257
 */
258
simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf,
259
                                                 size_t len) noexcept;
260
  #if SIMDUTF_SPAN
261
simdutf_really_inline simdutf_warn_unused bool
262
0
validate_utf16_as_ascii(std::span<const char16_t> input) noexcept {
263
0
  return validate_utf16_as_ascii(input.data(), input.size());
264
0
}
265
  #endif // SIMDUTF_SPAN
266
267
/**
268
 * Validate the ASCII string as a UTF-16BE sequence.
269
 * An UTF-16 sequence is considered an ASCII sequence
270
 * if it could be converted to an ASCII string losslessly.
271
 *
272
 * Overridden by each implementation.
273
 *
274
 * @param buf the UTF-16BE string to validate.
275
 * @param len the length of the string in bytes.
276
 * @return true if and only if the string is valid ASCII.
277
 */
278
simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf,
279
                                                   size_t len) noexcept;
280
  #if SIMDUTF_SPAN
281
simdutf_really_inline simdutf_warn_unused bool
282
0
validate_utf16be_as_ascii(std::span<const char16_t> input) noexcept {
283
0
  return validate_utf16be_as_ascii(input.data(), input.size());
284
0
}
285
  #endif // SIMDUTF_SPAN
286
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
287
288
/**
289
 * Validate the ASCII string as a UTF-16LE sequence.
290
 * An UTF-16 sequence is considered an ASCII sequence
291
 * if it could be converted to an ASCII string losslessly.
292
 *
293
 * Overridden by each implementation.
294
 *
295
 * @param buf the UTF-16LE string to validate.
296
 * @param len the length of the string in bytes.
297
 * @return true if and only if the string is valid ASCII.
298
 */
299
simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf,
300
                                                   size_t len) noexcept;
301
#if SIMDUTF_SPAN
302
simdutf_really_inline simdutf_warn_unused bool
303
0
validate_utf16le_as_ascii(std::span<const char16_t> input) noexcept {
304
0
  return validate_utf16le_as_ascii(input.data(), input.size());
305
0
}
306
#endif // SIMDUTF_SPAN
307
308
#if SIMDUTF_FEATURE_UTF16
309
/**
310
 * Using native endianness; Validate the UTF-16 string.
311
 * This function may be best when you expect the input to be almost always
312
 * valid. Otherwise, consider using validate_utf16_with_errors.
313
 *
314
 * Overridden by each implementation.
315
 *
316
 * This function is not BOM-aware.
317
 *
318
 * @param buf the UTF-16 string to validate.
319
 * @param len the length of the string in number of 2-byte code units
320
 * (char16_t).
321
 * @return true if and only if the string is valid UTF-16.
322
 */
323
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
324
                                        size_t len) noexcept;
325
  #if SIMDUTF_SPAN
326
simdutf_really_inline simdutf_warn_unused bool
327
0
validate_utf16(std::span<const char16_t> input) noexcept {
328
0
  return validate_utf16(input.data(), input.size());
329
0
}
330
  #endif // SIMDUTF_SPAN
331
#endif   // SIMDUTF_FEATURE_UTF16
332
333
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
334
/**
335
 * Validate the UTF-16LE string. This function may be best when you expect
336
 * the input to be almost always valid. Otherwise, consider using
337
 * validate_utf16le_with_errors.
338
 *
339
 * Overridden by each implementation.
340
 *
341
 * This function is not BOM-aware.
342
 *
343
 * @param buf the UTF-16LE string to validate.
344
 * @param len the length of the string in number of 2-byte code units
345
 * (char16_t).
346
 * @return true if and only if the string is valid UTF-16LE.
347
 */
348
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
349
                                          size_t len) noexcept;
350
  #if SIMDUTF_SPAN
351
simdutf_really_inline simdutf_warn_unused bool
352
0
validate_utf16le(std::span<const char16_t> input) noexcept {
353
0
  return validate_utf16le(input.data(), input.size());
354
0
}
355
  #endif // SIMDUTF_SPAN
356
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
357
358
#if SIMDUTF_FEATURE_UTF16
359
/**
360
 * Validate the UTF-16BE string. This function may be best when you expect
361
 * the input to be almost always valid. Otherwise, consider using
362
 * validate_utf16be_with_errors.
363
 *
364
 * Overridden by each implementation.
365
 *
366
 * This function is not BOM-aware.
367
 *
368
 * @param buf the UTF-16BE string to validate.
369
 * @param len the length of the string in number of 2-byte code units
370
 * (char16_t).
371
 * @return true if and only if the string is valid UTF-16BE.
372
 */
373
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
374
                                          size_t len) noexcept;
375
  #if SIMDUTF_SPAN
376
simdutf_really_inline simdutf_warn_unused bool
377
0
validate_utf16be(std::span<const char16_t> input) noexcept {
378
0
  return validate_utf16be(input.data(), input.size());
379
0
}
380
  #endif // SIMDUTF_SPAN
381
382
/**
383
 * Using native endianness; Validate the UTF-16 string and stop on error.
384
 * It might be faster than validate_utf16 when an error is expected to occur
385
 * early.
386
 *
387
 * Overridden by each implementation.
388
 *
389
 * This function is not BOM-aware.
390
 *
391
 * @param buf the UTF-16 string to validate.
392
 * @param len the length of the string in number of 2-byte code units
393
 * (char16_t).
394
 * @return a result pair struct (of type simdutf::result containing the two
395
 * fields error and count) with an error code and either position of the error
396
 * (in the input in code units) if any, or the number of code units validated if
397
 * successful.
398
 */
399
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
400
                                                      size_t len) noexcept;
401
  #if SIMDUTF_SPAN
402
simdutf_really_inline simdutf_warn_unused result
403
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
404
0
  return validate_utf16_with_errors(input.data(), input.size());
405
0
}
406
  #endif // SIMDUTF_SPAN
407
408
/**
409
 * Validate the UTF-16LE string and stop on error. It might be faster than
410
 * validate_utf16le when an error is expected to occur early.
411
 *
412
 * Overridden by each implementation.
413
 *
414
 * This function is not BOM-aware.
415
 *
416
 * @param buf the UTF-16LE string to validate.
417
 * @param len the length of the string in number of 2-byte code units
418
 * (char16_t).
419
 * @return a result pair struct (of type simdutf::result containing the two
420
 * fields error and count) with an error code and either position of the error
421
 * (in the input in code units) if any, or the number of code units validated if
422
 * successful.
423
 */
424
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
425
                                                        size_t len) noexcept;
426
  #if SIMDUTF_SPAN
427
simdutf_really_inline simdutf_warn_unused result
428
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
429
0
  return validate_utf16le_with_errors(input.data(), input.size());
430
0
}
431
  #endif // SIMDUTF_SPAN
432
433
/**
434
 * Validate the UTF-16BE string and stop on error. It might be faster than
435
 * validate_utf16be when an error is expected to occur early.
436
 *
437
 * Overridden by each implementation.
438
 *
439
 * This function is not BOM-aware.
440
 *
441
 * @param buf the UTF-16BE string to validate.
442
 * @param len the length of the string in number of 2-byte code units
443
 * (char16_t).
444
 * @return a result pair struct (of type simdutf::result containing the two
445
 * fields error and count) with an error code and either position of the error
446
 * (in the input in code units) if any, or the number of code units validated if
447
 * successful.
448
 */
449
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
450
                                                        size_t len) noexcept;
451
  #if SIMDUTF_SPAN
452
simdutf_really_inline simdutf_warn_unused result
453
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
454
0
  return validate_utf16be_with_errors(input.data(), input.size());
455
0
}
456
  #endif // SIMDUTF_SPAN
457
458
/**
459
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
460
 * the Unicode replacement character U+FFFD. If input and output points to
461
 * different memory areas, the procedure copies string, and it's expected that
462
 * output memory is at least as big as the input. It's also possible to set
463
 * input equal output, that makes replacements an in-place operation.
464
 *
465
 * @param input the UTF-16LE string to correct.
466
 * @param len the length of the string in number of 2-byte code units
467
 * (char16_t).
468
 * @param output the output buffer.
469
 */
470
void to_well_formed_utf16le(const char16_t *input, size_t len,
471
                            char16_t *output) noexcept;
472
  #if SIMDUTF_SPAN
473
simdutf_really_inline void
474
to_well_formed_utf16le(std::span<const char16_t> input,
475
0
                       std::span<char16_t> output) noexcept {
476
0
  to_well_formed_utf16le(input.data(), input.size(), output.data());
477
0
}
478
  #endif // SIMDUTF_SPAN
479
480
/**
481
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
482
 * the Unicode replacement character U+FFFD. If input and output points to
483
 * different memory areas, the procedure copies string, and it's expected that
484
 * output memory is at least as big as the input. It's also possible to set
485
 * input equal output, that makes replacements an in-place operation.
486
 *
487
 * @param input the UTF-16BE string to correct.
488
 * @param len the length of the string in number of 2-byte code units
489
 * (char16_t).
490
 * @param output the output buffer.
491
 */
492
void to_well_formed_utf16be(const char16_t *input, size_t len,
493
                            char16_t *output) noexcept;
494
  #if SIMDUTF_SPAN
495
simdutf_really_inline void
496
to_well_formed_utf16be(std::span<const char16_t> input,
497
0
                       std::span<char16_t> output) noexcept {
498
0
  to_well_formed_utf16be(input.data(), input.size(), output.data());
499
0
}
500
  #endif // SIMDUTF_SPAN
501
502
/**
503
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
504
 * Unicode replacement character U+FFFD. If input and output points to different
505
 * memory areas, the procedure copies string, and it's expected that output
506
 * memory is at least as big as the input. It's also possible to set input equal
507
 * output, that makes replacements an in-place operation.
508
 *
509
 * @param input the UTF-16 string to correct.
510
 * @param len the length of the string in number of 2-byte code units
511
 * (char16_t).
512
 * @param output the output buffer.
513
 */
514
void to_well_formed_utf16(const char16_t *input, size_t len,
515
                          char16_t *output) noexcept;
516
  #if SIMDUTF_SPAN
517
simdutf_really_inline void
518
to_well_formed_utf16(std::span<const char16_t> input,
519
0
                     std::span<char16_t> output) noexcept {
520
0
  to_well_formed_utf16(input.data(), input.size(), output.data());
521
0
}
522
  #endif // SIMDUTF_SPAN
523
524
#endif // SIMDUTF_FEATURE_UTF16
525
526
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
527
/**
528
 * Validate the UTF-32 string. This function may be best when you expect
529
 * the input to be almost always valid. Otherwise, consider using
530
 * validate_utf32_with_errors.
531
 *
532
 * Overridden by each implementation.
533
 *
534
 * This function is not BOM-aware.
535
 *
536
 * @param buf the UTF-32 string to validate.
537
 * @param len the length of the string in number of 4-byte code units
538
 * (char32_t).
539
 * @return true if and only if the string is valid UTF-32.
540
 */
541
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
542
                                        size_t len) noexcept;
543
  #if SIMDUTF_SPAN
544
simdutf_really_inline simdutf_warn_unused bool
545
0
validate_utf32(std::span<const char32_t> input) noexcept {
546
0
  return validate_utf32(input.data(), input.size());
547
0
}
548
  #endif // SIMDUTF_SPAN
549
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
550
551
#if SIMDUTF_FEATURE_UTF32
552
/**
553
 * Validate the UTF-32 string and stop on error. It might be faster than
554
 * validate_utf32 when an error is expected to occur early.
555
 *
556
 * Overridden by each implementation.
557
 *
558
 * This function is not BOM-aware.
559
 *
560
 * @param buf the UTF-32 string to validate.
561
 * @param len the length of the string in number of 4-byte code units
562
 * (char32_t).
563
 * @return a result pair struct (of type simdutf::result containing the two
564
 * fields error and count) with an error code and either position of the error
565
 * (in the input in code units) if any, or the number of code units validated if
566
 * successful.
567
 */
568
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
569
                                                      size_t len) noexcept;
570
  #if SIMDUTF_SPAN
571
simdutf_really_inline simdutf_warn_unused result
572
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
573
0
  return validate_utf32_with_errors(input.data(), input.size());
574
0
}
575
  #endif // SIMDUTF_SPAN
576
#endif   // SIMDUTF_FEATURE_UTF32
577
578
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
579
/**
580
 * Convert Latin1 string into UTF-8 string.
581
 *
582
 * This function is suitable to work with inputs from untrusted sources.
583
 *
584
 * @param input         the Latin1 string to convert
585
 * @param length        the length of the string in bytes
586
 * @param utf8_output   the pointer to buffer that can hold conversion result
587
 * @return the number of written char; 0 if conversion is not possible
588
 */
589
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
590
                                                  size_t length,
591
                                                  char *utf8_output) noexcept;
592
  #if SIMDUTF_SPAN
593
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8(
594
    const detail::input_span_of_byte_like auto &latin1_input,
595
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
596
  return convert_latin1_to_utf8(
597
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
598
      utf8_output.data());
599
}
600
  #endif // SIMDUTF_SPAN
601
602
/**
603
 * Convert Latin1 string into UTF-8 string with output limit.
604
 *
605
 * This function is suitable to work with inputs from untrusted sources.
606
 *
607
 * We write as many characters as possible.
608
 *
609
 * @param input         the Latin1 string to convert
610
 * @param length        the length of the string in bytes
611
 * @param utf8_output   the pointer to buffer that can hold conversion result
612
 * @param utf8_len      the maximum output length
613
 * @return the number of written char; 0 if conversion is not possible
614
 */
615
simdutf_warn_unused size_t
616
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
617
                            size_t utf8_len) noexcept;
618
  #if SIMDUTF_SPAN
619
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
620
    const detail::input_span_of_byte_like auto &input,
621
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
622
  // implementation note: outputspan is a forwarding ref to avoid copying and
623
  // allow both lvalues and rvalues. std::span can be copied without problems,
624
  // but std::vector should not, and this function should accept both. it will
625
  // allow using an owning rvalue ref (example: passing a temporary std::string)
626
  // as output, but the user will quickly find out that he has no way of getting
627
  // the data out of the object in that case.
628
  return convert_latin1_to_utf8_safe(
629
      input.data(), input.size(), reinterpret_cast<char *>(utf8_output.data()),
630
      utf8_output.size());
631
}
632
  #endif // SIMDUTF_SPAN
633
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
634
635
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
636
/**
637
 * Convert possibly Latin1 string into UTF-16LE string.
638
 *
639
 * This function is suitable to work with inputs from untrusted sources.
640
 *
641
 * @param input         the Latin1 string to convert
642
 * @param length        the length of the string in bytes
643
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
644
 * @return the number of written char16_t; 0 if conversion is not possible
645
 */
646
simdutf_warn_unused size_t convert_latin1_to_utf16le(
647
    const char *input, size_t length, char16_t *utf16_output) noexcept;
648
  #if SIMDUTF_SPAN
649
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf16le(
650
    const detail::input_span_of_byte_like auto &latin1_input,
651
    std::span<char16_t> utf16_output) noexcept {
652
  return convert_latin1_to_utf16le(
653
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
654
      utf16_output.data());
655
}
656
  #endif // SIMDUTF_SPAN
657
658
/**
659
 * Convert Latin1 string into UTF-16BE string.
660
 *
661
 * This function is suitable to work with inputs from untrusted sources.
662
 *
663
 * @param input         the Latin1 string to convert
664
 * @param length        the length of the string in bytes
665
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
666
 * @return the number of written char16_t; 0 if conversion is not possible
667
 */
668
simdutf_warn_unused size_t convert_latin1_to_utf16be(
669
    const char *input, size_t length, char16_t *utf16_output) noexcept;
670
  #if SIMDUTF_SPAN
671
simdutf_really_inline simdutf_warn_unused size_t
672
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
673
                          std::span<char16_t> output) noexcept {
674
  return convert_latin1_to_utf16be(reinterpret_cast<const char *>(input.data()),
675
                                   input.size(), output.data());
676
}
677
  #endif // SIMDUTF_SPAN
678
/**
679
 * Compute the number of bytes that this UTF-16 string would require in Latin1
680
 * format.
681
 *
682
 * @param length        the length of the string in Latin1 code units (char)
683
 * @return the length of the string in Latin1 code units (char) required to
684
 * encode the UTF-16 string as Latin1
685
 */
686
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
687
688
/**
689
 * Compute the number of code units that this Latin1 string would require in
690
 * UTF-16 format.
691
 *
692
 * @param length        the length of the string in Latin1 code units (char)
693
 * @return the length of the string in 2-byte code units (char16_t) required to
694
 * encode the Latin1 string as UTF-16
695
 */
696
simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept;
697
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
698
699
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
700
/**
701
 * Convert Latin1 string into UTF-32 string.
702
 *
703
 * This function is suitable to work with inputs from untrusted sources.
704
 *
705
 * @param input         the Latin1 string to convert
706
 * @param length        the length of the string in bytes
707
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
708
 * @return the number of written char32_t; 0 if conversion is not possible
709
 */
710
simdutf_warn_unused size_t convert_latin1_to_utf32(
711
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
712
  #if SIMDUTF_SPAN
713
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf32(
714
    const detail::input_span_of_byte_like auto &latin1_input,
715
    std::span<char32_t> utf32_output) noexcept {
716
  return convert_latin1_to_utf32(
717
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
718
      utf32_output.data());
719
}
720
  #endif // SIMDUTF_SPAN
721
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
722
723
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
724
/**
725
 * Convert possibly broken UTF-8 string into latin1 string.
726
 *
727
 * During the conversion also validation of the input string is done.
728
 * This function is suitable to work with inputs from untrusted sources.
729
 *
730
 * @param input         the UTF-8 string to convert
731
 * @param length        the length of the string in bytes
732
 * @param latin1_output  the pointer to buffer that can hold conversion result
733
 * @return the number of written char; 0 if the input was not valid UTF-8 string
734
 * or if it cannot be represented as Latin1
735
 */
736
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
737
                                                  size_t length,
738
                                                  char *latin1_output) noexcept;
739
  #if SIMDUTF_SPAN
740
simdutf_really_inline simdutf_warn_unused size_t convert_utf8_to_latin1(
741
    const detail::input_span_of_byte_like auto &input,
742
    detail::output_span_of_byte_like auto &&output) noexcept {
743
  return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
744
                                input.size(),
745
                                reinterpret_cast<char *>(output.data()));
746
}
747
  #endif // SIMDUTF_SPAN
748
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
749
750
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
751
/**
752
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
753
 * string.
754
 *
755
 * During the conversion also validation of the input string is done.
756
 * This function is suitable to work with inputs from untrusted sources.
757
 *
758
 * @param input         the UTF-8 string to convert
759
 * @param length        the length of the string in bytes
760
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
761
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
762
 * string
763
 */
764
simdutf_warn_unused size_t convert_utf8_to_utf16(
765
    const char *input, size_t length, char16_t *utf16_output) noexcept;
766
  #if SIMDUTF_SPAN
767
simdutf_really_inline simdutf_warn_unused size_t
768
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
769
                      std::span<char16_t> output) noexcept {
770
  return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
771
                               input.size(), output.data());
772
}
773
  #endif // SIMDUTF_SPAN
774
775
/**
776
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
777
 * format even when the UTF-16LE content contains mismatched surrogates
778
 * that have to be replaced by the replacement character (0xFFFD).
779
 *
780
 * @param input         the UTF-16LE string to convert
781
 * @param length        the length of the string in 2-byte code units (char16_t)
782
 * @return a result pair struct (of type simdutf::result containing the two
783
 * fields error and count) where the count is the number of bytes required to
784
 * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or
785
 * SURROGATE. The count is correct regardless of the error field.
786
 * When SURROGATE is returned, it does not indicate an error in the case of this
787
 * function: it indicates that at least one surrogate has been encountered: the
788
 * surrogates may be matched or not (thus this function does not validate). If
789
 * the returned error code is SUCCESS, then the input contains no surrogate, is
790
 * in the Basic Multilingual Plane, and is necessarily valid.
791
 */
792
simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
793
    const char16_t *input, size_t length) noexcept;
794
  #if SIMDUTF_SPAN
795
simdutf_really_inline simdutf_warn_unused result
796
utf8_length_from_utf16le_with_replacement(
797
0
    std::span<const char16_t> valid_utf16_input) noexcept {
798
0
  return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
799
0
                                                   valid_utf16_input.size());
800
0
}
801
  #endif // SIMDUTF_SPAN
802
803
/**
804
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
805
 * format even when the UTF-16BE content contains mismatched surrogates
806
 * that have to be replaced by the replacement character (0xFFFD).
807
 *
808
 * @param input         the UTF-16BE string to convert
809
 * @param length        the length of the string in 2-byte code units (char16_t)
810
 * @return a result pair struct (of type simdutf::result containing the two
811
 * fields error and count) where the count is the number of bytes required to
812
 * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or
813
 * SURROGATE. The count is correct regardless of the error field.
814
 * When SURROGATE is returned, it does not indicate an error in the case of this
815
 * function: it indicates that at least one surrogate has been encountered: the
816
 * surrogates may be matched or not (thus this function does not validate). If
817
 * the returned error code is SUCCESS, then the input contains no surrogate, is
818
 * in the Basic Multilingual Plane, and is necessarily valid.
819
 */
820
simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
821
    const char16_t *input, size_t length) noexcept;
822
  #if SIMDUTF_SPAN
823
simdutf_really_inline simdutf_warn_unused result
824
utf8_length_from_utf16be_with_replacement(
825
0
    std::span<const char16_t> valid_utf16_input) noexcept {
826
0
  return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
827
0
                                                   valid_utf16_input.size());
828
0
}
829
  #endif // SIMDUTF_SPAN
830
831
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
832
833
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
834
/**
835
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
836
 *
837
 * @param input         the Latin1 string to convert
838
 * @param length        the length of the string in bytes
839
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
840
 * @return the number of written char16_t.
841
 */
842
simdutf_warn_unused size_t convert_latin1_to_utf16(
843
    const char *input, size_t length, char16_t *utf16_output) noexcept;
844
  #if SIMDUTF_SPAN
845
simdutf_really_inline simdutf_warn_unused size_t
846
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
847
                        std::span<char16_t> output) noexcept {
848
  return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
849
                                 input.size(), output.data());
850
}
851
  #endif // SIMDUTF_SPAN
852
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
853
854
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
855
/**
856
 * Convert possibly broken UTF-8 string into UTF-16LE string.
857
 *
858
 * During the conversion also validation of the input string is done.
859
 * This function is suitable to work with inputs from untrusted sources.
860
 *
861
 * @param input         the UTF-8 string to convert
862
 * @param length        the length of the string in bytes
863
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
864
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
865
 * string
866
 */
867
simdutf_warn_unused size_t convert_utf8_to_utf16le(
868
    const char *input, size_t length, char16_t *utf16_output) noexcept;
869
  #if SIMDUTF_SPAN
870
simdutf_really_inline simdutf_warn_unused size_t
871
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
872
                        std::span<char16_t> utf16_output) noexcept {
873
  return convert_utf8_to_utf16le(
874
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
875
      utf16_output.data());
876
}
877
  #endif // SIMDUTF_SPAN
878
879
/**
880
 * Convert possibly broken UTF-8 string into UTF-16BE string.
881
 *
882
 * During the conversion also validation of the input string is done.
883
 * This function is suitable to work with inputs from untrusted sources.
884
 *
885
 * @param input         the UTF-8 string to convert
886
 * @param length        the length of the string in bytes
887
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
888
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
889
 * string
890
 */
891
simdutf_warn_unused size_t convert_utf8_to_utf16be(
892
    const char *input, size_t length, char16_t *utf16_output) noexcept;
893
  #if SIMDUTF_SPAN
894
simdutf_really_inline simdutf_warn_unused size_t
895
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
896
                        std::span<char16_t> utf16_output) noexcept {
897
  return convert_utf8_to_utf16be(
898
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
899
      utf16_output.data());
900
}
901
  #endif // SIMDUTF_SPAN
902
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
903
904
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
905
/**
906
 * Convert possibly broken UTF-8 string into latin1 string with errors.
907
 * If the string cannot be represented as Latin1, an error
908
 * code is returned.
909
 *
910
 * During the conversion also validation of the input string is done.
911
 * This function is suitable to work with inputs from untrusted sources.
912
 *
913
 * @param input         the UTF-8 string to convert
914
 * @param length        the length of the string in bytes
915
 * @param latin1_output  the pointer to buffer that can hold conversion result
916
 * @return a result pair struct (of type simdutf::result containing the two
917
 * fields error and count) with an error code and either position of the error
918
 * (in the input in code units) if any, or the number of code units validated if
919
 * successful.
920
 */
921
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
922
    const char *input, size_t length, char *latin1_output) noexcept;
923
  #if SIMDUTF_SPAN
924
simdutf_really_inline simdutf_warn_unused result
925
convert_utf8_to_latin1_with_errors(
926
    const detail::input_span_of_byte_like auto &utf8_input,
927
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
928
  return convert_utf8_to_latin1_with_errors(
929
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
930
      reinterpret_cast<char *>(latin1_output.data()));
931
}
932
  #endif // SIMDUTF_SPAN
933
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
934
935
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
936
/**
937
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
938
 * string and stop on error.
939
 *
940
 * During the conversion also validation of the input string is done.
941
 * This function is suitable to work with inputs from untrusted sources.
942
 *
943
 * @param input         the UTF-8 string to convert
944
 * @param length        the length of the string in bytes
945
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
946
 * @return a result pair struct (of type simdutf::result containing the two
947
 * fields error and count) with an error code and either position of the error
948
 * (in the input in code units) if any, or the number of char16_t written if
949
 * successful.
950
 */
951
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
952
    const char *input, size_t length, char16_t *utf16_output) noexcept;
953
  #if SIMDUTF_SPAN
954
simdutf_really_inline simdutf_warn_unused result
955
convert_utf8_to_utf16_with_errors(
956
    const detail::input_span_of_byte_like auto &utf8_input,
957
    std::span<char16_t> utf16_output) noexcept {
958
  return convert_utf8_to_utf16_with_errors(
959
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
960
      utf16_output.data());
961
}
962
  #endif // SIMDUTF_SPAN
963
964
/**
965
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
966
 *
967
 * During the conversion also validation of the input string is done.
968
 * This function is suitable to work with inputs from untrusted sources.
969
 *
970
 * @param input         the UTF-8 string to convert
971
 * @param length        the length of the string in bytes
972
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
973
 * @return a result pair struct (of type simdutf::result containing the two
974
 * fields error and count) with an error code and either position of the error
975
 * (in the input in code units) if any, or the number of char16_t written if
976
 * successful.
977
 */
978
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
979
    const char *input, size_t length, char16_t *utf16_output) noexcept;
980
  #if SIMDUTF_SPAN
981
simdutf_really_inline simdutf_warn_unused result
982
convert_utf8_to_utf16le_with_errors(
983
    const detail::input_span_of_byte_like auto &utf8_input,
984
    std::span<char16_t> utf16_output) noexcept {
985
  return convert_utf8_to_utf16le_with_errors(
986
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
987
      utf16_output.data());
988
}
989
  #endif // SIMDUTF_SPAN
990
991
/**
992
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
993
 *
994
 * During the conversion also validation of the input string is done.
995
 * This function is suitable to work with inputs from untrusted sources.
996
 *
997
 * @param input         the UTF-8 string to convert
998
 * @param length        the length of the string in bytes
999
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1000
 * @return a result pair struct (of type simdutf::result containing the two
1001
 * fields error and count) with an error code and either position of the error
1002
 * (in the input in code units) if any, or the number of char16_t written if
1003
 * successful.
1004
 */
1005
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
1006
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1007
  #if SIMDUTF_SPAN
1008
simdutf_really_inline simdutf_warn_unused result
1009
convert_utf8_to_utf16be_with_errors(
1010
    const detail::input_span_of_byte_like auto &utf8_input,
1011
    std::span<char16_t> utf16_output) noexcept {
1012
  return convert_utf8_to_utf16be_with_errors(
1013
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1014
      utf16_output.data());
1015
}
1016
  #endif // SIMDUTF_SPAN
1017
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1018
1019
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1020
/**
1021
 * Convert possibly broken UTF-8 string into UTF-32 string.
1022
 *
1023
 * During the conversion also validation of the input string is done.
1024
 * This function is suitable to work with inputs from untrusted sources.
1025
 *
1026
 * @param input         the UTF-8 string to convert
1027
 * @param length        the length of the string in bytes
1028
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1029
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
1030
 * string
1031
 */
1032
simdutf_warn_unused size_t convert_utf8_to_utf32(
1033
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1034
  #if SIMDUTF_SPAN
1035
simdutf_really_inline simdutf_warn_unused size_t
1036
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
1037
                      std::span<char32_t> utf32_output) noexcept {
1038
  return convert_utf8_to_utf32(
1039
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1040
      utf32_output.data());
1041
}
1042
  #endif // SIMDUTF_SPAN
1043
1044
/**
1045
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1046
 *
1047
 * During the conversion also validation of the input string is done.
1048
 * This function is suitable to work with inputs from untrusted sources.
1049
 *
1050
 * @param input         the UTF-8 string to convert
1051
 * @param length        the length of the string in bytes
1052
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1053
 * @return a result pair struct (of type simdutf::result containing the two
1054
 * fields error and count) with an error code and either position of the error
1055
 * (in the input in code units) if any, or the number of char32_t written if
1056
 * successful.
1057
 */
1058
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
1059
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1060
  #if SIMDUTF_SPAN
1061
simdutf_really_inline simdutf_warn_unused result
1062
convert_utf8_to_utf32_with_errors(
1063
    const detail::input_span_of_byte_like auto &utf8_input,
1064
    std::span<char32_t> utf32_output) noexcept {
1065
  return convert_utf8_to_utf32_with_errors(
1066
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1067
      utf32_output.data());
1068
}
1069
  #endif // SIMDUTF_SPAN
1070
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1071
1072
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1073
/**
1074
 * Convert valid UTF-8 string into latin1 string.
1075
 *
1076
 * This function assumes that the input string is valid UTF-8 and that it can be
1077
 * represented as Latin1. If you violate this assumption, the result is
1078
 * implementation defined and may include system-dependent behavior such as
1079
 * crashes.
1080
 *
1081
 * This function is for expert users only and not part of our public API. Use
1082
 * convert_utf8_to_latin1 instead. The function may be removed from the library
1083
 * in the future.
1084
 *
1085
 * This function is not BOM-aware.
1086
 *
1087
 * @param input         the UTF-8 string to convert
1088
 * @param length        the length of the string in bytes
1089
 * @param latin1_output  the pointer to buffer that can hold conversion result
1090
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1091
 */
1092
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1093
    const char *input, size_t length, char *latin1_output) noexcept;
1094
  #if SIMDUTF_SPAN
1095
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1096
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1097
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1098
  return convert_valid_utf8_to_latin1(
1099
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1100
      valid_utf8_input.size(), latin1_output.data());
1101
}
1102
  #endif // SIMDUTF_SPAN
1103
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1104
1105
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1106
/**
1107
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1108
 *
1109
 * This function assumes that the input string is valid UTF-8.
1110
 *
1111
 * @param input         the UTF-8 string to convert
1112
 * @param length        the length of the string in bytes
1113
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1114
 * @return the number of written char16_t
1115
 */
1116
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1117
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1118
  #if SIMDUTF_SPAN
1119
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1120
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1121
    std::span<char16_t> utf16_output) noexcept {
1122
  return convert_valid_utf8_to_utf16(
1123
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1124
      valid_utf8_input.size(), utf16_output.data());
1125
}
1126
  #endif // SIMDUTF_SPAN
1127
1128
/**
1129
 * Convert valid UTF-8 string into UTF-16LE string.
1130
 *
1131
 * This function assumes that the input string is valid UTF-8.
1132
 *
1133
 * @param input         the UTF-8 string to convert
1134
 * @param length        the length of the string in bytes
1135
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1136
 * @return the number of written char16_t
1137
 */
1138
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1139
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1140
  #if SIMDUTF_SPAN
1141
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1142
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1143
    std::span<char16_t> utf16_output) noexcept {
1144
  return convert_valid_utf8_to_utf16le(
1145
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1146
      valid_utf8_input.size(), utf16_output.data());
1147
}
1148
  #endif // SIMDUTF_SPAN
1149
1150
/**
1151
 * Convert valid UTF-8 string into UTF-16BE string.
1152
 *
1153
 * This function assumes that the input string is valid UTF-8.
1154
 *
1155
 * @param input         the UTF-8 string to convert
1156
 * @param length        the length of the string in bytes
1157
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1158
 * @return the number of written char16_t
1159
 */
1160
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1161
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1162
  #if SIMDUTF_SPAN
1163
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1164
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1165
    std::span<char16_t> utf16_output) noexcept {
1166
  return convert_valid_utf8_to_utf16be(
1167
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1168
      valid_utf8_input.size(), utf16_output.data());
1169
}
1170
  #endif // SIMDUTF_SPAN
1171
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1172
1173
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1174
/**
1175
 * Convert valid UTF-8 string into UTF-32 string.
1176
 *
1177
 * This function assumes that the input string is valid UTF-8.
1178
 *
1179
 * @param input         the UTF-8 string to convert
1180
 * @param length        the length of the string in bytes
1181
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1182
 * @return the number of written char32_t
1183
 */
1184
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1185
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1186
  #if SIMDUTF_SPAN
1187
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1188
    const detail::input_span_of_byte_like auto &valid_utf8_input,
1189
    std::span<char32_t> utf32_output) noexcept {
1190
  return convert_valid_utf8_to_utf32(
1191
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1192
      valid_utf8_input.size(), utf32_output.data());
1193
}
1194
  #endif // SIMDUTF_SPAN
1195
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1196
1197
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1198
/**
1199
 * Return the number of bytes that this Latin1 string would require in UTF-8
1200
 * format.
1201
 *
1202
 * @param input         the Latin1 string to convert
1203
 * @param length        the length of the string bytes
1204
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1205
 */
1206
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1207
                                                   size_t length) noexcept;
1208
  #if SIMDUTF_SPAN
1209
simdutf_really_inline simdutf_warn_unused size_t utf8_length_from_latin1(
1210
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1211
  return utf8_length_from_latin1(
1212
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size());
1213
}
1214
  #endif // SIMDUTF_SPAN
1215
1216
/**
1217
 * Compute the number of bytes that this UTF-8 string would require in Latin1
1218
 * format.
1219
 *
1220
 * This function does not validate the input. It is acceptable to pass invalid
1221
 * UTF-8 strings but in such cases the result is implementation defined.
1222
 *
1223
 * This function is not BOM-aware.
1224
 *
1225
 * @param input         the UTF-8 string to convert
1226
 * @param length        the length of the string in byte
1227
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1228
 */
1229
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1230
                                                   size_t length) noexcept;
1231
  #if SIMDUTF_SPAN
1232
simdutf_really_inline simdutf_warn_unused size_t latin1_length_from_utf8(
1233
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1234
  return latin1_length_from_utf8(
1235
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1236
      valid_utf8_input.size());
1237
}
1238
  #endif // SIMDUTF_SPAN
1239
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1240
1241
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1242
/**
1243
 * Compute the number of 2-byte code units that this UTF-8 string would require
1244
 * in UTF-16LE format.
1245
 *
1246
 * This function does not validate the input. It is acceptable to pass invalid
1247
 * UTF-8 strings but in such cases the result is implementation defined.
1248
 *
1249
 * This function is not BOM-aware.
1250
 *
1251
 * @param input         the UTF-8 string to process
1252
 * @param length        the length of the string in bytes
1253
 * @return the number of char16_t code units required to encode the UTF-8 string
1254
 * as UTF-16LE
1255
 */
1256
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1257
                                                  size_t length) noexcept;
1258
  #if SIMDUTF_SPAN
1259
simdutf_really_inline simdutf_warn_unused size_t utf16_length_from_utf8(
1260
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1261
  return utf16_length_from_utf8(
1262
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1263
      valid_utf8_input.size());
1264
}
1265
  #endif // SIMDUTF_SPAN
1266
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1267
1268
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1269
/**
1270
 * Compute the number of 4-byte code units that this UTF-8 string would require
1271
 * in UTF-32 format.
1272
 *
1273
 * This function is equivalent to count_utf8
1274
 *
1275
 * This function does not validate the input. It is acceptable to pass invalid
1276
 * UTF-8 strings but in such cases the result is implementation defined.
1277
 *
1278
 * This function is not BOM-aware.
1279
 *
1280
 * @param input         the UTF-8 string to process
1281
 * @param length        the length of the string in bytes
1282
 * @return the number of char32_t code units required to encode the UTF-8 string
1283
 * as UTF-32
1284
 */
1285
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1286
                                                  size_t length) noexcept;
1287
  #if SIMDUTF_SPAN
1288
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf8(
1289
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1290
  return utf32_length_from_utf8(
1291
      reinterpret_cast<const char *>(valid_utf8_input.data()),
1292
      valid_utf8_input.size());
1293
}
1294
  #endif // SIMDUTF_SPAN
1295
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1296
1297
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1298
/**
1299
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1300
 * string.
1301
 *
1302
 * During the conversion also validation of the input string is done.
1303
 * This function is suitable to work with inputs from untrusted sources.
1304
 *
1305
 * This function is not BOM-aware.
1306
 *
1307
 * @param input         the UTF-16 string to convert
1308
 * @param length        the length of the string in 2-byte code units (char16_t)
1309
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1310
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1311
 * string
1312
 */
1313
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1314
                                                 size_t length,
1315
                                                 char *utf8_buffer) noexcept;
1316
  #if SIMDUTF_SPAN
1317
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8(
1318
    std::span<const char16_t> utf16_input,
1319
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1320
  return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1321
                               reinterpret_cast<char *>(utf8_output.data()));
1322
}
1323
  #endif // SIMDUTF_SPAN
1324
1325
/**
1326
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1327
 * string with output limit.
1328
 *
1329
 * We write as many characters as possible into the output buffer,
1330
 *
1331
 * During the conversion also validation of the input string is done.
1332
 * This function is suitable to work with inputs from untrusted sources.
1333
 *
1334
 * This function is not BOM-aware.
1335
 *
1336
 *
1337
 * @param input         the UTF-16 string to convert
1338
 * @param length        the length of the string in 16-bit code units (char16_t)
1339
 * @param utf8_output   the pointer to buffer that can hold conversion result
1340
 * @param utf8_len      the maximum output length
1341
 * @return the number of written char; 0 if conversion is not possible
1342
 */
1343
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1344
                                                      size_t length,
1345
                                                      char *utf8_output,
1346
                                                      size_t utf8_len) noexcept;
1347
  #if SIMDUTF_SPAN
1348
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8_safe(
1349
    std::span<const char16_t> utf16_input,
1350
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1351
  // implementation note: outputspan is a forwarding ref to avoid copying and
1352
  // allow both lvalues and rvalues. std::span can be copied without problems,
1353
  // but std::vector should not, and this function should accept both. it will
1354
  // allow using an owning rvalue ref (example: passing a temporary std::string)
1355
  // as output, but the user will quickly find out that he has no way of getting
1356
  // the data out of the object in that case.
1357
  return convert_utf16_to_utf8_safe(
1358
      utf16_input.data(), utf16_input.size(),
1359
      reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1360
}
1361
  #endif // SIMDUTF_SPAN
1362
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1363
1364
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1365
/**
1366
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1367
 * string.
1368
 *
1369
 * During the conversion also validation of the input string is done.
1370
 * This function is suitable to work with inputs from untrusted sources.
1371
 *
1372
 * This function is not BOM-aware.
1373
 *
1374
 * @param input         the UTF-16 string to convert
1375
 * @param length        the length of the string in 2-byte code units (char16_t)
1376
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1377
 * @return number of written code units; 0 if input is not a valid UTF-16 string
1378
 * or if it cannot be represented as Latin1
1379
 */
1380
simdutf_warn_unused size_t convert_utf16_to_latin1(
1381
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1382
  #if SIMDUTF_SPAN
1383
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_latin1(
1384
    std::span<const char16_t> utf16_input,
1385
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1386
  return convert_utf16_to_latin1(
1387
      utf16_input.data(), utf16_input.size(),
1388
      reinterpret_cast<char *>(latin1_output.data()));
1389
}
1390
  #endif // SIMDUTF_SPAN
1391
1392
/**
1393
 * Convert possibly broken UTF-16LE string into Latin1 string.
1394
 * If the string cannot be represented as Latin1, an error
1395
 * is returned.
1396
 *
1397
 * During the conversion also validation of the input string is done.
1398
 * This function is suitable to work with inputs from untrusted sources.
1399
 *
1400
 * This function is not BOM-aware.
1401
 *
1402
 * @param input         the UTF-16LE string to convert
1403
 * @param length        the length of the string in 2-byte code units (char16_t)
1404
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1405
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1406
 * string or if it cannot be represented as Latin1
1407
 */
1408
simdutf_warn_unused size_t convert_utf16le_to_latin1(
1409
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1410
  #if SIMDUTF_SPAN
1411
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_latin1(
1412
    std::span<const char16_t> utf16_input,
1413
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1414
  return convert_utf16le_to_latin1(
1415
      utf16_input.data(), utf16_input.size(),
1416
      reinterpret_cast<char *>(latin1_output.data()));
1417
}
1418
  #endif // SIMDUTF_SPAN
1419
1420
/**
1421
 * Convert possibly broken UTF-16BE string into Latin1 string.
1422
 *
1423
 * During the conversion also validation of the input string is done.
1424
 * This function is suitable to work with inputs from untrusted sources.
1425
 *
1426
 * This function is not BOM-aware.
1427
 *
1428
 * @param input         the UTF-16BE string to convert
1429
 * @param length        the length of the string in 2-byte code units (char16_t)
1430
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1431
 * @return number of written code units; 0 if input is not a valid UTF-16BE
1432
 * string or if it cannot be represented as Latin1
1433
 */
1434
simdutf_warn_unused size_t convert_utf16be_to_latin1(
1435
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1436
  #if SIMDUTF_SPAN
1437
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_latin1(
1438
    std::span<const char16_t> utf16_input,
1439
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1440
  return convert_utf16be_to_latin1(
1441
      utf16_input.data(), utf16_input.size(),
1442
      reinterpret_cast<char *>(latin1_output.data()));
1443
}
1444
  #endif // SIMDUTF_SPAN
1445
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1446
1447
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1448
/**
1449
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1450
 *
1451
 * During the conversion also validation of the input string is done.
1452
 * This function is suitable to work with inputs from untrusted sources.
1453
 *
1454
 * This function is not BOM-aware.
1455
 *
1456
 * @param input         the UTF-16LE string to convert
1457
 * @param length        the length of the string in 2-byte code units (char16_t)
1458
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1459
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1460
 * string
1461
 */
1462
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1463
                                                   size_t length,
1464
                                                   char *utf8_buffer) noexcept;
1465
  #if SIMDUTF_SPAN
1466
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_utf8(
1467
    std::span<const char16_t> utf16_input,
1468
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1469
  return convert_utf16le_to_utf8(utf16_input.data(), utf16_input.size(),
1470
                                 reinterpret_cast<char *>(utf8_output.data()));
1471
}
1472
  #endif // SIMDUTF_SPAN
1473
1474
/**
1475
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1476
 *
1477
 * During the conversion also validation of the input string is done.
1478
 * This function is suitable to work with inputs from untrusted sources.
1479
 *
1480
 * This function is not BOM-aware.
1481
 *
1482
 * @param input         the UTF-16BE string to convert
1483
 * @param length        the length of the string in 2-byte code units (char16_t)
1484
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1485
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1486
 * string
1487
 */
1488
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
1489
                                                   size_t length,
1490
                                                   char *utf8_buffer) noexcept;
1491
  #if SIMDUTF_SPAN
1492
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_utf8(
1493
    std::span<const char16_t> utf16_input,
1494
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1495
  return convert_utf16be_to_utf8(utf16_input.data(), utf16_input.size(),
1496
                                 reinterpret_cast<char *>(utf8_output.data()));
1497
}
1498
  #endif // SIMDUTF_SPAN
1499
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1500
1501
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1502
/**
1503
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1504
 * string.
1505
 *
1506
 * During the conversion also validation of the input string is done.
1507
 * This function is suitable to work with inputs from untrusted sources.
1508
 * This function is not BOM-aware.
1509
 *
1510
 * @param input         the UTF-16 string to convert
1511
 * @param length        the length of the string in 2-byte code units (char16_t)
1512
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1513
 * @return a result pair struct (of type simdutf::result containing the two
1514
 * fields error and count) with an error code and either position of the error
1515
 * (in the input in code units) if any, or the number of char written if
1516
 * successful.
1517
 */
1518
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
1519
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1520
  #if SIMDUTF_SPAN
1521
simdutf_really_inline simdutf_warn_unused result
1522
convert_utf16_to_latin1_with_errors(
1523
    std::span<const char16_t> utf16_input,
1524
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1525
  return convert_utf16_to_latin1_with_errors(
1526
      utf16_input.data(), utf16_input.size(),
1527
      reinterpret_cast<char *>(latin1_output.data()));
1528
}
1529
  #endif // SIMDUTF_SPAN
1530
1531
/**
1532
 * Convert possibly broken UTF-16LE string into Latin1 string.
1533
 *
1534
 * During the conversion also validation of the input string is done.
1535
 * This function is suitable to work with inputs from untrusted sources.
1536
 * This function is not BOM-aware.
1537
 *
1538
 * @param input         the UTF-16LE string to convert
1539
 * @param length        the length of the string in 2-byte code units (char16_t)
1540
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1541
 * @return a result pair struct (of type simdutf::result containing the two
1542
 * fields error and count) with an error code and either position of the error
1543
 * (in the input in code units) if any, or the number of char written if
1544
 * successful.
1545
 */
1546
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
1547
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1548
  #if SIMDUTF_SPAN
1549
simdutf_really_inline simdutf_warn_unused result
1550
convert_utf16le_to_latin1_with_errors(
1551
    std::span<const char16_t> utf16_input,
1552
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1553
  return convert_utf16le_to_latin1_with_errors(
1554
      utf16_input.data(), utf16_input.size(),
1555
      reinterpret_cast<char *>(latin1_output.data()));
1556
}
1557
  #endif // SIMDUTF_SPAN
1558
1559
/**
1560
 * Convert possibly broken UTF-16BE string into Latin1 string.
1561
 * If the string cannot be represented as Latin1, an error
1562
 * is returned.
1563
 *
1564
 * During the conversion also validation of the input string is done.
1565
 * This function is suitable to work with inputs from untrusted sources.
1566
 * This function is not BOM-aware.
1567
 *
1568
 * @param input         the UTF-16BE string to convert
1569
 * @param length        the length of the string in 2-byte code units (char16_t)
1570
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1571
 * @return a result pair struct (of type simdutf::result containing the two
1572
 * fields error and count) with an error code and either position of the error
1573
 * (in the input in code units) if any, or the number of char written if
1574
 * successful.
1575
 */
1576
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
1577
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1578
  #if SIMDUTF_SPAN
1579
simdutf_really_inline simdutf_warn_unused result
1580
convert_utf16be_to_latin1_with_errors(
1581
    std::span<const char16_t> utf16_input,
1582
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1583
  return convert_utf16be_to_latin1_with_errors(
1584
      utf16_input.data(), utf16_input.size(),
1585
      reinterpret_cast<char *>(latin1_output.data()));
1586
}
1587
  #endif // SIMDUTF_SPAN
1588
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1589
1590
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1591
/**
1592
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1593
 * string and stop on error.
1594
 *
1595
 * During the conversion also validation of the input string is done.
1596
 * This function is suitable to work with inputs from untrusted sources.
1597
 *
1598
 * This function is not BOM-aware.
1599
 *
1600
 * @param input         the UTF-16 string to convert
1601
 * @param length        the length of the string in 2-byte code units (char16_t)
1602
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1603
 * @return a result pair struct (of type simdutf::result containing the two
1604
 * fields error and count) with an error code and either position of the error
1605
 * (in the input in code units) if any, or the number of char written if
1606
 * successful.
1607
 */
1608
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
1609
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1610
  #if SIMDUTF_SPAN
1611
simdutf_really_inline simdutf_warn_unused result
1612
convert_utf16_to_utf8_with_errors(
1613
    std::span<const char16_t> utf16_input,
1614
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1615
  return convert_utf16_to_utf8_with_errors(
1616
      utf16_input.data(), utf16_input.size(),
1617
      reinterpret_cast<char *>(utf8_output.data()));
1618
}
1619
  #endif // SIMDUTF_SPAN
1620
1621
/**
1622
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1623
 *
1624
 * During the conversion also validation of the input string is done.
1625
 * This function is suitable to work with inputs from untrusted sources.
1626
 *
1627
 * This function is not BOM-aware.
1628
 *
1629
 * @param input         the UTF-16LE string to convert
1630
 * @param length        the length of the string in 2-byte code units (char16_t)
1631
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1632
 * @return a result pair struct (of type simdutf::result containing the two
1633
 * fields error and count) with an error code and either position of the error
1634
 * (in the input in code units) if any, or the number of char written if
1635
 * successful.
1636
 */
1637
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
1638
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1639
  #if SIMDUTF_SPAN
1640
simdutf_really_inline simdutf_warn_unused result
1641
convert_utf16le_to_utf8_with_errors(
1642
    std::span<const char16_t> utf16_input,
1643
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1644
  return convert_utf16le_to_utf8_with_errors(
1645
      utf16_input.data(), utf16_input.size(),
1646
      reinterpret_cast<char *>(utf8_output.data()));
1647
}
1648
  #endif // SIMDUTF_SPAN
1649
1650
/**
1651
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1652
 *
1653
 * During the conversion also validation of the input string is done.
1654
 * This function is suitable to work with inputs from untrusted sources.
1655
 *
1656
 * This function is not BOM-aware.
1657
 *
1658
 * @param input         the UTF-16BE string to convert
1659
 * @param length        the length of the string in 2-byte code units (char16_t)
1660
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1661
 * @return a result pair struct (of type simdutf::result containing the two
1662
 * fields error and count) with an error code and either position of the error
1663
 * (in the input in code units) if any, or the number of char written if
1664
 * successful.
1665
 */
1666
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
1667
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1668
  #if SIMDUTF_SPAN
1669
simdutf_really_inline simdutf_warn_unused result
1670
convert_utf16be_to_utf8_with_errors(
1671
    std::span<const char16_t> utf16_input,
1672
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1673
  return convert_utf16be_to_utf8_with_errors(
1674
      utf16_input.data(), utf16_input.size(),
1675
      reinterpret_cast<char *>(utf8_output.data()));
1676
}
1677
  #endif // SIMDUTF_SPAN
1678
1679
/**
1680
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
1681
 *
1682
 * This function assumes that the input string is valid UTF-16LE.
1683
 *
1684
 * This function is not BOM-aware.
1685
 *
1686
 * @param input         the UTF-16 string to convert
1687
 * @param length        the length of the string in 2-byte code units (char16_t)
1688
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1689
 * result
1690
 * @return number of written code units; 0 if conversion is not possible
1691
 */
1692
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1693
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1694
  #if SIMDUTF_SPAN
1695
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
1696
    std::span<const char16_t> valid_utf16_input,
1697
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1698
  return convert_valid_utf16_to_utf8(
1699
      valid_utf16_input.data(), valid_utf16_input.size(),
1700
      reinterpret_cast<char *>(utf8_output.data()));
1701
}
1702
  #endif // SIMDUTF_SPAN
1703
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1704
1705
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1706
/**
1707
 * Using native endianness, convert UTF-16 string into Latin1 string.
1708
 *
1709
 * This function assumes that the input string is valid UTF-16 and that it can
1710
 * be represented as Latin1. If you violate this assumption, the result is
1711
 * implementation defined and may include system-dependent behavior such as
1712
 * crashes.
1713
 *
1714
 * This function is for expert users only and not part of our public API. Use
1715
 * convert_utf16_to_latin1 instead. The function may be removed from the library
1716
 * in the future.
1717
 *
1718
 * This function is not BOM-aware.
1719
 *
1720
 * @param input         the UTF-16 string to convert
1721
 * @param length        the length of the string in 2-byte code units (char16_t)
1722
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1723
 * @return number of written code units; 0 if conversion is not possible
1724
 */
1725
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1726
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1727
  #if SIMDUTF_SPAN
1728
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
1729
    std::span<const char16_t> valid_utf16_input,
1730
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1731
  return convert_valid_utf16_to_latin1(
1732
      valid_utf16_input.data(), valid_utf16_input.size(),
1733
      reinterpret_cast<char *>(latin1_output.data()));
1734
}
1735
  #endif // SIMDUTF_SPAN
1736
1737
/**
1738
 * Convert valid UTF-16LE string into Latin1 string.
1739
 *
1740
 * This function assumes that the input string is valid UTF-16LE and that it can
1741
 * be represented as Latin1. If you violate this assumption, the result is
1742
 * implementation defined and may include system-dependent behavior such as
1743
 * crashes.
1744
 *
1745
 * This function is for expert users only and not part of our public API. Use
1746
 * convert_utf16le_to_latin1 instead. The function may be removed from the
1747
 * library in the future.
1748
 *
1749
 * This function is not BOM-aware.
1750
 *
1751
 * @param input         the UTF-16LE string to convert
1752
 * @param length        the length of the string in 2-byte code units (char16_t)
1753
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1754
 * @return number of written code units; 0 if conversion is not possible
1755
 */
1756
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
1757
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1758
  #if SIMDUTF_SPAN
1759
simdutf_really_inline simdutf_warn_unused size_t
1760
convert_valid_utf16le_to_latin1(
1761
    std::span<const char16_t> valid_utf16_input,
1762
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1763
  return convert_valid_utf16le_to_latin1(
1764
      valid_utf16_input.data(), valid_utf16_input.size(),
1765
      reinterpret_cast<char *>(latin1_output.data()));
1766
}
1767
  #endif // SIMDUTF_SPAN
1768
1769
/**
1770
 * Convert valid UTF-16BE string into Latin1 string.
1771
 *
1772
 * This function assumes that the input string is valid UTF-16BE and that it can
1773
 * be represented as Latin1. If you violate this assumption, the result is
1774
 * implementation defined and may include system-dependent behavior such as
1775
 * crashes.
1776
 *
1777
 * This function is for expert users only and not part of our public API. Use
1778
 * convert_utf16be_to_latin1 instead. The function may be removed from the
1779
 * library in the future.
1780
 *
1781
 * This function is not BOM-aware.
1782
 *
1783
 * @param input         the UTF-16BE string to convert
1784
 * @param length        the length of the string in 2-byte code units (char16_t)
1785
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1786
 * @return number of written code units; 0 if conversion is not possible
1787
 */
1788
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
1789
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1790
  #if SIMDUTF_SPAN
1791
simdutf_really_inline simdutf_warn_unused size_t
1792
convert_valid_utf16be_to_latin1(
1793
    std::span<const char16_t> valid_utf16_input,
1794
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1795
  return convert_valid_utf16be_to_latin1(
1796
      valid_utf16_input.data(), valid_utf16_input.size(),
1797
      reinterpret_cast<char *>(latin1_output.data()));
1798
}
1799
  #endif // SIMDUTF_SPAN
1800
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1801
1802
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1803
/**
1804
 * Convert valid UTF-16LE string into UTF-8 string.
1805
 *
1806
 * This function assumes that the input string is valid UTF-16LE and that it can
1807
 * be represented as Latin1.
1808
 *
1809
 * This function is not BOM-aware.
1810
 *
1811
 * @param input         the UTF-16LE string to convert
1812
 * @param length        the length of the string in 2-byte code units (char16_t)
1813
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1814
 * result
1815
 * @return number of written code units; 0 if conversion is not possible
1816
 */
1817
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1818
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1819
  #if SIMDUTF_SPAN
1820
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
1821
    std::span<const char16_t> valid_utf16_input,
1822
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1823
  return convert_valid_utf16le_to_utf8(
1824
      valid_utf16_input.data(), valid_utf16_input.size(),
1825
      reinterpret_cast<char *>(utf8_output.data()));
1826
}
1827
  #endif // SIMDUTF_SPAN
1828
1829
/**
1830
 * Convert valid UTF-16BE string into UTF-8 string.
1831
 *
1832
 * This function assumes that the input string is valid UTF-16BE.
1833
 *
1834
 * This function is not BOM-aware.
1835
 *
1836
 * @param input         the UTF-16BE string to convert
1837
 * @param length        the length of the string in 2-byte code units (char16_t)
1838
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
1839
 * result
1840
 * @return number of written code units; 0 if conversion is not possible
1841
 */
1842
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1843
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
1844
  #if SIMDUTF_SPAN
1845
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
1846
    std::span<const char16_t> valid_utf16_input,
1847
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1848
  return convert_valid_utf16be_to_utf8(
1849
      valid_utf16_input.data(), valid_utf16_input.size(),
1850
      reinterpret_cast<char *>(utf8_output.data()));
1851
}
1852
  #endif // SIMDUTF_SPAN
1853
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1854
1855
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
1856
/**
1857
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
1858
 * string.
1859
 *
1860
 * During the conversion also validation of the input string is done.
1861
 * This function is suitable to work with inputs from untrusted sources.
1862
 *
1863
 * This function is not BOM-aware.
1864
 *
1865
 * @param input         the UTF-16 string to convert
1866
 * @param length        the length of the string in 2-byte code units (char16_t)
1867
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1868
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1869
 * string
1870
 */
1871
simdutf_warn_unused size_t convert_utf16_to_utf32(
1872
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1873
  #if SIMDUTF_SPAN
1874
simdutf_really_inline simdutf_warn_unused size_t
1875
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
1876
0
                       std::span<char32_t> utf32_output) noexcept {
1877
0
  return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
1878
0
                                utf32_output.data());
1879
0
}
1880
  #endif // SIMDUTF_SPAN
1881
1882
/**
1883
 * Convert possibly broken UTF-16LE string into UTF-32 string.
1884
 *
1885
 * During the conversion also validation of the input string is done.
1886
 * This function is suitable to work with inputs from untrusted sources.
1887
 *
1888
 * This function is not BOM-aware.
1889
 *
1890
 * @param input         the UTF-16LE string to convert
1891
 * @param length        the length of the string in 2-byte code units (char16_t)
1892
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1893
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1894
 * string
1895
 */
1896
simdutf_warn_unused size_t convert_utf16le_to_utf32(
1897
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1898
  #if SIMDUTF_SPAN
1899
simdutf_really_inline simdutf_warn_unused size_t
1900
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
1901
0
                         std::span<char32_t> utf32_output) noexcept {
1902
0
  return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
1903
0
                                  utf32_output.data());
1904
0
}
1905
  #endif // SIMDUTF_SPAN
1906
1907
/**
1908
 * Convert possibly broken UTF-16BE string into UTF-32 string.
1909
 *
1910
 * During the conversion also validation of the input string is done.
1911
 * This function is suitable to work with inputs from untrusted sources.
1912
 *
1913
 * This function is not BOM-aware.
1914
 *
1915
 * @param input         the UTF-16BE string to convert
1916
 * @param length        the length of the string in 2-byte code units (char16_t)
1917
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1918
 * @return number of written code units; 0 if input is not a valid UTF-16LE
1919
 * string
1920
 */
1921
simdutf_warn_unused size_t convert_utf16be_to_utf32(
1922
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1923
  #if SIMDUTF_SPAN
1924
simdutf_really_inline simdutf_warn_unused size_t
1925
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
1926
0
                         std::span<char32_t> utf32_output) noexcept {
1927
0
  return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
1928
0
                                  utf32_output.data());
1929
0
}
1930
  #endif // SIMDUTF_SPAN
1931
1932
/**
1933
 * Using native endianness, convert possibly broken UTF-16 string into
1934
 * UTF-32 string and stop on error.
1935
 *
1936
 * During the conversion also validation of the input string is done.
1937
 * This function is suitable to work with inputs from untrusted sources.
1938
 *
1939
 * This function is not BOM-aware.
1940
 *
1941
 * @param input         the UTF-16 string to convert
1942
 * @param length        the length of the string in 2-byte code units (char16_t)
1943
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1944
 * @return a result pair struct (of type simdutf::result containing the two
1945
 * fields error and count) with an error code and either position of the error
1946
 * (in the input in code units) if any, or the number of char32_t written if
1947
 * successful.
1948
 */
1949
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
1950
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1951
  #if SIMDUTF_SPAN
1952
simdutf_really_inline simdutf_warn_unused result
1953
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
1954
0
                                   std::span<char32_t> utf32_output) noexcept {
1955
0
  return convert_utf16_to_utf32_with_errors(
1956
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1957
0
}
1958
  #endif // SIMDUTF_SPAN
1959
1960
/**
1961
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1962
 *
1963
 * During the conversion also validation of the input string is done.
1964
 * This function is suitable to work with inputs from untrusted sources.
1965
 *
1966
 * This function is not BOM-aware.
1967
 *
1968
 * @param input         the UTF-16LE string to convert
1969
 * @param length        the length of the string in 2-byte code units (char16_t)
1970
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1971
 * @return a result pair struct (of type simdutf::result containing the two
1972
 * fields error and count) with an error code and either position of the error
1973
 * (in the input in code units) if any, or the number of char32_t written if
1974
 * successful.
1975
 */
1976
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
1977
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
1978
  #if SIMDUTF_SPAN
1979
simdutf_really_inline simdutf_warn_unused result
1980
convert_utf16le_to_utf32_with_errors(
1981
    std::span<const char16_t> utf16_input,
1982
0
    std::span<char32_t> utf32_output) noexcept {
1983
0
  return convert_utf16le_to_utf32_with_errors(
1984
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
1985
0
}
1986
  #endif // SIMDUTF_SPAN
1987
1988
/**
1989
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1990
 *
1991
 * During the conversion also validation of the input string is done.
1992
 * This function is suitable to work with inputs from untrusted sources.
1993
 *
1994
 * This function is not BOM-aware.
1995
 *
1996
 * @param input         the UTF-16BE string to convert
1997
 * @param length        the length of the string in 2-byte code units (char16_t)
1998
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1999
 * @return a result pair struct (of type simdutf::result containing the two
2000
 * fields error and count) with an error code and either position of the error
2001
 * (in the input in code units) if any, or the number of char32_t written if
2002
 * successful.
2003
 */
2004
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
2005
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2006
  #if SIMDUTF_SPAN
2007
simdutf_really_inline simdutf_warn_unused result
2008
convert_utf16be_to_utf32_with_errors(
2009
    std::span<const char16_t> utf16_input,
2010
0
    std::span<char32_t> utf32_output) noexcept {
2011
0
  return convert_utf16be_to_utf32_with_errors(
2012
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
2013
0
}
2014
  #endif // SIMDUTF_SPAN
2015
2016
/**
2017
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
2018
 *
2019
 * This function assumes that the input string is valid UTF-16 (native
2020
 * endianness).
2021
 *
2022
 * This function is not BOM-aware.
2023
 *
2024
 * @param input         the UTF-16 string to convert
2025
 * @param length        the length of the string in 2-byte code units (char16_t)
2026
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2027
 * result
2028
 * @return number of written code units; 0 if conversion is not possible
2029
 */
2030
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
2031
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2032
  #if SIMDUTF_SPAN
2033
simdutf_really_inline simdutf_warn_unused size_t
2034
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
2035
0
                             std::span<char32_t> utf32_output) noexcept {
2036
0
  return convert_valid_utf16_to_utf32(
2037
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2038
0
}
2039
  #endif // SIMDUTF_SPAN
2040
2041
/**
2042
 * Convert valid UTF-16LE string into UTF-32 string.
2043
 *
2044
 * This function assumes that the input string is valid UTF-16LE.
2045
 *
2046
 * This function is not BOM-aware.
2047
 *
2048
 * @param input         the UTF-16LE string to convert
2049
 * @param length        the length of the string in 2-byte code units (char16_t)
2050
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2051
 * result
2052
 * @return number of written code units; 0 if conversion is not possible
2053
 */
2054
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
2055
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2056
  #if SIMDUTF_SPAN
2057
simdutf_really_inline simdutf_warn_unused size_t
2058
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
2059
0
                               std::span<char32_t> utf32_output) noexcept {
2060
0
  return convert_valid_utf16le_to_utf32(
2061
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2062
0
}
2063
  #endif // SIMDUTF_SPAN
2064
2065
/**
2066
 * Convert valid UTF-16BE string into UTF-32 string.
2067
 *
2068
 * This function assumes that the input string is valid UTF-16LE.
2069
 *
2070
 * This function is not BOM-aware.
2071
 *
2072
 * @param input         the UTF-16BE string to convert
2073
 * @param length        the length of the string in 2-byte code units (char16_t)
2074
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2075
 * result
2076
 * @return number of written code units; 0 if conversion is not possible
2077
 */
2078
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
2079
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2080
  #if SIMDUTF_SPAN
2081
simdutf_really_inline simdutf_warn_unused size_t
2082
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
2083
0
                               std::span<char32_t> utf32_output) noexcept {
2084
0
  return convert_valid_utf16be_to_utf32(
2085
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2086
0
}
2087
  #endif // SIMDUTF_SPAN
2088
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2089
2090
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2091
/**
2092
 * Compute the number of bytes that this UTF-16LE/BE string would require in
2093
 * Latin1 format.
2094
 *
2095
 * This function does not validate the input. It is acceptable to pass invalid
2096
 * UTF-16 strings but in such cases the result is implementation defined.
2097
 *
2098
 * This function is not BOM-aware.
2099
 *
2100
 * @param length        the length of the string in 2-byte code units (char16_t)
2101
 * @return the number of bytes required to encode the UTF-16LE string as Latin1
2102
 */
2103
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
2104
2105
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2106
2107
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2108
/**
2109
 * Using native endianness; Compute the number of bytes that this UTF-16
2110
 * string would require in UTF-8 format.
2111
 *
2112
 * This function does not validate the input. It is acceptable to pass invalid
2113
 * UTF-16 strings but in such cases the result is implementation defined.
2114
 *
2115
 * @param input         the UTF-16 string to convert
2116
 * @param length        the length of the string in 2-byte code units (char16_t)
2117
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2118
 */
2119
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
2120
                                                  size_t length) noexcept;
2121
  #if SIMDUTF_SPAN
2122
simdutf_really_inline simdutf_warn_unused size_t
2123
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2124
0
  return utf8_length_from_utf16(valid_utf16_input.data(),
2125
0
                                valid_utf16_input.size());
2126
0
}
2127
  #endif // SIMDUTF_SPAN
2128
2129
/**
2130
 * Using native endianness; compute the number of bytes that this UTF-16
2131
 * string would require in UTF-8 format even when the UTF-16LE content contains
2132
 * mismatched surrogates that have to be replaced by the replacement character
2133
 * (0xFFFD).
2134
 *
2135
 * @param input         the UTF-16 string to convert
2136
 * @param length        the length of the string in 2-byte code units (char16_t)
2137
 * @return a result pair struct (of type simdutf::result containing the two
2138
 * fields error and count) where the count is the number of bytes required to
2139
 * encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or
2140
 * SURROGATE. The count is correct regardless of the error field.
2141
 * When SURROGATE is returned, it does not indicate an error in the case of this
2142
 * function: it indicates that at least one surrogate has been encountered: the
2143
 * surrogates may be matched or not (thus this function does not validate). If
2144
 * the returned error code is SUCCESS, then the input contains no surrogate, is
2145
 * in the Basic Multilingual Plane, and is necessarily valid.
2146
 */
2147
simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
2148
    const char16_t *input, size_t length) noexcept;
2149
  #if SIMDUTF_SPAN
2150
simdutf_really_inline simdutf_warn_unused result
2151
utf8_length_from_utf16_with_replacement(
2152
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2153
0
  return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
2154
0
                                                 valid_utf16_input.size());
2155
0
}
2156
  #endif // SIMDUTF_SPAN
2157
2158
/**
2159
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2160
 * format.
2161
 *
2162
 * This function does not validate the input. It is acceptable to pass invalid
2163
 * UTF-16 strings but in such cases the result is implementation defined.
2164
 *
2165
 * @param input         the UTF-16LE string to convert
2166
 * @param length        the length of the string in 2-byte code units (char16_t)
2167
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2168
 */
2169
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
2170
                                                    size_t length) noexcept;
2171
  #if SIMDUTF_SPAN
2172
simdutf_really_inline simdutf_warn_unused size_t
2173
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2174
0
  return utf8_length_from_utf16le(valid_utf16_input.data(),
2175
0
                                  valid_utf16_input.size());
2176
0
}
2177
  #endif // SIMDUTF_SPAN
2178
2179
/**
2180
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
2181
 * format.
2182
 *
2183
 * This function does not validate the input. It is acceptable to pass invalid
2184
 * UTF-16 strings but in such cases the result is implementation defined.
2185
 *
2186
 * @param input         the UTF-16BE string to convert
2187
 * @param length        the length of the string in 2-byte code units (char16_t)
2188
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
2189
 */
2190
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
2191
                                                    size_t length) noexcept;
2192
  #if SIMDUTF_SPAN
2193
simdutf_really_inline simdutf_warn_unused size_t
2194
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2195
0
  return utf8_length_from_utf16be(valid_utf16_input.data(),
2196
0
                                  valid_utf16_input.size());
2197
0
}
2198
  #endif // SIMDUTF_SPAN
2199
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2200
2201
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2202
/**
2203
 * Convert possibly broken UTF-32 string into UTF-8 string.
2204
 *
2205
 * During the conversion also validation of the input string is done.
2206
 * This function is suitable to work with inputs from untrusted sources.
2207
 *
2208
 * This function is not BOM-aware.
2209
 *
2210
 * @param input         the UTF-32 string to convert
2211
 * @param length        the length of the string in 4-byte code units (char32_t)
2212
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2213
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2214
 */
2215
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
2216
                                                 size_t length,
2217
                                                 char *utf8_buffer) noexcept;
2218
  #if SIMDUTF_SPAN
2219
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_utf8(
2220
    std::span<const char32_t> utf32_input,
2221
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2222
  return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
2223
                               reinterpret_cast<char *>(utf8_output.data()));
2224
}
2225
  #endif // SIMDUTF_SPAN
2226
2227
/**
2228
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
2229
 *
2230
 * During the conversion also validation of the input string is done.
2231
 * This function is suitable to work with inputs from untrusted sources.
2232
 *
2233
 * This function is not BOM-aware.
2234
 *
2235
 * @param input         the UTF-32 string to convert
2236
 * @param length        the length of the string in 4-byte code units (char32_t)
2237
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2238
 * @return a result pair struct (of type simdutf::result containing the two
2239
 * fields error and count) with an error code and either position of the error
2240
 * (in the input in code units) if any, or the number of char written if
2241
 * successful.
2242
 */
2243
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
2244
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2245
  #if SIMDUTF_SPAN
2246
simdutf_really_inline simdutf_warn_unused result
2247
convert_utf32_to_utf8_with_errors(
2248
    std::span<const char32_t> utf32_input,
2249
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2250
  return convert_utf32_to_utf8_with_errors(
2251
      utf32_input.data(), utf32_input.size(),
2252
      reinterpret_cast<char *>(utf8_output.data()));
2253
}
2254
  #endif // SIMDUTF_SPAN
2255
2256
/**
2257
 * Convert valid UTF-32 string into UTF-8 string.
2258
 *
2259
 * This function assumes that the input string is valid UTF-32.
2260
 *
2261
 * This function is not BOM-aware.
2262
 *
2263
 * @param input         the UTF-32 string to convert
2264
 * @param length        the length of the string in 4-byte code units (char32_t)
2265
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2266
 * result
2267
 * @return number of written code units; 0 if conversion is not possible
2268
 */
2269
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2270
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
2271
  #if SIMDUTF_SPAN
2272
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
2273
    std::span<const char32_t> valid_utf32_input,
2274
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2275
  return convert_valid_utf32_to_utf8(
2276
      valid_utf32_input.data(), valid_utf32_input.size(),
2277
      reinterpret_cast<char *>(utf8_output.data()));
2278
}
2279
  #endif // SIMDUTF_SPAN
2280
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2281
2282
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2283
/**
2284
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
2285
 * string.
2286
 *
2287
 * During the conversion also validation of the input string is done.
2288
 * This function is suitable to work with inputs from untrusted sources.
2289
 *
2290
 * This function is not BOM-aware.
2291
 *
2292
 * @param input         the UTF-32 string to convert
2293
 * @param length        the length of the string in 4-byte code units (char32_t)
2294
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2295
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2296
 */
2297
simdutf_warn_unused size_t convert_utf32_to_utf16(
2298
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2299
  #if SIMDUTF_SPAN
2300
simdutf_really_inline simdutf_warn_unused size_t
2301
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
2302
0
                       std::span<char16_t> utf16_output) noexcept {
2303
0
  return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
2304
0
                                utf16_output.data());
2305
0
}
2306
  #endif // SIMDUTF_SPAN
2307
2308
/**
2309
 * Convert possibly broken UTF-32 string into UTF-16LE string.
2310
 *
2311
 * During the conversion also validation of the input string is done.
2312
 * This function is suitable to work with inputs from untrusted sources.
2313
 *
2314
 * This function is not BOM-aware.
2315
 *
2316
 * @param input         the UTF-32 string to convert
2317
 * @param length        the length of the string in 4-byte code units (char32_t)
2318
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2319
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2320
 */
2321
simdutf_warn_unused size_t convert_utf32_to_utf16le(
2322
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2323
  #if SIMDUTF_SPAN
2324
simdutf_really_inline simdutf_warn_unused size_t
2325
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
2326
0
                         std::span<char16_t> utf16_output) noexcept {
2327
0
  return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
2328
0
                                  utf16_output.data());
2329
0
}
2330
  #endif // SIMDUTF_SPAN
2331
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2332
2333
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2334
/**
2335
 * Convert possibly broken UTF-32 string into Latin1 string.
2336
 *
2337
 * During the conversion also validation of the input string is done.
2338
 * This function is suitable to work with inputs from untrusted sources.
2339
 *
2340
 * This function is not BOM-aware.
2341
 *
2342
 * @param input         the UTF-32 string to convert
2343
 * @param length        the length of the string in 4-byte code units (char32_t)
2344
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2345
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2346
 * or if it cannot be represented as Latin1
2347
 */
2348
simdutf_warn_unused size_t convert_utf32_to_latin1(
2349
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2350
  #if SIMDUTF_SPAN
2351
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_latin1(
2352
    std::span<const char32_t> utf32_input,
2353
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2354
  return convert_utf32_to_latin1(
2355
      utf32_input.data(), utf32_input.size(),
2356
      reinterpret_cast<char *>(latin1_output.data()));
2357
}
2358
  #endif // SIMDUTF_SPAN
2359
2360
/**
2361
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
2362
 * If the string cannot be represented as Latin1, an error is returned.
2363
 *
2364
 * During the conversion also validation of the input string is done.
2365
 * This function is suitable to work with inputs from untrusted sources.
2366
 *
2367
 * This function is not BOM-aware.
2368
 *
2369
 * @param input         the UTF-32 string to convert
2370
 * @param length        the length of the string in 4-byte code units (char32_t)
2371
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2372
 * @return a result pair struct (of type simdutf::result containing the two
2373
 * fields error and count) with an error code and either position of the error
2374
 * (in the input in code units) if any, or the number of char written if
2375
 * successful.
2376
 */
2377
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
2378
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2379
  #if SIMDUTF_SPAN
2380
simdutf_really_inline simdutf_warn_unused result
2381
convert_utf32_to_latin1_with_errors(
2382
    std::span<const char32_t> utf32_input,
2383
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2384
  return convert_utf32_to_latin1_with_errors(
2385
      utf32_input.data(), utf32_input.size(),
2386
      reinterpret_cast<char *>(latin1_output.data()));
2387
}
2388
  #endif // SIMDUTF_SPAN
2389
2390
/**
2391
 * Convert valid UTF-32 string into Latin1 string.
2392
 *
2393
 * This function assumes that the input string is valid UTF-32 and that it can
2394
 * be represented as Latin1. If you violate this assumption, the result is
2395
 * implementation defined and may include system-dependent behavior such as
2396
 * crashes.
2397
 *
2398
 * This function is for expert users only and not part of our public API. Use
2399
 * convert_utf32_to_latin1 instead. The function may be removed from the library
2400
 * in the future.
2401
 *
2402
 * This function is not BOM-aware.
2403
 *
2404
 * @param input         the UTF-32 string to convert
2405
 * @param length        the length of the string in 4-byte code units (char32_t)
2406
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
2407
 * result
2408
 * @return number of written code units; 0 if conversion is not possible
2409
 */
2410
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2411
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
2412
  #if SIMDUTF_SPAN
2413
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
2414
    std::span<const char32_t> valid_utf32_input,
2415
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2416
  return convert_valid_utf32_to_latin1(
2417
      valid_utf32_input.data(), valid_utf32_input.size(),
2418
      reinterpret_cast<char *>(latin1_output.data()));
2419
}
2420
  #endif // SIMDUTF_SPAN
2421
2422
/**
2423
 * Compute the number of bytes that this UTF-32 string would require in Latin1
2424
 * format.
2425
 *
2426
 * This function does not validate the input. It is acceptable to pass invalid
2427
 * UTF-32 strings but in such cases the result is implementation defined.
2428
 *
2429
 * This function is not BOM-aware.
2430
 *
2431
 * @param length        the length of the string in 4-byte code units (char32_t)
2432
 * @return the number of bytes required to encode the UTF-32 string as Latin1
2433
 */
2434
simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept;
2435
2436
/**
2437
 * Compute the number of bytes that this Latin1 string would require in UTF-32
2438
 * format.
2439
 *
2440
 * @param length        the length of the string in Latin1 code units (char)
2441
 * @return the length of the string in 4-byte code units (char32_t) required to
2442
 * encode the Latin1 string as UTF-32
2443
 */
2444
simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept;
2445
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
2446
2447
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2448
/**
2449
 * Convert possibly broken UTF-32 string into UTF-16BE string.
2450
 *
2451
 * During the conversion also validation of the input string is done.
2452
 * This function is suitable to work with inputs from untrusted sources.
2453
 *
2454
 * This function is not BOM-aware.
2455
 *
2456
 * @param input         the UTF-32 string to convert
2457
 * @param length        the length of the string in 4-byte code units (char32_t)
2458
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2459
 * @return number of written code units; 0 if input is not a valid UTF-32 string
2460
 */
2461
simdutf_warn_unused size_t convert_utf32_to_utf16be(
2462
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2463
  #if SIMDUTF_SPAN
2464
simdutf_really_inline simdutf_warn_unused size_t
2465
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
2466
0
                         std::span<char16_t> utf16_output) noexcept {
2467
0
  return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
2468
0
                                  utf16_output.data());
2469
0
}
2470
  #endif // SIMDUTF_SPAN
2471
2472
/**
2473
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
2474
 * string and stop on error.
2475
 *
2476
 * During the conversion also validation of the input string is done.
2477
 * This function is suitable to work with inputs from untrusted sources.
2478
 *
2479
 * This function is not BOM-aware.
2480
 *
2481
 * @param input         the UTF-32 string to convert
2482
 * @param length        the length of the string in 4-byte code units (char32_t)
2483
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2484
 * @return a result pair struct (of type simdutf::result containing the two
2485
 * fields error and count) with an error code and either position of the error
2486
 * (in the input in code units) if any, or the number of char16_t written if
2487
 * successful.
2488
 */
2489
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
2490
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2491
  #if SIMDUTF_SPAN
2492
simdutf_really_inline simdutf_warn_unused result
2493
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
2494
0
                                   std::span<char16_t> utf16_output) noexcept {
2495
0
  return convert_utf32_to_utf16_with_errors(
2496
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2497
0
}
2498
  #endif // SIMDUTF_SPAN
2499
2500
/**
2501
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2502
 *
2503
 * During the conversion also validation of the input string is done.
2504
 * This function is suitable to work with inputs from untrusted sources.
2505
 *
2506
 * This function is not BOM-aware.
2507
 *
2508
 * @param input         the UTF-32 string to convert
2509
 * @param length        the length of the string in 4-byte code units (char32_t)
2510
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2511
 * @return a result pair struct (of type simdutf::result containing the two
2512
 * fields error and count) with an error code and either position of the error
2513
 * (in the input in code units) if any, or the number of char16_t written if
2514
 * successful.
2515
 */
2516
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
2517
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2518
  #if SIMDUTF_SPAN
2519
simdutf_really_inline simdutf_warn_unused result
2520
convert_utf32_to_utf16le_with_errors(
2521
    std::span<const char32_t> utf32_input,
2522
0
    std::span<char16_t> utf16_output) noexcept {
2523
0
  return convert_utf32_to_utf16le_with_errors(
2524
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2525
0
}
2526
  #endif // SIMDUTF_SPAN
2527
2528
/**
2529
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2530
 *
2531
 * During the conversion also validation of the input string is done.
2532
 * This function is suitable to work with inputs from untrusted sources.
2533
 *
2534
 * This function is not BOM-aware.
2535
 *
2536
 * @param input         the UTF-32 string to convert
2537
 * @param length        the length of the string in 4-byte code units (char32_t)
2538
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2539
 * @return a result pair struct (of type simdutf::result containing the two
2540
 * fields error and count) with an error code and either position of the error
2541
 * (in the input in code units) if any, or the number of char16_t written if
2542
 * successful.
2543
 */
2544
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
2545
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2546
  #if SIMDUTF_SPAN
2547
simdutf_really_inline simdutf_warn_unused result
2548
convert_utf32_to_utf16be_with_errors(
2549
    std::span<const char32_t> utf32_input,
2550
0
    std::span<char16_t> utf16_output) noexcept {
2551
0
  return convert_utf32_to_utf16be_with_errors(
2552
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
2553
0
}
2554
  #endif // SIMDUTF_SPAN
2555
2556
/**
2557
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
2558
 *
2559
 * This function assumes that the input string is valid UTF-32.
2560
 *
2561
 * This function is not BOM-aware.
2562
 *
2563
 * @param input         the UTF-32 string to convert
2564
 * @param length        the length of the string in 4-byte code units (char32_t)
2565
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2566
 * result
2567
 * @return number of written code units; 0 if conversion is not possible
2568
 */
2569
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
2570
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2571
  #if SIMDUTF_SPAN
2572
simdutf_really_inline simdutf_warn_unused size_t
2573
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
2574
0
                             std::span<char16_t> utf16_output) noexcept {
2575
0
  return convert_valid_utf32_to_utf16(
2576
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2577
0
}
2578
  #endif // SIMDUTF_SPAN
2579
2580
/**
2581
 * Convert valid UTF-32 string into UTF-16LE string.
2582
 *
2583
 * This function assumes that the input string is valid UTF-32.
2584
 *
2585
 * This function is not BOM-aware.
2586
 *
2587
 * @param input         the UTF-32 string to convert
2588
 * @param length        the length of the string in 4-byte code units (char32_t)
2589
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2590
 * result
2591
 * @return number of written code units; 0 if conversion is not possible
2592
 */
2593
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
2594
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2595
  #if SIMDUTF_SPAN
2596
simdutf_really_inline simdutf_warn_unused size_t
2597
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
2598
0
                               std::span<char16_t> utf16_output) noexcept {
2599
0
  return convert_valid_utf32_to_utf16le(
2600
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2601
0
}
2602
  #endif // SIMDUTF_SPAN
2603
2604
/**
2605
 * Convert valid UTF-32 string into UTF-16BE string.
2606
 *
2607
 * This function assumes that the input string is valid UTF-32.
2608
 *
2609
 * This function is not BOM-aware.
2610
 *
2611
 * @param input         the UTF-32 string to convert
2612
 * @param length        the length of the string in 4-byte code units (char32_t)
2613
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
2614
 * result
2615
 * @return number of written code units; 0 if conversion is not possible
2616
 */
2617
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
2618
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
2619
  #if SIMDUTF_SPAN
2620
simdutf_really_inline simdutf_warn_unused size_t
2621
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
2622
0
                               std::span<char16_t> utf16_output) noexcept {
2623
0
  return convert_valid_utf32_to_utf16be(
2624
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
2625
0
}
2626
  #endif // SIMDUTF_SPAN
2627
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2628
2629
#if SIMDUTF_FEATURE_UTF16
2630
/**
2631
 * Change the endianness of the input. Can be used to go from UTF-16LE to
2632
 * UTF-16BE or from UTF-16BE to UTF-16LE.
2633
 *
2634
 * This function does not validate the input.
2635
 *
2636
 * This function is not BOM-aware.
2637
 *
2638
 * @param input         the UTF-16 string to process
2639
 * @param length        the length of the string in 2-byte code units (char16_t)
2640
 * @param output        the pointer to a buffer that can hold the conversion
2641
 * result
2642
 */
2643
void change_endianness_utf16(const char16_t *input, size_t length,
2644
                             char16_t *output) noexcept;
2645
  #if SIMDUTF_SPAN
2646
simdutf_really_inline void
2647
change_endianness_utf16(std::span<const char16_t> utf16_input,
2648
0
                        std::span<char16_t> utf16_output) noexcept {
2649
0
  return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
2650
0
                                 utf16_output.data());
2651
0
}
2652
  #endif // SIMDUTF_SPAN
2653
#endif   // SIMDUTF_FEATURE_UTF16
2654
2655
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2656
/**
2657
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
2658
 * format.
2659
 *
2660
 * This function does not validate the input. It is acceptable to pass invalid
2661
 * UTF-32 strings but in such cases the result is implementation defined.
2662
 *
2663
 * @param input         the UTF-32 string to convert
2664
 * @param length        the length of the string in 4-byte code units (char32_t)
2665
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2666
 */
2667
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
2668
                                                  size_t length) noexcept;
2669
  #if SIMDUTF_SPAN
2670
simdutf_really_inline simdutf_warn_unused size_t
2671
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2672
0
  return utf8_length_from_utf32(valid_utf32_input.data(),
2673
0
                                valid_utf32_input.size());
2674
0
}
2675
  #endif // SIMDUTF_SPAN
2676
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2677
2678
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2679
/**
2680
 * Compute the number of two-byte code units that this UTF-32 string would
2681
 * require in UTF-16 format.
2682
 *
2683
 * This function does not validate the input. It is acceptable to pass invalid
2684
 * UTF-32 strings but in such cases the result is implementation defined.
2685
 *
2686
 * @param input         the UTF-32 string to convert
2687
 * @param length        the length of the string in 4-byte code units (char32_t)
2688
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2689
 */
2690
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
2691
                                                   size_t length) noexcept;
2692
  #if SIMDUTF_SPAN
2693
simdutf_really_inline simdutf_warn_unused size_t
2694
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
2695
0
  return utf16_length_from_utf32(valid_utf32_input.data(),
2696
0
                                 valid_utf32_input.size());
2697
0
}
2698
  #endif // SIMDUTF_SPAN
2699
2700
/**
2701
 * Using native endianness; Compute the number of bytes that this UTF-16
2702
 * string would require in UTF-32 format.
2703
 *
2704
 * This function is equivalent to count_utf16.
2705
 *
2706
 * This function does not validate the input. It is acceptable to pass invalid
2707
 * UTF-16 strings but in such cases the result is implementation defined.
2708
 *
2709
 * This function is not BOM-aware.
2710
 *
2711
 * @param input         the UTF-16 string to convert
2712
 * @param length        the length of the string in 2-byte code units (char16_t)
2713
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2714
 */
2715
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
2716
                                                   size_t length) noexcept;
2717
  #if SIMDUTF_SPAN
2718
simdutf_really_inline simdutf_warn_unused size_t
2719
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2720
0
  return utf32_length_from_utf16(valid_utf16_input.data(),
2721
0
                                 valid_utf16_input.size());
2722
0
}
2723
  #endif // SIMDUTF_SPAN
2724
2725
/**
2726
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
2727
 * format.
2728
 *
2729
 * This function is equivalent to count_utf16le.
2730
 *
2731
 * This function does not validate the input. It is acceptable to pass invalid
2732
 * UTF-16 strings but in such cases the result is implementation defined.
2733
 *
2734
 * This function is not BOM-aware.
2735
 *
2736
 * @param input         the UTF-16LE string to convert
2737
 * @param length        the length of the string in 2-byte code units (char16_t)
2738
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2739
 */
2740
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
2741
                                                     size_t length) noexcept;
2742
  #if SIMDUTF_SPAN
2743
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16le(
2744
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2745
0
  return utf32_length_from_utf16le(valid_utf16_input.data(),
2746
0
                                   valid_utf16_input.size());
2747
0
}
2748
  #endif // SIMDUTF_SPAN
2749
2750
/**
2751
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
2752
 * format.
2753
 *
2754
 * This function is equivalent to count_utf16be.
2755
 *
2756
 * This function does not validate the input. It is acceptable to pass invalid
2757
 * UTF-16 strings but in such cases the result is implementation defined.
2758
 *
2759
 * This function is not BOM-aware.
2760
 *
2761
 * @param input         the UTF-16BE string to convert
2762
 * @param length        the length of the string in 2-byte code units (char16_t)
2763
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2764
 */
2765
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
2766
                                                     size_t length) noexcept;
2767
  #if SIMDUTF_SPAN
2768
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16be(
2769
0
    std::span<const char16_t> valid_utf16_input) noexcept {
2770
0
  return utf32_length_from_utf16be(valid_utf16_input.data(),
2771
0
                                   valid_utf16_input.size());
2772
0
}
2773
  #endif // SIMDUTF_SPAN
2774
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2775
2776
#if SIMDUTF_FEATURE_UTF16
2777
/**
2778
 * Count the number of code points (characters) in the string assuming that
2779
 * it is valid.
2780
 *
2781
 * This function assumes that the input string is valid UTF-16 (native
2782
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
2783
 * cases the result is implementation defined.
2784
 *
2785
 * This function is not BOM-aware.
2786
 *
2787
 * @param input         the UTF-16 string to process
2788
 * @param length        the length of the string in 2-byte code units (char16_t)
2789
 * @return number of code points
2790
 */
2791
simdutf_warn_unused size_t count_utf16(const char16_t *input,
2792
                                       size_t length) noexcept;
2793
  #if SIMDUTF_SPAN
2794
simdutf_really_inline simdutf_warn_unused size_t
2795
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2796
0
  return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2797
0
}
2798
  #endif // SIMDUTF_SPAN
2799
2800
/**
2801
 * Count the number of code points (characters) in the string assuming that
2802
 * it is valid.
2803
 *
2804
 * This function assumes that the input string is valid UTF-16LE.
2805
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2806
 * the result is implementation defined.
2807
 *
2808
 * This function is not BOM-aware.
2809
 *
2810
 * @param input         the UTF-16LE string to process
2811
 * @param length        the length of the string in 2-byte code units (char16_t)
2812
 * @return number of code points
2813
 */
2814
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
2815
                                         size_t length) noexcept;
2816
  #if SIMDUTF_SPAN
2817
simdutf_really_inline simdutf_warn_unused size_t
2818
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2819
0
  return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
2820
0
}
2821
  #endif // SIMDUTF_SPAN
2822
2823
/**
2824
 * Count the number of code points (characters) in the string assuming that
2825
 * it is valid.
2826
 *
2827
 * This function assumes that the input string is valid UTF-16BE.
2828
 * It is acceptable to pass invalid UTF-16 strings but in such cases
2829
 * the result is implementation defined.
2830
 *
2831
 * This function is not BOM-aware.
2832
 *
2833
 * @param input         the UTF-16BE string to process
2834
 * @param length        the length of the string in 2-byte code units (char16_t)
2835
 * @return number of code points
2836
 */
2837
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
2838
                                         size_t length) noexcept;
2839
  #if SIMDUTF_SPAN
2840
simdutf_really_inline simdutf_warn_unused size_t
2841
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2842
0
  return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
2843
0
}
2844
  #endif // SIMDUTF_SPAN
2845
#endif   // SIMDUTF_FEATURE_UTF16
2846
2847
#if SIMDUTF_FEATURE_UTF8
2848
/**
2849
 * Count the number of code points (characters) in the string assuming that
2850
 * it is valid.
2851
 *
2852
 * This function assumes that the input string is valid UTF-8.
2853
 * It is acceptable to pass invalid UTF-8 strings but in such cases
2854
 * the result is implementation defined.
2855
 *
2856
 * @param input         the UTF-8 string to process
2857
 * @param length        the length of the string in bytes
2858
 * @return number of code points
2859
 */
2860
simdutf_warn_unused size_t count_utf8(const char *input,
2861
                                      size_t length) noexcept;
2862
  #if SIMDUTF_SPAN
2863
simdutf_really_inline simdutf_warn_unused size_t count_utf8(
2864
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2865
  return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
2866
                    valid_utf8_input.size());
2867
}
2868
  #endif // SIMDUTF_SPAN
2869
2870
/**
2871
 * Given a valid UTF-8 string having a possibly truncated last character,
2872
 * this function checks the end of string. If the last character is truncated
2873
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
2874
 * that the short UTF-8 strings only contain complete characters. If there is no
2875
 * truncated character, the original length is returned.
2876
 *
2877
 * This function assumes that the input string is valid UTF-8, but possibly
2878
 * truncated.
2879
 *
2880
 * @param input         the UTF-8 string to process
2881
 * @param length        the length of the string in bytes
2882
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
2883
 */
2884
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
2885
  #if SIMDUTF_SPAN
2886
simdutf_really_inline simdutf_warn_unused size_t trim_partial_utf8(
2887
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2888
  return trim_partial_utf8(
2889
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2890
      valid_utf8_input.size());
2891
}
2892
  #endif // SIMDUTF_SPAN
2893
#endif   // SIMDUTF_FEATURE_UTF8
2894
2895
#if SIMDUTF_FEATURE_UTF16
2896
/**
2897
 * Given a valid UTF-16BE string having a possibly truncated last character,
2898
 * this function checks the end of string. If the last character is truncated
2899
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2900
 * the short UTF-16BE strings only contain complete characters. If there is no
2901
 * truncated character, the original length is returned.
2902
 *
2903
 * This function assumes that the input string is valid UTF-16BE, but possibly
2904
 * truncated.
2905
 *
2906
 * @param input         the UTF-16BE string to process
2907
 * @param length        the length of the string in bytes
2908
 * @return the length of the string in bytes, possibly shorter by 1 unit
2909
 */
2910
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
2911
                                                size_t length);
2912
  #if SIMDUTF_SPAN
2913
simdutf_really_inline simdutf_warn_unused size_t
2914
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
2915
0
  return trim_partial_utf16be(valid_utf16_input.data(),
2916
0
                              valid_utf16_input.size());
2917
0
}
2918
  #endif // SIMDUTF_SPAN
2919
2920
/**
2921
 * Given a valid UTF-16LE string having a possibly truncated last character,
2922
 * this function checks the end of string. If the last character is truncated
2923
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2924
 * the short UTF-16LE strings only contain complete characters. If there is no
2925
 * truncated character, the original length is returned.
2926
 *
2927
 * This function assumes that the input string is valid UTF-16LE, but possibly
2928
 * truncated.
2929
 *
2930
 * @param input         the UTF-16LE string to process
2931
 * @param length        the length of the string in bytes
2932
 * @return the length of the string in unit, possibly shorter by 1 unit
2933
 */
2934
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
2935
                                                size_t length);
2936
  #if SIMDUTF_SPAN
2937
simdutf_really_inline simdutf_warn_unused size_t
2938
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
2939
0
  return trim_partial_utf16le(valid_utf16_input.data(),
2940
0
                              valid_utf16_input.size());
2941
0
}
2942
  #endif // SIMDUTF_SPAN
2943
2944
/**
2945
 * Given a valid UTF-16 string having a possibly truncated last character,
2946
 * this function checks the end of string. If the last character is truncated
2947
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
2948
 * the short UTF-16 strings only contain complete characters. If there is no
2949
 * truncated character, the original length is returned.
2950
 *
2951
 * This function assumes that the input string is valid UTF-16, but possibly
2952
 * truncated. We use the native endianness.
2953
 *
2954
 * @param input         the UTF-16 string to process
2955
 * @param length        the length of the string in bytes
2956
 * @return the length of the string in unit, possibly shorter by 1 unit
2957
 */
2958
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
2959
                                              size_t length);
2960
  #if SIMDUTF_SPAN
2961
simdutf_really_inline simdutf_warn_unused size_t
2962
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2963
0
  return trim_partial_utf16(valid_utf16_input.data(), valid_utf16_input.size());
2964
0
}
2965
  #endif // SIMDUTF_SPAN
2966
#endif   // SIMDUTF_FEATURE_UTF16
2967
2968
#if SIMDUTF_FEATURE_BASE64
2969
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
2970
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
2971
  #endif
2972
// base64_options are used to specify the base64 encoding options.
2973
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
2974
// garbage characters are characters that are not part of the base64 alphabet
2975
// nor ASCII spaces.
2976
constexpr uint64_t base64_reverse_padding =
2977
    2; /* modifier for base64_default and base64_url */
2978
enum base64_options : uint64_t {
2979
  base64_default = 0, /* standard base64 format (with padding) */
2980
  base64_url = 1,     /* base64url format (no padding) */
2981
  base64_default_no_padding =
2982
      base64_default |
2983
      base64_reverse_padding, /* standard base64 format without padding */
2984
  base64_url_with_padding =
2985
      base64_url | base64_reverse_padding, /* base64url with padding */
2986
  base64_default_accept_garbage =
2987
      4, /* standard base64 format accepting garbage characters, the input stops
2988
            with the first '=' if any */
2989
  base64_url_accept_garbage =
2990
      5, /* base64url format accepting garbage characters, the input stops with
2991
            the first '=' if any */
2992
  base64_default_or_url =
2993
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
2994
  base64_default_or_url_accept_garbage =
2995
      12, /* standard/base64url hybrid format accepting garbage characters
2996
             (only meaningful for decoding!), the input stops with the first '='
2997
             if any */
2998
};
2999
3000
  #if SIMDUTF_CPLUSPLUS17
3001
0
inline std::string_view to_string(base64_options options) {
3002
0
  switch (options) {
3003
0
  case base64_default:
3004
0
    return "base64_default";
3005
0
  case base64_url:
3006
0
    return "base64_url";
3007
0
  case base64_reverse_padding:
3008
0
    return "base64_reverse_padding";
3009
0
  case base64_url_with_padding:
3010
0
    return "base64_url_with_padding";
3011
0
  case base64_default_accept_garbage:
3012
0
    return "base64_default_accept_garbage";
3013
0
  case base64_url_accept_garbage:
3014
0
    return "base64_url_accept_garbage";
3015
0
  case base64_default_or_url:
3016
0
    return "base64_default_or_url";
3017
0
  case base64_default_or_url_accept_garbage:
3018
0
    return "base64_default_or_url_accept_garbage";
3019
0
  }
3020
0
  return "<unknown>";
3021
0
}
3022
  #endif // SIMDUTF_CPLUSPLUS17
3023
3024
// last_chunk_handling_options are used to specify the handling of the last
3025
// chunk in base64 decoding.
3026
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3027
enum last_chunk_handling_options : uint64_t {
3028
  loose = 0,  /* standard base64 format, decode partial final chunk */
3029
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
3030
                 unpadded, or non-zero bit padding */
3031
  stop_before_partial =
3032
      2, /* if the last chunk is partial, ignore it (no error) */
3033
  only_full_chunks =
3034
      3 /* only decode full blocks (4 base64 characters, no padding) */
3035
};
3036
3037
inline bool is_partial(last_chunk_handling_options options) {
3038
  return (options == stop_before_partial) || (options == only_full_chunks);
3039
}
3040
3041
  #if SIMDUTF_CPLUSPLUS17
3042
0
inline std::string_view to_string(last_chunk_handling_options options) {
3043
0
  switch (options) {
3044
0
  case loose:
3045
0
    return "loose";
3046
0
  case strict:
3047
0
    return "strict";
3048
0
  case stop_before_partial:
3049
0
    return "stop_before_partial";
3050
0
  case only_full_chunks:
3051
0
    return "only_full_chunks";
3052
0
  }
3053
0
  return "<unknown>";
3054
0
}
3055
  #endif
3056
3057
/**
3058
 * Provide the maximal binary length in bytes given the base64 input.
3059
 * In general, if the input contains ASCII spaces, the result will be less than
3060
 * the maximum length.
3061
 *
3062
 * @param input         the base64 input to process
3063
 * @param length        the length of the base64 input in bytes
3064
 * @return maximum number of binary bytes
3065
 */
3066
simdutf_warn_unused size_t
3067
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
3068
  #if SIMDUTF_SPAN
3069
simdutf_really_inline simdutf_warn_unused size_t
3070
maximal_binary_length_from_base64(
3071
    const detail::input_span_of_byte_like auto &input) noexcept {
3072
  return maximal_binary_length_from_base64(
3073
      reinterpret_cast<const char *>(input.data()), input.size());
3074
}
3075
  #endif // SIMDUTF_SPAN
3076
3077
/**
3078
 * Provide the maximal binary length in bytes given the base64 input.
3079
 * In general, if the input contains ASCII spaces, the result will be less than
3080
 * the maximum length.
3081
 *
3082
 * @param input         the base64 input to process, in ASCII stored as 16-bit
3083
 * units
3084
 * @param length        the length of the base64 input in 16-bit units
3085
 * @return maximal number of binary bytes
3086
 */
3087
simdutf_warn_unused size_t maximal_binary_length_from_base64(
3088
    const char16_t *input, size_t length) noexcept;
3089
  #if SIMDUTF_SPAN
3090
simdutf_really_inline simdutf_warn_unused size_t
3091
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
3092
0
  return maximal_binary_length_from_base64(input.data(), input.size());
3093
0
}
3094
  #endif // SIMDUTF_SPAN
3095
3096
/**
3097
 * Convert a base64 input to a binary output.
3098
 *
3099
 * This function follows the WHATWG forgiving-base64 format, which means that it
3100
 * will ignore any ASCII spaces in the input. You may provide a padded input
3101
 * (with one or two equal signs at the end) or an unpadded input (without any
3102
 * equal signs at the end).
3103
 *
3104
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3105
 *
3106
 * This function will fail in case of invalid input. When last_chunk_options =
3107
 * loose, there are two possible reasons for failure: the input contains a
3108
 * number of base64 characters that when divided by 4, leaves a single remainder
3109
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3110
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3111
 *
3112
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3113
 * input where the invalid character was found. When the error is
3114
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3115
 *
3116
 * The default option (simdutf::base64_default) expects the characters `+` and
3117
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3118
 * characters `-` and `_` as part of its alphabet.
3119
 *
3120
 * The padding (`=`) is validated if present. There may be at most two padding
3121
 * characters at the end of the input. If there are any padding characters, the
3122
 * total number of characters (excluding spaces but including padding
3123
 * characters) must be divisible by four.
3124
 *
3125
 * You should call this function with a buffer that is at least
3126
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
3127
 * provide that much space, the function may cause a buffer overflow.
3128
 *
3129
 * Advanced users may want to tailor how the last chunk is handled. By default,
3130
 * we use a loose (forgiving) approach but we also support a strict approach
3131
 * as well as a stop_before_partial approach, as per the following proposal:
3132
 *
3133
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3134
 *
3135
 * @param input         the base64 string to process
3136
 * @param length        the length of the string in bytes
3137
 * @param output        the pointer to a buffer that can hold the conversion
3138
 * result (should be at least maximal_binary_length_from_base64(input, length)
3139
 * bytes long).
3140
 * @param options       the base64 options to use, usually base64_default or
3141
 * base64_url, and base64_default by default.
3142
 * @param last_chunk_options the last chunk handling options,
3143
 * last_chunk_handling_options::loose by default
3144
 * but can also be last_chunk_handling_options::strict or
3145
 * last_chunk_handling_options::stop_before_partial.
3146
 * @return a result pair struct (of type simdutf::result containing the two
3147
 * fields error and count) with an error code and either position of the error
3148
 * (in the input in bytes) if any, or the number of bytes written if successful.
3149
 */
3150
simdutf_warn_unused result base64_to_binary(
3151
    const char *input, size_t length, char *output,
3152
    base64_options options = base64_default,
3153
    last_chunk_handling_options last_chunk_options = loose) noexcept;
3154
  #if SIMDUTF_SPAN
3155
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3156
    const detail::input_span_of_byte_like auto &input,
3157
    detail::output_span_of_byte_like auto &&binary_output,
3158
    base64_options options = base64_default,
3159
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3160
  return base64_to_binary(reinterpret_cast<const char *>(input.data()),
3161
                          input.size(),
3162
                          reinterpret_cast<char *>(binary_output.data()),
3163
                          options, last_chunk_options);
3164
}
3165
  #endif // SIMDUTF_SPAN
3166
3167
/**
3168
 * Provide the base64 length in bytes given the length of a binary input.
3169
 *
3170
 * @param length        the length of the input in bytes
3171
 * @return number of base64 bytes
3172
 */
3173
simdutf_warn_unused size_t base64_length_from_binary(
3174
    size_t length, base64_options options = base64_default) noexcept;
3175
3176
/**
3177
 * Provide the base64 length in bytes given the length of a binary input,
3178
 * taking into account line breaks.
3179
 *
3180
 * @param length        the length of the input in bytes
3181
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
3182
 * interpreted as 4),
3183
 * @return number of base64 bytes
3184
 */
3185
simdutf_warn_unused size_t base64_length_from_binary_with_lines(
3186
    size_t length, base64_options options = base64_default,
3187
    size_t line_length = default_line_length) noexcept;
3188
3189
/**
3190
 * Convert a binary input to a base64 output.
3191
 *
3192
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3193
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3194
 * output to ensure that the output length is a multiple of four.
3195
 *
3196
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3197
 * of its alphabet. No padding is added at the end of the output.
3198
 *
3199
 * This function always succeeds.
3200
 *
3201
 * @param input         the binary to process
3202
 * @param length        the length of the input in bytes
3203
 * @param output        the pointer to a buffer that can hold the conversion
3204
 * result (should be at least base64_length_from_binary(length) bytes long)
3205
 * @param options       the base64 options to use, can be base64_default or
3206
 * base64_url, is base64_default by default.
3207
 * @return number of written bytes, will be equal to
3208
 * base64_length_from_binary(length, options)
3209
 */
3210
size_t binary_to_base64(const char *input, size_t length, char *output,
3211
                        base64_options options = base64_default) noexcept;
3212
  #if SIMDUTF_SPAN
3213
simdutf_really_inline simdutf_warn_unused size_t
3214
binary_to_base64(const detail::input_span_of_byte_like auto &input,
3215
                 detail::output_span_of_byte_like auto &&binary_output,
3216
                 base64_options options = base64_default) noexcept {
3217
  return binary_to_base64(
3218
      reinterpret_cast<const char *>(input.data()), input.size(),
3219
      reinterpret_cast<char *>(binary_output.data()), options);
3220
}
3221
  #endif // SIMDUTF_SPAN
3222
3223
/**
3224
 * Convert a binary input to a base64 output with line breaks.
3225
 *
3226
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3227
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3228
 * output to ensure that the output length is a multiple of four.
3229
 *
3230
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3231
 * of its alphabet. No padding is added at the end of the output.
3232
 *
3233
 * This function always succeeds.
3234
 *
3235
 * @param input         the binary to process
3236
 * @param length        the length of the input in bytes
3237
 * @param output        the pointer to a buffer that can hold the conversion
3238
 * result (should be at least base64_length_from_binary_with_lines(length,
3239
 * options, line_length) bytes long)
3240
 * @param line_length   the length of lines, must be at least 4 (otherwise it is
3241
 * interpreted as 4),
3242
 * @param options       the base64 options to use, can be base64_default or
3243
 * base64_url, is base64_default by default.
3244
 * @return number of written bytes, will be equal to
3245
 * base64_length_from_binary_with_lines(length, options)
3246
 */
3247
size_t
3248
binary_to_base64_with_lines(const char *input, size_t length, char *output,
3249
                            size_t line_length = simdutf::default_line_length,
3250
                            base64_options options = base64_default) noexcept;
3251
  #if SIMDUTF_SPAN
3252
simdutf_really_inline simdutf_warn_unused size_t binary_to_base64_with_lines(
3253
    const detail::input_span_of_byte_like auto &input,
3254
    detail::output_span_of_byte_like auto &&binary_output,
3255
    size_t line_length = simdutf::default_line_length,
3256
    base64_options options = base64_default) noexcept {
3257
  return binary_to_base64_with_lines(
3258
      reinterpret_cast<const char *>(input.data()), input.size(),
3259
      reinterpret_cast<char *>(binary_output.data()), line_length, options);
3260
}
3261
  #endif // SIMDUTF_SPAN
3262
3263
  #if SIMDUTF_ATOMIC_REF
3264
/**
3265
 * Convert a binary input to a base64 output, using atomic accesses.
3266
 * This function comes with a potentially significant performance
3267
 * penalty, but it may be useful in some cases where the input
3268
 * buffers are shared between threads, to avoid undefined
3269
 * behavior in case of data races.
3270
 *
3271
 * The function is for advanced users. Its main use case is when
3272
 * to silence sanitizer warnings. We have no documented use case
3273
 * where this function is actually necessary in terms of practical correctness.
3274
 *
3275
 * This function is only available when simdutf is compiled with
3276
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3277
 * the availability of this function by checking the macro
3278
 * SIMDUTF_ATOMIC_REF.
3279
 *
3280
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
3281
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
3282
 * output to ensure that the output length is a multiple of four.
3283
 *
3284
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
3285
 * of its alphabet. No padding is added at the end of the output.
3286
 *
3287
 * This function always succeeds.
3288
 *
3289
 * This function is considered experimental. It is not tested by default
3290
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3291
 * It is not documented in the public API documentation (README). It is
3292
 * offered on a best effort basis. We rely on the community for further
3293
 * testing and feedback.
3294
 *
3295
 * @brief atomic_binary_to_base64
3296
 * @param input         the binary to process
3297
 * @param length        the length of the input in bytes
3298
 * @param output        the pointer to a buffer that can hold the conversion
3299
 * result (should be at least base64_length_from_binary(length) bytes long)
3300
 * @param options       the base64 options to use, can be base64_default or
3301
 * base64_url, is base64_default by default.
3302
 * @return number of written bytes, will be equal to
3303
 * base64_length_from_binary(length, options)
3304
 */
3305
size_t
3306
atomic_binary_to_base64(const char *input, size_t length, char *output,
3307
                        base64_options options = base64_default) noexcept;
3308
    #if SIMDUTF_SPAN
3309
simdutf_really_inline simdutf_warn_unused size_t
3310
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
3311
                        detail::output_span_of_byte_like auto &&binary_output,
3312
                        base64_options options = base64_default) noexcept {
3313
  return atomic_binary_to_base64(
3314
      reinterpret_cast<const char *>(input.data()), input.size(),
3315
      reinterpret_cast<char *>(binary_output.data()), options);
3316
}
3317
    #endif // SIMDUTF_SPAN
3318
  #endif   // SIMDUTF_ATOMIC_REF
3319
3320
/**
3321
 * Convert a base64 input to a binary output.
3322
 *
3323
 * This function follows the WHATWG forgiving-base64 format, which means that it
3324
 * will ignore any ASCII spaces in the input. You may provide a padded input
3325
 * (with one or two equal signs at the end) or an unpadded input (without any
3326
 * equal signs at the end).
3327
 *
3328
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3329
 *
3330
 * This function will fail in case of invalid input. When last_chunk_options =
3331
 * loose, there are two possible reasons for failure: the input contains a
3332
 * number of base64 characters that when divided by 4, leaves a single remainder
3333
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
3334
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
3335
 *
3336
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3337
 * input where the invalid character was found. When the error is
3338
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3339
 *
3340
 * The default option (simdutf::base64_default) expects the characters `+` and
3341
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3342
 * characters `-` and `_` as part of its alphabet.
3343
 *
3344
 * The padding (`=`) is validated if present. There may be at most two padding
3345
 * characters at the end of the input. If there are any padding characters, the
3346
 * total number of characters (excluding spaces but including padding
3347
 * characters) must be divisible by four.
3348
 *
3349
 * You should call this function with a buffer that is at least
3350
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
3351
 * to provide that much space, the function may cause a buffer overflow.
3352
 *
3353
 * Advanced users may want to tailor how the last chunk is handled. By default,
3354
 * we use a loose (forgiving) approach but we also support a strict approach
3355
 * as well as a stop_before_partial approach, as per the following proposal:
3356
 *
3357
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3358
 *
3359
 * @param input         the base64 string to process, in ASCII stored as 16-bit
3360
 * units
3361
 * @param length        the length of the string in 16-bit units
3362
 * @param output        the pointer to a buffer that can hold the conversion
3363
 * result (should be at least maximal_binary_length_from_base64(input, length)
3364
 * bytes long).
3365
 * @param options       the base64 options to use, can be base64_default or
3366
 * base64_url, is base64_default by default.
3367
 * @param last_chunk_options the last chunk handling options,
3368
 * last_chunk_handling_options::loose by default
3369
 * but can also be last_chunk_handling_options::strict or
3370
 * last_chunk_handling_options::stop_before_partial.
3371
 * @return a result pair struct (of type simdutf::result containing the two
3372
 * fields error and count) with an error code and position of the
3373
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3374
 * of bytes written if successful.
3375
 */
3376
simdutf_warn_unused result
3377
base64_to_binary(const char16_t *input, size_t length, char *output,
3378
                 base64_options options = base64_default,
3379
                 last_chunk_handling_options last_chunk_options =
3380
                     last_chunk_handling_options::loose) noexcept;
3381
  #if SIMDUTF_SPAN
3382
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
3383
    std::span<const char16_t> input,
3384
    detail::output_span_of_byte_like auto &&binary_output,
3385
    base64_options options = base64_default,
3386
    last_chunk_handling_options last_chunk_options = loose) noexcept {
3387
  return base64_to_binary(input.data(), input.size(),
3388
                          reinterpret_cast<char *>(binary_output.data()),
3389
                          options, last_chunk_options);
3390
}
3391
  #endif // SIMDUTF_SPAN
3392
3393
/**
3394
 * Check if a character is an ignorabl base64 character.
3395
 * Checking a large input, character by character, is not computationally
3396
 * efficient.
3397
 *
3398
 * @param input         the character to check
3399
 * @param options       the base64 options to use, is base64_default by default.
3400
 * @return true if the character is an ignorablee base64 character, false
3401
 * otherwise.
3402
 */
3403
simdutf_warn_unused bool
3404
base64_ignorable(char input, base64_options options = base64_default) noexcept;
3405
simdutf_warn_unused bool
3406
base64_ignorable(char16_t input,
3407
                 base64_options options = base64_default) noexcept;
3408
3409
/**
3410
 * Check if a character is a valid base64 character.
3411
 * Checking a large input, character by character, is not computationally
3412
 * efficient.
3413
 * Note that padding characters are not considered valid base64 characters in
3414
 * this context, nor are spaces.
3415
 *
3416
 * @param input         the character to check
3417
 * @param options       the base64 options to use, is base64_default by default.
3418
 * @return true if the character is a base64 character, false otherwise.
3419
 */
3420
simdutf_warn_unused bool
3421
base64_valid(char input, base64_options options = base64_default) noexcept;
3422
simdutf_warn_unused bool
3423
base64_valid(char16_t input, base64_options options = base64_default) noexcept;
3424
3425
/**
3426
 * Check if a character is a valid base64 character or the padding character
3427
 * ('='). Checking a large input, character by character, is not computationally
3428
 * efficient.
3429
 *
3430
 * @param input         the character to check
3431
 * @param options       the base64 options to use, is base64_default by default.
3432
 * @return true if the character is a base64 character, false otherwise.
3433
 */
3434
simdutf_warn_unused bool
3435
base64_valid_or_padding(char input,
3436
                        base64_options options = base64_default) noexcept;
3437
simdutf_warn_unused bool
3438
base64_valid_or_padding(char16_t input,
3439
                        base64_options options = base64_default) noexcept;
3440
3441
/**
3442
 * Convert a base64 input to a binary output.
3443
 *
3444
 * This function follows the WHATWG forgiving-base64 format, which means that it
3445
 * will ignore any ASCII spaces in the input. You may provide a padded input
3446
 * (with one or two equal signs at the end) or an unpadded input (without any
3447
 * equal signs at the end).
3448
 *
3449
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3450
 *
3451
 * This function will fail in case of invalid input. When last_chunk_options =
3452
 * loose, there are three possible reasons for failure: the input contains a
3453
 * number of base64 characters that when divided by 4, leaves a single remainder
3454
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
3455
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
3456
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
3457
 *
3458
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
3459
 * and the number of units processed, see description of the parameters and
3460
 * returned value.
3461
 *
3462
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
3463
 * input where the invalid character was found. When the error is
3464
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
3465
 *
3466
 * The default option (simdutf::base64_default) expects the characters `+` and
3467
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
3468
 * characters `-` and `_` as part of its alphabet.
3469
 *
3470
 * The padding (`=`) is validated if present. There may be at most two padding
3471
 * characters at the end of the input. If there are any padding characters, the
3472
 * total number of characters (excluding spaces but including padding
3473
 * characters) must be divisible by four.
3474
 *
3475
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
3476
 * to discard the output unless the parameter decode_up_to_bad_char is set to
3477
 * true. In that case, the function will decode up to the first invalid
3478
 * character. Extra padding characters ('=') are considered invalid characters.
3479
 *
3480
 * Advanced users may want to tailor how the last chunk is handled. By default,
3481
 * we use a loose (forgiving) approach but we also support a strict approach
3482
 * as well as a stop_before_partial approach, as per the following proposal:
3483
 *
3484
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3485
 *
3486
 * @param input         the base64 string to process, in ASCII stored as 8-bit
3487
 * or 16-bit units
3488
 * @param length        the length of the string in 8-bit or 16-bit units.
3489
 * @param output        the pointer to a buffer that can hold the conversion
3490
 * result.
3491
 * @param outlen        the number of bytes that can be written in the output
3492
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3493
 * @param options       the base64 options to use, can be base64_default or
3494
 * base64_url, is base64_default by default.
3495
 * @param last_chunk_options the last chunk handling options,
3496
 * last_chunk_handling_options::loose by default
3497
 * but can also be last_chunk_handling_options::strict or
3498
 * last_chunk_handling_options::stop_before_partial.
3499
 * @param decode_up_to_bad_char if true, the function will decode up to the
3500
 * first invalid character. By default (false), it is assumed that the output
3501
 * buffer is to be discarded. When there are multiple errors in the input,
3502
 * using decode_up_to_bad_char might trigger a different error.
3503
 * @return a result pair struct (of type simdutf::result containing the two
3504
 * fields error and count) with an error code and position of the
3505
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
3506
 * of units processed if successful.
3507
 */
3508
simdutf_warn_unused result
3509
base64_to_binary_safe(const char *input, size_t length, char *output,
3510
                      size_t &outlen, base64_options options = base64_default,
3511
                      last_chunk_handling_options last_chunk_options =
3512
                          last_chunk_handling_options::loose,
3513
                      bool decode_up_to_bad_char = false) noexcept;
3514
  #if SIMDUTF_SPAN
3515
/**
3516
 * @brief span overload
3517
 * @return a tuple of result and outlen
3518
 */
3519
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3520
base64_to_binary_safe(const detail::input_span_of_byte_like auto &input,
3521
                      detail::output_span_of_byte_like auto &&binary_output,
3522
                      base64_options options = base64_default,
3523
                      last_chunk_handling_options last_chunk_options = loose,
3524
                      bool decode_up_to_bad_char = false) noexcept {
3525
  size_t outlen = binary_output.size();
3526
  auto r = base64_to_binary_safe(
3527
      reinterpret_cast<const char *>(input.data()), input.size(),
3528
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3529
      last_chunk_options, decode_up_to_bad_char);
3530
  return {r, outlen};
3531
}
3532
  #endif // SIMDUTF_SPAN
3533
3534
simdutf_warn_unused result
3535
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
3536
                      size_t &outlen, base64_options options = base64_default,
3537
                      last_chunk_handling_options last_chunk_options =
3538
                          last_chunk_handling_options::loose,
3539
                      bool decode_up_to_bad_char = false) noexcept;
3540
  #if SIMDUTF_SPAN
3541
/**
3542
 * @brief span overload
3543
 * @return a tuple of result and outlen
3544
 */
3545
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3546
base64_to_binary_safe(std::span<const char16_t> input,
3547
                      detail::output_span_of_byte_like auto &&binary_output,
3548
                      base64_options options = base64_default,
3549
                      last_chunk_handling_options last_chunk_options = loose,
3550
                      bool decode_up_to_bad_char = false) noexcept {
3551
  size_t outlen = binary_output.size();
3552
  auto r = base64_to_binary_safe(input.data(), input.size(),
3553
                                 reinterpret_cast<char *>(binary_output.data()),
3554
                                 outlen, options, last_chunk_options,
3555
                                 decode_up_to_bad_char);
3556
  return {r, outlen};
3557
}
3558
  #endif // SIMDUTF_SPAN
3559
3560
  #if SIMDUTF_ATOMIC_REF
3561
/**
3562
 * Convert a base64 input to a binary output with a size limit and using atomic
3563
 * operations.
3564
 *
3565
 * Like `base64_to_binary_safe` but using atomic operations, this function is
3566
 * thread-safe for concurrent memory access, allowing the output
3567
 * buffers to be shared between threads without undefined behavior in case of
3568
 * data races.
3569
 *
3570
 * This function comes with a potentially significant performance penalty, but
3571
 * is useful when thread safety is needed during base64 decoding.
3572
 *
3573
 * This function is only available when simdutf is compiled with
3574
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
3575
 * the availability of this function by checking the macro
3576
 * SIMDUTF_ATOMIC_REF.
3577
 *
3578
 * This function is considered experimental. It is not tested by default
3579
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
3580
 * It is not documented in the public API documentation (README). It is
3581
 * offered on a best effort basis. We rely on the community for further
3582
 * testing and feedback.
3583
 *
3584
 * @param input         the base64 input to decode
3585
 * @param length        the length of the input in bytes
3586
 * @param output        the pointer to buffer that can hold the conversion
3587
 * result
3588
 * @param outlen        the number of bytes that can be written in the output
3589
 * buffer. Upon return, it is modified to reflect how many bytes were written.
3590
 * @param options       the base64 options to use (default, url, etc.)
3591
 * @param last_chunk_options the last chunk handling options (loose, strict,
3592
 * stop_before_partial)
3593
 * @param decode_up_to_bad_char if true, the function will decode up to the
3594
 * first invalid character. By default (false), it is assumed that the output
3595
 * buffer is to be discarded. When there are multiple errors in the input,
3596
 * using decode_up_to_bad_char might trigger a different error.
3597
 * @return a result struct with an error code and count indicating error
3598
 * position or success
3599
 */
3600
simdutf_warn_unused result atomic_base64_to_binary_safe(
3601
    const char *input, size_t length, char *output, size_t &outlen,
3602
    base64_options options = base64_default,
3603
    last_chunk_handling_options last_chunk_options =
3604
        last_chunk_handling_options::loose,
3605
    bool decode_up_to_bad_char = false) noexcept;
3606
simdutf_warn_unused result atomic_base64_to_binary_safe(
3607
    const char16_t *input, size_t length, char *output, size_t &outlen,
3608
    base64_options options = base64_default,
3609
    last_chunk_handling_options last_chunk_options = loose,
3610
    bool decode_up_to_bad_char = false) noexcept;
3611
    #if SIMDUTF_SPAN
3612
/**
3613
 * @brief span overload
3614
 * @return a tuple of result and outlen
3615
 */
3616
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
3617
atomic_base64_to_binary_safe(
3618
    const detail::input_span_of_byte_like auto &binary_input,
3619
    detail::output_span_of_byte_like auto &&output,
3620
    base64_options options = base64_default,
3621
    last_chunk_handling_options last_chunk_options =
3622
        last_chunk_handling_options::loose,
3623
    bool decode_up_to_bad_char = false) noexcept {
3624
  size_t outlen = output.size();
3625
  auto ret = atomic_base64_to_binary_safe(
3626
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
3627
      reinterpret_cast<char *>(output.data()), outlen, options,
3628
      last_chunk_options, decode_up_to_bad_char);
3629
  return {ret, outlen};
3630
}
3631
/**
3632
 * @brief span overload
3633
 * @return a tuple of result and outlen
3634
 */
3635
simdutf_warn_unused std::tuple<result, std::size_t>
3636
atomic_base64_to_binary_safe(
3637
    std::span<const char16_t> base64_input,
3638
    detail::output_span_of_byte_like auto &&binary_output,
3639
    base64_options options = base64_default,
3640
    last_chunk_handling_options last_chunk_options = loose,
3641
    bool decode_up_to_bad_char = false) noexcept {
3642
  size_t outlen = binary_output.size();
3643
  auto ret = atomic_base64_to_binary_safe(
3644
      base64_input.data(), base64_input.size(),
3645
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
3646
      last_chunk_options, decode_up_to_bad_char);
3647
  return {ret, outlen};
3648
}
3649
    #endif // SIMDUTF_SPAN
3650
  #endif   // SIMDUTF_ATOMIC_REF
3651
3652
/**
3653
 * Find the first occurrence of a character in a string. If the character is
3654
 * not found, return a pointer to the end of the string.
3655
 * @param start        the start of the string
3656
 * @param end          the end of the string
3657
 * @param character    the character to find
3658
 * @return a pointer to the first occurrence of the character in the string,
3659
 * or a pointer to the end of the string if the character is not found.
3660
 *
3661
 */
3662
simdutf_warn_unused const char *find(const char *start, const char *end,
3663
                                     char character) noexcept;
3664
simdutf_warn_unused const char16_t *
3665
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
3666
#endif // SIMDUTF_FEATURE_BASE64
3667
3668
/**
3669
 * An implementation of simdutf for a particular CPU architecture.
3670
 *
3671
 * Also used to maintain the currently active implementation. The active
3672
 * implementation is automatically initialized on first use to the most advanced
3673
 * implementation supported by the host.
3674
 */
3675
class implementation {
3676
public:
3677
  /**
3678
   * The name of this implementation.
3679
   *
3680
   *     const implementation *impl = simdutf::active_implementation;
3681
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3682
   * impl->description() << ")" << endl;
3683
   *
3684
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3685
   */
3686
  virtual std::string name() const { return std::string(_name); }
3687
3688
  /**
3689
   * The description of this implementation.
3690
   *
3691
   *     const implementation *impl = simdutf::active_implementation;
3692
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
3693
   * impl->description() << ")" << endl;
3694
   *
3695
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
3696
   */
3697
  virtual std::string description() const { return std::string(_description); }
3698
3699
  /**
3700
   * The instruction sets this implementation is compiled against
3701
   * and the current CPU match. This function may poll the current CPU/system
3702
   * and should therefore not be called too often if performance is a concern.
3703
   *
3704
   *
3705
   * @return true if the implementation can be safely used on the current system
3706
   * (determined at runtime)
3707
   */
3708
  bool supported_by_runtime_system() const;
3709
3710
#if SIMDUTF_FEATURE_DETECT_ENCODING
3711
  /**
3712
   * This function will try to detect the encoding
3713
   * @param input the string to identify
3714
   * @param length the length of the string in bytes.
3715
   * @return the encoding type detected
3716
   */
3717
  virtual encoding_type autodetect_encoding(const char *input,
3718
                                            size_t length) const noexcept;
3719
3720
  /**
3721
   * This function will try to detect the possible encodings in one pass
3722
   * @param input the string to identify
3723
   * @param length the length of the string in bytes.
3724
   * @return the encoding type detected
3725
   */
3726
  virtual int detect_encodings(const char *input,
3727
                               size_t length) const noexcept = 0;
3728
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
3729
3730
  /**
3731
   * @private For internal implementation use
3732
   *
3733
   * The instruction sets this implementation is compiled against.
3734
   *
3735
   * @return a mask of all required `internal::instruction_set::` values
3736
   */
3737
  virtual uint32_t required_instruction_sets() const {
3738
    return _required_instruction_sets;
3739
  }
3740
3741
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3742
  /**
3743
   * Validate the UTF-8 string.
3744
   *
3745
   * Overridden by each implementation.
3746
   *
3747
   * @param buf the UTF-8 string to validate.
3748
   * @param len the length of the string in bytes.
3749
   * @return true if and only if the string is valid UTF-8.
3750
   */
3751
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
3752
                                                 size_t len) const noexcept = 0;
3753
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
3754
3755
#if SIMDUTF_FEATURE_UTF8
3756
  /**
3757
   * Validate the UTF-8 string and stop on errors.
3758
   *
3759
   * Overridden by each implementation.
3760
   *
3761
   * @param buf the UTF-8 string to validate.
3762
   * @param len the length of the string in bytes.
3763
   * @return a result pair struct (of type simdutf::result containing the two
3764
   * fields error and count) with an error code and either position of the error
3765
   * (in the input in code units) if any, or the number of code units validated
3766
   * if successful.
3767
   */
3768
  simdutf_warn_unused virtual result
3769
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
3770
#endif // SIMDUTF_FEATURE_UTF8
3771
3772
#if SIMDUTF_FEATURE_ASCII
3773
  /**
3774
   * Validate the ASCII string.
3775
   *
3776
   * Overridden by each implementation.
3777
   *
3778
   * @param buf the ASCII string to validate.
3779
   * @param len the length of the string in bytes.
3780
   * @return true if and only if the string is valid ASCII.
3781
   */
3782
  simdutf_warn_unused virtual bool
3783
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
3784
3785
  /**
3786
   * Validate the ASCII string and stop on error.
3787
   *
3788
   * Overridden by each implementation.
3789
   *
3790
   * @param buf the ASCII string to validate.
3791
   * @param len the length of the string in bytes.
3792
   * @return a result pair struct (of type simdutf::result containing the two
3793
   * fields error and count) with an error code and either position of the error
3794
   * (in the input in code units) if any, or the number of code units validated
3795
   * if successful.
3796
   */
3797
  simdutf_warn_unused virtual result
3798
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
3799
3800
#endif // SIMDUTF_FEATURE_ASCII
3801
3802
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3803
  /**
3804
   * Validate the ASCII string as a UTF-16BE sequence.
3805
   * An UTF-16 sequence is considered an ASCII sequence
3806
   * if it could be converted to an ASCII string losslessly.
3807
   *
3808
   * Overridden by each implementation.
3809
   *
3810
   * @param buf the UTF-16BE string to validate.
3811
   * @param len the length of the string in bytes.
3812
   * @return true if and only if the string is valid ASCII.
3813
   */
3814
  simdutf_warn_unused virtual bool
3815
  validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3816
3817
  /**
3818
   * Validate the ASCII string as a UTF-16LE sequence.
3819
   * An UTF-16 sequence is considered an ASCII sequence
3820
   * if it could be converted to an ASCII string losslessly.
3821
   *
3822
   * Overridden by each implementation.
3823
   *
3824
   * @param buf the UTF-16LE string to validate.
3825
   * @param len the length of the string in bytes.
3826
   * @return true if and only if the string is valid ASCII.
3827
   */
3828
  simdutf_warn_unused virtual bool
3829
  validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
3830
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
3831
3832
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3833
  /**
3834
   * Validate the UTF-16LE string.This function may be best when you expect
3835
   * the input to be almost always valid. Otherwise, consider using
3836
   * validate_utf16le_with_errors.
3837
   *
3838
   * Overridden by each implementation.
3839
   *
3840
   * This function is not BOM-aware.
3841
   *
3842
   * @param buf the UTF-16LE string to validate.
3843
   * @param len the length of the string in number of 2-byte code units
3844
   * (char16_t).
3845
   * @return true if and only if the string is valid UTF-16LE.
3846
   */
3847
  simdutf_warn_unused virtual bool
3848
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
3849
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
3850
3851
#if SIMDUTF_FEATURE_UTF16
3852
  /**
3853
   * Validate the UTF-16BE string. This function may be best when you expect
3854
   * the input to be almost always valid. Otherwise, consider using
3855
   * validate_utf16be_with_errors.
3856
   *
3857
   * Overridden by each implementation.
3858
   *
3859
   * This function is not BOM-aware.
3860
   *
3861
   * @param buf the UTF-16BE string to validate.
3862
   * @param len the length of the string in number of 2-byte code units
3863
   * (char16_t).
3864
   * @return true if and only if the string is valid UTF-16BE.
3865
   */
3866
  simdutf_warn_unused virtual bool
3867
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
3868
3869
  /**
3870
   * Validate the UTF-16LE string and stop on error.  It might be faster than
3871
   * validate_utf16le when an error is expected to occur early.
3872
   *
3873
   * Overridden by each implementation.
3874
   *
3875
   * This function is not BOM-aware.
3876
   *
3877
   * @param buf the UTF-16LE string to validate.
3878
   * @param len the length of the string in number of 2-byte code units
3879
   * (char16_t).
3880
   * @return a result pair struct (of type simdutf::result containing the two
3881
   * fields error and count) with an error code and either position of the error
3882
   * (in the input in code units) if any, or the number of code units validated
3883
   * if successful.
3884
   */
3885
  simdutf_warn_unused virtual result
3886
  validate_utf16le_with_errors(const char16_t *buf,
3887
                               size_t len) const noexcept = 0;
3888
3889
  /**
3890
   * Validate the UTF-16BE string and stop on error. It might be faster than
3891
   * validate_utf16be when an error is expected to occur early.
3892
   *
3893
   * Overridden by each implementation.
3894
   *
3895
   * This function is not BOM-aware.
3896
   *
3897
   * @param buf the UTF-16BE string to validate.
3898
   * @param len the length of the string in number of 2-byte code units
3899
   * (char16_t).
3900
   * @return a result pair struct (of type simdutf::result containing the two
3901
   * fields error and count) with an error code and either position of the error
3902
   * (in the input in code units) if any, or the number of code units validated
3903
   * if successful.
3904
   */
3905
  simdutf_warn_unused virtual result
3906
  validate_utf16be_with_errors(const char16_t *buf,
3907
                               size_t len) const noexcept = 0;
3908
  /**
3909
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
3910
   * Unicode replacement character U+FFFD. We allow the input and output to be
3911
   * the same buffer so that the correction is done in-place.
3912
   *
3913
   * Overridden by each implementation.
3914
   *
3915
   * @param input the UTF-16LE string to correct.
3916
   * @param len the length of the string in number of 2-byte code units
3917
   * (char16_t).
3918
   * @param output the output buffer.
3919
   */
3920
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
3921
                                      char16_t *output) const noexcept = 0;
3922
  /**
3923
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
3924
   * Unicode replacement character U+FFFD. We allow the input and output to be
3925
   * the same buffer so that the correction is done in-place.
3926
   *
3927
   * Overridden by each implementation.
3928
   *
3929
   * @param input the UTF-16BE string to correct.
3930
   * @param len the length of the string in number of 2-byte code units
3931
   * (char16_t).
3932
   * @param output the output buffer.
3933
   */
3934
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
3935
                                      char16_t *output) const noexcept = 0;
3936
#endif // SIMDUTF_FEATURE_UTF16
3937
3938
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3939
  /**
3940
   * Validate the UTF-32 string.
3941
   *
3942
   * Overridden by each implementation.
3943
   *
3944
   * This function is not BOM-aware.
3945
   *
3946
   * @param buf the UTF-32 string to validate.
3947
   * @param len the length of the string in number of 4-byte code units
3948
   * (char32_t).
3949
   * @return true if and only if the string is valid UTF-32.
3950
   */
3951
  simdutf_warn_unused virtual bool
3952
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
3953
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
3954
3955
#if SIMDUTF_FEATURE_UTF32
3956
  /**
3957
   * Validate the UTF-32 string and stop on error.
3958
   *
3959
   * Overridden by each implementation.
3960
   *
3961
   * This function is not BOM-aware.
3962
   *
3963
   * @param buf the UTF-32 string to validate.
3964
   * @param len the length of the string in number of 4-byte code units
3965
   * (char32_t).
3966
   * @return a result pair struct (of type simdutf::result containing the two
3967
   * fields error and count) with an error code and either position of the error
3968
   * (in the input in code units) if any, or the number of code units validated
3969
   * if successful.
3970
   */
3971
  simdutf_warn_unused virtual result
3972
  validate_utf32_with_errors(const char32_t *buf,
3973
                             size_t len) const noexcept = 0;
3974
#endif // SIMDUTF_FEATURE_UTF32
3975
3976
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3977
  /**
3978
   * Convert Latin1 string into UTF-8 string.
3979
   *
3980
   * This function is suitable to work with inputs from untrusted sources.
3981
   *
3982
   * @param input         the Latin1 string to convert
3983
   * @param length        the length of the string in bytes
3984
   * @param utf8_output  the pointer to buffer that can hold conversion result
3985
   * @return the number of written char; 0 if conversion is not possible
3986
   */
3987
  simdutf_warn_unused virtual size_t
3988
  convert_latin1_to_utf8(const char *input, size_t length,
3989
                         char *utf8_output) const noexcept = 0;
3990
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
3991
3992
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3993
  /**
3994
   * Convert possibly Latin1 string into UTF-16LE string.
3995
   *
3996
   * This function is suitable to work with inputs from untrusted sources.
3997
   *
3998
   * @param input         the Latin1  string to convert
3999
   * @param length        the length of the string in bytes
4000
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4001
   * @return the number of written char16_t; 0 if conversion is not possible
4002
   */
4003
  simdutf_warn_unused virtual size_t
4004
  convert_latin1_to_utf16le(const char *input, size_t length,
4005
                            char16_t *utf16_output) const noexcept = 0;
4006
4007
  /**
4008
   * Convert Latin1 string into UTF-16BE string.
4009
   *
4010
   * This function is suitable to work with inputs from untrusted sources.
4011
   *
4012
   * @param input         the Latin1 string to convert
4013
   * @param length        the length of the string in bytes
4014
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4015
   * @return the number of written char16_t; 0 if conversion is not possible
4016
   */
4017
  simdutf_warn_unused virtual size_t
4018
  convert_latin1_to_utf16be(const char *input, size_t length,
4019
                            char16_t *utf16_output) const noexcept = 0;
4020
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4021
4022
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4023
  /**
4024
   * Convert Latin1 string into UTF-32 string.
4025
   *
4026
   * This function is suitable to work with inputs from untrusted sources.
4027
   *
4028
   * @param input         the Latin1 string to convert
4029
   * @param length        the length of the string in bytes
4030
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4031
   * @return the number of written char32_t; 0 if conversion is not possible
4032
   */
4033
  simdutf_warn_unused virtual size_t
4034
  convert_latin1_to_utf32(const char *input, size_t length,
4035
                          char32_t *utf32_buffer) const noexcept = 0;
4036
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4037
4038
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4039
  /**
4040
   * Convert possibly broken UTF-8 string into latin1 string.
4041
   *
4042
   * During the conversion also validation of the input string is done.
4043
   * This function is suitable to work with inputs from untrusted sources.
4044
   *
4045
   * @param input         the UTF-8 string to convert
4046
   * @param length        the length of the string in bytes
4047
   * @param latin1_output  the pointer to buffer that can hold conversion result
4048
   * @return the number of written char; 0 if the input was not valid UTF-8
4049
   * string or if it cannot be represented as Latin1
4050
   */
4051
  simdutf_warn_unused virtual size_t
4052
  convert_utf8_to_latin1(const char *input, size_t length,
4053
                         char *latin1_output) const noexcept = 0;
4054
4055
  /**
4056
   * Convert possibly broken UTF-8 string into latin1 string with errors.
4057
   * If the string cannot be represented as Latin1, an error
4058
   * code is returned.
4059
   *
4060
   * During the conversion also validation of the input string is done.
4061
   * This function is suitable to work with inputs from untrusted sources.
4062
   *
4063
   * @param input         the UTF-8 string to convert
4064
   * @param length        the length of the string in bytes
4065
   * @param latin1_output  the pointer to buffer that can hold conversion result
4066
   * @return a result pair struct (of type simdutf::result containing the two
4067
   * fields error and count) with an error code and either position of the error
4068
   * (in the input in code units) if any, or the number of code units validated
4069
   * if successful.
4070
   */
4071
  simdutf_warn_unused virtual result
4072
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
4073
                                     char *latin1_output) const noexcept = 0;
4074
4075
  /**
4076
   * Convert valid UTF-8 string into latin1 string.
4077
   *
4078
   * This function assumes that the input string is valid UTF-8 and that it can
4079
   * be represented as Latin1. If you violate this assumption, the result is
4080
   * implementation defined and may include system-dependent behavior such as
4081
   * crashes.
4082
   *
4083
   * This function is for expert users only and not part of our public API. Use
4084
   * convert_utf8_to_latin1 instead.
4085
   *
4086
   * This function is not BOM-aware.
4087
   *
4088
   * @param input         the UTF-8 string to convert
4089
   * @param length        the length of the string in bytes
4090
   * @param latin1_output  the pointer to buffer that can hold conversion result
4091
   * @return the number of written char; 0 if the input was not valid UTF-8
4092
   * string
4093
   */
4094
  simdutf_warn_unused virtual size_t
4095
  convert_valid_utf8_to_latin1(const char *input, size_t length,
4096
                               char *latin1_output) const noexcept = 0;
4097
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4098
4099
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4100
  /**
4101
   * Convert possibly broken UTF-8 string into UTF-16LE string.
4102
   *
4103
   * During the conversion also validation of the input string is done.
4104
   * This function is suitable to work with inputs from untrusted sources.
4105
   *
4106
   * @param input         the UTF-8 string to convert
4107
   * @param length        the length of the string in bytes
4108
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4109
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4110
   * string
4111
   */
4112
  simdutf_warn_unused virtual size_t
4113
  convert_utf8_to_utf16le(const char *input, size_t length,
4114
                          char16_t *utf16_output) const noexcept = 0;
4115
4116
  /**
4117
   * Convert possibly broken UTF-8 string into UTF-16BE string.
4118
   *
4119
   * During the conversion also validation of the input string is done.
4120
   * This function is suitable to work with inputs from untrusted sources.
4121
   *
4122
   * @param input         the UTF-8 string to convert
4123
   * @param length        the length of the string in bytes
4124
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4125
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4126
   * string
4127
   */
4128
  simdutf_warn_unused virtual size_t
4129
  convert_utf8_to_utf16be(const char *input, size_t length,
4130
                          char16_t *utf16_output) const noexcept = 0;
4131
4132
  /**
4133
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
4134
   * error.
4135
   *
4136
   * During the conversion also validation of the input string is done.
4137
   * This function is suitable to work with inputs from untrusted sources.
4138
   *
4139
   * @param input         the UTF-8 string to convert
4140
   * @param length        the length of the string in bytes
4141
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4142
   * @return a result pair struct (of type simdutf::result containing the two
4143
   * fields error and count) with an error code and either position of the error
4144
   * (in the input in code units) if any, or the number of code units validated
4145
   * if successful.
4146
   */
4147
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
4148
      const char *input, size_t length,
4149
      char16_t *utf16_output) const noexcept = 0;
4150
4151
  /**
4152
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
4153
   * error.
4154
   *
4155
   * During the conversion also validation of the input string is done.
4156
   * This function is suitable to work with inputs from untrusted sources.
4157
   *
4158
   * @param input         the UTF-8 string to convert
4159
   * @param length        the length of the string in bytes
4160
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4161
   * @return a result pair struct (of type simdutf::result containing the two
4162
   * fields error and count) with an error code and either position of the error
4163
   * (in the input in code units) if any, or the number of code units validated
4164
   * if successful.
4165
   */
4166
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
4167
      const char *input, size_t length,
4168
      char16_t *utf16_output) const noexcept = 0;
4169
  /**
4170
   * Compute the number of bytes that this UTF-16LE string would require in
4171
   * UTF-8 format even when the UTF-16LE content contains mismatched
4172
   * surrogates that have to be replaced by the replacement character (0xFFFD).
4173
   *
4174
   * @param input         the UTF-16LE string to convert
4175
   * @param length        the length of the string in 2-byte code units
4176
   * (char16_t)
4177
   * @return a result pair struct (of type simdutf::result containing the two
4178
   * fields error and count) where the count is the number of bytes required to
4179
   * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS
4180
   * or SURROGATE. The count is correct regardless of the error field.
4181
   * When SURROGATE is returned, it does not indicate an error in the case of
4182
   * this function: it indicates that at least one surrogate has been
4183
   * encountered: the surrogates may be matched or not (thus this function does
4184
   * not validate). If the returned error code is SUCCESS, then the input
4185
   * contains no surrogate, is in the Basic Multilingual Plane, and is
4186
   * necessarily valid.
4187
   */
4188
  virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
4189
      const char16_t *input, size_t length) const noexcept = 0;
4190
4191
  /**
4192
   * Compute the number of bytes that this UTF-16BE string would require in
4193
   * UTF-8 format even when the UTF-16BE content contains mismatched
4194
   * surrogates that have to be replaced by the replacement character (0xFFFD).
4195
   *
4196
   * @param input         the UTF-16BE string to convert
4197
   * @param length        the length of the string in 2-byte code units
4198
   * (char16_t)
4199
   * @return a result pair struct (of type simdutf::result containing the two
4200
   * fields error and count) where the count is the number of bytes required to
4201
   * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS
4202
   * or SURROGATE. The count is correct regardless of the error field.
4203
   * When SURROGATE is returned, it does not indicate an error in the case of
4204
   * this function: it indicates that at least one surrogate has been
4205
   * encountered: the surrogates may be matched or not (thus this function does
4206
   * not validate). If the returned error code is SUCCESS, then the input
4207
   * contains no surrogate, is in the Basic Multilingual Plane, and is
4208
   * necessarily valid.
4209
   */
4210
  virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
4211
      const char16_t *input, size_t length) const noexcept = 0;
4212
4213
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4214
4215
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4216
  /**
4217
   * Convert possibly broken UTF-8 string into UTF-32 string.
4218
   *
4219
   * During the conversion also validation of the input string is done.
4220
   * This function is suitable to work with inputs from untrusted sources.
4221
   *
4222
   * @param input         the UTF-8 string to convert
4223
   * @param length        the length of the string in bytes
4224
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4225
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4226
   * string
4227
   */
4228
  simdutf_warn_unused virtual size_t
4229
  convert_utf8_to_utf32(const char *input, size_t length,
4230
                        char32_t *utf32_output) const noexcept = 0;
4231
4232
  /**
4233
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
4234
   *
4235
   * During the conversion also validation of the input string is done.
4236
   * This function is suitable to work with inputs from untrusted sources.
4237
   *
4238
   * @param input         the UTF-8 string to convert
4239
   * @param length        the length of the string in bytes
4240
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4241
   * @return a result pair struct (of type simdutf::result containing the two
4242
   * fields error and count) with an error code and either position of the error
4243
   * (in the input in code units) if any, or the number of char32_t written if
4244
   * successful.
4245
   */
4246
  simdutf_warn_unused virtual result
4247
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
4248
                                    char32_t *utf32_output) const noexcept = 0;
4249
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4250
4251
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4252
  /**
4253
   * Convert valid UTF-8 string into UTF-16LE string.
4254
   *
4255
   * This function assumes that the input string is valid UTF-8.
4256
   *
4257
   * @param input         the UTF-8 string to convert
4258
   * @param length        the length of the string in bytes
4259
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4260
   * @return the number of written char16_t
4261
   */
4262
  simdutf_warn_unused virtual size_t
4263
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
4264
                                char16_t *utf16_buffer) const noexcept = 0;
4265
4266
  /**
4267
   * Convert valid UTF-8 string into UTF-16BE string.
4268
   *
4269
   * This function assumes that the input string is valid UTF-8.
4270
   *
4271
   * @param input         the UTF-8 string to convert
4272
   * @param length        the length of the string in bytes
4273
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4274
   * @return the number of written char16_t
4275
   */
4276
  simdutf_warn_unused virtual size_t
4277
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
4278
                                char16_t *utf16_buffer) const noexcept = 0;
4279
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4280
4281
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4282
  /**
4283
   * Convert valid UTF-8 string into UTF-32 string.
4284
   *
4285
   * This function assumes that the input string is valid UTF-8.
4286
   *
4287
   * @param input         the UTF-8 string to convert
4288
   * @param length        the length of the string in bytes
4289
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4290
   * @return the number of written char32_t
4291
   */
4292
  simdutf_warn_unused virtual size_t
4293
  convert_valid_utf8_to_utf32(const char *input, size_t length,
4294
                              char32_t *utf32_buffer) const noexcept = 0;
4295
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4296
4297
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4298
  /**
4299
   * Compute the number of 2-byte code units that this UTF-8 string would
4300
   * require in UTF-16LE format.
4301
   *
4302
   * This function does not validate the input. It is acceptable to pass invalid
4303
   * UTF-8 strings but in such cases the result is implementation defined.
4304
   *
4305
   * @param input         the UTF-8 string to process
4306
   * @param length        the length of the string in bytes
4307
   * @return the number of char16_t code units required to encode the UTF-8
4308
   * string as UTF-16LE
4309
   */
4310
  simdutf_warn_unused virtual size_t
4311
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4312
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4313
4314
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4315
  /**
4316
   * Compute the number of 4-byte code units that this UTF-8 string would
4317
   * require in UTF-32 format.
4318
   *
4319
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
4320
   * UTF-8 strings but in such cases the result is implementation defined.
4321
   *
4322
   * This function does not validate the input.
4323
   *
4324
   * @param input         the UTF-8 string to process
4325
   * @param length        the length of the string in bytes
4326
   * @return the number of char32_t code units required to encode the UTF-8
4327
   * string as UTF-32
4328
   */
4329
  simdutf_warn_unused virtual size_t
4330
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
4331
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4332
4333
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4334
  /**
4335
   * Convert possibly broken UTF-16LE string into Latin1 string.
4336
   *
4337
   * During the conversion also validation of the input string is done.
4338
   * This function is suitable to work with inputs from untrusted sources.
4339
   *
4340
   * This function is not BOM-aware.
4341
   *
4342
   * @param input         the UTF-16LE string to convert
4343
   * @param length        the length of the string in 2-byte code units
4344
   * (char16_t)
4345
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4346
   * result
4347
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4348
   * string or if it cannot be represented as Latin1
4349
   */
4350
  simdutf_warn_unused virtual size_t
4351
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
4352
                            char *latin1_buffer) const noexcept = 0;
4353
4354
  /**
4355
   * Convert possibly broken UTF-16BE string into Latin1 string.
4356
   *
4357
   * During the conversion also validation of the input string is done.
4358
   * This function is suitable to work with inputs from untrusted sources.
4359
   *
4360
   * This function is not BOM-aware.
4361
   *
4362
   * @param input         the UTF-16BE string to convert
4363
   * @param length        the length of the string in 2-byte code units
4364
   * (char16_t)
4365
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4366
   * result
4367
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4368
   * string or if it cannot be represented as Latin1
4369
   */
4370
  simdutf_warn_unused virtual size_t
4371
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
4372
                            char *latin1_buffer) const noexcept = 0;
4373
4374
  /**
4375
   * Convert possibly broken UTF-16LE string into Latin1 string.
4376
   * If the string cannot be represented as Latin1, an error
4377
   * is returned.
4378
   *
4379
   * During the conversion also validation of the input string is done.
4380
   * This function is suitable to work with inputs from untrusted sources.
4381
   * This function is not BOM-aware.
4382
   *
4383
   * @param input         the UTF-16LE string to convert
4384
   * @param length        the length of the string in 2-byte code units
4385
   * (char16_t)
4386
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4387
   * result
4388
   * @return a result pair struct (of type simdutf::result containing the two
4389
   * fields error and count) with an error code and either position of the error
4390
   * (in the input in code units) if any, or the number of char written if
4391
   * successful.
4392
   */
4393
  simdutf_warn_unused virtual result
4394
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
4395
                                        char *latin1_buffer) const noexcept = 0;
4396
4397
  /**
4398
   * Convert possibly broken UTF-16BE string into Latin1 string.
4399
   * If the string cannot be represented as Latin1, an error
4400
   * is returned.
4401
   *
4402
   * During the conversion also validation of the input string is done.
4403
   * This function is suitable to work with inputs from untrusted sources.
4404
   * This function is not BOM-aware.
4405
   *
4406
   * @param input         the UTF-16BE string to convert
4407
   * @param length        the length of the string in 2-byte code units
4408
   * (char16_t)
4409
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4410
   * result
4411
   * @return a result pair struct (of type simdutf::result containing the two
4412
   * fields error and count) with an error code and either position of the error
4413
   * (in the input in code units) if any, or the number of char written if
4414
   * successful.
4415
   */
4416
  simdutf_warn_unused virtual result
4417
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
4418
                                        char *latin1_buffer) const noexcept = 0;
4419
4420
  /**
4421
   * Convert valid UTF-16LE string into Latin1 string.
4422
   *
4423
   * This function assumes that the input string is valid UTF-L16LE and that it
4424
   * can be represented as Latin1. If you violate this assumption, the result is
4425
   * implementation defined and may include system-dependent behavior such as
4426
   * crashes.
4427
   *
4428
   * This function is for expert users only and not part of our public API. Use
4429
   * convert_utf16le_to_latin1 instead.
4430
   *
4431
   * This function is not BOM-aware.
4432
   *
4433
   * @param input         the UTF-16LE string to convert
4434
   * @param length        the length of the string in 2-byte code units
4435
   * (char16_t)
4436
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4437
   * result
4438
   * @return number of written code units; 0 if conversion is not possible
4439
   */
4440
  simdutf_warn_unused virtual size_t
4441
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
4442
                                  char *latin1_buffer) const noexcept = 0;
4443
4444
  /**
4445
   * Convert valid UTF-16BE string into Latin1 string.
4446
   *
4447
   * This function assumes that the input string is valid UTF16-BE and that it
4448
   * can be represented as Latin1. If you violate this assumption, the result is
4449
   * implementation defined and may include system-dependent behavior such as
4450
   * crashes.
4451
   *
4452
   * This function is for expert users only and not part of our public API. Use
4453
   * convert_utf16be_to_latin1 instead.
4454
   *
4455
   * This function is not BOM-aware.
4456
   *
4457
   * @param input         the UTF-16BE string to convert
4458
   * @param length        the length of the string in 2-byte code units
4459
   * (char16_t)
4460
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4461
   * result
4462
   * @return number of written code units; 0 if conversion is not possible
4463
   */
4464
  simdutf_warn_unused virtual size_t
4465
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
4466
                                  char *latin1_buffer) const noexcept = 0;
4467
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4468
4469
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4470
  /**
4471
   * Convert possibly broken UTF-16LE string into UTF-8 string.
4472
   *
4473
   * During the conversion also validation of the input string is done.
4474
   * This function is suitable to work with inputs from untrusted sources.
4475
   *
4476
   * This function is not BOM-aware.
4477
   *
4478
   * @param input         the UTF-16LE string to convert
4479
   * @param length        the length of the string in 2-byte code units
4480
   * (char16_t)
4481
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4482
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4483
   * string
4484
   */
4485
  simdutf_warn_unused virtual size_t
4486
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
4487
                          char *utf8_buffer) const noexcept = 0;
4488
4489
  /**
4490
   * Convert possibly broken UTF-16BE string into UTF-8 string.
4491
   *
4492
   * During the conversion also validation of the input string is done.
4493
   * This function is suitable to work with inputs from untrusted sources.
4494
   *
4495
   * This function is not BOM-aware.
4496
   *
4497
   * @param input         the UTF-16BE string to convert
4498
   * @param length        the length of the string in 2-byte code units
4499
   * (char16_t)
4500
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4501
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4502
   * string
4503
   */
4504
  simdutf_warn_unused virtual size_t
4505
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
4506
                          char *utf8_buffer) const noexcept = 0;
4507
4508
  /**
4509
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
4510
   * error.
4511
   *
4512
   * During the conversion also validation of the input string is done.
4513
   * This function is suitable to work with inputs from untrusted sources.
4514
   *
4515
   * This function is not BOM-aware.
4516
   *
4517
   * @param input         the UTF-16LE string to convert
4518
   * @param length        the length of the string in 2-byte code units
4519
   * (char16_t)
4520
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4521
   * @return a result pair struct (of type simdutf::result containing the two
4522
   * fields error and count) with an error code and either position of the error
4523
   * (in the input in code units) if any, or the number of char written if
4524
   * successful.
4525
   */
4526
  simdutf_warn_unused virtual result
4527
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
4528
                                      char *utf8_buffer) const noexcept = 0;
4529
4530
  /**
4531
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
4532
   * error.
4533
   *
4534
   * During the conversion also validation of the input string is done.
4535
   * This function is suitable to work with inputs from untrusted sources.
4536
   *
4537
   * This function is not BOM-aware.
4538
   *
4539
   * @param input         the UTF-16BE string to convert
4540
   * @param length        the length of the string in 2-byte code units
4541
   * (char16_t)
4542
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4543
   * @return a result pair struct (of type simdutf::result containing the two
4544
   * fields error and count) with an error code and either position of the error
4545
   * (in the input in code units) if any, or the number of char written if
4546
   * successful.
4547
   */
4548
  simdutf_warn_unused virtual result
4549
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
4550
                                      char *utf8_buffer) const noexcept = 0;
4551
4552
  /**
4553
   * Convert valid UTF-16LE string into UTF-8 string.
4554
   *
4555
   * This function assumes that the input string is valid UTF-16LE.
4556
   *
4557
   * This function is not BOM-aware.
4558
   *
4559
   * @param input         the UTF-16LE string to convert
4560
   * @param length        the length of the string in 2-byte code units
4561
   * (char16_t)
4562
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4563
   * result
4564
   * @return number of written code units; 0 if conversion is not possible
4565
   */
4566
  simdutf_warn_unused virtual size_t
4567
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
4568
                                char *utf8_buffer) const noexcept = 0;
4569
4570
  /**
4571
   * Convert valid UTF-16BE string into UTF-8 string.
4572
   *
4573
   * This function assumes that the input string is valid UTF-16BE.
4574
   *
4575
   * This function is not BOM-aware.
4576
   *
4577
   * @param input         the UTF-16BE string to convert
4578
   * @param length        the length of the string in 2-byte code units
4579
   * (char16_t)
4580
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4581
   * result
4582
   * @return number of written code units; 0 if conversion is not possible
4583
   */
4584
  simdutf_warn_unused virtual size_t
4585
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
4586
                                char *utf8_buffer) const noexcept = 0;
4587
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4588
4589
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4590
  /**
4591
   * Convert possibly broken UTF-16LE string into UTF-32 string.
4592
   *
4593
   * During the conversion also validation of the input string is done.
4594
   * This function is suitable to work with inputs from untrusted sources.
4595
   *
4596
   * This function is not BOM-aware.
4597
   *
4598
   * @param input         the UTF-16LE string to convert
4599
   * @param length        the length of the string in 2-byte code units
4600
   * (char16_t)
4601
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4602
   * @return number of written code units; 0 if input is not a valid UTF-16LE
4603
   * string
4604
   */
4605
  simdutf_warn_unused virtual size_t
4606
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
4607
                           char32_t *utf32_buffer) const noexcept = 0;
4608
4609
  /**
4610
   * Convert possibly broken UTF-16BE string into UTF-32 string.
4611
   *
4612
   * During the conversion also validation of the input string is done.
4613
   * This function is suitable to work with inputs from untrusted sources.
4614
   *
4615
   * This function is not BOM-aware.
4616
   *
4617
   * @param input         the UTF-16BE string to convert
4618
   * @param length        the length of the string in 2-byte code units
4619
   * (char16_t)
4620
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4621
   * @return number of written code units; 0 if input is not a valid UTF-16BE
4622
   * string
4623
   */
4624
  simdutf_warn_unused virtual size_t
4625
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
4626
                           char32_t *utf32_buffer) const noexcept = 0;
4627
4628
  /**
4629
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
4630
   * error.
4631
   *
4632
   * During the conversion also validation of the input string is done.
4633
   * This function is suitable to work with inputs from untrusted sources.
4634
   *
4635
   * This function is not BOM-aware.
4636
   *
4637
   * @param input         the UTF-16LE string to convert
4638
   * @param length        the length of the string in 2-byte code units
4639
   * (char16_t)
4640
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4641
   * @return a result pair struct (of type simdutf::result containing the two
4642
   * fields error and count) with an error code and either position of the error
4643
   * (in the input in code units) if any, or the number of char32_t written if
4644
   * successful.
4645
   */
4646
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
4647
      const char16_t *input, size_t length,
4648
      char32_t *utf32_buffer) const noexcept = 0;
4649
4650
  /**
4651
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
4652
   * error.
4653
   *
4654
   * During the conversion also validation of the input string is done.
4655
   * This function is suitable to work with inputs from untrusted sources.
4656
   *
4657
   * This function is not BOM-aware.
4658
   *
4659
   * @param input         the UTF-16BE string to convert
4660
   * @param length        the length of the string in 2-byte code units
4661
   * (char16_t)
4662
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
4663
   * @return a result pair struct (of type simdutf::result containing the two
4664
   * fields error and count) with an error code and either position of the error
4665
   * (in the input in code units) if any, or the number of char32_t written if
4666
   * successful.
4667
   */
4668
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
4669
      const char16_t *input, size_t length,
4670
      char32_t *utf32_buffer) const noexcept = 0;
4671
4672
  /**
4673
   * Convert valid UTF-16LE string into UTF-32 string.
4674
   *
4675
   * This function assumes that the input string is valid UTF-16LE.
4676
   *
4677
   * This function is not BOM-aware.
4678
   *
4679
   * @param input         the UTF-16LE string to convert
4680
   * @param length        the length of the string in 2-byte code units
4681
   * (char16_t)
4682
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4683
   * result
4684
   * @return number of written code units; 0 if conversion is not possible
4685
   */
4686
  simdutf_warn_unused virtual size_t
4687
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
4688
                                 char32_t *utf32_buffer) const noexcept = 0;
4689
4690
  /**
4691
   * Convert valid UTF-16LE string into UTF-32BE string.
4692
   *
4693
   * This function assumes that the input string is valid UTF-16BE.
4694
   *
4695
   * This function is not BOM-aware.
4696
   *
4697
   * @param input         the UTF-16BE string to convert
4698
   * @param length        the length of the string in 2-byte code units
4699
   * (char16_t)
4700
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
4701
   * result
4702
   * @return number of written code units; 0 if conversion is not possible
4703
   */
4704
  simdutf_warn_unused virtual size_t
4705
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
4706
                                 char32_t *utf32_buffer) const noexcept = 0;
4707
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4708
4709
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4710
  /**
4711
   * Compute the number of bytes that this UTF-16LE string would require in
4712
   * UTF-8 format.
4713
   *
4714
   * This function does not validate the input. It is acceptable to pass invalid
4715
   * UTF-16 strings but in such cases the result is implementation defined.
4716
   *
4717
   * This function is not BOM-aware.
4718
   *
4719
   * @param input         the UTF-16LE string to convert
4720
   * @param length        the length of the string in 2-byte code units
4721
   * (char16_t)
4722
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
4723
   */
4724
  simdutf_warn_unused virtual size_t
4725
  utf8_length_from_utf16le(const char16_t *input,
4726
                           size_t length) const noexcept = 0;
4727
4728
  /**
4729
   * Compute the number of bytes that this UTF-16BE string would require in
4730
   * UTF-8 format.
4731
   *
4732
   * This function does not validate the input. It is acceptable to pass invalid
4733
   * UTF-16 strings but in such cases the result is implementation defined.
4734
   *
4735
   * This function is not BOM-aware.
4736
   *
4737
   * @param input         the UTF-16BE string to convert
4738
   * @param length        the length of the string in 2-byte code units
4739
   * (char16_t)
4740
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
4741
   */
4742
  simdutf_warn_unused virtual size_t
4743
  utf8_length_from_utf16be(const char16_t *input,
4744
                           size_t length) const noexcept = 0;
4745
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4746
4747
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4748
  /**
4749
   * Convert possibly broken UTF-32 string into Latin1 string.
4750
   *
4751
   * During the conversion also validation of the input string is done.
4752
   * This function is suitable to work with inputs from untrusted sources.
4753
   *
4754
   * This function is not BOM-aware.
4755
   *
4756
   * @param input         the UTF-32 string to convert
4757
   * @param length        the length of the string in 4-byte code units
4758
   * (char32_t)
4759
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4760
   * result
4761
   * @return number of written code units; 0 if input is not a valid UTF-32
4762
   * string
4763
   */
4764
  simdutf_warn_unused virtual size_t
4765
  convert_utf32_to_latin1(const char32_t *input, size_t length,
4766
                          char *latin1_buffer) const noexcept = 0;
4767
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4768
4769
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4770
  /**
4771
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
4772
   * If the string cannot be represented as Latin1, an error is returned.
4773
   *
4774
   * During the conversion also validation of the input string is done.
4775
   * This function is suitable to work with inputs from untrusted sources.
4776
   *
4777
   * This function is not BOM-aware.
4778
   *
4779
   * @param input         the UTF-32 string to convert
4780
   * @param length        the length of the string in 4-byte code units
4781
   * (char32_t)
4782
   * @param latin1_buffer   the pointer to buffer that can hold conversion
4783
   * result
4784
   * @return a result pair struct (of type simdutf::result containing the two
4785
   * fields error and count) with an error code and either position of the error
4786
   * (in the input in code units) if any, or the number of char written if
4787
   * successful.
4788
   */
4789
  simdutf_warn_unused virtual result
4790
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
4791
                                      char *latin1_buffer) const noexcept = 0;
4792
4793
  /**
4794
   * Convert valid UTF-32 string into Latin1 string.
4795
   *
4796
   * This function assumes that the input string is valid UTF-32 and can be
4797
   * represented as Latin1. If you violate this assumption, the result is
4798
   * implementation defined and may include system-dependent behavior such as
4799
   * crashes.
4800
   *
4801
   * This function is for expert users only and not part of our public API. Use
4802
   * convert_utf32_to_latin1 instead.
4803
   *
4804
   * This function is not BOM-aware.
4805
   *
4806
   * @param input         the UTF-32 string to convert
4807
   * @param length        the length of the string in 4-byte code units
4808
   * (char32_t)
4809
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
4810
   * result
4811
   * @return number of written code units; 0 if conversion is not possible
4812
   */
4813
  simdutf_warn_unused virtual size_t
4814
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
4815
                                char *latin1_buffer) const noexcept = 0;
4816
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4817
4818
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4819
  /**
4820
   * Convert possibly broken UTF-32 string into UTF-8 string.
4821
   *
4822
   * During the conversion also validation of the input string is done.
4823
   * This function is suitable to work with inputs from untrusted sources.
4824
   *
4825
   * This function is not BOM-aware.
4826
   *
4827
   * @param input         the UTF-32 string to convert
4828
   * @param length        the length of the string in 4-byte code units
4829
   * (char32_t)
4830
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4831
   * @return number of written code units; 0 if input is not a valid UTF-32
4832
   * string
4833
   */
4834
  simdutf_warn_unused virtual size_t
4835
  convert_utf32_to_utf8(const char32_t *input, size_t length,
4836
                        char *utf8_buffer) const noexcept = 0;
4837
4838
  /**
4839
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
4840
   *
4841
   * During the conversion also validation of the input string is done.
4842
   * This function is suitable to work with inputs from untrusted sources.
4843
   *
4844
   * This function is not BOM-aware.
4845
   *
4846
   * @param input         the UTF-32 string to convert
4847
   * @param length        the length of the string in 4-byte code units
4848
   * (char32_t)
4849
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
4850
   * @return a result pair struct (of type simdutf::result containing the two
4851
   * fields error and count) with an error code and either position of the error
4852
   * (in the input in code units) if any, or the number of char written if
4853
   * successful.
4854
   */
4855
  simdutf_warn_unused virtual result
4856
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
4857
                                    char *utf8_buffer) const noexcept = 0;
4858
4859
  /**
4860
   * Convert valid UTF-32 string into UTF-8 string.
4861
   *
4862
   * This function assumes that the input string is valid UTF-32.
4863
   *
4864
   * This function is not BOM-aware.
4865
   *
4866
   * @param input         the UTF-32 string to convert
4867
   * @param length        the length of the string in 4-byte code units
4868
   * (char32_t)
4869
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
4870
   * result
4871
   * @return number of written code units; 0 if conversion is not possible
4872
   */
4873
  simdutf_warn_unused virtual size_t
4874
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
4875
                              char *utf8_buffer) const noexcept = 0;
4876
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
4877
4878
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4879
  /**
4880
   * Return the number of bytes that this UTF-16 string would require in Latin1
4881
   * format.
4882
   *
4883
   *
4884
   * @param input         the UTF-16 string to convert
4885
   * @param length        the length of the string in 2-byte code units
4886
   * (char16_t)
4887
   * @return the number of bytes required to encode the UTF-16 string as Latin1
4888
   */
4889
  simdutf_warn_unused virtual size_t
4890
  utf16_length_from_latin1(size_t length) const noexcept {
4891
    return length;
4892
  }
4893
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4894
4895
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
4896
  /**
4897
   * Convert possibly broken UTF-32 string into UTF-16LE string.
4898
   *
4899
   * During the conversion also validation of the input string is done.
4900
   * This function is suitable to work with inputs from untrusted sources.
4901
   *
4902
   * This function is not BOM-aware.
4903
   *
4904
   * @param input         the UTF-32 string to convert
4905
   * @param length        the length of the string in 4-byte code units
4906
   * (char32_t)
4907
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4908
   * @return number of written code units; 0 if input is not a valid UTF-32
4909
   * string
4910
   */
4911
  simdutf_warn_unused virtual size_t
4912
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
4913
                           char16_t *utf16_buffer) const noexcept = 0;
4914
4915
  /**
4916
   * Convert possibly broken UTF-32 string into UTF-16BE string.
4917
   *
4918
   * During the conversion also validation of the input string is done.
4919
   * This function is suitable to work with inputs from untrusted sources.
4920
   *
4921
   * This function is not BOM-aware.
4922
   *
4923
   * @param input         the UTF-32 string to convert
4924
   * @param length        the length of the string in 4-byte code units
4925
   * (char32_t)
4926
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4927
   * @return number of written code units; 0 if input is not a valid UTF-32
4928
   * string
4929
   */
4930
  simdutf_warn_unused virtual size_t
4931
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
4932
                           char16_t *utf16_buffer) const noexcept = 0;
4933
4934
  /**
4935
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
4936
   * error.
4937
   *
4938
   * During the conversion also validation of the input string is done.
4939
   * This function is suitable to work with inputs from untrusted sources.
4940
   *
4941
   * This function is not BOM-aware.
4942
   *
4943
   * @param input         the UTF-32 string to convert
4944
   * @param length        the length of the string in 4-byte code units
4945
   * (char32_t)
4946
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4947
   * @return a result pair struct (of type simdutf::result containing the two
4948
   * fields error and count) with an error code and either position of the error
4949
   * (in the input in code units) if any, or the number of char16_t written if
4950
   * successful.
4951
   */
4952
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
4953
      const char32_t *input, size_t length,
4954
      char16_t *utf16_buffer) const noexcept = 0;
4955
4956
  /**
4957
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
4958
   * error.
4959
   *
4960
   * During the conversion also validation of the input string is done.
4961
   * This function is suitable to work with inputs from untrusted sources.
4962
   *
4963
   * This function is not BOM-aware.
4964
   *
4965
   * @param input         the UTF-32 string to convert
4966
   * @param length        the length of the string in 4-byte code units
4967
   * (char32_t)
4968
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
4969
   * @return a result pair struct (of type simdutf::result containing the two
4970
   * fields error and count) with an error code and either position of the error
4971
   * (in the input in code units) if any, or the number of char16_t written if
4972
   * successful.
4973
   */
4974
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
4975
      const char32_t *input, size_t length,
4976
      char16_t *utf16_buffer) const noexcept = 0;
4977
4978
  /**
4979
   * Convert valid UTF-32 string into UTF-16LE string.
4980
   *
4981
   * This function assumes that the input string is valid UTF-32.
4982
   *
4983
   * This function is not BOM-aware.
4984
   *
4985
   * @param input         the UTF-32 string to convert
4986
   * @param length        the length of the string in 4-byte code units
4987
   * (char32_t)
4988
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
4989
   * result
4990
   * @return number of written code units; 0 if conversion is not possible
4991
   */
4992
  simdutf_warn_unused virtual size_t
4993
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
4994
                                 char16_t *utf16_buffer) const noexcept = 0;
4995
4996
  /**
4997
   * Convert valid UTF-32 string into UTF-16BE string.
4998
   *
4999
   * This function assumes that the input string is valid UTF-32.
5000
   *
5001
   * This function is not BOM-aware.
5002
   *
5003
   * @param input         the UTF-32 string to convert
5004
   * @param length        the length of the string in 4-byte code units
5005
   * (char32_t)
5006
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
5007
   * result
5008
   * @return number of written code units; 0 if conversion is not possible
5009
   */
5010
  simdutf_warn_unused virtual size_t
5011
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
5012
                                 char16_t *utf16_buffer) const noexcept = 0;
5013
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5014
5015
#if SIMDUTF_FEATURE_UTF16
5016
  /**
5017
   * Change the endianness of the input. Can be used to go from UTF-16LE to
5018
   * UTF-16BE or from UTF-16BE to UTF-16LE.
5019
   *
5020
   * This function does not validate the input.
5021
   *
5022
   * This function is not BOM-aware.
5023
   *
5024
   * @param input         the UTF-16 string to process
5025
   * @param length        the length of the string in 2-byte code units
5026
   * (char16_t)
5027
   * @param output        the pointer to a buffer that can hold the conversion
5028
   * result
5029
   */
5030
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
5031
                                       char16_t *output) const noexcept = 0;
5032
#endif // SIMDUTF_FEATURE_UTF16
5033
5034
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5035
  /**
5036
   * Return the number of bytes that this Latin1 string would require in UTF-8
5037
   * format.
5038
   *
5039
   * @param input         the Latin1 string to convert
5040
   * @param length        the length of the string bytes
5041
   * @return the number of bytes required to encode the Latin1 string as UTF-8
5042
   */
5043
  simdutf_warn_unused virtual size_t
5044
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
5045
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5046
5047
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5048
  /**
5049
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
5050
   * format.
5051
   *
5052
   * This function does not validate the input. It is acceptable to pass invalid
5053
   * UTF-32 strings but in such cases the result is implementation defined.
5054
   *
5055
   * @param input         the UTF-32 string to convert
5056
   * @param length        the length of the string in 4-byte code units
5057
   * (char32_t)
5058
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
5059
   */
5060
  simdutf_warn_unused virtual size_t
5061
  utf8_length_from_utf32(const char32_t *input,
5062
                         size_t length) const noexcept = 0;
5063
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5064
5065
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5066
  /**
5067
   * Compute the number of bytes that this UTF-32 string would require in Latin1
5068
   * format.
5069
   *
5070
   * This function does not validate the input. It is acceptable to pass invalid
5071
   * UTF-32 strings but in such cases the result is implementation defined.
5072
   *
5073
   * @param length        the length of the string in 4-byte code units
5074
   * (char32_t)
5075
   * @return the number of bytes required to encode the UTF-32 string as Latin1
5076
   */
5077
  simdutf_warn_unused virtual size_t
5078
  latin1_length_from_utf32(size_t length) const noexcept {
5079
    return length;
5080
  }
5081
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5082
5083
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5084
  /**
5085
   * Compute the number of bytes that this UTF-8 string would require in Latin1
5086
   * format.
5087
   *
5088
   * This function does not validate the input. It is acceptable to pass invalid
5089
   * UTF-8 strings but in such cases the result is implementation defined.
5090
   *
5091
   * @param input         the UTF-8 string to convert
5092
   * @param length        the length of the string in byte
5093
   * @return the number of bytes required to encode the UTF-8 string as Latin1
5094
   */
5095
  simdutf_warn_unused virtual size_t
5096
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5097
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5098
5099
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5100
  /**
5101
   * Compute the number of bytes that this UTF-16LE/BE string would require in
5102
   * Latin1 format.
5103
   *
5104
   * This function does not validate the input. It is acceptable to pass invalid
5105
   * UTF-16 strings but in such cases the result is implementation defined.
5106
   *
5107
   * This function is not BOM-aware.
5108
   *
5109
   * @param input         the UTF-16LE string to convert
5110
   * @param length        the length of the string in 2-byte code units
5111
   * (char16_t)
5112
   * @return the number of bytes required to encode the UTF-16LE string as
5113
   * Latin1
5114
   */
5115
  simdutf_warn_unused virtual size_t
5116
  latin1_length_from_utf16(size_t length) const noexcept {
5117
    return length;
5118
  }
5119
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5120
5121
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5122
  /**
5123
   * Compute the number of two-byte code units that this UTF-32 string would
5124
   * require in UTF-16 format.
5125
   *
5126
   * This function does not validate the input. It is acceptable to pass invalid
5127
   * UTF-32 strings but in such cases the result is implementation defined.
5128
   *
5129
   * @param input         the UTF-32 string to convert
5130
   * @param length        the length of the string in 4-byte code units
5131
   * (char32_t)
5132
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
5133
   */
5134
  simdutf_warn_unused virtual size_t
5135
  utf16_length_from_utf32(const char32_t *input,
5136
                          size_t length) const noexcept = 0;
5137
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5138
5139
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5140
  /**
5141
   * Return the number of bytes that this UTF-32 string would require in Latin1
5142
   * format.
5143
   *
5144
   * @param length        the length of the string in 4-byte code units
5145
   * (char32_t)
5146
   * @return the number of bytes required to encode the UTF-32 string as Latin1
5147
   */
5148
  simdutf_warn_unused virtual size_t
5149
  utf32_length_from_latin1(size_t length) const noexcept {
5150
    return length;
5151
  }
5152
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5153
5154
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5155
  /**
5156
   * Compute the number of bytes that this UTF-16LE string would require in
5157
   * UTF-32 format.
5158
   *
5159
   * This function is equivalent to count_utf16le.
5160
   *
5161
   * This function does not validate the input. It is acceptable to pass invalid
5162
   * UTF-16 strings but in such cases the result is implementation defined.
5163
   *
5164
   * This function is not BOM-aware.
5165
   *
5166
   * @param input         the UTF-16LE string to convert
5167
   * @param length        the length of the string in 2-byte code units
5168
   * (char16_t)
5169
   * @return the number of bytes required to encode the UTF-16LE string as
5170
   * UTF-32
5171
   */
5172
  simdutf_warn_unused virtual size_t
5173
  utf32_length_from_utf16le(const char16_t *input,
5174
                            size_t length) const noexcept = 0;
5175
5176
  /**
5177
   * Compute the number of bytes that this UTF-16BE string would require in
5178
   * UTF-32 format.
5179
   *
5180
   * This function is equivalent to count_utf16be.
5181
   *
5182
   * This function does not validate the input. It is acceptable to pass invalid
5183
   * UTF-16 strings but in such cases the result is implementation defined.
5184
   *
5185
   * This function is not BOM-aware.
5186
   *
5187
   * @param input         the UTF-16BE string to convert
5188
   * @param length        the length of the string in 2-byte code units
5189
   * (char16_t)
5190
   * @return the number of bytes required to encode the UTF-16BE string as
5191
   * UTF-32
5192
   */
5193
  simdutf_warn_unused virtual size_t
5194
  utf32_length_from_utf16be(const char16_t *input,
5195
                            size_t length) const noexcept = 0;
5196
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5197
5198
#if SIMDUTF_FEATURE_UTF16
5199
  /**
5200
   * Count the number of code points (characters) in the string assuming that
5201
   * it is valid.
5202
   *
5203
   * This function assumes that the input string is valid UTF-16LE.
5204
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5205
   * the result is implementation defined.
5206
   *
5207
   * This function is not BOM-aware.
5208
   *
5209
   * @param input         the UTF-16LE string to process
5210
   * @param length        the length of the string in 2-byte code units
5211
   * (char16_t)
5212
   * @return number of code points
5213
   */
5214
  simdutf_warn_unused virtual size_t
5215
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
5216
5217
  /**
5218
   * Count the number of code points (characters) in the string assuming that
5219
   * it is valid.
5220
   *
5221
   * This function assumes that the input string is valid UTF-16BE.
5222
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5223
   * the result is implementation defined.
5224
   *
5225
   * This function is not BOM-aware.
5226
   *
5227
   * @param input         the UTF-16BE string to process
5228
   * @param length        the length of the string in 2-byte code units
5229
   * (char16_t)
5230
   * @return number of code points
5231
   */
5232
  simdutf_warn_unused virtual size_t
5233
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
5234
#endif // SIMDUTF_FEATURE_UTF16
5235
5236
#if SIMDUTF_FEATURE_UTF8
5237
  /**
5238
   * Count the number of code points (characters) in the string assuming that
5239
   * it is valid.
5240
   *
5241
   * This function assumes that the input string is valid UTF-8.
5242
   * It is acceptable to pass invalid UTF-8 strings but in such cases
5243
   * the result is implementation defined.
5244
   *
5245
   * @param input         the UTF-8 string to process
5246
   * @param length        the length of the string in bytes
5247
   * @return number of code points
5248
   */
5249
  simdutf_warn_unused virtual size_t
5250
  count_utf8(const char *input, size_t length) const noexcept = 0;
5251
#endif // SIMDUTF_FEATURE_UTF8
5252
5253
#if SIMDUTF_FEATURE_BASE64
5254
  /**
5255
   * Provide the maximal binary length in bytes given the base64 input.
5256
   * In general, if the input contains ASCII spaces, the result will be less
5257
   * than the maximum length. It is acceptable to pass invalid base64 strings
5258
   * but in such cases the result is implementation defined.
5259
   *
5260
   * @param input         the base64 input to process
5261
   * @param length        the length of the base64 input in bytes
5262
   * @return maximal number of binary bytes
5263
   */
5264
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5265
      const char *input, size_t length) const noexcept;
5266
5267
  /**
5268
   * Provide the maximal binary length in bytes given the base64 input.
5269
   * In general, if the input contains ASCII spaces, the result will be less
5270
   * than the maximum length. It is acceptable to pass invalid base64 strings
5271
   * but in such cases the result is implementation defined.
5272
   *
5273
   * @param input         the base64 input to process, in ASCII stored as 16-bit
5274
   * units
5275
   * @param length        the length of the base64 input in 16-bit units
5276
   * @return maximal number of binary bytes
5277
   */
5278
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
5279
      const char16_t *input, size_t length) const noexcept;
5280
5281
  /**
5282
   * Convert a base64 input to a binary output.
5283
   *
5284
   * This function follows the WHATWG forgiving-base64 format, which means that
5285
   * it will ignore any ASCII spaces in the input. You may provide a padded
5286
   * input (with one or two equal signs at the end) or an unpadded input
5287
   * (without any equal signs at the end).
5288
   *
5289
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5290
   *
5291
   * This function will fail in case of invalid input. When last_chunk_options =
5292
   * loose, there are two possible reasons for failure: the input contains a
5293
   * number of base64 characters that when divided by 4, leaves a single
5294
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5295
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5296
   *
5297
   * You should call this function with a buffer that is at least
5298
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5299
   * provide that much space, the function may cause a buffer overflow.
5300
   *
5301
   * @param input         the base64 string to process
5302
   * @param length        the length of the string in bytes
5303
   * @param output        the pointer to a buffer that can hold the conversion
5304
   * result (should be at least maximal_binary_length_from_base64(input, length)
5305
   * bytes long).
5306
   * @param options       the base64 options to use, can be base64_default or
5307
   * base64_url, is base64_default by default.
5308
   * @return a result pair struct (of type simdutf::result containing the two
5309
   * fields error and count) with an error code and either position of the error
5310
   * (in the input in bytes) if any, or the number of bytes written if
5311
   * successful.
5312
   */
5313
  simdutf_warn_unused virtual result
5314
  base64_to_binary(const char *input, size_t length, char *output,
5315
                   base64_options options = base64_default,
5316
                   last_chunk_handling_options last_chunk_options =
5317
                       last_chunk_handling_options::loose) const noexcept = 0;
5318
5319
  /**
5320
   * Convert a base64 input to a binary output while returning more details
5321
   * than base64_to_binary.
5322
   *
5323
   * This function follows the WHATWG forgiving-base64 format, which means that
5324
   * it will ignore any ASCII spaces in the input. You may provide a padded
5325
   * input (with one or two equal signs at the end) or an unpadded input
5326
   * (without any equal signs at the end).
5327
   *
5328
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5329
   *
5330
   * This function will fail in case of invalid input. When last_chunk_options =
5331
   * loose, there are two possible reasons for failure: the input contains a
5332
   * number of base64 characters that when divided by 4, leaves a single
5333
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5334
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5335
   *
5336
   * You should call this function with a buffer that is at least
5337
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5338
   * provide that much space, the function may cause a buffer overflow.
5339
   *
5340
   * @param input         the base64 string to process
5341
   * @param length        the length of the string in bytes
5342
   * @param output        the pointer to a buffer that can hold the conversion
5343
   * result (should be at least maximal_binary_length_from_base64(input, length)
5344
   * bytes long).
5345
   * @param options       the base64 options to use, can be base64_default or
5346
   * base64_url, is base64_default by default.
5347
   * @return a full_result pair struct (of type simdutf::result containing the
5348
   * three fields error, input_count and output_count).
5349
   */
5350
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5351
      const char *input, size_t length, char *output,
5352
      base64_options options = base64_default,
5353
      last_chunk_handling_options last_chunk_options =
5354
          last_chunk_handling_options::loose) const noexcept = 0;
5355
  /**
5356
   * Convert a base64 input to a binary output.
5357
   *
5358
   * This function follows the WHATWG forgiving-base64 format, which means that
5359
   * it will ignore any ASCII spaces in the input. You may provide a padded
5360
   * input (with one or two equal signs at the end) or an unpadded input
5361
   * (without any equal signs at the end).
5362
   *
5363
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5364
   *
5365
   * This function will fail in case of invalid input. When last_chunk_options =
5366
   * loose, there are two possible reasons for failure: the input contains a
5367
   * number of base64 characters that when divided by 4, leaves a single
5368
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5369
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5370
   *
5371
   * You should call this function with a buffer that is at least
5372
   * maximal_binary_length_from_base64(input, length) bytes long. If you
5373
   * fail to provide that much space, the function may cause a buffer overflow.
5374
   *
5375
   * @param input         the base64 string to process, in ASCII stored as
5376
   * 16-bit units
5377
   * @param length        the length of the string in 16-bit units
5378
   * @param output        the pointer to a buffer that can hold the conversion
5379
   * result (should be at least maximal_binary_length_from_base64(input, length)
5380
   * bytes long).
5381
   * @param options       the base64 options to use, can be base64_default or
5382
   * base64_url, is base64_default by default.
5383
   * @return a result pair struct (of type simdutf::result containing the two
5384
   * fields error and count) with an error code and position of the
5385
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
5386
   * number of bytes written if successful.
5387
   */
5388
  simdutf_warn_unused virtual result
5389
  base64_to_binary(const char16_t *input, size_t length, char *output,
5390
                   base64_options options = base64_default,
5391
                   last_chunk_handling_options last_chunk_options =
5392
                       last_chunk_handling_options::loose) const noexcept = 0;
5393
5394
  /**
5395
   * Convert a base64 input to a binary output while returning more details
5396
   * than base64_to_binary.
5397
   *
5398
   * This function follows the WHATWG forgiving-base64 format, which means that
5399
   * it will ignore any ASCII spaces in the input. You may provide a padded
5400
   * input (with one or two equal signs at the end) or an unpadded input
5401
   * (without any equal signs at the end).
5402
   *
5403
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
5404
   *
5405
   * This function will fail in case of invalid input. When last_chunk_options =
5406
   * loose, there are two possible reasons for failure: the input contains a
5407
   * number of base64 characters that when divided by 4, leaves a single
5408
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
5409
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
5410
   *
5411
   * You should call this function with a buffer that is at least
5412
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
5413
   * provide that much space, the function may cause a buffer overflow.
5414
   *
5415
   * @param input         the base64 string to process
5416
   * @param length        the length of the string in bytes
5417
   * @param output        the pointer to a buffer that can hold the conversion
5418
   * result (should be at least maximal_binary_length_from_base64(input, length)
5419
   * bytes long).
5420
   * @param options       the base64 options to use, can be base64_default or
5421
   * base64_url, is base64_default by default.
5422
   * @return a full_result pair struct (of type simdutf::result containing the
5423
   * three fields error, input_count and output_count).
5424
   */
5425
  simdutf_warn_unused virtual full_result base64_to_binary_details(
5426
      const char16_t *input, size_t length, char *output,
5427
      base64_options options = base64_default,
5428
      last_chunk_handling_options last_chunk_options =
5429
          last_chunk_handling_options::loose) const noexcept = 0;
5430
5431
  /**
5432
   * Provide the base64 length in bytes given the length of a binary input.
5433
   *
5434
   * @param length        the length of the input in bytes
5435
   * @param options       the base64 options to use, can be base64_default or
5436
   * base64_url, is base64_default by default.
5437
   * @return number of base64 bytes
5438
   */
5439
  simdutf_warn_unused size_t base64_length_from_binary(
5440
      size_t length, base64_options options = base64_default) const noexcept;
5441
5442
  /**
5443
   * Convert a binary input to a base64 output.
5444
   *
5445
   * The default option (simdutf::base64_default) uses the characters `+` and
5446
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5447
   * the output to ensure that the output length is a multiple of four.
5448
   *
5449
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5450
   * part of its alphabet. No padding is added at the end of the output.
5451
   *
5452
   * This function always succeeds.
5453
   *
5454
   * @param input         the binary to process
5455
   * @param length        the length of the input in bytes
5456
   * @param output        the pointer to a buffer that can hold the conversion
5457
   * result (should be at least base64_length_from_binary(length) bytes long)
5458
   * @param options       the base64 options to use, can be base64_default or
5459
   * base64_url, is base64_default by default.
5460
   * @return number of written bytes, will be equal to
5461
   * base64_length_from_binary(length, options)
5462
   */
5463
  virtual size_t
5464
  binary_to_base64(const char *input, size_t length, char *output,
5465
                   base64_options options = base64_default) const noexcept = 0;
5466
5467
  /**
5468
   * Convert a binary input to a base64 output with lines of given length.
5469
   * Lines are separated by a single linefeed character.
5470
   *
5471
   * The default option (simdutf::base64_default) uses the characters `+` and
5472
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
5473
   * the output to ensure that the output length is a multiple of four.
5474
   *
5475
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
5476
   * part of its alphabet. No padding is added at the end of the output.
5477
   *
5478
   * This function always succeeds.
5479
   *
5480
   * @param input         the binary to process
5481
   * @param length        the length of the input in bytes
5482
   * @param output        the pointer to a buffer that can hold the conversion
5483
   * result (should be at least base64_length_from_binary_with_lines(length,
5484
   * options, line_length) bytes long)
5485
   * @param line_length   the length of each line, values smaller than 4 are
5486
   * interpreted as 4
5487
   * @param options       the base64 options to use, can be base64_default or
5488
   * base64_url, is base64_default by default.
5489
   * @return number of written bytes, will be equal to
5490
   * base64_length_from_binary_with_lines(length, options, line_length)
5491
   */
5492
  virtual size_t binary_to_base64_with_lines(
5493
      const char *input, size_t length, char *output,
5494
      size_t line_length = simdutf::default_line_length,
5495
      base64_options options = base64_default) const noexcept = 0;
5496
  /**
5497
   * Find the first occurrence of a character in a string. If the character is
5498
   * not found, return a pointer to the end of the string.
5499
   * @param start        the start of the string
5500
   * @param end          the end of the string
5501
   * @param character    the character to find
5502
   * @return a pointer to the first occurrence of the character in the string,
5503
   * or a pointer to the end of the string if the character is not found.
5504
   *
5505
   */
5506
  virtual const char *find(const char *start, const char *end,
5507
                           char character) const noexcept = 0;
5508
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
5509
                               char16_t character) const noexcept = 0;
5510
#endif // SIMDUTF_FEATURE_BASE64
5511
5512
#ifdef SIMDUTF_INTERNAL_TESTS
5513
  // This method is exported only in developer mode, its purpose
5514
  // is to expose some internal test procedures from the given
5515
  // implementation and then use them through our standard test
5516
  // framework.
5517
  //
5518
  // Regular users should not use it, the tests of the public
5519
  // API are enough.
5520
5521
  struct TestProcedure {
5522
    // display name
5523
    std::string name;
5524
5525
    // procedure should return whether given test pass or not
5526
    void (*procedure)(const implementation &);
5527
  };
5528
5529
  virtual std::vector<TestProcedure> internal_tests() const;
5530
#endif
5531
5532
protected:
5533
  /** @private Construct an implementation with the given name and description.
5534
   * For subclasses. */
5535
  simdutf_really_inline implementation(const char *name,
5536
                                       const char *description,
5537
                                       uint32_t required_instruction_sets)
5538
      : _name(name), _description(description),
5539
        _required_instruction_sets(required_instruction_sets) {}
5540
5541
protected:
5542
  ~implementation() = default;
5543
5544
private:
5545
  /**
5546
   * The name of this implementation.
5547
   */
5548
  const char *_name;
5549
5550
  /**
5551
   * The description of this implementation.
5552
   */
5553
  const char *_description;
5554
5555
  /**
5556
   * Instruction sets required for this implementation.
5557
   */
5558
  const uint32_t _required_instruction_sets;
5559
};
5560
5561
/** @private */
5562
namespace internal {
5563
5564
/**
5565
 * The list of available implementations compiled into simdutf.
5566
 */
5567
class available_implementation_list {
5568
public:
5569
  /** Get the list of available implementations compiled into simdutf */
5570
  simdutf_really_inline available_implementation_list() {}
5571
  /** Number of implementations */
5572
  size_t size() const noexcept;
5573
  /** STL const begin() iterator */
5574
  const implementation *const *begin() const noexcept;
5575
  /** STL const end() iterator */
5576
  const implementation *const *end() const noexcept;
5577
5578
  /**
5579
   * Get the implementation with the given name.
5580
   *
5581
   * Case sensitive.
5582
   *
5583
   *     const implementation *impl =
5584
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
5585
   * (!imp->supported_by_runtime_system()) { exit(1); }
5586
   *     simdutf::active_implementation = impl;
5587
   *
5588
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
5589
   * @return the implementation, or nullptr if the parse failed.
5590
   */
5591
  const implementation *operator[](const std::string &name) const noexcept {
5592
    for (const implementation *impl : *this) {
5593
      if (impl->name() == name) {
5594
        return impl;
5595
      }
5596
    }
5597
    return nullptr;
5598
  }
5599
5600
  /**
5601
   * Detect the most advanced implementation supported by the current host.
5602
   *
5603
   * This is used to initialize the implementation on startup.
5604
   *
5605
   *     const implementation *impl =
5606
   * simdutf::available_implementation::detect_best_supported();
5607
   *     simdutf::active_implementation = impl;
5608
   *
5609
   * @return the most advanced supported implementation for the current host, or
5610
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
5611
   * supported implementation. Will never return nullptr.
5612
   */
5613
  const implementation *detect_best_supported() const noexcept;
5614
};
5615
5616
template <typename T> class atomic_ptr {
5617
public:
5618
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
5619
5620
#if defined(SIMDUTF_NO_THREADS)
5621
  operator const T *() const { return ptr; }
5622
  const T &operator*() const { return *ptr; }
5623
  const T *operator->() const { return ptr; }
5624
5625
  operator T *() { return ptr; }
5626
  T &operator*() { return *ptr; }
5627
  T *operator->() { return ptr; }
5628
  atomic_ptr &operator=(T *_ptr) {
5629
    ptr = _ptr;
5630
    return *this;
5631
  }
5632
5633
#else
5634
  operator const T *() const { return ptr.load(); }
5635
  const T &operator*() const { return *ptr; }
5636
  const T *operator->() const { return ptr.load(); }
5637
5638
  operator T *() { return ptr.load(); }
5639
  T &operator*() { return *ptr; }
5640
  T *operator->() { return ptr.load(); }
5641
  atomic_ptr &operator=(T *_ptr) {
5642
    ptr = _ptr;
5643
    return *this;
5644
  }
5645
5646
#endif
5647
5648
private:
5649
#if defined(SIMDUTF_NO_THREADS)
5650
  T *ptr;
5651
#else
5652
  std::atomic<T *> ptr;
5653
#endif
5654
};
5655
5656
class detect_best_supported_implementation_on_first_use;
5657
5658
} // namespace internal
5659
5660
/**
5661
 * The list of available implementations compiled into simdutf.
5662
 */
5663
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
5664
get_available_implementations();
5665
5666
/**
5667
 * The active implementation.
5668
 *
5669
 * Automatically initialized on first use to the most advanced implementation
5670
 * supported by this hardware.
5671
 */
5672
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
5673
get_active_implementation();
5674
5675
} // namespace simdutf
5676
5677
#endif // SIMDUTF_IMPLEMENTATION_H