Coverage Report

Created: 2025-10-31 09:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/node/deps/v8/third_party/simdutf/simdutf.h
Line
Count
Source
1
/* auto-generated on 2025-07-13 10:46:57 -0400. Do not edit! */
2
/* begin file include/simdutf.h */
3
#ifndef SIMDUTF_H
4
#define SIMDUTF_H
5
#include <cstring>
6
7
/* begin file include/simdutf/compiler_check.h */
8
#ifndef SIMDUTF_COMPILER_CHECK_H
9
#define SIMDUTF_COMPILER_CHECK_H
10
11
#ifndef __cplusplus
12
  #error simdutf requires a C++ compiler
13
#endif
14
15
#ifndef SIMDUTF_CPLUSPLUS
16
  #if defined(_MSVC_LANG) && !defined(__clang__)
17
    #define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
18
  #else
19
    #define SIMDUTF_CPLUSPLUS __cplusplus
20
  #endif
21
#endif
22
23
// C++ 23
24
#if !defined(SIMDUTF_CPLUSPLUS23) && (SIMDUTF_CPLUSPLUS >= 202302L)
25
  #define SIMDUTF_CPLUSPLUS23 1
26
#endif
27
28
// C++ 20
29
#if !defined(SIMDUTF_CPLUSPLUS20) && (SIMDUTF_CPLUSPLUS >= 202002L)
30
  #define SIMDUTF_CPLUSPLUS20 1
31
#endif
32
33
// C++ 17
34
#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
35
  #define SIMDUTF_CPLUSPLUS17 1
36
#endif
37
38
// C++ 14
39
#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
40
  #define SIMDUTF_CPLUSPLUS14 1
41
#endif
42
43
// C++ 11
44
#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
45
  #define SIMDUTF_CPLUSPLUS11 1
46
#endif
47
48
#ifndef SIMDUTF_CPLUSPLUS11
49
  #error simdutf requires a compiler compliant with the C++11 standard
50
#endif
51
52
#endif // SIMDUTF_COMPILER_CHECK_H
53
/* end file include/simdutf/compiler_check.h */
54
/* begin file include/simdutf/common_defs.h */
55
#ifndef SIMDUTF_COMMON_DEFS_H
56
#define SIMDUTF_COMMON_DEFS_H
57
58
/* begin file include/simdutf/portability.h */
59
#ifndef SIMDUTF_PORTABILITY_H
60
#define SIMDUTF_PORTABILITY_H
61
62
63
#include <cfloat>
64
#include <cstddef>
65
#include <cstdint>
66
#include <cstdlib>
67
#ifndef _WIN32
68
  // strcasecmp, strncasecmp
69
  #include <strings.h>
70
#endif
71
72
#if defined(__apple_build_version__)
73
  #if __apple_build_version__ < 14000000
74
    #define SIMDUTF_SPAN_DISABLED                                              \
75
      1 // apple-clang/13 doesn't support std::convertible_to
76
  #endif
77
#endif
78
79
#if SIMDUTF_CPLUSPLUS20
80
  #include <version>
81
  #if __cpp_concepts >= 201907L && __cpp_lib_span >= 202002L &&                \
82
      !defined(SIMDUTF_SPAN_DISABLED)
83
    #define SIMDUTF_SPAN 1
84
  #endif // __cpp_concepts >= 201907L && __cpp_lib_span >= 202002L
85
  #if __cpp_lib_atomic_ref >= 201806L
86
    #define SIMDUTF_ATOMIC_REF 1
87
  #endif // __cpp_lib_atomic_ref
88
  #if __has_cpp_attribute(maybe_unused) >= 201603L
89
    #define SIMDUTF_MAYBE_UNUSED_AVAILABLE 1
90
  #endif // __has_cpp_attribute(maybe_unused) >= 201603L
91
#endif
92
93
/**
94
 * We want to check that it is actually a little endian system at
95
 * compile-time.
96
 */
97
98
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
99
  #define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
100
#elif defined(_WIN32)
101
  #define SIMDUTF_IS_BIG_ENDIAN 0
102
#else
103
  #if defined(__APPLE__) ||                                                    \
104
      defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined
105
                           // __ORDER_BIG_ENDIAN__
106
    #include <machine/endian.h>
107
  #elif defined(sun) ||                                                        \
108
      defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
109
    #include <sys/byteorder.h>
110
  #else // defined(__APPLE__) || defined(__FreeBSD__)
111
112
    #ifdef __has_include
113
      #if __has_include(<endian.h>)
114
        #include <endian.h>
115
      #endif //__has_include(<endian.h>)
116
    #endif   //__has_include
117
118
  #endif // defined(__APPLE__) || defined(__FreeBSD__)
119
120
  #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
121
    #define SIMDUTF_IS_BIG_ENDIAN 0
122
  #endif
123
124
  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
125
    #define SIMDUTF_IS_BIG_ENDIAN 0
126
  #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
127
    #define SIMDUTF_IS_BIG_ENDIAN 1
128
  #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
129
130
#endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
131
132
/**
133
 * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined.
134
 */
135
136
#ifdef _MSC_VER
137
  #define SIMDUTF_VISUAL_STUDIO 1
138
  /**
139
   * We want to differentiate carefully between
140
   * clang under visual studio and regular visual
141
   * studio.
142
   *
143
   * Under clang for Windows, we enable:
144
   *  * target pragmas so that part and only part of the
145
   *     code gets compiled for advanced instructions.
146
   *
147
   */
148
  #ifdef __clang__
149
    // clang under visual studio
150
    #define SIMDUTF_CLANG_VISUAL_STUDIO 1
151
  #else
152
    // just regular visual studio (best guess)
153
    #define SIMDUTF_REGULAR_VISUAL_STUDIO 1
154
  #endif // __clang__
155
#endif   // _MSC_VER
156
157
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
158
  // https://en.wikipedia.org/wiki/C_alternative_tokens
159
  // This header should have no effect, except maybe
160
  // under Visual Studio.
161
  #include <iso646.h>
162
#endif
163
164
#if (defined(__x86_64__) || defined(_M_AMD64)) && !defined(_M_ARM64EC)
165
  #define SIMDUTF_IS_X86_64 1
166
#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
167
  #define SIMDUTF_IS_ARM64 1
168
#elif defined(__PPC64__) || defined(_M_PPC64)
169
  #if defined(__VEC__) && defined(__ALTIVEC__)
170
    #define SIMDUTF_IS_PPC64 1
171
  #endif
172
#elif defined(__s390__)
173
// s390 IBM system. Big endian.
174
#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
175
  // RISC-V 64-bit
176
  #define SIMDUTF_IS_RISCV64 1
177
178
  // #if __riscv_v_intrinsic >= 1000000
179
  //   #define SIMDUTF_HAS_RVV_INTRINSICS 1
180
  //   #define SIMDUTF_HAS_RVV_TARGET_REGION 1
181
  // #elif ...
182
  //  Check for special compiler versions that implement pre v1.0 intrinsics
183
  #if __riscv_v_intrinsic >= 11000
184
    #define SIMDUTF_HAS_RVV_INTRINSICS 1
185
  #endif
186
187
  #define SIMDUTF_HAS_ZVBB_INTRINSICS                                          \
188
    0 // there is currently no way to detect this
189
190
  #if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector &&                          \
191
      __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64
192
    // RISC-V V extension
193
    #define SIMDUTF_IS_RVV 1
194
    #if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000
195
      // RISC-V Vector Basic Bit-manipulation
196
      #define SIMDUTF_IS_ZVBB 1
197
    #endif
198
  #endif
199
200
#elif defined(__loongarch_lp64)
201
  #if defined(__loongarch_sx) && defined(__loongarch_asx)
202
    #define SIMDUTF_IS_LSX 1
203
    #define SIMDUTF_IS_LASX 1
204
  #elif defined(__loongarch_sx)
205
    #define SIMDUTF_IS_LSX 1
206
  #endif
207
#else
208
  // The simdutf library is designed
209
  // for 64-bit processors and it seems that you are not
210
  // compiling for a known 64-bit platform. Please
211
  // use a 64-bit target such as x64 or 64-bit ARM for best performance.
212
  #define SIMDUTF_IS_32BITS 1
213
214
  // We do not support 32-bit platforms, but it can be
215
  // handy to identify them.
216
  #if defined(_M_IX86) || defined(__i386__)
217
    #define SIMDUTF_IS_X86_32BITS 1
218
  #elif defined(__arm__) || defined(_M_ARM)
219
    #define SIMDUTF_IS_ARM_32BITS 1
220
  #elif defined(__PPC__) || defined(_M_PPC)
221
    #define SIMDUTF_IS_PPC_32BITS 1
222
  #endif
223
224
#endif // defined(__x86_64__) || defined(_M_AMD64)
225
226
#ifdef SIMDUTF_IS_32BITS
227
  #ifndef SIMDUTF_NO_PORTABILITY_WARNING
228
  // In the future, we may want to warn users of 32-bit systems that
229
  // the simdutf does not support accelerated kernels for such systems.
230
  #endif // SIMDUTF_NO_PORTABILITY_WARNING
231
#endif   // SIMDUTF_IS_32BITS
232
233
// this is almost standard?
234
#define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
235
#define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
236
237
// Our fast kernels require 64-bit systems.
238
//
239
// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
240
// Furthermore, the number of SIMD registers is reduced.
241
//
242
// On 32-bit ARM, we would have smaller registers.
243
//
244
// The simdutf users should still have the fallback kernel. It is
245
// slower, but it should run everywhere.
246
247
//
248
// Enable valid runtime implementations, and select
249
// SIMDUTF_BUILTIN_IMPLEMENTATION
250
//
251
252
// We are going to use runtime dispatch.
253
#ifdef SIMDUTF_IS_X86_64
254
  #ifdef __clang__
255
    // clang does not have GCC push pop
256
    // warning: clang attribute push can't be used within a namespace in clang
257
    // up til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be
258
    // *outside* of a namespace.
259
    #define SIMDUTF_TARGET_REGION(T)                                           \
260
      _Pragma(SIMDUTF_STRINGIFY(clang attribute push(                          \
261
          __attribute__((target(T))), apply_to = function)))
262
    #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
263
  #elif defined(__GNUC__)
264
    // GCC is easier
265
    #define SIMDUTF_TARGET_REGION(T)                                           \
266
      _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
267
    #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
268
  #endif // clang then gcc
269
270
#endif // x86
271
272
// Default target region macros don't do anything.
273
#ifndef SIMDUTF_TARGET_REGION
274
  #define SIMDUTF_TARGET_REGION(T)
275
  #define SIMDUTF_UNTARGET_REGION
276
#endif
277
278
// Is threading enabled?
279
#if defined(_REENTRANT) || defined(_MT)
280
  #ifndef SIMDUTF_THREADS_ENABLED
281
    #define SIMDUTF_THREADS_ENABLED
282
  #endif
283
#endif
284
285
// workaround for large stack sizes under -O0.
286
// https://github.com/simdutf/simdutf/issues/691
287
#ifdef __APPLE__
288
  #ifndef __OPTIMIZE__
289
    // Apple systems have small stack sizes in secondary threads.
290
    // Lack of compiler optimization may generate high stack usage.
291
    // Users may want to disable threads for safety, but only when
292
    // in debug mode which we detect by the fact that the __OPTIMIZE__
293
    // macro is not defined.
294
    #undef SIMDUTF_THREADS_ENABLED
295
  #endif
296
#endif
297
298
#ifdef SIMDUTF_VISUAL_STUDIO
299
  // This is one case where we do not distinguish between
300
  // regular visual studio and clang under visual studio.
301
  // clang under Windows has _stricmp (like visual studio) but not strcasecmp
302
  // (as clang normally has)
303
  #define simdutf_strcasecmp _stricmp
304
  #define simdutf_strncasecmp _strnicmp
305
#else
306
  // The strcasecmp, strncasecmp, and strcasestr functions do not work with
307
  // multibyte strings (e.g. UTF-8). So they are only useful for ASCII in our
308
  // context.
309
  // https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
310
  #define simdutf_strcasecmp strcasecmp
311
  #define simdutf_strncasecmp strncasecmp
312
#endif
313
314
#if defined(__GNUC__) && !defined(__clang__)
315
  #if __GNUC__ >= 11
316
    #define SIMDUTF_GCC11ORMORE 1
317
  #endif //  __GNUC__ >= 11
318
#endif   // defined(__GNUC__) && !defined(__clang__)
319
320
#endif // SIMDUTF_PORTABILITY_H
321
/* end file include/simdutf/portability.h */
322
/* begin file include/simdutf/avx512.h */
323
#ifndef SIMDUTF_AVX512_H_
324
#define SIMDUTF_AVX512_H_
325
326
/*
327
    It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS.
328
329
    All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`,
330
    where a feature is a code name for extensions.
331
332
    Please see the listing below to find which are supported.
333
*/
334
335
#ifndef SIMDUTF_HAS_AVX512F
336
  #if defined(__AVX512F__) && __AVX512F__ == 1
337
    #define SIMDUTF_HAS_AVX512F 1
338
  #endif
339
#endif
340
341
#ifndef SIMDUTF_HAS_AVX512DQ
342
  #if defined(__AVX512DQ__) && __AVX512DQ__ == 1
343
    #define SIMDUTF_HAS_AVX512DQ 1
344
  #endif
345
#endif
346
347
#ifndef SIMDUTF_HAS_AVX512IFMA
348
  #if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
349
    #define SIMDUTF_HAS_AVX512IFMA 1
350
  #endif
351
#endif
352
353
#ifndef SIMDUTF_HAS_AVX512CD
354
  #if defined(__AVX512CD__) && __AVX512CD__ == 1
355
    #define SIMDUTF_HAS_AVX512CD 1
356
  #endif
357
#endif
358
359
#ifndef SIMDUTF_HAS_AVX512BW
360
  #if defined(__AVX512BW__) && __AVX512BW__ == 1
361
    #define SIMDUTF_HAS_AVX512BW 1
362
  #endif
363
#endif
364
365
#ifndef SIMDUTF_HAS_AVX512VL
366
  #if defined(__AVX512VL__) && __AVX512VL__ == 1
367
    #define SIMDUTF_HAS_AVX512VL 1
368
  #endif
369
#endif
370
371
#ifndef SIMDUTF_HAS_AVX512VBMI
372
  #if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
373
    #define SIMDUTF_HAS_AVX512VBMI 1
374
  #endif
375
#endif
376
377
#ifndef SIMDUTF_HAS_AVX512VBMI2
378
  #if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
379
    #define SIMDUTF_HAS_AVX512VBMI2 1
380
  #endif
381
#endif
382
383
#ifndef SIMDUTF_HAS_AVX512VNNI
384
  #if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
385
    #define SIMDUTF_HAS_AVX512VNNI 1
386
  #endif
387
#endif
388
389
#ifndef SIMDUTF_HAS_AVX512BITALG
390
  #if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
391
    #define SIMDUTF_HAS_AVX512BITALG 1
392
  #endif
393
#endif
394
395
#ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
396
  #if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
397
    #define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
398
  #endif
399
#endif
400
401
#endif // SIMDUTF_AVX512_H_
402
/* end file include/simdutf/avx512.h */
403
404
// Sometimes logging is useful, but we want it disabled by default
405
// and free of any logging code in release builds.
406
#ifdef SIMDUTF_LOGGING
407
  #include <iostream>
408
  #define simdutf_log(msg)                                                     \
409
    std::cout << "[" << __FUNCTION__ << "]: " << msg << std::endl              \
410
              << "\t" << __FILE__ << ":" << __LINE__ << std::endl;
411
  #define simdutf_log_assert(cond, msg)                                        \
412
    do {                                                                       \
413
      if (!(cond)) {                                                           \
414
        std::cerr << "[" << __FUNCTION__ << "]: " << msg << std::endl          \
415
                  << "\t" << __FILE__ << ":" << __LINE__ << std::endl;         \
416
        std::abort();                                                          \
417
      }                                                                        \
418
    } while (0)
419
#else
420
  #define simdutf_log(msg)
421
  #define simdutf_log_assert(cond, msg)
422
#endif
423
424
#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
425
  #define SIMDUTF_DEPRECATED __declspec(deprecated)
426
427
  #define simdutf_really_inline __forceinline // really inline in release mode
428
  #define simdutf_always_inline __forceinline // always inline, no matter what
429
  #define simdutf_never_inline __declspec(noinline)
430
431
  #define simdutf_unused
432
  #define simdutf_warn_unused
433
434
  #ifndef simdutf_likely
435
    #define simdutf_likely(x) x
436
  #endif
437
  #ifndef simdutf_unlikely
438
    #define simdutf_unlikely(x) x
439
  #endif
440
441
  #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning(push))
442
  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning(push, 0))
443
  #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER)                           \
444
    __pragma(warning(disable : WARNING_NUMBER))
445
  // Get rid of Intellisense-only warnings (Code Analysis)
446
  // Though __has_include is C++17, it is supported in Visual Studio 2017 or
447
  // better (_MSC_VER>=1910).
448
  #ifdef __has_include
449
    #if __has_include(<CppCoreCheck\Warnings.h>)
450
      #include <CppCoreCheck\Warnings.h>
451
      #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS                               \
452
        SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
453
    #endif
454
  #endif
455
456
  #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
457
    #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
458
  #endif
459
460
  #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
461
  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
462
  #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning(pop))
463
  #define SIMDUTF_DISABLE_UNUSED_WARNING
464
#else // SIMDUTF_REGULAR_VISUAL_STUDIO
465
  #if defined(__OPTIMIZE__) || defined(NDEBUG)
466
    #define simdutf_really_inline inline __attribute__((always_inline))
467
  #else
468
    #define simdutf_really_inline inline
469
  #endif
470
  #define simdutf_always_inline                                                \
471
    inline __attribute__((always_inline)) // always inline, no matter what
472
  #define SIMDUTF_DEPRECATED __attribute__((deprecated))
473
  #define simdutf_never_inline inline __attribute__((noinline))
474
475
  #define simdutf_unused __attribute__((unused))
476
  #define simdutf_warn_unused __attribute__((warn_unused_result))
477
478
  #ifndef simdutf_likely
479
    #define simdutf_likely(x) __builtin_expect(!!(x), 1)
480
  #endif
481
  #ifndef simdutf_unlikely
482
    #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
483
  #endif
484
  // clang-format off
485
  #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
486
  // gcc doesn't seem to disable all warnings with all and extra, add warnings
487
  // here as necessary
488
  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS                                    \
489
    SIMDUTF_PUSH_DISABLE_WARNINGS                                              \
490
    SIMDUTF_DISABLE_GCC_WARNING(-Weffc++)                                      \
491
    SIMDUTF_DISABLE_GCC_WARNING(-Wall)                                         \
492
    SIMDUTF_DISABLE_GCC_WARNING(-Wconversion)                                  \
493
    SIMDUTF_DISABLE_GCC_WARNING(-Wextra)                                       \
494
    SIMDUTF_DISABLE_GCC_WARNING(-Wattributes)                                  \
495
    SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough)                        \
496
    SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor)                            \
497
    SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type)                                 \
498
    SIMDUTF_DISABLE_GCC_WARNING(-Wshadow)                                      \
499
    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter)                            \
500
    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
501
  #define SIMDUTF_PRAGMA(P) _Pragma(#P)
502
  #define SIMDUTF_DISABLE_GCC_WARNING(WARNING)                                 \
503
    SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
504
  #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
505
    #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS                                 \
506
      SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
507
  #else
508
    #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
509
  #endif
510
  #define SIMDUTF_DISABLE_DEPRECATED_WARNING                                   \
511
    SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
512
  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING                              \
513
    SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
514
  #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
515
  #define SIMDUTF_DISABLE_UNUSED_WARNING                                       \
516
    SIMDUTF_PUSH_DISABLE_WARNINGS                                              \
517
    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-function)                             \
518
    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-const-variable)
519
  // clang-format on
520
521
#endif // MSC_VER
522
523
#ifndef SIMDUTF_DLLIMPORTEXPORT
524
  #if defined(SIMDUTF_VISUAL_STUDIO)
525
    /**
526
     * It does not matter here whether you are using
527
     * the regular visual studio or clang under visual
528
     * studio.
529
     */
530
    #if SIMDUTF_USING_LIBRARY
531
      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
532
    #else
533
      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
534
    #endif
535
  #else
536
    #define SIMDUTF_DLLIMPORTEXPORT
537
  #endif
538
#endif
539
540
#if SIMDUTF_MAYBE_UNUSED_AVAILABLE
541
  #define simdutf_maybe_unused [[maybe_unused]]
542
#else
543
  #define simdutf_maybe_unused
544
#endif
545
546
#endif // SIMDUTF_COMMON_DEFS_H
547
/* end file include/simdutf/common_defs.h */
548
/* begin file include/simdutf/encoding_types.h */
549
#ifndef SIMDUTF_ENCODING_TYPES_H
550
#define SIMDUTF_ENCODING_TYPES_H
551
#include <string>
552
553
namespace simdutf {
554
555
enum encoding_type {
556
  UTF8 = 1,      // BOM 0xef 0xbb 0xbf
557
  UTF16_LE = 2,  // BOM 0xff 0xfe
558
  UTF16_BE = 4,  // BOM 0xfe 0xff
559
  UTF32_LE = 8,  // BOM 0xff 0xfe 0x00 0x00
560
  UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
561
  Latin1 = 32,
562
563
  unspecified = 0
564
};
565
566
enum endianness { LITTLE = 0, BIG = 1 };
567
568
bool match_system(endianness e);
569
570
std::string to_string(encoding_type bom);
571
572
// Note that BOM for UTF8 is discouraged.
573
namespace BOM {
574
575
/**
576
 * Checks for a BOM. If not, returns unspecified
577
 * @param input         the string to process
578
 * @param length        the length of the string in code units
579
 * @return the corresponding encoding
580
 */
581
582
encoding_type check_bom(const uint8_t *byte, size_t length);
583
encoding_type check_bom(const char *byte, size_t length);
584
/**
585
 * Returns the size, in bytes, of the BOM for a given encoding type.
586
 * Note that UTF8 BOM are discouraged.
587
 * @param bom         the encoding type
588
 * @return the size in bytes of the corresponding BOM
589
 */
590
size_t bom_byte_size(encoding_type bom);
591
592
} // namespace BOM
593
} // namespace simdutf
594
#endif
595
/* end file include/simdutf/encoding_types.h */
596
/* begin file include/simdutf/error.h */
597
#ifndef SIMDUTF_ERROR_H
598
#define SIMDUTF_ERROR_H
599
namespace simdutf {
600
601
enum error_code {
602
  SUCCESS = 0,
603
  HEADER_BITS, // Any byte must have fewer than 5 header bits.
604
  TOO_SHORT,   // The leading byte must be followed by N-1 continuation bytes,
605
               // where N is the UTF-8 character length This is also the error
606
               // when the input is truncated.
607
  TOO_LONG,    // We either have too many consecutive continuation bytes or the
608
               // string starts with a continuation byte.
609
  OVERLONG, // The decoded character must be above U+7F for two-byte characters,
610
            // U+7FF for three-byte characters, and U+FFFF for four-byte
611
            // characters.
612
  TOO_LARGE, // The decoded character must be less than or equal to
613
             // U+10FFFF,less than or equal than U+7F for ASCII OR less than
614
             // equal than U+FF for Latin1
615
  SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or
616
             // UTF-32) OR a high surrogate must be followed by a low surrogate
617
             // and a low surrogate must be preceded by a high surrogate
618
             // (UTF-16) OR there must be no surrogate at all (Latin1)
619
  INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid
620
                            // base64 string. This may include a misplaced
621
                            // padding character ('=').
622
  BASE64_INPUT_REMAINDER,   // The base64 input terminates with a single
623
                            // character, excluding padding (=). It is also used
624
                            // in strict mode when padding is not adequate.
625
  BASE64_EXTRA_BITS,        // The base64 input terminates with non-zero
626
                            // padding bits.
627
  OUTPUT_BUFFER_TOO_SMALL,  // The provided buffer is too small.
628
  OTHER                     // Not related to validation/transcoding.
629
};
630
#if SIMDUTF_CPLUSPLUS17
631
0
inline std::string_view error_to_string(error_code code) noexcept {
632
0
  switch (code) {
633
0
  case SUCCESS:
634
0
    return "SUCCESS";
635
0
  case HEADER_BITS:
636
0
    return "HEADER_BITS";
637
0
  case TOO_SHORT:
638
0
    return "TOO_SHORT";
639
0
  case TOO_LONG:
640
0
    return "TOO_LONG";
641
0
  case OVERLONG:
642
0
    return "OVERLONG";
643
0
  case TOO_LARGE:
644
0
    return "TOO_LARGE";
645
0
  case SURROGATE:
646
0
    return "SURROGATE";
647
0
  case INVALID_BASE64_CHARACTER:
648
0
    return "INVALID_BASE64_CHARACTER";
649
0
  case BASE64_INPUT_REMAINDER:
650
0
    return "BASE64_INPUT_REMAINDER";
651
0
  case BASE64_EXTRA_BITS:
652
0
    return "BASE64_EXTRA_BITS";
653
0
  case OUTPUT_BUFFER_TOO_SMALL:
654
0
    return "OUTPUT_BUFFER_TOO_SMALL";
655
0
  default:
656
0
    return "OTHER";
657
0
  }
658
0
}
659
#endif
660
661
struct result {
662
  error_code error;
663
  size_t count; // In case of error, indicates the position of the error. In
664
                // case of success, indicates the number of code units
665
                // validated/written.
666
667
  simdutf_really_inline result() noexcept
668
0
      : error{error_code::SUCCESS}, count{0} {}
669
670
  simdutf_really_inline result(error_code err, size_t pos) noexcept
671
0
      : error{err}, count{pos} {}
672
673
0
  simdutf_really_inline bool is_ok() const noexcept {
674
0
    return error == error_code::SUCCESS;
675
0
  }
676
677
0
  simdutf_really_inline bool is_err() const noexcept {
678
0
    return error != error_code::SUCCESS;
679
0
  }
680
};
681
682
struct full_result {
683
  error_code error;
684
  size_t input_count;
685
  size_t output_count;
686
  bool padding_error = false; // true if the error is due to padding, only
687
                              // meaningful when error is not SUCCESS
688
689
  simdutf_really_inline full_result() noexcept
690
0
      : error{error_code::SUCCESS}, input_count{0}, output_count{0} {}
691
692
  simdutf_really_inline full_result(error_code err, size_t pos_in,
693
                                    size_t pos_out) noexcept
694
0
      : error{err}, input_count{pos_in}, output_count{pos_out} {}
695
  simdutf_really_inline full_result(error_code err, size_t pos_in,
696
                                    size_t pos_out, bool padding_err) noexcept
697
      : error{err}, input_count{pos_in}, output_count{pos_out},
698
0
        padding_error{padding_err} {}
699
700
0
  simdutf_really_inline operator result() const noexcept {
701
0
    if (error == error_code::SUCCESS) {
702
0
      return result{error, output_count};
703
0
    } else {
704
0
      return result{error, input_count};
705
0
    }
706
0
  }
707
};
708
709
} // namespace simdutf
710
#endif
711
/* end file include/simdutf/error.h */
712
713
SIMDUTF_PUSH_DISABLE_WARNINGS
714
SIMDUTF_DISABLE_UNDESIRED_WARNINGS
715
716
// Public API
717
/* begin file include/simdutf/simdutf_version.h */
718
// /include/simdutf/simdutf_version.h automatically generated by release.py,
719
// do not change by hand
720
#ifndef SIMDUTF_SIMDUTF_VERSION_H
721
#define SIMDUTF_SIMDUTF_VERSION_H
722
723
/** The version of simdutf being used (major.minor.revision) */
724
72
#define SIMDUTF_VERSION "7.3.3"
725
726
namespace simdutf {
727
enum {
728
  /**
729
   * The major version (MAJOR.minor.revision) of simdutf being used.
730
   */
731
  SIMDUTF_VERSION_MAJOR = 7,
732
  /**
733
   * The minor version (major.MINOR.revision) of simdutf being used.
734
   */
735
  SIMDUTF_VERSION_MINOR = 3,
736
  /**
737
   * The revision (major.minor.REVISION) of simdutf being used.
738
   */
739
  SIMDUTF_VERSION_REVISION = 3
740
};
741
} // namespace simdutf
742
743
#endif // SIMDUTF_SIMDUTF_VERSION_H
744
/* end file include/simdutf/simdutf_version.h */
745
/* begin file include/simdutf/implementation.h */
746
#ifndef SIMDUTF_IMPLEMENTATION_H
747
#define SIMDUTF_IMPLEMENTATION_H
748
#if !defined(SIMDUTF_NO_THREADS)
749
  #include <atomic>
750
#endif
751
#include <string>
752
#ifdef SIMDUTF_INTERNAL_TESTS
753
  #include <vector>
754
#endif
755
/* begin file include/simdutf/internal/isadetection.h */
756
/* From
757
https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
758
Highly modified.
759
760
Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
761
Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
762
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
763
Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
764
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
765
Copyright (c) 2011-2013 NYU                      (Clement Farabet)
766
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
767
Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
768
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
769
Samy Bengio, Johnny Mariethoz)
770
771
All rights reserved.
772
773
Redistribution and use in source and binary forms, with or without
774
modification, are permitted provided that the following conditions are met:
775
776
1. Redistributions of source code must retain the above copyright
777
   notice, this list of conditions and the following disclaimer.
778
779
2. Redistributions in binary form must reproduce the above copyright
780
   notice, this list of conditions and the following disclaimer in the
781
   documentation and/or other materials provided with the distribution.
782
783
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
784
America and IDIAP Research Institute nor the names of its contributors may be
785
   used to endorse or promote products derived from this software without
786
   specific prior written permission.
787
788
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
789
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
790
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
791
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
792
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
793
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
794
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
795
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
796
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
797
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
798
POSSIBILITY OF SUCH DAMAGE.
799
*/
800
801
#ifndef SIMDutf_INTERNAL_ISADETECTION_H
802
#define SIMDutf_INTERNAL_ISADETECTION_H
803
804
#include <cstdint>
805
#include <cstdlib>
806
#if defined(_MSC_VER)
807
  #include <intrin.h>
808
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
809
  #include <cpuid.h>
810
#endif
811
812
813
// RISC-V ISA detection utilities
814
#if SIMDUTF_IS_RISCV64 && defined(__linux__)
815
  #include <unistd.h> // for syscall
816
// We define these ourselves, for backwards compatibility
817
struct simdutf_riscv_hwprobe {
818
  int64_t key;
819
  uint64_t value;
820
};
821
  #define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__)
822
  #define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4
823
  #define SIMDUTF_RISCV_HWPROBE_IMA_V (1 << 2)
824
  #define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17)
825
#endif // SIMDUTF_IS_RISCV64 && defined(__linux__)
826
827
#if defined(__loongarch__) && defined(__linux__)
828
  #include <sys/auxv.h>
829
// bits/hwcap.h
830
// #define HWCAP_LOONGARCH_LSX             (1 << 4)
831
// #define HWCAP_LOONGARCH_LASX            (1 << 5)
832
#endif
833
834
namespace simdutf {
835
namespace internal {
836
837
enum instruction_set {
838
  DEFAULT = 0x0,
839
  NEON = 0x1,
840
  AVX2 = 0x4,
841
  SSE42 = 0x8,
842
  PCLMULQDQ = 0x10,
843
  BMI1 = 0x20,
844
  BMI2 = 0x40,
845
  ALTIVEC = 0x80,
846
  AVX512F = 0x100,
847
  AVX512DQ = 0x200,
848
  AVX512IFMA = 0x400,
849
  AVX512PF = 0x800,
850
  AVX512ER = 0x1000,
851
  AVX512CD = 0x2000,
852
  AVX512BW = 0x4000,
853
  AVX512VL = 0x8000,
854
  AVX512VBMI2 = 0x10000,
855
  AVX512VPOPCNTDQ = 0x2000,
856
  RVV = 0x4000,
857
  ZVBB = 0x8000,
858
  LSX = 0x40000,
859
  LASX = 0x80000,
860
};
861
862
#if defined(__PPC64__)
863
864
static inline uint32_t detect_supported_architectures() {
865
  return instruction_set::ALTIVEC;
866
}
867
868
#elif SIMDUTF_IS_RISCV64
869
870
static inline uint32_t detect_supported_architectures() {
871
  uint32_t host_isa = instruction_set::DEFAULT;
872
  #if SIMDUTF_IS_RVV
873
  host_isa |= instruction_set::RVV;
874
  #endif
875
  #if SIMDUTF_IS_ZVBB
876
  host_isa |= instruction_set::ZVBB;
877
  #endif
878
  #if defined(__linux__)
879
  simdutf_riscv_hwprobe probes[] = {{SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0}};
880
  long ret = simdutf_riscv_hwprobe(&probes, sizeof probes / sizeof *probes, 0,
881
                                   nullptr, 0);
882
  if (ret == 0) {
883
    uint64_t extensions = probes[0].value;
884
    if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V)
885
      host_isa |= instruction_set::RVV;
886
    if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB)
887
      host_isa |= instruction_set::ZVBB;
888
  }
889
  #endif
890
  #if defined(RUN_IN_SPIKE_SIMULATOR)
891
  // Proxy Kernel does not implement yet hwprobe syscall
892
  host_isa |= instruction_set::RVV;
893
  #endif
894
  return host_isa;
895
}
896
897
#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
898
899
static inline uint32_t detect_supported_architectures() {
900
  return instruction_set::NEON;
901
}
902
903
#elif defined(__x86_64__) || defined(_M_AMD64) // x64
904
905
namespace {
906
namespace cpuid_bit {
907
// Can be found on Intel ISA Reference for CPUID
908
909
// EAX = 0x01
910
constexpr uint32_t pclmulqdq = uint32_t(1)
911
                               << 1; ///< @private bit  1 of ECX for EAX=0x1
912
constexpr uint32_t sse42 = uint32_t(1)
913
                           << 20; ///< @private bit 20 of ECX for EAX=0x1
914
constexpr uint32_t osxsave =
915
    (uint32_t(1) << 26) |
916
    (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
917
918
// EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
919
// See: "Table 3-8. Information Returned by CPUID Instruction"
920
namespace ebx {
921
constexpr uint32_t bmi1 = uint32_t(1) << 3;
922
constexpr uint32_t avx2 = uint32_t(1) << 5;
923
constexpr uint32_t bmi2 = uint32_t(1) << 8;
924
constexpr uint32_t avx512f = uint32_t(1) << 16;
925
constexpr uint32_t avx512dq = uint32_t(1) << 17;
926
constexpr uint32_t avx512ifma = uint32_t(1) << 21;
927
constexpr uint32_t avx512cd = uint32_t(1) << 28;
928
constexpr uint32_t avx512bw = uint32_t(1) << 30;
929
constexpr uint32_t avx512vl = uint32_t(1) << 31;
930
} // namespace ebx
931
932
namespace ecx {
933
constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
934
constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
935
constexpr uint32_t avx512vnni = uint32_t(1) << 11;
936
constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
937
constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
938
} // namespace ecx
939
namespace edx {
940
constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
941
}
942
namespace xcr0_bit {
943
constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
944
constexpr uint64_t avx512_saved =
945
    uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
946
} // namespace xcr0_bit
947
} // namespace cpuid_bit
948
} // namespace
949
950
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
951
0
                         uint32_t *edx) {
952
0
  #if defined(_MSC_VER)
953
0
  int cpu_info[4];
954
0
  __cpuidex(cpu_info, *eax, *ecx);
955
0
  *eax = cpu_info[0];
956
0
  *ebx = cpu_info[1];
957
0
  *ecx = cpu_info[2];
958
0
  *edx = cpu_info[3];
959
0
  #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
960
0
  uint32_t level = *eax;
961
0
  __get_cpuid(level, eax, ebx, ecx, edx);
962
0
  #else
963
0
  uint32_t a = *eax, b, c = *ecx, d;
964
0
  asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
965
0
  *eax = a;
966
0
  *ebx = b;
967
0
  *ecx = c;
968
0
  *edx = d;
969
0
  #endif
970
0
}
Unexecuted instantiation: node_buffer.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: node_builtins.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: node_i18n.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: node_metadata.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: string_bytes.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: util.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: inspector_profiler.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: main_thread_interface.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: node_string.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: encoding_binding.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: inspector_socket.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
971
972
0
static inline uint64_t xgetbv() {
973
0
  #if defined(_MSC_VER)
974
0
  return _xgetbv(0);
975
0
  #else
976
0
  uint32_t xcr0_lo, xcr0_hi;
977
0
  asm volatile("xgetbv\n\t" : "=a"(xcr0_lo), "=d"(xcr0_hi) : "c"(0));
978
0
  return xcr0_lo | ((uint64_t)xcr0_hi << 32);
979
0
  #endif
980
0
}
Unexecuted instantiation: node_buffer.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: node_builtins.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: node_i18n.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: node_metadata.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: string_bytes.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: util.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: inspector_profiler.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: main_thread_interface.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: node_string.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: encoding_binding.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: inspector_socket.cc:simdutf::internal::xgetbv()
981
982
0
static inline uint32_t detect_supported_architectures() {
983
0
  uint32_t eax;
984
0
  uint32_t ebx = 0;
985
0
  uint32_t ecx = 0;
986
0
  uint32_t edx = 0;
987
0
  uint32_t host_isa = 0x0;
988
0
989
0
  // EBX for EAX=0x1
990
0
  eax = 0x1;
991
0
  cpuid(&eax, &ebx, &ecx, &edx);
992
0
993
0
  if (ecx & cpuid_bit::sse42) {
994
0
    host_isa |= instruction_set::SSE42;
995
0
  }
996
0
997
0
  if (ecx & cpuid_bit::pclmulqdq) {
998
0
    host_isa |= instruction_set::PCLMULQDQ;
999
0
  }
1000
0
1001
0
  if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
1002
0
    return host_isa;
1003
0
  }
1004
0
1005
0
  // xgetbv for checking if the OS saves registers
1006
0
  uint64_t xcr0 = xgetbv();
1007
0
1008
0
  if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
1009
0
    return host_isa;
1010
0
  }
1011
0
  // ECX for EAX=0x7
1012
0
  eax = 0x7;
1013
0
  ecx = 0x0; // Sub-leaf = 0
1014
0
  cpuid(&eax, &ebx, &ecx, &edx);
1015
0
  if (ebx & cpuid_bit::ebx::avx2) {
1016
0
    host_isa |= instruction_set::AVX2;
1017
0
  }
1018
0
  if (ebx & cpuid_bit::ebx::bmi1) {
1019
0
    host_isa |= instruction_set::BMI1;
1020
0
  }
1021
0
  if (ebx & cpuid_bit::ebx::bmi2) {
1022
0
    host_isa |= instruction_set::BMI2;
1023
0
  }
1024
0
  if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) ==
1025
0
        cpuid_bit::xcr0_bit::avx512_saved)) {
1026
0
    return host_isa;
1027
0
  }
1028
0
  if (ebx & cpuid_bit::ebx::avx512f) {
1029
0
    host_isa |= instruction_set::AVX512F;
1030
0
  }
1031
0
  if (ebx & cpuid_bit::ebx::avx512bw) {
1032
0
    host_isa |= instruction_set::AVX512BW;
1033
0
  }
1034
0
  if (ebx & cpuid_bit::ebx::avx512cd) {
1035
0
    host_isa |= instruction_set::AVX512CD;
1036
0
  }
1037
0
  if (ebx & cpuid_bit::ebx::avx512dq) {
1038
0
    host_isa |= instruction_set::AVX512DQ;
1039
0
  }
1040
0
  if (ebx & cpuid_bit::ebx::avx512vl) {
1041
0
    host_isa |= instruction_set::AVX512VL;
1042
0
  }
1043
0
  if (ecx & cpuid_bit::ecx::avx512vbmi2) {
1044
0
    host_isa |= instruction_set::AVX512VBMI2;
1045
0
  }
1046
0
  if (ecx & cpuid_bit::ecx::avx512vpopcnt) {
1047
0
    host_isa |= instruction_set::AVX512VPOPCNTDQ;
1048
0
  }
1049
0
  return host_isa;
1050
0
}
Unexecuted instantiation: node_buffer.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: node_builtins.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: node_i18n.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: node_metadata.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: string_bytes.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: util.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: inspector_profiler.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: main_thread_interface.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: node_string.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: encoding_binding.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: inspector_socket.cc:simdutf::internal::detect_supported_architectures()
1051
#elif defined(__loongarch__)
1052
1053
static inline uint32_t detect_supported_architectures() {
1054
  uint32_t host_isa = instruction_set::DEFAULT;
1055
  #if defined(__linux__)
1056
  uint64_t hwcap = 0;
1057
  hwcap = getauxval(AT_HWCAP);
1058
  if (hwcap & HWCAP_LOONGARCH_LSX) {
1059
    host_isa |= instruction_set::LSX;
1060
  }
1061
  if (hwcap & HWCAP_LOONGARCH_LASX) {
1062
    host_isa |= instruction_set::LASX;
1063
  }
1064
  #endif
1065
  return host_isa;
1066
}
1067
#else // fallback
1068
1069
// includes 32-bit ARM.
1070
static inline uint32_t detect_supported_architectures() {
1071
  return instruction_set::DEFAULT;
1072
}
1073
1074
#endif // end SIMD extension detection code
1075
1076
} // namespace internal
1077
} // namespace simdutf
1078
1079
#endif // SIMDutf_INTERNAL_ISADETECTION_H
1080
/* end file include/simdutf/internal/isadetection.h */
1081
1082
#if SIMDUTF_SPAN
1083
  #include <concepts>
1084
  #include <type_traits>
1085
  #include <span>
1086
  #include <tuple>
1087
#endif
1088
#if SIMDUTF_CPLUSPLUS17
1089
  #include <string_view>
1090
#endif
1091
// The following defines are conditionally enabled/disabled during amalgamation.
1092
// By default all features are enabled, regular code shouldn't check them. Only
1093
// when user code really relies of a selected subset, it's good to verify these
1094
// flags, like:
1095
//
1096
//      #if !SIMDUTF_FEATURE_UTF16
1097
//      #   error("Please amalgamate simdutf with UTF-16 support")
1098
//      #endif
1099
//
1100
#define SIMDUTF_FEATURE_DETECT_ENCODING 1
1101
#define SIMDUTF_FEATURE_ASCII 1
1102
#define SIMDUTF_FEATURE_LATIN1 1
1103
#define SIMDUTF_FEATURE_UTF8 1
1104
#define SIMDUTF_FEATURE_UTF16 1
1105
#define SIMDUTF_FEATURE_UTF32 1
1106
#define SIMDUTF_FEATURE_BASE64 1
1107
1108
namespace simdutf {
1109
1110
#if SIMDUTF_SPAN
1111
/// helpers placed in namespace detail are not a part of the public API
1112
namespace detail {
1113
/**
1114
 * matches a byte, in the many ways C++ allows. note that these
1115
 * are all distinct types.
1116
 */
1117
template <typename T>
1118
concept byte_like = std::is_same_v<T, std::byte> ||   //
1119
                    std::is_same_v<T, char> ||        //
1120
                    std::is_same_v<T, signed char> || //
1121
                    std::is_same_v<T, unsigned char>;
1122
1123
template <typename T>
1124
concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
1125
1126
template <typename T>
1127
concept is_pointer = std::is_pointer_v<T>;
1128
1129
/**
1130
 * matches anything that behaves like std::span and points to character-like
1131
 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
1132
 * std::uint8_t
1133
 */
1134
template <typename T>
1135
concept input_span_of_byte_like = requires(const T &t) {
1136
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
1137
  { t.data() } noexcept -> is_pointer;
1138
  { *t.data() } noexcept -> is_byte_like;
1139
};
1140
1141
template <typename T>
1142
concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
1143
1144
/**
1145
 * like span_of_byte_like, but for an output span (intended to be written to)
1146
 */
1147
template <typename T>
1148
concept output_span_of_byte_like = requires(T &t) {
1149
  { t.size() } noexcept -> std::convertible_to<std::size_t>;
1150
  { t.data() } noexcept -> is_pointer;
1151
  { *t.data() } noexcept -> is_byte_like;
1152
  { *t.data() } noexcept -> is_mutable;
1153
};
1154
} // namespace detail
1155
#endif
1156
1157
#if SIMDUTF_FEATURE_DETECT_ENCODING
1158
/**
1159
 * Autodetect the encoding of the input, a single encoding is recommended.
1160
 * E.g., the function might return simdutf::encoding_type::UTF8,
1161
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
1162
 * simdutf::encoding_type::UTF32_LE.
1163
 *
1164
 * @param input the string to analyze.
1165
 * @param length the length of the string in bytes.
1166
 * @return the detected encoding type
1167
 */
1168
simdutf_warn_unused simdutf::encoding_type
1169
autodetect_encoding(const char *input, size_t length) noexcept;
1170
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
1171
0
autodetect_encoding(const uint8_t *input, size_t length) noexcept {
1172
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
1173
0
}
1174
  #if SIMDUTF_SPAN
1175
/**
1176
 * Autodetect the encoding of the input, a single encoding is recommended.
1177
 * E.g., the function might return simdutf::encoding_type::UTF8,
1178
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
1179
 * simdutf::encoding_type::UTF32_LE.
1180
 *
1181
 * @param input the string to analyze. can be a anything span-like that has a
1182
 * data() and size() that points to character data: std::string,
1183
 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
1184
 * @return the detected encoding type
1185
 */
1186
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
1187
autodetect_encoding(
1188
    const detail::input_span_of_byte_like auto &input) noexcept {
1189
  return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
1190
                             input.size());
1191
}
1192
  #endif // SIMDUTF_SPAN
1193
1194
/**
1195
 * Autodetect the possible encodings of the input in one pass.
1196
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
1197
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
1198
 *
1199
 * Overridden by each implementation.
1200
 *
1201
 * @param input the string to analyze.
1202
 * @param length the length of the string in bytes.
1203
 * @return the detected encoding type
1204
 */
1205
simdutf_warn_unused int detect_encodings(const char *input,
1206
                                         size_t length) noexcept;
1207
simdutf_really_inline simdutf_warn_unused int
1208
0
detect_encodings(const uint8_t *input, size_t length) noexcept {
1209
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
1210
0
}
1211
  #if SIMDUTF_SPAN
1212
simdutf_really_inline simdutf_warn_unused int
1213
detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
1214
  return detect_encodings(reinterpret_cast<const char *>(input.data()),
1215
                          input.size());
1216
}
1217
  #endif // SIMDUTF_SPAN
1218
#endif   // SIMDUTF_FEATURE_DETECT_ENCODING
1219
1220
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
1221
/**
1222
 * Validate the UTF-8 string. This function may be best when you expect
1223
 * the input to be almost always valid. Otherwise, consider using
1224
 * validate_utf8_with_errors.
1225
 *
1226
 * Overridden by each implementation.
1227
 *
1228
 * @param buf the UTF-8 string to validate.
1229
 * @param len the length of the string in bytes.
1230
 * @return true if and only if the string is valid UTF-8.
1231
 */
1232
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
1233
  #if SIMDUTF_SPAN
1234
simdutf_really_inline simdutf_warn_unused bool
1235
validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
1236
  return validate_utf8(reinterpret_cast<const char *>(input.data()),
1237
                       input.size());
1238
}
1239
  #endif // SIMDUTF_SPAN
1240
#endif   // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
1241
1242
#if SIMDUTF_FEATURE_UTF8
1243
/**
1244
 * Validate the UTF-8 string and stop on error.
1245
 *
1246
 * Overridden by each implementation.
1247
 *
1248
 * @param buf the UTF-8 string to validate.
1249
 * @param len the length of the string in bytes.
1250
 * @return a result pair struct (of type simdutf::result containing the two
1251
 * fields error and count) with an error code and either position of the error
1252
 * (in the input in code units) if any, or the number of code units validated if
1253
 * successful.
1254
 */
1255
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
1256
                                                     size_t len) noexcept;
1257
  #if SIMDUTF_SPAN
1258
simdutf_really_inline simdutf_warn_unused result validate_utf8_with_errors(
1259
    const detail::input_span_of_byte_like auto &input) noexcept {
1260
  return validate_utf8_with_errors(reinterpret_cast<const char *>(input.data()),
1261
                                   input.size());
1262
}
1263
  #endif // SIMDUTF_SPAN
1264
#endif   // SIMDUTF_FEATURE_UTF8
1265
1266
#if SIMDUTF_FEATURE_ASCII
1267
/**
1268
 * Validate the ASCII string.
1269
 *
1270
 * Overridden by each implementation.
1271
 *
1272
 * @param buf the ASCII string to validate.
1273
 * @param len the length of the string in bytes.
1274
 * @return true if and only if the string is valid ASCII.
1275
 */
1276
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
1277
  #if SIMDUTF_SPAN
1278
simdutf_really_inline simdutf_warn_unused bool
1279
validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
1280
  return validate_ascii(reinterpret_cast<const char *>(input.data()),
1281
                        input.size());
1282
}
1283
  #endif // SIMDUTF_SPAN
1284
1285
/**
1286
 * Validate the ASCII string and stop on error. It might be faster than
1287
 * validate_utf8 when an error is expected to occur early.
1288
 *
1289
 * Overridden by each implementation.
1290
 *
1291
 * @param buf the ASCII string to validate.
1292
 * @param len the length of the string in bytes.
1293
 * @return a result pair struct (of type simdutf::result containing the two
1294
 * fields error and count) with an error code and either position of the error
1295
 * (in the input in code units) if any, or the number of code units validated if
1296
 * successful.
1297
 */
1298
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
1299
                                                      size_t len) noexcept;
1300
  #if SIMDUTF_SPAN
1301
simdutf_really_inline simdutf_warn_unused result validate_ascii_with_errors(
1302
    const detail::input_span_of_byte_like auto &input) noexcept {
1303
  return validate_ascii_with_errors(
1304
      reinterpret_cast<const char *>(input.data()), input.size());
1305
}
1306
  #endif // SIMDUTF_SPAN
1307
#endif   // SIMDUTF_FEATURE_ASCII
1308
1309
#if SIMDUTF_FEATURE_UTF16
1310
/**
1311
 * Using native endianness; Validate the UTF-16 string.
1312
 * This function may be best when you expect the input to be almost always
1313
 * valid. Otherwise, consider using validate_utf16_with_errors.
1314
 *
1315
 * Overridden by each implementation.
1316
 *
1317
 * This function is not BOM-aware.
1318
 *
1319
 * @param buf the UTF-16 string to validate.
1320
 * @param len the length of the string in number of 2-byte code units
1321
 * (char16_t).
1322
 * @return true if and only if the string is valid UTF-16.
1323
 */
1324
simdutf_warn_unused bool validate_utf16(const char16_t *buf,
1325
                                        size_t len) noexcept;
1326
  #if SIMDUTF_SPAN
1327
simdutf_really_inline simdutf_warn_unused bool
1328
0
validate_utf16(std::span<const char16_t> input) noexcept {
1329
0
  return validate_utf16(input.data(), input.size());
1330
0
}
1331
  #endif // SIMDUTF_SPAN
1332
#endif   // SIMDUTF_FEATURE_UTF16
1333
1334
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
1335
/**
1336
 * Validate the UTF-16LE string. This function may be best when you expect
1337
 * the input to be almost always valid. Otherwise, consider using
1338
 * validate_utf16le_with_errors.
1339
 *
1340
 * Overridden by each implementation.
1341
 *
1342
 * This function is not BOM-aware.
1343
 *
1344
 * @param buf the UTF-16LE string to validate.
1345
 * @param len the length of the string in number of 2-byte code units
1346
 * (char16_t).
1347
 * @return true if and only if the string is valid UTF-16LE.
1348
 */
1349
simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
1350
                                          size_t len) noexcept;
1351
  #if SIMDUTF_SPAN
1352
simdutf_really_inline simdutf_warn_unused bool
1353
0
validate_utf16le(std::span<const char16_t> input) noexcept {
1354
0
  return validate_utf16le(input.data(), input.size());
1355
0
}
1356
  #endif // SIMDUTF_SPAN
1357
#endif   // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
1358
1359
#if SIMDUTF_FEATURE_UTF16
1360
/**
1361
 * Validate the UTF-16BE string. This function may be best when you expect
1362
 * the input to be almost always valid. Otherwise, consider using
1363
 * validate_utf16be_with_errors.
1364
 *
1365
 * Overridden by each implementation.
1366
 *
1367
 * This function is not BOM-aware.
1368
 *
1369
 * @param buf the UTF-16BE string to validate.
1370
 * @param len the length of the string in number of 2-byte code units
1371
 * (char16_t).
1372
 * @return true if and only if the string is valid UTF-16BE.
1373
 */
1374
simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
1375
                                          size_t len) noexcept;
1376
  #if SIMDUTF_SPAN
1377
simdutf_really_inline simdutf_warn_unused bool
1378
0
validate_utf16be(std::span<const char16_t> input) noexcept {
1379
0
  return validate_utf16be(input.data(), input.size());
1380
0
}
1381
  #endif // SIMDUTF_SPAN
1382
1383
/**
1384
 * Using native endianness; Validate the UTF-16 string and stop on error.
1385
 * It might be faster than validate_utf16 when an error is expected to occur
1386
 * early.
1387
 *
1388
 * Overridden by each implementation.
1389
 *
1390
 * This function is not BOM-aware.
1391
 *
1392
 * @param buf the UTF-16 string to validate.
1393
 * @param len the length of the string in number of 2-byte code units
1394
 * (char16_t).
1395
 * @return a result pair struct (of type simdutf::result containing the two
1396
 * fields error and count) with an error code and either position of the error
1397
 * (in the input in code units) if any, or the number of code units validated if
1398
 * successful.
1399
 */
1400
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
1401
                                                      size_t len) noexcept;
1402
  #if SIMDUTF_SPAN
1403
simdutf_really_inline simdutf_warn_unused result
1404
0
validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
1405
0
  return validate_utf16_with_errors(input.data(), input.size());
1406
0
}
1407
  #endif // SIMDUTF_SPAN
1408
1409
/**
1410
 * Validate the UTF-16LE string and stop on error. It might be faster than
1411
 * validate_utf16le when an error is expected to occur early.
1412
 *
1413
 * Overridden by each implementation.
1414
 *
1415
 * This function is not BOM-aware.
1416
 *
1417
 * @param buf the UTF-16LE string to validate.
1418
 * @param len the length of the string in number of 2-byte code units
1419
 * (char16_t).
1420
 * @return a result pair struct (of type simdutf::result containing the two
1421
 * fields error and count) with an error code and either position of the error
1422
 * (in the input in code units) if any, or the number of code units validated if
1423
 * successful.
1424
 */
1425
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
1426
                                                        size_t len) noexcept;
1427
  #if SIMDUTF_SPAN
1428
simdutf_really_inline simdutf_warn_unused result
1429
0
validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
1430
0
  return validate_utf16le_with_errors(input.data(), input.size());
1431
0
}
1432
  #endif // SIMDUTF_SPAN
1433
1434
/**
1435
 * Validate the UTF-16BE string and stop on error. It might be faster than
1436
 * validate_utf16be when an error is expected to occur early.
1437
 *
1438
 * Overridden by each implementation.
1439
 *
1440
 * This function is not BOM-aware.
1441
 *
1442
 * @param buf the UTF-16BE string to validate.
1443
 * @param len the length of the string in number of 2-byte code units
1444
 * (char16_t).
1445
 * @return a result pair struct (of type simdutf::result containing the two
1446
 * fields error and count) with an error code and either position of the error
1447
 * (in the input in code units) if any, or the number of code units validated if
1448
 * successful.
1449
 */
1450
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
1451
                                                        size_t len) noexcept;
1452
  #if SIMDUTF_SPAN
1453
simdutf_really_inline simdutf_warn_unused result
1454
0
validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
1455
0
  return validate_utf16be_with_errors(input.data(), input.size());
1456
0
}
1457
  #endif // SIMDUTF_SPAN
1458
1459
/**
1460
 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
1461
 * the Unicode replacement character U+FFFD. If input and output points to
1462
 * different memory areas, the procedure copies string, and it's expected that
1463
 * output memory is at least as big as the input. It's also possible to set
1464
 * input equal output, that makes replacements an in-place operation.
1465
 *
1466
 * @param input the UTF-16LE string to correct.
1467
 * @param len the length of the string in number of 2-byte code units
1468
 * (char16_t).
1469
 * @param output the output buffer.
1470
 */
1471
void to_well_formed_utf16le(const char16_t *input, size_t len,
1472
                            char16_t *output) noexcept;
1473
  #if SIMDUTF_SPAN
1474
simdutf_really_inline void
1475
to_well_formed_utf16le(std::span<const char16_t> input,
1476
0
                       std::span<char16_t> output) noexcept {
1477
0
  to_well_formed_utf16le(input.data(), input.size(), output.data());
1478
0
}
1479
  #endif // SIMDUTF_SPAN
1480
1481
/**
1482
 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
1483
 * the Unicode replacement character U+FFFD. If input and output points to
1484
 * different memory areas, the procedure copies string, and it's expected that
1485
 * output memory is at least as big as the input. It's also possible to set
1486
 * input equal output, that makes replacements an in-place operation.
1487
 *
1488
 * @param input the UTF-16BE string to correct.
1489
 * @param len the length of the string in number of 2-byte code units
1490
 * (char16_t).
1491
 * @param output the output buffer.
1492
 */
1493
void to_well_formed_utf16be(const char16_t *input, size_t len,
1494
                            char16_t *output) noexcept;
1495
  #if SIMDUTF_SPAN
1496
simdutf_really_inline void
1497
to_well_formed_utf16be(std::span<const char16_t> input,
1498
0
                       std::span<char16_t> output) noexcept {
1499
0
  to_well_formed_utf16be(input.data(), input.size(), output.data());
1500
0
}
1501
  #endif // SIMDUTF_SPAN
1502
1503
/**
1504
 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
1505
 * Unicode replacement character U+FFFD. If input and output points to different
1506
 * memory areas, the procedure copies string, and it's expected that output
1507
 * memory is at least as big as the input. It's also possible to set input equal
1508
 * output, that makes replacements an in-place operation.
1509
 *
1510
 * @param input the UTF-16 string to correct.
1511
 * @param len the length of the string in number of 2-byte code units
1512
 * (char16_t).
1513
 * @param output the output buffer.
1514
 */
1515
void to_well_formed_utf16(const char16_t *input, size_t len,
1516
                          char16_t *output) noexcept;
1517
  #if SIMDUTF_SPAN
1518
simdutf_really_inline void
1519
to_well_formed_utf16(std::span<const char16_t> input,
1520
0
                     std::span<char16_t> output) noexcept {
1521
0
  to_well_formed_utf16(input.data(), input.size(), output.data());
1522
0
}
1523
  #endif // SIMDUTF_SPAN
1524
1525
#endif // SIMDUTF_FEATURE_UTF16
1526
1527
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
1528
/**
1529
 * Validate the UTF-32 string. This function may be best when you expect
1530
 * the input to be almost always valid. Otherwise, consider using
1531
 * validate_utf32_with_errors.
1532
 *
1533
 * Overridden by each implementation.
1534
 *
1535
 * This function is not BOM-aware.
1536
 *
1537
 * @param buf the UTF-32 string to validate.
1538
 * @param len the length of the string in number of 4-byte code units
1539
 * (char32_t).
1540
 * @return true if and only if the string is valid UTF-32.
1541
 */
1542
simdutf_warn_unused bool validate_utf32(const char32_t *buf,
1543
                                        size_t len) noexcept;
1544
  #if SIMDUTF_SPAN
1545
simdutf_really_inline simdutf_warn_unused bool
1546
0
validate_utf32(std::span<const char32_t> input) noexcept {
1547
0
  return validate_utf32(input.data(), input.size());
1548
0
}
1549
  #endif // SIMDUTF_SPAN
1550
#endif   // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
1551
1552
#if SIMDUTF_FEATURE_UTF32
1553
/**
1554
 * Validate the UTF-32 string and stop on error. It might be faster than
1555
 * validate_utf32 when an error is expected to occur early.
1556
 *
1557
 * Overridden by each implementation.
1558
 *
1559
 * This function is not BOM-aware.
1560
 *
1561
 * @param buf the UTF-32 string to validate.
1562
 * @param len the length of the string in number of 4-byte code units
1563
 * (char32_t).
1564
 * @return a result pair struct (of type simdutf::result containing the two
1565
 * fields error and count) with an error code and either position of the error
1566
 * (in the input in code units) if any, or the number of code units validated if
1567
 * successful.
1568
 */
1569
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
1570
                                                      size_t len) noexcept;
1571
  #if SIMDUTF_SPAN
1572
simdutf_really_inline simdutf_warn_unused result
1573
0
validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
1574
0
  return validate_utf32_with_errors(input.data(), input.size());
1575
0
}
1576
  #endif // SIMDUTF_SPAN
1577
#endif   // SIMDUTF_FEATURE_UTF32
1578
1579
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1580
/**
1581
 * Convert Latin1 string into UTF-8 string.
1582
 *
1583
 * This function is suitable to work with inputs from untrusted sources.
1584
 *
1585
 * @param input         the Latin1 string to convert
1586
 * @param length        the length of the string in bytes
1587
 * @param utf8_output   the pointer to buffer that can hold conversion result
1588
 * @return the number of written char; 0 if conversion is not possible
1589
 */
1590
simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
1591
                                                  size_t length,
1592
                                                  char *utf8_output) noexcept;
1593
  #if SIMDUTF_SPAN
1594
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8(
1595
    const detail::input_span_of_byte_like auto &latin1_input,
1596
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1597
  return convert_latin1_to_utf8(
1598
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
1599
      utf8_output.data());
1600
}
1601
  #endif // SIMDUTF_SPAN
1602
1603
/**
1604
 * Convert Latin1 string into UTF-8 string with output limit.
1605
 *
1606
 * This function is suitable to work with inputs from untrusted sources.
1607
 *
1608
 * We write as many characters as possible.
1609
 *
1610
 * @param input         the Latin1 string to convert
1611
 * @param length        the length of the string in bytes
1612
 * @param utf8_output   the pointer to buffer that can hold conversion result
1613
 * @param utf8_len      the maximum output length
1614
 * @return the number of written char; 0 if conversion is not possible
1615
 */
1616
simdutf_warn_unused size_t
1617
convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
1618
                            size_t utf8_len) noexcept;
1619
  #if SIMDUTF_SPAN
1620
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
1621
    const detail::input_span_of_byte_like auto &input,
1622
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1623
  // implementation note: outputspan is a forwarding ref to avoid copying and
1624
  // allow both lvalues and rvalues. std::span can be copied without problems,
1625
  // but std::vector should not, and this function should accept both. it will
1626
  // allow using an owning rvalue ref (example: passing a temporary std::string)
1627
  // as output, but the user will quickly find out that he has no way of getting
1628
  // the data out of the object in that case.
1629
  return convert_latin1_to_utf8_safe(
1630
      input.data(), input.size(), reinterpret_cast<char *>(utf8_output.data()),
1631
      utf8_output.size());
1632
}
1633
  #endif // SIMDUTF_SPAN
1634
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1635
1636
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1637
/**
1638
 * Convert possibly Latin1 string into UTF-16LE string.
1639
 *
1640
 * This function is suitable to work with inputs from untrusted sources.
1641
 *
1642
 * @param input         the Latin1 string to convert
1643
 * @param length        the length of the string in bytes
1644
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1645
 * @return the number of written char16_t; 0 if conversion is not possible
1646
 */
1647
simdutf_warn_unused size_t convert_latin1_to_utf16le(
1648
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1649
  #if SIMDUTF_SPAN
1650
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf16le(
1651
    const detail::input_span_of_byte_like auto &latin1_input,
1652
    std::span<char16_t> utf16_output) noexcept {
1653
  return convert_latin1_to_utf16le(
1654
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
1655
      utf16_output.data());
1656
}
1657
  #endif // SIMDUTF_SPAN
1658
1659
/**
1660
 * Convert Latin1 string into UTF-16BE string.
1661
 *
1662
 * This function is suitable to work with inputs from untrusted sources.
1663
 *
1664
 * @param input         the Latin1 string to convert
1665
 * @param length        the length of the string in bytes
1666
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1667
 * @return the number of written char16_t; 0 if conversion is not possible
1668
 */
1669
simdutf_warn_unused size_t convert_latin1_to_utf16be(
1670
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1671
  #if SIMDUTF_SPAN
1672
simdutf_really_inline simdutf_warn_unused size_t
1673
convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
1674
                          std::span<char16_t> output) noexcept {
1675
  return convert_latin1_to_utf16be(reinterpret_cast<const char *>(input.data()),
1676
                                   input.size(), output.data());
1677
}
1678
  #endif // SIMDUTF_SPAN
1679
/**
1680
 * Compute the number of bytes that this UTF-16 string would require in Latin1
1681
 * format.
1682
 *
1683
 * @param length        the length of the string in Latin1 code units (char)
1684
 * @return the length of the string in Latin1 code units (char) required to
1685
 * encode the UTF-16 string as Latin1
1686
 */
1687
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
1688
1689
/**
1690
 * Compute the number of code units that this Latin1 string would require in
1691
 * UTF-16 format.
1692
 *
1693
 * @param length        the length of the string in Latin1 code units (char)
1694
 * @return the length of the string in 2-byte code units (char16_t) required to
1695
 * encode the Latin1 string as UTF-16
1696
 */
1697
simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept;
1698
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1699
1700
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
1701
/**
1702
 * Convert Latin1 string into UTF-32 string.
1703
 *
1704
 * This function is suitable to work with inputs from untrusted sources.
1705
 *
1706
 * @param input         the Latin1 string to convert
1707
 * @param length        the length of the string in bytes
1708
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1709
 * @return the number of written char32_t; 0 if conversion is not possible
1710
 */
1711
simdutf_warn_unused size_t convert_latin1_to_utf32(
1712
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1713
  #if SIMDUTF_SPAN
1714
simdutf_really_inline simdutf_warn_unused size_t convert_latin1_to_utf32(
1715
    const detail::input_span_of_byte_like auto &latin1_input,
1716
    std::span<char32_t> utf32_output) noexcept {
1717
  return convert_latin1_to_utf32(
1718
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size(),
1719
      utf32_output.data());
1720
}
1721
  #endif // SIMDUTF_SPAN
1722
#endif   // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
1723
1724
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1725
/**
1726
 * Convert possibly broken UTF-8 string into latin1 string.
1727
 *
1728
 * During the conversion also validation of the input string is done.
1729
 * This function is suitable to work with inputs from untrusted sources.
1730
 *
1731
 * @param input         the UTF-8 string to convert
1732
 * @param length        the length of the string in bytes
1733
 * @param latin1_output  the pointer to buffer that can hold conversion result
1734
 * @return the number of written char; 0 if the input was not valid UTF-8 string
1735
 * or if it cannot be represented as Latin1
1736
 */
1737
simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
1738
                                                  size_t length,
1739
                                                  char *latin1_output) noexcept;
1740
  #if SIMDUTF_SPAN
1741
simdutf_really_inline simdutf_warn_unused size_t convert_utf8_to_latin1(
1742
    const detail::input_span_of_byte_like auto &input,
1743
    detail::output_span_of_byte_like auto &&output) noexcept {
1744
  return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
1745
                                input.size(),
1746
                                reinterpret_cast<char *>(output.data()));
1747
}
1748
  #endif // SIMDUTF_SPAN
1749
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1750
1751
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1752
/**
1753
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
1754
 * string.
1755
 *
1756
 * During the conversion also validation of the input string is done.
1757
 * This function is suitable to work with inputs from untrusted sources.
1758
 *
1759
 * @param input         the UTF-8 string to convert
1760
 * @param length        the length of the string in bytes
1761
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1762
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1763
 * string
1764
 */
1765
simdutf_warn_unused size_t convert_utf8_to_utf16(
1766
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1767
  #if SIMDUTF_SPAN
1768
simdutf_really_inline simdutf_warn_unused size_t
1769
convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
1770
                      std::span<char16_t> output) noexcept {
1771
  return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
1772
                               input.size(), output.data());
1773
}
1774
  #endif // SIMDUTF_SPAN
1775
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1776
1777
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1778
/**
1779
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
1780
 *
1781
 * @param input         the Latin1 string to convert
1782
 * @param length        the length of the string in bytes
1783
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1784
 * @return the number of written char16_t.
1785
 */
1786
simdutf_warn_unused size_t convert_latin1_to_utf16(
1787
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1788
  #if SIMDUTF_SPAN
1789
simdutf_really_inline simdutf_warn_unused size_t
1790
convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
1791
                        std::span<char16_t> output) noexcept {
1792
  return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
1793
                                 input.size(), output.data());
1794
}
1795
  #endif // SIMDUTF_SPAN
1796
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1797
1798
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1799
/**
1800
 * Convert possibly broken UTF-8 string into UTF-16LE string.
1801
 *
1802
 * During the conversion also validation of the input string is done.
1803
 * This function is suitable to work with inputs from untrusted sources.
1804
 *
1805
 * @param input         the UTF-8 string to convert
1806
 * @param length        the length of the string in bytes
1807
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1808
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1809
 * string
1810
 */
1811
simdutf_warn_unused size_t convert_utf8_to_utf16le(
1812
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1813
  #if SIMDUTF_SPAN
1814
simdutf_really_inline simdutf_warn_unused size_t
1815
convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
1816
                        std::span<char16_t> utf16_output) noexcept {
1817
  return convert_utf8_to_utf16le(
1818
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1819
      utf16_output.data());
1820
}
1821
  #endif // SIMDUTF_SPAN
1822
1823
/**
1824
 * Convert possibly broken UTF-8 string into UTF-16BE string.
1825
 *
1826
 * During the conversion also validation of the input string is done.
1827
 * This function is suitable to work with inputs from untrusted sources.
1828
 *
1829
 * @param input         the UTF-8 string to convert
1830
 * @param length        the length of the string in bytes
1831
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1832
 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1833
 * string
1834
 */
1835
simdutf_warn_unused size_t convert_utf8_to_utf16be(
1836
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1837
  #if SIMDUTF_SPAN
1838
simdutf_really_inline simdutf_warn_unused size_t
1839
convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
1840
                        std::span<char16_t> utf16_output) noexcept {
1841
  return convert_utf8_to_utf16be(
1842
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1843
      utf16_output.data());
1844
}
1845
  #endif // SIMDUTF_SPAN
1846
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1847
1848
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1849
/**
1850
 * Convert possibly broken UTF-8 string into latin1 string with errors.
1851
 * If the string cannot be represented as Latin1, an error
1852
 * code is returned.
1853
 *
1854
 * During the conversion also validation of the input string is done.
1855
 * This function is suitable to work with inputs from untrusted sources.
1856
 *
1857
 * @param input         the UTF-8 string to convert
1858
 * @param length        the length of the string in bytes
1859
 * @param latin1_output  the pointer to buffer that can hold conversion result
1860
 * @return a result pair struct (of type simdutf::result containing the two
1861
 * fields error and count) with an error code and either position of the error
1862
 * (in the input in code units) if any, or the number of code units validated if
1863
 * successful.
1864
 */
1865
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
1866
    const char *input, size_t length, char *latin1_output) noexcept;
1867
  #if SIMDUTF_SPAN
1868
simdutf_really_inline simdutf_warn_unused result
1869
convert_utf8_to_latin1_with_errors(
1870
    const detail::input_span_of_byte_like auto &utf8_input,
1871
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1872
  return convert_utf8_to_latin1_with_errors(
1873
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1874
      reinterpret_cast<char *>(latin1_output.data()));
1875
}
1876
  #endif // SIMDUTF_SPAN
1877
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1878
1879
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1880
/**
1881
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
1882
 * string and stop on error.
1883
 *
1884
 * During the conversion also validation of the input string is done.
1885
 * This function is suitable to work with inputs from untrusted sources.
1886
 *
1887
 * @param input         the UTF-8 string to convert
1888
 * @param length        the length of the string in bytes
1889
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1890
 * @return a result pair struct (of type simdutf::result containing the two
1891
 * fields error and count) with an error code and either position of the error
1892
 * (in the input in code units) if any, or the number of char16_t written if
1893
 * successful.
1894
 */
1895
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
1896
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1897
  #if SIMDUTF_SPAN
1898
simdutf_really_inline simdutf_warn_unused result
1899
convert_utf8_to_utf16_with_errors(
1900
    const detail::input_span_of_byte_like auto &utf8_input,
1901
    std::span<char16_t> utf16_output) noexcept {
1902
  return convert_utf8_to_utf16_with_errors(
1903
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1904
      utf16_output.data());
1905
}
1906
  #endif // SIMDUTF_SPAN
1907
1908
/**
1909
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1910
 *
1911
 * During the conversion also validation of the input string is done.
1912
 * This function is suitable to work with inputs from untrusted sources.
1913
 *
1914
 * @param input         the UTF-8 string to convert
1915
 * @param length        the length of the string in bytes
1916
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1917
 * @return a result pair struct (of type simdutf::result containing the two
1918
 * fields error and count) with an error code and either position of the error
1919
 * (in the input in code units) if any, or the number of char16_t written if
1920
 * successful.
1921
 */
1922
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
1923
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1924
  #if SIMDUTF_SPAN
1925
simdutf_really_inline simdutf_warn_unused result
1926
convert_utf8_to_utf16le_with_errors(
1927
    const detail::input_span_of_byte_like auto &utf8_input,
1928
    std::span<char16_t> utf16_output) noexcept {
1929
  return convert_utf8_to_utf16le_with_errors(
1930
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1931
      utf16_output.data());
1932
}
1933
  #endif // SIMDUTF_SPAN
1934
1935
/**
1936
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1937
 *
1938
 * During the conversion also validation of the input string is done.
1939
 * This function is suitable to work with inputs from untrusted sources.
1940
 *
1941
 * @param input         the UTF-8 string to convert
1942
 * @param length        the length of the string in bytes
1943
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1944
 * @return a result pair struct (of type simdutf::result containing the two
1945
 * fields error and count) with an error code and either position of the error
1946
 * (in the input in code units) if any, or the number of char16_t written if
1947
 * successful.
1948
 */
1949
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
1950
    const char *input, size_t length, char16_t *utf16_output) noexcept;
1951
  #if SIMDUTF_SPAN
1952
simdutf_really_inline simdutf_warn_unused result
1953
convert_utf8_to_utf16be_with_errors(
1954
    const detail::input_span_of_byte_like auto &utf8_input,
1955
    std::span<char16_t> utf16_output) noexcept {
1956
  return convert_utf8_to_utf16be_with_errors(
1957
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1958
      utf16_output.data());
1959
}
1960
  #endif // SIMDUTF_SPAN
1961
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1962
1963
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1964
/**
1965
 * Convert possibly broken UTF-8 string into UTF-32 string.
1966
 *
1967
 * During the conversion also validation of the input string is done.
1968
 * This function is suitable to work with inputs from untrusted sources.
1969
 *
1970
 * @param input         the UTF-8 string to convert
1971
 * @param length        the length of the string in bytes
1972
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1973
 * @return the number of written char32_t; 0 if the input was not valid UTF-8
1974
 * string
1975
 */
1976
simdutf_warn_unused size_t convert_utf8_to_utf32(
1977
    const char *input, size_t length, char32_t *utf32_output) noexcept;
1978
  #if SIMDUTF_SPAN
1979
simdutf_really_inline simdutf_warn_unused size_t
1980
convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
1981
                      std::span<char32_t> utf32_output) noexcept {
1982
  return convert_utf8_to_utf32(
1983
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1984
      utf32_output.data());
1985
}
1986
  #endif // SIMDUTF_SPAN
1987
1988
/**
1989
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1990
 *
1991
 * During the conversion also validation of the input string is done.
1992
 * This function is suitable to work with inputs from untrusted sources.
1993
 *
1994
 * @param input         the UTF-8 string to convert
1995
 * @param length        the length of the string in bytes
1996
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1997
 * @return a result pair struct (of type simdutf::result containing the two
1998
 * fields error and count) with an error code and either position of the error
1999
 * (in the input in code units) if any, or the number of char32_t written if
2000
 * successful.
2001
 */
2002
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
2003
    const char *input, size_t length, char32_t *utf32_output) noexcept;
2004
  #if SIMDUTF_SPAN
2005
simdutf_really_inline simdutf_warn_unused result
2006
convert_utf8_to_utf32_with_errors(
2007
    const detail::input_span_of_byte_like auto &utf8_input,
2008
    std::span<char32_t> utf32_output) noexcept {
2009
  return convert_utf8_to_utf32_with_errors(
2010
      reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
2011
      utf32_output.data());
2012
}
2013
  #endif // SIMDUTF_SPAN
2014
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2015
2016
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
2017
/**
2018
 * Convert valid UTF-8 string into latin1 string.
2019
 *
2020
 * This function assumes that the input string is valid UTF-8 and that it can be
2021
 * represented as Latin1. If you violate this assumption, the result is
2022
 * implementation defined and may include system-dependent behavior such as
2023
 * crashes.
2024
 *
2025
 * This function is for expert users only and not part of our public API. Use
2026
 * convert_utf8_to_latin1 instead. The function may be removed from the library
2027
 * in the future.
2028
 *
2029
 * This function is not BOM-aware.
2030
 *
2031
 * @param input         the UTF-8 string to convert
2032
 * @param length        the length of the string in bytes
2033
 * @param latin1_output  the pointer to buffer that can hold conversion result
2034
 * @return the number of written char; 0 if the input was not valid UTF-8 string
2035
 */
2036
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
2037
    const char *input, size_t length, char *latin1_output) noexcept;
2038
  #if SIMDUTF_SPAN
2039
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
2040
    const detail::input_span_of_byte_like auto &valid_utf8_input,
2041
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2042
  return convert_valid_utf8_to_latin1(
2043
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2044
      valid_utf8_input.size(), latin1_output.data());
2045
}
2046
  #endif // SIMDUTF_SPAN
2047
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
2048
2049
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2050
/**
2051
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
2052
 *
2053
 * This function assumes that the input string is valid UTF-8.
2054
 *
2055
 * @param input         the UTF-8 string to convert
2056
 * @param length        the length of the string in bytes
2057
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
2058
 * @return the number of written char16_t
2059
 */
2060
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
2061
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
2062
  #if SIMDUTF_SPAN
2063
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
2064
    const detail::input_span_of_byte_like auto &valid_utf8_input,
2065
    std::span<char16_t> utf16_output) noexcept {
2066
  return convert_valid_utf8_to_utf16(
2067
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2068
      valid_utf8_input.size(), utf16_output.data());
2069
}
2070
  #endif // SIMDUTF_SPAN
2071
2072
/**
2073
 * Convert valid UTF-8 string into UTF-16LE string.
2074
 *
2075
 * This function assumes that the input string is valid UTF-8.
2076
 *
2077
 * @param input         the UTF-8 string to convert
2078
 * @param length        the length of the string in bytes
2079
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
2080
 * @return the number of written char16_t
2081
 */
2082
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
2083
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
2084
  #if SIMDUTF_SPAN
2085
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
2086
    const detail::input_span_of_byte_like auto &valid_utf8_input,
2087
    std::span<char16_t> utf16_output) noexcept {
2088
  return convert_valid_utf8_to_utf16le(
2089
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2090
      valid_utf8_input.size(), utf16_output.data());
2091
}
2092
  #endif // SIMDUTF_SPAN
2093
2094
/**
2095
 * Convert valid UTF-8 string into UTF-16BE string.
2096
 *
2097
 * This function assumes that the input string is valid UTF-8.
2098
 *
2099
 * @param input         the UTF-8 string to convert
2100
 * @param length        the length of the string in bytes
2101
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
2102
 * @return the number of written char16_t
2103
 */
2104
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
2105
    const char *input, size_t length, char16_t *utf16_buffer) noexcept;
2106
  #if SIMDUTF_SPAN
2107
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
2108
    const detail::input_span_of_byte_like auto &valid_utf8_input,
2109
    std::span<char16_t> utf16_output) noexcept {
2110
  return convert_valid_utf8_to_utf16be(
2111
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2112
      valid_utf8_input.size(), utf16_output.data());
2113
}
2114
  #endif // SIMDUTF_SPAN
2115
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2116
2117
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2118
/**
2119
 * Convert valid UTF-8 string into UTF-32 string.
2120
 *
2121
 * This function assumes that the input string is valid UTF-8.
2122
 *
2123
 * @param input         the UTF-8 string to convert
2124
 * @param length        the length of the string in bytes
2125
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
2126
 * @return the number of written char32_t
2127
 */
2128
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
2129
    const char *input, size_t length, char32_t *utf32_buffer) noexcept;
2130
  #if SIMDUTF_SPAN
2131
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
2132
    const detail::input_span_of_byte_like auto &valid_utf8_input,
2133
    std::span<char32_t> utf32_output) noexcept {
2134
  return convert_valid_utf8_to_utf32(
2135
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2136
      valid_utf8_input.size(), utf32_output.data());
2137
}
2138
  #endif // SIMDUTF_SPAN
2139
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2140
2141
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
2142
/**
2143
 * Return the number of bytes that this Latin1 string would require in UTF-8
2144
 * format.
2145
 *
2146
 * @param input         the Latin1 string to convert
2147
 * @param length        the length of the string bytes
2148
 * @return the number of bytes required to encode the Latin1 string as UTF-8
2149
 */
2150
simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
2151
                                                   size_t length) noexcept;
2152
  #if SIMDUTF_SPAN
2153
simdutf_really_inline simdutf_warn_unused size_t utf8_length_from_latin1(
2154
    const detail::input_span_of_byte_like auto &latin1_input) noexcept {
2155
  return utf8_length_from_latin1(
2156
      reinterpret_cast<const char *>(latin1_input.data()), latin1_input.size());
2157
}
2158
  #endif // SIMDUTF_SPAN
2159
2160
/**
2161
 * Compute the number of bytes that this UTF-8 string would require in Latin1
2162
 * format.
2163
 *
2164
 * This function does not validate the input. It is acceptable to pass invalid
2165
 * UTF-8 strings but in such cases the result is implementation defined.
2166
 *
2167
 * This function is not BOM-aware.
2168
 *
2169
 * @param input         the UTF-8 string to convert
2170
 * @param length        the length of the string in byte
2171
 * @return the number of bytes required to encode the UTF-8 string as Latin1
2172
 */
2173
simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
2174
                                                   size_t length) noexcept;
2175
  #if SIMDUTF_SPAN
2176
simdutf_really_inline simdutf_warn_unused size_t latin1_length_from_utf8(
2177
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2178
  return latin1_length_from_utf8(
2179
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2180
      valid_utf8_input.size());
2181
}
2182
  #endif // SIMDUTF_SPAN
2183
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
2184
2185
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2186
/**
2187
 * Compute the number of 2-byte code units that this UTF-8 string would require
2188
 * in UTF-16LE format.
2189
 *
2190
 * This function does not validate the input. It is acceptable to pass invalid
2191
 * UTF-8 strings but in such cases the result is implementation defined.
2192
 *
2193
 * This function is not BOM-aware.
2194
 *
2195
 * @param input         the UTF-8 string to process
2196
 * @param length        the length of the string in bytes
2197
 * @return the number of char16_t code units required to encode the UTF-8 string
2198
 * as UTF-16LE
2199
 */
2200
simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
2201
                                                  size_t length) noexcept;
2202
  #if SIMDUTF_SPAN
2203
simdutf_really_inline simdutf_warn_unused size_t utf16_length_from_utf8(
2204
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2205
  return utf16_length_from_utf8(
2206
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2207
      valid_utf8_input.size());
2208
}
2209
  #endif // SIMDUTF_SPAN
2210
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2211
2212
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2213
/**
2214
 * Compute the number of 4-byte code units that this UTF-8 string would require
2215
 * in UTF-32 format.
2216
 *
2217
 * This function is equivalent to count_utf8
2218
 *
2219
 * This function does not validate the input. It is acceptable to pass invalid
2220
 * UTF-8 strings but in such cases the result is implementation defined.
2221
 *
2222
 * This function is not BOM-aware.
2223
 *
2224
 * @param input         the UTF-8 string to process
2225
 * @param length        the length of the string in bytes
2226
 * @return the number of char32_t code units required to encode the UTF-8 string
2227
 * as UTF-32
2228
 */
2229
simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
2230
                                                  size_t length) noexcept;
2231
  #if SIMDUTF_SPAN
2232
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf8(
2233
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
2234
  return utf32_length_from_utf8(
2235
      reinterpret_cast<const char *>(valid_utf8_input.data()),
2236
      valid_utf8_input.size());
2237
}
2238
  #endif // SIMDUTF_SPAN
2239
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
2240
2241
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2242
/**
2243
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
2244
 * string.
2245
 *
2246
 * During the conversion also validation of the input string is done.
2247
 * This function is suitable to work with inputs from untrusted sources.
2248
 *
2249
 * This function is not BOM-aware.
2250
 *
2251
 * @param input         the UTF-16 string to convert
2252
 * @param length        the length of the string in 2-byte code units (char16_t)
2253
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2254
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2255
 * string
2256
 */
2257
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
2258
                                                 size_t length,
2259
                                                 char *utf8_buffer) noexcept;
2260
  #if SIMDUTF_SPAN
2261
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8(
2262
    std::span<const char16_t> utf16_input,
2263
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2264
  return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
2265
                               reinterpret_cast<char *>(utf8_output.data()));
2266
}
2267
  #endif // SIMDUTF_SPAN
2268
2269
/**
2270
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
2271
 * string with output limit.
2272
 *
2273
 * We write as many characters as possible into the output buffer,
2274
 *
2275
 * During the conversion also validation of the input string is done.
2276
 * This function is suitable to work with inputs from untrusted sources.
2277
 *
2278
 * This function is not BOM-aware.
2279
 *
2280
 *
2281
 * @param input         the UTF-16 string to convert
2282
 * @param length        the length of the string in 16-bit code units (char16_t)
2283
 * @param utf8_output   the pointer to buffer that can hold conversion result
2284
 * @param utf8_len      the maximum output length
2285
 * @return the number of written char; 0 if conversion is not possible
2286
 */
2287
simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
2288
                                                      size_t length,
2289
                                                      char *utf8_output,
2290
                                                      size_t utf8_len) noexcept;
2291
  #if SIMDUTF_SPAN
2292
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_utf8_safe(
2293
    std::span<const char16_t> utf16_input,
2294
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2295
  // implementation note: outputspan is a forwarding ref to avoid copying and
2296
  // allow both lvalues and rvalues. std::span can be copied without problems,
2297
  // but std::vector should not, and this function should accept both. it will
2298
  // allow using an owning rvalue ref (example: passing a temporary std::string)
2299
  // as output, but the user will quickly find out that he has no way of getting
2300
  // the data out of the object in that case.
2301
  return convert_utf16_to_utf8_safe(
2302
      utf16_input.data(), utf16_input.size(),
2303
      reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
2304
}
2305
  #endif // SIMDUTF_SPAN
2306
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2307
2308
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2309
/**
2310
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
2311
 * string.
2312
 *
2313
 * During the conversion also validation of the input string is done.
2314
 * This function is suitable to work with inputs from untrusted sources.
2315
 *
2316
 * This function is not BOM-aware.
2317
 *
2318
 * @param input         the UTF-16 string to convert
2319
 * @param length        the length of the string in 2-byte code units (char16_t)
2320
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2321
 * @return number of written code units; 0 if input is not a valid UTF-16 string
2322
 * or if it cannot be represented as Latin1
2323
 */
2324
simdutf_warn_unused size_t convert_utf16_to_latin1(
2325
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2326
  #if SIMDUTF_SPAN
2327
simdutf_really_inline simdutf_warn_unused size_t convert_utf16_to_latin1(
2328
    std::span<const char16_t> utf16_input,
2329
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2330
  return convert_utf16_to_latin1(
2331
      utf16_input.data(), utf16_input.size(),
2332
      reinterpret_cast<char *>(latin1_output.data()));
2333
}
2334
  #endif // SIMDUTF_SPAN
2335
2336
/**
2337
 * Convert possibly broken UTF-16LE string into Latin1 string.
2338
 * If the string cannot be represented as Latin1, an error
2339
 * is returned.
2340
 *
2341
 * During the conversion also validation of the input string is done.
2342
 * This function is suitable to work with inputs from untrusted sources.
2343
 *
2344
 * This function is not BOM-aware.
2345
 *
2346
 * @param input         the UTF-16LE string to convert
2347
 * @param length        the length of the string in 2-byte code units (char16_t)
2348
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2349
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2350
 * string or if it cannot be represented as Latin1
2351
 */
2352
simdutf_warn_unused size_t convert_utf16le_to_latin1(
2353
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2354
  #if SIMDUTF_SPAN
2355
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_latin1(
2356
    std::span<const char16_t> utf16_input,
2357
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2358
  return convert_utf16le_to_latin1(
2359
      utf16_input.data(), utf16_input.size(),
2360
      reinterpret_cast<char *>(latin1_output.data()));
2361
}
2362
  #endif // SIMDUTF_SPAN
2363
2364
/**
2365
 * Convert possibly broken UTF-16BE string into Latin1 string.
2366
 *
2367
 * During the conversion also validation of the input string is done.
2368
 * This function is suitable to work with inputs from untrusted sources.
2369
 *
2370
 * This function is not BOM-aware.
2371
 *
2372
 * @param input         the UTF-16BE string to convert
2373
 * @param length        the length of the string in 2-byte code units (char16_t)
2374
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2375
 * @return number of written code units; 0 if input is not a valid UTF-16BE
2376
 * string or if it cannot be represented as Latin1
2377
 */
2378
simdutf_warn_unused size_t convert_utf16be_to_latin1(
2379
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2380
  #if SIMDUTF_SPAN
2381
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_latin1(
2382
    std::span<const char16_t> utf16_input,
2383
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2384
  return convert_utf16be_to_latin1(
2385
      utf16_input.data(), utf16_input.size(),
2386
      reinterpret_cast<char *>(latin1_output.data()));
2387
}
2388
  #endif // SIMDUTF_SPAN
2389
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2390
2391
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2392
/**
2393
 * Convert possibly broken UTF-16LE string into UTF-8 string.
2394
 *
2395
 * During the conversion also validation of the input string is done.
2396
 * This function is suitable to work with inputs from untrusted sources.
2397
 *
2398
 * This function is not BOM-aware.
2399
 *
2400
 * @param input         the UTF-16LE string to convert
2401
 * @param length        the length of the string in 2-byte code units (char16_t)
2402
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2403
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2404
 * string
2405
 */
2406
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
2407
                                                   size_t length,
2408
                                                   char *utf8_buffer) noexcept;
2409
  #if SIMDUTF_SPAN
2410
simdutf_really_inline simdutf_warn_unused size_t convert_utf16le_to_utf8(
2411
    std::span<const char16_t> utf16_input,
2412
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2413
  return convert_utf16le_to_utf8(utf16_input.data(), utf16_input.size(),
2414
                                 reinterpret_cast<char *>(utf8_output.data()));
2415
}
2416
  #endif // SIMDUTF_SPAN
2417
2418
/**
2419
 * Convert possibly broken UTF-16BE string into UTF-8 string.
2420
 *
2421
 * During the conversion also validation of the input string is done.
2422
 * This function is suitable to work with inputs from untrusted sources.
2423
 *
2424
 * This function is not BOM-aware.
2425
 *
2426
 * @param input         the UTF-16BE string to convert
2427
 * @param length        the length of the string in 2-byte code units (char16_t)
2428
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2429
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2430
 * string
2431
 */
2432
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
2433
                                                   size_t length,
2434
                                                   char *utf8_buffer) noexcept;
2435
  #if SIMDUTF_SPAN
2436
simdutf_really_inline simdutf_warn_unused size_t convert_utf16be_to_utf8(
2437
    std::span<const char16_t> utf16_input,
2438
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2439
  return convert_utf16be_to_utf8(utf16_input.data(), utf16_input.size(),
2440
                                 reinterpret_cast<char *>(utf8_output.data()));
2441
}
2442
  #endif // SIMDUTF_SPAN
2443
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2444
2445
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2446
/**
2447
 * Using native endianness, convert possibly broken UTF-16 string into Latin1
2448
 * string.
2449
 *
2450
 * During the conversion also validation of the input string is done.
2451
 * This function is suitable to work with inputs from untrusted sources.
2452
 * This function is not BOM-aware.
2453
 *
2454
 * @param input         the UTF-16 string to convert
2455
 * @param length        the length of the string in 2-byte code units (char16_t)
2456
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2457
 * @return a result pair struct (of type simdutf::result containing the two
2458
 * fields error and count) with an error code and either position of the error
2459
 * (in the input in code units) if any, or the number of char written if
2460
 * successful.
2461
 */
2462
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
2463
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2464
  #if SIMDUTF_SPAN
2465
simdutf_really_inline simdutf_warn_unused result
2466
convert_utf16_to_latin1_with_errors(
2467
    std::span<const char16_t> utf16_input,
2468
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2469
  return convert_utf16_to_latin1_with_errors(
2470
      utf16_input.data(), utf16_input.size(),
2471
      reinterpret_cast<char *>(latin1_output.data()));
2472
}
2473
  #endif // SIMDUTF_SPAN
2474
2475
/**
2476
 * Convert possibly broken UTF-16LE string into Latin1 string.
2477
 *
2478
 * During the conversion also validation of the input string is done.
2479
 * This function is suitable to work with inputs from untrusted sources.
2480
 * This function is not BOM-aware.
2481
 *
2482
 * @param input         the UTF-16LE string to convert
2483
 * @param length        the length of the string in 2-byte code units (char16_t)
2484
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2485
 * @return a result pair struct (of type simdutf::result containing the two
2486
 * fields error and count) with an error code and either position of the error
2487
 * (in the input in code units) if any, or the number of char written if
2488
 * successful.
2489
 */
2490
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
2491
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2492
  #if SIMDUTF_SPAN
2493
simdutf_really_inline simdutf_warn_unused result
2494
convert_utf16le_to_latin1_with_errors(
2495
    std::span<const char16_t> utf16_input,
2496
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2497
  return convert_utf16le_to_latin1_with_errors(
2498
      utf16_input.data(), utf16_input.size(),
2499
      reinterpret_cast<char *>(latin1_output.data()));
2500
}
2501
  #endif // SIMDUTF_SPAN
2502
2503
/**
2504
 * Convert possibly broken UTF-16BE string into Latin1 string.
2505
 * If the string cannot be represented as Latin1, an error
2506
 * is returned.
2507
 *
2508
 * During the conversion also validation of the input string is done.
2509
 * This function is suitable to work with inputs from untrusted sources.
2510
 * This function is not BOM-aware.
2511
 *
2512
 * @param input         the UTF-16BE string to convert
2513
 * @param length        the length of the string in 2-byte code units (char16_t)
2514
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2515
 * @return a result pair struct (of type simdutf::result containing the two
2516
 * fields error and count) with an error code and either position of the error
2517
 * (in the input in code units) if any, or the number of char written if
2518
 * successful.
2519
 */
2520
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
2521
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2522
  #if SIMDUTF_SPAN
2523
simdutf_really_inline simdutf_warn_unused result
2524
convert_utf16be_to_latin1_with_errors(
2525
    std::span<const char16_t> utf16_input,
2526
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2527
  return convert_utf16be_to_latin1_with_errors(
2528
      utf16_input.data(), utf16_input.size(),
2529
      reinterpret_cast<char *>(latin1_output.data()));
2530
}
2531
  #endif // SIMDUTF_SPAN
2532
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2533
2534
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2535
/**
2536
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
2537
 * string and stop on error.
2538
 *
2539
 * During the conversion also validation of the input string is done.
2540
 * This function is suitable to work with inputs from untrusted sources.
2541
 *
2542
 * This function is not BOM-aware.
2543
 *
2544
 * @param input         the UTF-16 string to convert
2545
 * @param length        the length of the string in 2-byte code units (char16_t)
2546
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2547
 * @return a result pair struct (of type simdutf::result containing the two
2548
 * fields error and count) with an error code and either position of the error
2549
 * (in the input in code units) if any, or the number of char written if
2550
 * successful.
2551
 */
2552
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
2553
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2554
  #if SIMDUTF_SPAN
2555
simdutf_really_inline simdutf_warn_unused result
2556
convert_utf16_to_utf8_with_errors(
2557
    std::span<const char16_t> utf16_input,
2558
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2559
  return convert_utf16_to_utf8_with_errors(
2560
      utf16_input.data(), utf16_input.size(),
2561
      reinterpret_cast<char *>(utf8_output.data()));
2562
}
2563
  #endif // SIMDUTF_SPAN
2564
2565
/**
2566
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2567
 *
2568
 * During the conversion also validation of the input string is done.
2569
 * This function is suitable to work with inputs from untrusted sources.
2570
 *
2571
 * This function is not BOM-aware.
2572
 *
2573
 * @param input         the UTF-16LE string to convert
2574
 * @param length        the length of the string in 2-byte code units (char16_t)
2575
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2576
 * @return a result pair struct (of type simdutf::result containing the two
2577
 * fields error and count) with an error code and either position of the error
2578
 * (in the input in code units) if any, or the number of char written if
2579
 * successful.
2580
 */
2581
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
2582
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2583
  #if SIMDUTF_SPAN
2584
simdutf_really_inline simdutf_warn_unused result
2585
convert_utf16le_to_utf8_with_errors(
2586
    std::span<const char16_t> utf16_input,
2587
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2588
  return convert_utf16le_to_utf8_with_errors(
2589
      utf16_input.data(), utf16_input.size(),
2590
      reinterpret_cast<char *>(utf8_output.data()));
2591
}
2592
  #endif // SIMDUTF_SPAN
2593
2594
/**
2595
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2596
 *
2597
 * During the conversion also validation of the input string is done.
2598
 * This function is suitable to work with inputs from untrusted sources.
2599
 *
2600
 * This function is not BOM-aware.
2601
 *
2602
 * @param input         the UTF-16BE string to convert
2603
 * @param length        the length of the string in 2-byte code units (char16_t)
2604
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
2605
 * @return a result pair struct (of type simdutf::result containing the two
2606
 * fields error and count) with an error code and either position of the error
2607
 * (in the input in code units) if any, or the number of char written if
2608
 * successful.
2609
 */
2610
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
2611
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2612
  #if SIMDUTF_SPAN
2613
simdutf_really_inline simdutf_warn_unused result
2614
convert_utf16be_to_utf8_with_errors(
2615
    std::span<const char16_t> utf16_input,
2616
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2617
  return convert_utf16be_to_utf8_with_errors(
2618
      utf16_input.data(), utf16_input.size(),
2619
      reinterpret_cast<char *>(utf8_output.data()));
2620
}
2621
  #endif // SIMDUTF_SPAN
2622
2623
/**
2624
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
2625
 *
2626
 * This function assumes that the input string is valid UTF-16LE.
2627
 *
2628
 * This function is not BOM-aware.
2629
 *
2630
 * @param input         the UTF-16 string to convert
2631
 * @param length        the length of the string in 2-byte code units (char16_t)
2632
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2633
 * result
2634
 * @return number of written code units; 0 if conversion is not possible
2635
 */
2636
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
2637
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2638
  #if SIMDUTF_SPAN
2639
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
2640
    std::span<const char16_t> valid_utf16_input,
2641
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2642
  return convert_valid_utf16_to_utf8(
2643
      valid_utf16_input.data(), valid_utf16_input.size(),
2644
      reinterpret_cast<char *>(utf8_output.data()));
2645
}
2646
  #endif // SIMDUTF_SPAN
2647
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2648
2649
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2650
/**
2651
 * Using native endianness, convert UTF-16 string into Latin1 string.
2652
 *
2653
 * This function assumes that the input string is valid UTF-16 and that it can
2654
 * be represented as Latin1. If you violate this assumption, the result is
2655
 * implementation defined and may include system-dependent behavior such as
2656
 * crashes.
2657
 *
2658
 * This function is for expert users only and not part of our public API. Use
2659
 * convert_utf16_to_latin1 instead. The function may be removed from the library
2660
 * in the future.
2661
 *
2662
 * This function is not BOM-aware.
2663
 *
2664
 * @param input         the UTF-16 string to convert
2665
 * @param length        the length of the string in 2-byte code units (char16_t)
2666
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2667
 * @return number of written code units; 0 if conversion is not possible
2668
 */
2669
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
2670
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2671
  #if SIMDUTF_SPAN
2672
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
2673
    std::span<const char16_t> valid_utf16_input,
2674
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2675
  return convert_valid_utf16_to_latin1(
2676
      valid_utf16_input.data(), valid_utf16_input.size(),
2677
      reinterpret_cast<char *>(latin1_output.data()));
2678
}
2679
  #endif // SIMDUTF_SPAN
2680
2681
/**
2682
 * Convert valid UTF-16LE string into Latin1 string.
2683
 *
2684
 * This function assumes that the input string is valid UTF-16LE and that it can
2685
 * be represented as Latin1. If you violate this assumption, the result is
2686
 * implementation defined and may include system-dependent behavior such as
2687
 * crashes.
2688
 *
2689
 * This function is for expert users only and not part of our public API. Use
2690
 * convert_utf16le_to_latin1 instead. The function may be removed from the
2691
 * library in the future.
2692
 *
2693
 * This function is not BOM-aware.
2694
 *
2695
 * @param input         the UTF-16LE string to convert
2696
 * @param length        the length of the string in 2-byte code units (char16_t)
2697
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2698
 * @return number of written code units; 0 if conversion is not possible
2699
 */
2700
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
2701
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2702
  #if SIMDUTF_SPAN
2703
simdutf_really_inline simdutf_warn_unused size_t
2704
convert_valid_utf16le_to_latin1(
2705
    std::span<const char16_t> valid_utf16_input,
2706
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2707
  return convert_valid_utf16le_to_latin1(
2708
      valid_utf16_input.data(), valid_utf16_input.size(),
2709
      reinterpret_cast<char *>(latin1_output.data()));
2710
}
2711
  #endif // SIMDUTF_SPAN
2712
2713
/**
2714
 * Convert valid UTF-16BE string into Latin1 string.
2715
 *
2716
 * This function assumes that the input string is valid UTF-16BE and that it can
2717
 * be represented as Latin1. If you violate this assumption, the result is
2718
 * implementation defined and may include system-dependent behavior such as
2719
 * crashes.
2720
 *
2721
 * This function is for expert users only and not part of our public API. Use
2722
 * convert_utf16be_to_latin1 instead. The function may be removed from the
2723
 * library in the future.
2724
 *
2725
 * This function is not BOM-aware.
2726
 *
2727
 * @param input         the UTF-16BE string to convert
2728
 * @param length        the length of the string in 2-byte code units (char16_t)
2729
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
2730
 * @return number of written code units; 0 if conversion is not possible
2731
 */
2732
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
2733
    const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2734
  #if SIMDUTF_SPAN
2735
simdutf_really_inline simdutf_warn_unused size_t
2736
convert_valid_utf16be_to_latin1(
2737
    std::span<const char16_t> valid_utf16_input,
2738
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2739
  return convert_valid_utf16be_to_latin1(
2740
      valid_utf16_input.data(), valid_utf16_input.size(),
2741
      reinterpret_cast<char *>(latin1_output.data()));
2742
}
2743
  #endif // SIMDUTF_SPAN
2744
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2745
2746
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2747
/**
2748
 * Convert valid UTF-16LE string into UTF-8 string.
2749
 *
2750
 * This function assumes that the input string is valid UTF-16LE and that it can
2751
 * be represented as Latin1.
2752
 *
2753
 * This function is not BOM-aware.
2754
 *
2755
 * @param input         the UTF-16LE string to convert
2756
 * @param length        the length of the string in 2-byte code units (char16_t)
2757
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2758
 * result
2759
 * @return number of written code units; 0 if conversion is not possible
2760
 */
2761
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
2762
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2763
  #if SIMDUTF_SPAN
2764
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
2765
    std::span<const char16_t> valid_utf16_input,
2766
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2767
  return convert_valid_utf16le_to_utf8(
2768
      valid_utf16_input.data(), valid_utf16_input.size(),
2769
      reinterpret_cast<char *>(utf8_output.data()));
2770
}
2771
  #endif // SIMDUTF_SPAN
2772
2773
/**
2774
 * Convert valid UTF-16BE string into UTF-8 string.
2775
 *
2776
 * This function assumes that the input string is valid UTF-16BE.
2777
 *
2778
 * This function is not BOM-aware.
2779
 *
2780
 * @param input         the UTF-16BE string to convert
2781
 * @param length        the length of the string in 2-byte code units (char16_t)
2782
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
2783
 * result
2784
 * @return number of written code units; 0 if conversion is not possible
2785
 */
2786
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
2787
    const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2788
  #if SIMDUTF_SPAN
2789
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
2790
    std::span<const char16_t> valid_utf16_input,
2791
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2792
  return convert_valid_utf16be_to_utf8(
2793
      valid_utf16_input.data(), valid_utf16_input.size(),
2794
      reinterpret_cast<char *>(utf8_output.data()));
2795
}
2796
  #endif // SIMDUTF_SPAN
2797
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2798
2799
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2800
/**
2801
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
2802
 * string.
2803
 *
2804
 * During the conversion also validation of the input string is done.
2805
 * This function is suitable to work with inputs from untrusted sources.
2806
 *
2807
 * This function is not BOM-aware.
2808
 *
2809
 * @param input         the UTF-16 string to convert
2810
 * @param length        the length of the string in 2-byte code units (char16_t)
2811
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2812
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2813
 * string
2814
 */
2815
simdutf_warn_unused size_t convert_utf16_to_utf32(
2816
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2817
  #if SIMDUTF_SPAN
2818
simdutf_really_inline simdutf_warn_unused size_t
2819
convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
2820
0
                       std::span<char32_t> utf32_output) noexcept {
2821
0
  return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
2822
0
                                utf32_output.data());
2823
0
}
2824
  #endif // SIMDUTF_SPAN
2825
2826
/**
2827
 * Convert possibly broken UTF-16LE string into UTF-32 string.
2828
 *
2829
 * During the conversion also validation of the input string is done.
2830
 * This function is suitable to work with inputs from untrusted sources.
2831
 *
2832
 * This function is not BOM-aware.
2833
 *
2834
 * @param input         the UTF-16LE string to convert
2835
 * @param length        the length of the string in 2-byte code units (char16_t)
2836
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2837
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2838
 * string
2839
 */
2840
simdutf_warn_unused size_t convert_utf16le_to_utf32(
2841
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2842
  #if SIMDUTF_SPAN
2843
simdutf_really_inline simdutf_warn_unused size_t
2844
convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
2845
0
                         std::span<char32_t> utf32_output) noexcept {
2846
0
  return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
2847
0
                                  utf32_output.data());
2848
0
}
2849
  #endif // SIMDUTF_SPAN
2850
2851
/**
2852
 * Convert possibly broken UTF-16BE string into UTF-32 string.
2853
 *
2854
 * During the conversion also validation of the input string is done.
2855
 * This function is suitable to work with inputs from untrusted sources.
2856
 *
2857
 * This function is not BOM-aware.
2858
 *
2859
 * @param input         the UTF-16BE string to convert
2860
 * @param length        the length of the string in 2-byte code units (char16_t)
2861
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2862
 * @return number of written code units; 0 if input is not a valid UTF-16LE
2863
 * string
2864
 */
2865
simdutf_warn_unused size_t convert_utf16be_to_utf32(
2866
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2867
  #if SIMDUTF_SPAN
2868
simdutf_really_inline simdutf_warn_unused size_t
2869
convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
2870
0
                         std::span<char32_t> utf32_output) noexcept {
2871
0
  return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
2872
0
                                  utf32_output.data());
2873
0
}
2874
  #endif // SIMDUTF_SPAN
2875
2876
/**
2877
 * Using native endianness, convert possibly broken UTF-16 string into
2878
 * UTF-32 string and stop on error.
2879
 *
2880
 * During the conversion also validation of the input string is done.
2881
 * This function is suitable to work with inputs from untrusted sources.
2882
 *
2883
 * This function is not BOM-aware.
2884
 *
2885
 * @param input         the UTF-16 string to convert
2886
 * @param length        the length of the string in 2-byte code units (char16_t)
2887
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2888
 * @return a result pair struct (of type simdutf::result containing the two
2889
 * fields error and count) with an error code and either position of the error
2890
 * (in the input in code units) if any, or the number of char32_t written if
2891
 * successful.
2892
 */
2893
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
2894
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2895
  #if SIMDUTF_SPAN
2896
simdutf_really_inline simdutf_warn_unused result
2897
convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
2898
0
                                   std::span<char32_t> utf32_output) noexcept {
2899
0
  return convert_utf16_to_utf32_with_errors(
2900
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
2901
0
}
2902
  #endif // SIMDUTF_SPAN
2903
2904
/**
2905
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2906
 *
2907
 * During the conversion also validation of the input string is done.
2908
 * This function is suitable to work with inputs from untrusted sources.
2909
 *
2910
 * This function is not BOM-aware.
2911
 *
2912
 * @param input         the UTF-16LE string to convert
2913
 * @param length        the length of the string in 2-byte code units (char16_t)
2914
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2915
 * @return a result pair struct (of type simdutf::result containing the two
2916
 * fields error and count) with an error code and either position of the error
2917
 * (in the input in code units) if any, or the number of char32_t written if
2918
 * successful.
2919
 */
2920
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
2921
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2922
  #if SIMDUTF_SPAN
2923
simdutf_really_inline simdutf_warn_unused result
2924
convert_utf16le_to_utf32_with_errors(
2925
    std::span<const char16_t> utf16_input,
2926
0
    std::span<char32_t> utf32_output) noexcept {
2927
0
  return convert_utf16le_to_utf32_with_errors(
2928
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
2929
0
}
2930
  #endif // SIMDUTF_SPAN
2931
2932
/**
2933
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2934
 *
2935
 * During the conversion also validation of the input string is done.
2936
 * This function is suitable to work with inputs from untrusted sources.
2937
 *
2938
 * This function is not BOM-aware.
2939
 *
2940
 * @param input         the UTF-16BE string to convert
2941
 * @param length        the length of the string in 2-byte code units (char16_t)
2942
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
2943
 * @return a result pair struct (of type simdutf::result containing the two
2944
 * fields error and count) with an error code and either position of the error
2945
 * (in the input in code units) if any, or the number of char32_t written if
2946
 * successful.
2947
 */
2948
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
2949
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2950
  #if SIMDUTF_SPAN
2951
simdutf_really_inline simdutf_warn_unused result
2952
convert_utf16be_to_utf32_with_errors(
2953
    std::span<const char16_t> utf16_input,
2954
0
    std::span<char32_t> utf32_output) noexcept {
2955
0
  return convert_utf16be_to_utf32_with_errors(
2956
0
      utf16_input.data(), utf16_input.size(), utf32_output.data());
2957
0
}
2958
  #endif // SIMDUTF_SPAN
2959
2960
/**
2961
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
2962
 *
2963
 * This function assumes that the input string is valid UTF-16 (native
2964
 * endianness).
2965
 *
2966
 * This function is not BOM-aware.
2967
 *
2968
 * @param input         the UTF-16 string to convert
2969
 * @param length        the length of the string in 2-byte code units (char16_t)
2970
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2971
 * result
2972
 * @return number of written code units; 0 if conversion is not possible
2973
 */
2974
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
2975
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2976
  #if SIMDUTF_SPAN
2977
simdutf_really_inline simdutf_warn_unused size_t
2978
convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
2979
0
                             std::span<char32_t> utf32_output) noexcept {
2980
0
  return convert_valid_utf16_to_utf32(
2981
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
2982
0
}
2983
  #endif // SIMDUTF_SPAN
2984
2985
/**
2986
 * Convert valid UTF-16LE string into UTF-32 string.
2987
 *
2988
 * This function assumes that the input string is valid UTF-16LE.
2989
 *
2990
 * This function is not BOM-aware.
2991
 *
2992
 * @param input         the UTF-16LE string to convert
2993
 * @param length        the length of the string in 2-byte code units (char16_t)
2994
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
2995
 * result
2996
 * @return number of written code units; 0 if conversion is not possible
2997
 */
2998
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
2999
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
3000
  #if SIMDUTF_SPAN
3001
simdutf_really_inline simdutf_warn_unused size_t
3002
convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
3003
0
                               std::span<char32_t> utf32_output) noexcept {
3004
0
  return convert_valid_utf16le_to_utf32(
3005
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
3006
0
}
3007
  #endif // SIMDUTF_SPAN
3008
3009
/**
3010
 * Convert valid UTF-16BE string into UTF-32 string.
3011
 *
3012
 * This function assumes that the input string is valid UTF-16LE.
3013
 *
3014
 * This function is not BOM-aware.
3015
 *
3016
 * @param input         the UTF-16BE string to convert
3017
 * @param length        the length of the string in 2-byte code units (char16_t)
3018
 * @param utf32_buffer   the pointer to a buffer that can hold the conversion
3019
 * result
3020
 * @return number of written code units; 0 if conversion is not possible
3021
 */
3022
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
3023
    const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
3024
  #if SIMDUTF_SPAN
3025
simdutf_really_inline simdutf_warn_unused size_t
3026
convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
3027
0
                               std::span<char32_t> utf32_output) noexcept {
3028
0
  return convert_valid_utf16be_to_utf32(
3029
0
      valid_utf16_input.data(), valid_utf16_input.size(), utf32_output.data());
3030
0
}
3031
  #endif // SIMDUTF_SPAN
3032
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3033
3034
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3035
/**
3036
 * Compute the number of bytes that this UTF-16LE/BE string would require in
3037
 * Latin1 format.
3038
 *
3039
 * This function does not validate the input. It is acceptable to pass invalid
3040
 * UTF-16 strings but in such cases the result is implementation defined.
3041
 *
3042
 * This function is not BOM-aware.
3043
 *
3044
 * @param length        the length of the string in 2-byte code units (char16_t)
3045
 * @return the number of bytes required to encode the UTF-16LE string as Latin1
3046
 */
3047
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
3048
3049
/**
3050
 * Using native endianness; Compute the number of bytes that this UTF-16
3051
 * string would require in UTF-8 format.
3052
 *
3053
 * This function does not validate the input. It is acceptable to pass invalid
3054
 * UTF-16 strings but in such cases the result is implementation defined.
3055
 *
3056
 * @param input         the UTF-16 string to convert
3057
 * @param length        the length of the string in 2-byte code units (char16_t)
3058
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
3059
 */
3060
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
3061
                                                  size_t length) noexcept;
3062
  #if SIMDUTF_SPAN
3063
simdutf_really_inline simdutf_warn_unused size_t
3064
0
utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3065
0
  return utf8_length_from_utf16(valid_utf16_input.data(),
3066
0
                                valid_utf16_input.size());
3067
0
}
3068
  #endif // SIMDUTF_SPAN
3069
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
3070
3071
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3072
/**
3073
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
3074
 * format.
3075
 *
3076
 * This function does not validate the input. It is acceptable to pass invalid
3077
 * UTF-16 strings but in such cases the result is implementation defined.
3078
 *
3079
 * @param input         the UTF-16LE string to convert
3080
 * @param length        the length of the string in 2-byte code units (char16_t)
3081
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
3082
 */
3083
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
3084
                                                    size_t length) noexcept;
3085
  #if SIMDUTF_SPAN
3086
simdutf_really_inline simdutf_warn_unused size_t
3087
0
utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3088
0
  return utf8_length_from_utf16le(valid_utf16_input.data(),
3089
0
                                  valid_utf16_input.size());
3090
0
}
3091
  #endif // SIMDUTF_SPAN
3092
3093
/**
3094
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
3095
 * format.
3096
 *
3097
 * This function does not validate the input. It is acceptable to pass invalid
3098
 * UTF-16 strings but in such cases the result is implementation defined.
3099
 *
3100
 * @param input         the UTF-16BE string to convert
3101
 * @param length        the length of the string in 2-byte code units (char16_t)
3102
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
3103
 */
3104
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
3105
                                                    size_t length) noexcept;
3106
  #if SIMDUTF_SPAN
3107
simdutf_really_inline simdutf_warn_unused size_t
3108
0
utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3109
0
  return utf8_length_from_utf16be(valid_utf16_input.data(),
3110
0
                                  valid_utf16_input.size());
3111
0
}
3112
  #endif // SIMDUTF_SPAN
3113
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3114
3115
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3116
/**
3117
 * Convert possibly broken UTF-32 string into UTF-8 string.
3118
 *
3119
 * During the conversion also validation of the input string is done.
3120
 * This function is suitable to work with inputs from untrusted sources.
3121
 *
3122
 * This function is not BOM-aware.
3123
 *
3124
 * @param input         the UTF-32 string to convert
3125
 * @param length        the length of the string in 4-byte code units (char32_t)
3126
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
3127
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3128
 */
3129
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
3130
                                                 size_t length,
3131
                                                 char *utf8_buffer) noexcept;
3132
  #if SIMDUTF_SPAN
3133
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_utf8(
3134
    std::span<const char32_t> utf32_input,
3135
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3136
  return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
3137
                               reinterpret_cast<char *>(utf8_output.data()));
3138
}
3139
  #endif // SIMDUTF_SPAN
3140
3141
/**
3142
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
3143
 *
3144
 * During the conversion also validation of the input string is done.
3145
 * This function is suitable to work with inputs from untrusted sources.
3146
 *
3147
 * This function is not BOM-aware.
3148
 *
3149
 * @param input         the UTF-32 string to convert
3150
 * @param length        the length of the string in 4-byte code units (char32_t)
3151
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
3152
 * @return a result pair struct (of type simdutf::result containing the two
3153
 * fields error and count) with an error code and either position of the error
3154
 * (in the input in code units) if any, or the number of char written if
3155
 * successful.
3156
 */
3157
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
3158
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
3159
  #if SIMDUTF_SPAN
3160
simdutf_really_inline simdutf_warn_unused result
3161
convert_utf32_to_utf8_with_errors(
3162
    std::span<const char32_t> utf32_input,
3163
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3164
  return convert_utf32_to_utf8_with_errors(
3165
      utf32_input.data(), utf32_input.size(),
3166
      reinterpret_cast<char *>(utf8_output.data()));
3167
}
3168
  #endif // SIMDUTF_SPAN
3169
3170
/**
3171
 * Convert valid UTF-32 string into UTF-8 string.
3172
 *
3173
 * This function assumes that the input string is valid UTF-32.
3174
 *
3175
 * This function is not BOM-aware.
3176
 *
3177
 * @param input         the UTF-32 string to convert
3178
 * @param length        the length of the string in 4-byte code units (char32_t)
3179
 * @param utf8_buffer   the pointer to a buffer that can hold the conversion
3180
 * result
3181
 * @return number of written code units; 0 if conversion is not possible
3182
 */
3183
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
3184
    const char32_t *input, size_t length, char *utf8_buffer) noexcept;
3185
  #if SIMDUTF_SPAN
3186
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
3187
    std::span<const char32_t> valid_utf32_input,
3188
    detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3189
  return convert_valid_utf32_to_utf8(
3190
      valid_utf32_input.data(), valid_utf32_input.size(),
3191
      reinterpret_cast<char *>(utf8_output.data()));
3192
}
3193
  #endif // SIMDUTF_SPAN
3194
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3195
3196
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3197
/**
3198
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
3199
 * string.
3200
 *
3201
 * During the conversion also validation of the input string is done.
3202
 * This function is suitable to work with inputs from untrusted sources.
3203
 *
3204
 * This function is not BOM-aware.
3205
 *
3206
 * @param input         the UTF-32 string to convert
3207
 * @param length        the length of the string in 4-byte code units (char32_t)
3208
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3209
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3210
 */
3211
simdutf_warn_unused size_t convert_utf32_to_utf16(
3212
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3213
  #if SIMDUTF_SPAN
3214
simdutf_really_inline simdutf_warn_unused size_t
3215
convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
3216
0
                       std::span<char16_t> utf16_output) noexcept {
3217
0
  return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
3218
0
                                utf16_output.data());
3219
0
}
3220
  #endif // SIMDUTF_SPAN
3221
3222
/**
3223
 * Convert possibly broken UTF-32 string into UTF-16LE string.
3224
 *
3225
 * During the conversion also validation of the input string is done.
3226
 * This function is suitable to work with inputs from untrusted sources.
3227
 *
3228
 * This function is not BOM-aware.
3229
 *
3230
 * @param input         the UTF-32 string to convert
3231
 * @param length        the length of the string in 4-byte code units (char32_t)
3232
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3233
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3234
 */
3235
simdutf_warn_unused size_t convert_utf32_to_utf16le(
3236
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3237
  #if SIMDUTF_SPAN
3238
simdutf_really_inline simdutf_warn_unused size_t
3239
convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
3240
0
                         std::span<char16_t> utf16_output) noexcept {
3241
0
  return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
3242
0
                                  utf16_output.data());
3243
0
}
3244
  #endif // SIMDUTF_SPAN
3245
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3246
3247
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3248
/**
3249
 * Convert possibly broken UTF-32 string into Latin1 string.
3250
 *
3251
 * During the conversion also validation of the input string is done.
3252
 * This function is suitable to work with inputs from untrusted sources.
3253
 *
3254
 * This function is not BOM-aware.
3255
 *
3256
 * @param input         the UTF-32 string to convert
3257
 * @param length        the length of the string in 4-byte code units (char32_t)
3258
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
3259
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3260
 * or if it cannot be represented as Latin1
3261
 */
3262
simdutf_warn_unused size_t convert_utf32_to_latin1(
3263
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3264
  #if SIMDUTF_SPAN
3265
simdutf_really_inline simdutf_warn_unused size_t convert_utf32_to_latin1(
3266
    std::span<const char32_t> utf32_input,
3267
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3268
  return convert_utf32_to_latin1(
3269
      utf32_input.data(), utf32_input.size(),
3270
      reinterpret_cast<char *>(latin1_output.data()));
3271
}
3272
  #endif // SIMDUTF_SPAN
3273
3274
/**
3275
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
3276
 * If the string cannot be represented as Latin1, an error is returned.
3277
 *
3278
 * During the conversion also validation of the input string is done.
3279
 * This function is suitable to work with inputs from untrusted sources.
3280
 *
3281
 * This function is not BOM-aware.
3282
 *
3283
 * @param input         the UTF-32 string to convert
3284
 * @param length        the length of the string in 4-byte code units (char32_t)
3285
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
3286
 * @return a result pair struct (of type simdutf::result containing the two
3287
 * fields error and count) with an error code and either position of the error
3288
 * (in the input in code units) if any, or the number of char written if
3289
 * successful.
3290
 */
3291
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
3292
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3293
  #if SIMDUTF_SPAN
3294
simdutf_really_inline simdutf_warn_unused result
3295
convert_utf32_to_latin1_with_errors(
3296
    std::span<const char32_t> utf32_input,
3297
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3298
  return convert_utf32_to_latin1_with_errors(
3299
      utf32_input.data(), utf32_input.size(),
3300
      reinterpret_cast<char *>(latin1_output.data()));
3301
}
3302
  #endif // SIMDUTF_SPAN
3303
3304
/**
3305
 * Convert valid UTF-32 string into Latin1 string.
3306
 *
3307
 * This function assumes that the input string is valid UTF-32 and that it can
3308
 * be represented as Latin1. If you violate this assumption, the result is
3309
 * implementation defined and may include system-dependent behavior such as
3310
 * crashes.
3311
 *
3312
 * This function is for expert users only and not part of our public API. Use
3313
 * convert_utf32_to_latin1 instead. The function may be removed from the library
3314
 * in the future.
3315
 *
3316
 * This function is not BOM-aware.
3317
 *
3318
 * @param input         the UTF-32 string to convert
3319
 * @param length        the length of the string in 4-byte code units (char32_t)
3320
 * @param latin1_buffer   the pointer to a buffer that can hold the conversion
3321
 * result
3322
 * @return number of written code units; 0 if conversion is not possible
3323
 */
3324
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
3325
    const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3326
  #if SIMDUTF_SPAN
3327
simdutf_really_inline simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
3328
    std::span<const char32_t> valid_utf32_input,
3329
    detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3330
  return convert_valid_utf32_to_latin1(
3331
      valid_utf32_input.data(), valid_utf32_input.size(),
3332
      reinterpret_cast<char *>(latin1_output.data()));
3333
}
3334
  #endif // SIMDUTF_SPAN
3335
3336
/**
3337
 * Compute the number of bytes that this UTF-32 string would require in Latin1
3338
 * format.
3339
 *
3340
 * This function does not validate the input. It is acceptable to pass invalid
3341
 * UTF-32 strings but in such cases the result is implementation defined.
3342
 *
3343
 * This function is not BOM-aware.
3344
 *
3345
 * @param length        the length of the string in 4-byte code units (char32_t)
3346
 * @return the number of bytes required to encode the UTF-32 string as Latin1
3347
 */
3348
simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept;
3349
3350
/**
3351
 * Compute the number of bytes that this Latin1 string would require in UTF-32
3352
 * format.
3353
 *
3354
 * @param length        the length of the string in Latin1 code units (char)
3355
 * @return the length of the string in 4-byte code units (char32_t) required to
3356
 * encode the Latin1 string as UTF-32
3357
 */
3358
simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept;
3359
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3360
3361
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3362
/**
3363
 * Convert possibly broken UTF-32 string into UTF-16BE string.
3364
 *
3365
 * During the conversion also validation of the input string is done.
3366
 * This function is suitable to work with inputs from untrusted sources.
3367
 *
3368
 * This function is not BOM-aware.
3369
 *
3370
 * @param input         the UTF-32 string to convert
3371
 * @param length        the length of the string in 4-byte code units (char32_t)
3372
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3373
 * @return number of written code units; 0 if input is not a valid UTF-32 string
3374
 */
3375
simdutf_warn_unused size_t convert_utf32_to_utf16be(
3376
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3377
  #if SIMDUTF_SPAN
3378
simdutf_really_inline simdutf_warn_unused size_t
3379
convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
3380
0
                         std::span<char16_t> utf16_output) noexcept {
3381
0
  return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
3382
0
                                  utf16_output.data());
3383
0
}
3384
  #endif // SIMDUTF_SPAN
3385
3386
/**
3387
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
3388
 * string and stop on error.
3389
 *
3390
 * During the conversion also validation of the input string is done.
3391
 * This function is suitable to work with inputs from untrusted sources.
3392
 *
3393
 * This function is not BOM-aware.
3394
 *
3395
 * @param input         the UTF-32 string to convert
3396
 * @param length        the length of the string in 4-byte code units (char32_t)
3397
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3398
 * @return a result pair struct (of type simdutf::result containing the two
3399
 * fields error and count) with an error code and either position of the error
3400
 * (in the input in code units) if any, or the number of char16_t written if
3401
 * successful.
3402
 */
3403
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
3404
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3405
  #if SIMDUTF_SPAN
3406
simdutf_really_inline simdutf_warn_unused result
3407
convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
3408
0
                                   std::span<char16_t> utf16_output) noexcept {
3409
0
  return convert_utf32_to_utf16_with_errors(
3410
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
3411
0
}
3412
  #endif // SIMDUTF_SPAN
3413
3414
/**
3415
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
3416
 *
3417
 * During the conversion also validation of the input string is done.
3418
 * This function is suitable to work with inputs from untrusted sources.
3419
 *
3420
 * This function is not BOM-aware.
3421
 *
3422
 * @param input         the UTF-32 string to convert
3423
 * @param length        the length of the string in 4-byte code units (char32_t)
3424
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3425
 * @return a result pair struct (of type simdutf::result containing the two
3426
 * fields error and count) with an error code and either position of the error
3427
 * (in the input in code units) if any, or the number of char16_t written if
3428
 * successful.
3429
 */
3430
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
3431
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3432
  #if SIMDUTF_SPAN
3433
simdutf_really_inline simdutf_warn_unused result
3434
convert_utf32_to_utf16le_with_errors(
3435
    std::span<const char32_t> utf32_input,
3436
0
    std::span<char16_t> utf16_output) noexcept {
3437
0
  return convert_utf32_to_utf16le_with_errors(
3438
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
3439
0
}
3440
  #endif // SIMDUTF_SPAN
3441
3442
/**
3443
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
3444
 *
3445
 * During the conversion also validation of the input string is done.
3446
 * This function is suitable to work with inputs from untrusted sources.
3447
 *
3448
 * This function is not BOM-aware.
3449
 *
3450
 * @param input         the UTF-32 string to convert
3451
 * @param length        the length of the string in 4-byte code units (char32_t)
3452
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
3453
 * @return a result pair struct (of type simdutf::result containing the two
3454
 * fields error and count) with an error code and either position of the error
3455
 * (in the input in code units) if any, or the number of char16_t written if
3456
 * successful.
3457
 */
3458
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
3459
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3460
  #if SIMDUTF_SPAN
3461
simdutf_really_inline simdutf_warn_unused result
3462
convert_utf32_to_utf16be_with_errors(
3463
    std::span<const char32_t> utf32_input,
3464
0
    std::span<char16_t> utf16_output) noexcept {
3465
0
  return convert_utf32_to_utf16be_with_errors(
3466
0
      utf32_input.data(), utf32_input.size(), utf16_output.data());
3467
0
}
3468
  #endif // SIMDUTF_SPAN
3469
3470
/**
3471
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
3472
 *
3473
 * This function assumes that the input string is valid UTF-32.
3474
 *
3475
 * This function is not BOM-aware.
3476
 *
3477
 * @param input         the UTF-32 string to convert
3478
 * @param length        the length of the string in 4-byte code units (char32_t)
3479
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3480
 * result
3481
 * @return number of written code units; 0 if conversion is not possible
3482
 */
3483
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
3484
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3485
  #if SIMDUTF_SPAN
3486
simdutf_really_inline simdutf_warn_unused size_t
3487
convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
3488
0
                             std::span<char16_t> utf16_output) noexcept {
3489
0
  return convert_valid_utf32_to_utf16(
3490
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
3491
0
}
3492
  #endif // SIMDUTF_SPAN
3493
3494
/**
3495
 * Convert valid UTF-32 string into UTF-16LE string.
3496
 *
3497
 * This function assumes that the input string is valid UTF-32.
3498
 *
3499
 * This function is not BOM-aware.
3500
 *
3501
 * @param input         the UTF-32 string to convert
3502
 * @param length        the length of the string in 4-byte code units (char32_t)
3503
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3504
 * result
3505
 * @return number of written code units; 0 if conversion is not possible
3506
 */
3507
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
3508
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3509
  #if SIMDUTF_SPAN
3510
simdutf_really_inline simdutf_warn_unused size_t
3511
convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
3512
0
                               std::span<char16_t> utf16_output) noexcept {
3513
0
  return convert_valid_utf32_to_utf16le(
3514
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
3515
0
}
3516
  #endif // SIMDUTF_SPAN
3517
3518
/**
3519
 * Convert valid UTF-32 string into UTF-16BE string.
3520
 *
3521
 * This function assumes that the input string is valid UTF-32.
3522
 *
3523
 * This function is not BOM-aware.
3524
 *
3525
 * @param input         the UTF-32 string to convert
3526
 * @param length        the length of the string in 4-byte code units (char32_t)
3527
 * @param utf16_buffer   the pointer to a buffer that can hold the conversion
3528
 * result
3529
 * @return number of written code units; 0 if conversion is not possible
3530
 */
3531
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
3532
    const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3533
  #if SIMDUTF_SPAN
3534
simdutf_really_inline simdutf_warn_unused size_t
3535
convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
3536
0
                               std::span<char16_t> utf16_output) noexcept {
3537
0
  return convert_valid_utf32_to_utf16be(
3538
0
      valid_utf32_input.data(), valid_utf32_input.size(), utf16_output.data());
3539
0
}
3540
  #endif // SIMDUTF_SPAN
3541
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3542
3543
#if SIMDUTF_FEATURE_UTF16
3544
/**
3545
 * Change the endianness of the input. Can be used to go from UTF-16LE to
3546
 * UTF-16BE or from UTF-16BE to UTF-16LE.
3547
 *
3548
 * This function does not validate the input.
3549
 *
3550
 * This function is not BOM-aware.
3551
 *
3552
 * @param input         the UTF-16 string to process
3553
 * @param length        the length of the string in 2-byte code units (char16_t)
3554
 * @param output        the pointer to a buffer that can hold the conversion
3555
 * result
3556
 */
3557
void change_endianness_utf16(const char16_t *input, size_t length,
3558
                             char16_t *output) noexcept;
3559
  #if SIMDUTF_SPAN
3560
simdutf_really_inline void
3561
change_endianness_utf16(std::span<const char16_t> utf16_input,
3562
0
                        std::span<char16_t> utf16_output) noexcept {
3563
0
  return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
3564
0
                                 utf16_output.data());
3565
0
}
3566
  #endif // SIMDUTF_SPAN
3567
#endif   // SIMDUTF_FEATURE_UTF16
3568
3569
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3570
/**
3571
 * Compute the number of bytes that this UTF-32 string would require in UTF-8
3572
 * format.
3573
 *
3574
 * This function does not validate the input. It is acceptable to pass invalid
3575
 * UTF-32 strings but in such cases the result is implementation defined.
3576
 *
3577
 * @param input         the UTF-32 string to convert
3578
 * @param length        the length of the string in 4-byte code units (char32_t)
3579
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
3580
 */
3581
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
3582
                                                  size_t length) noexcept;
3583
  #if SIMDUTF_SPAN
3584
simdutf_really_inline simdutf_warn_unused size_t
3585
0
utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3586
0
  return utf8_length_from_utf32(valid_utf32_input.data(),
3587
0
                                valid_utf32_input.size());
3588
0
}
3589
  #endif // SIMDUTF_SPAN
3590
#endif   // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3591
3592
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3593
/**
3594
 * Compute the number of two-byte code units that this UTF-32 string would
3595
 * require in UTF-16 format.
3596
 *
3597
 * This function does not validate the input. It is acceptable to pass invalid
3598
 * UTF-32 strings but in such cases the result is implementation defined.
3599
 *
3600
 * @param input         the UTF-32 string to convert
3601
 * @param length        the length of the string in 4-byte code units (char32_t)
3602
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
3603
 */
3604
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
3605
                                                   size_t length) noexcept;
3606
  #if SIMDUTF_SPAN
3607
simdutf_really_inline simdutf_warn_unused size_t
3608
0
utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3609
0
  return utf16_length_from_utf32(valid_utf32_input.data(),
3610
0
                                 valid_utf32_input.size());
3611
0
}
3612
  #endif // SIMDUTF_SPAN
3613
3614
/**
3615
 * Using native endianness; Compute the number of bytes that this UTF-16
3616
 * string would require in UTF-32 format.
3617
 *
3618
 * This function is equivalent to count_utf16.
3619
 *
3620
 * This function does not validate the input. It is acceptable to pass invalid
3621
 * UTF-16 strings but in such cases the result is implementation defined.
3622
 *
3623
 * This function is not BOM-aware.
3624
 *
3625
 * @param input         the UTF-16 string to convert
3626
 * @param length        the length of the string in 2-byte code units (char16_t)
3627
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3628
 */
3629
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
3630
                                                   size_t length) noexcept;
3631
  #if SIMDUTF_SPAN
3632
simdutf_really_inline simdutf_warn_unused size_t
3633
0
utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3634
0
  return utf32_length_from_utf16(valid_utf16_input.data(),
3635
0
                                 valid_utf16_input.size());
3636
0
}
3637
  #endif // SIMDUTF_SPAN
3638
3639
/**
3640
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
3641
 * format.
3642
 *
3643
 * This function is equivalent to count_utf16le.
3644
 *
3645
 * This function does not validate the input. It is acceptable to pass invalid
3646
 * UTF-16 strings but in such cases the result is implementation defined.
3647
 *
3648
 * This function is not BOM-aware.
3649
 *
3650
 * @param input         the UTF-16LE string to convert
3651
 * @param length        the length of the string in 2-byte code units (char16_t)
3652
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3653
 */
3654
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
3655
                                                     size_t length) noexcept;
3656
  #if SIMDUTF_SPAN
3657
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16le(
3658
0
    std::span<const char16_t> valid_utf16_input) noexcept {
3659
0
  return utf32_length_from_utf16le(valid_utf16_input.data(),
3660
0
                                   valid_utf16_input.size());
3661
0
}
3662
  #endif // SIMDUTF_SPAN
3663
3664
/**
3665
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
3666
 * format.
3667
 *
3668
 * This function is equivalent to count_utf16be.
3669
 *
3670
 * This function does not validate the input. It is acceptable to pass invalid
3671
 * UTF-16 strings but in such cases the result is implementation defined.
3672
 *
3673
 * This function is not BOM-aware.
3674
 *
3675
 * @param input         the UTF-16BE string to convert
3676
 * @param length        the length of the string in 2-byte code units (char16_t)
3677
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
3678
 */
3679
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
3680
                                                     size_t length) noexcept;
3681
  #if SIMDUTF_SPAN
3682
simdutf_really_inline simdutf_warn_unused size_t utf32_length_from_utf16be(
3683
0
    std::span<const char16_t> valid_utf16_input) noexcept {
3684
0
  return utf32_length_from_utf16be(valid_utf16_input.data(),
3685
0
                                   valid_utf16_input.size());
3686
0
}
3687
  #endif // SIMDUTF_SPAN
3688
#endif   // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3689
3690
#if SIMDUTF_FEATURE_UTF16
3691
/**
3692
 * Count the number of code points (characters) in the string assuming that
3693
 * it is valid.
3694
 *
3695
 * This function assumes that the input string is valid UTF-16 (native
3696
 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
3697
 * cases the result is implementation defined.
3698
 *
3699
 * This function is not BOM-aware.
3700
 *
3701
 * @param input         the UTF-16 string to process
3702
 * @param length        the length of the string in 2-byte code units (char16_t)
3703
 * @return number of code points
3704
 */
3705
simdutf_warn_unused size_t count_utf16(const char16_t *input,
3706
                                       size_t length) noexcept;
3707
  #if SIMDUTF_SPAN
3708
simdutf_really_inline simdutf_warn_unused size_t
3709
0
count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3710
0
  return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
3711
0
}
3712
  #endif // SIMDUTF_SPAN
3713
3714
/**
3715
 * Count the number of code points (characters) in the string assuming that
3716
 * it is valid.
3717
 *
3718
 * This function assumes that the input string is valid UTF-16LE.
3719
 * It is acceptable to pass invalid UTF-16 strings but in such cases
3720
 * the result is implementation defined.
3721
 *
3722
 * This function is not BOM-aware.
3723
 *
3724
 * @param input         the UTF-16LE string to process
3725
 * @param length        the length of the string in 2-byte code units (char16_t)
3726
 * @return number of code points
3727
 */
3728
simdutf_warn_unused size_t count_utf16le(const char16_t *input,
3729
                                         size_t length) noexcept;
3730
  #if SIMDUTF_SPAN
3731
simdutf_really_inline simdutf_warn_unused size_t
3732
0
count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3733
0
  return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
3734
0
}
3735
  #endif // SIMDUTF_SPAN
3736
3737
/**
3738
 * Count the number of code points (characters) in the string assuming that
3739
 * it is valid.
3740
 *
3741
 * This function assumes that the input string is valid UTF-16BE.
3742
 * It is acceptable to pass invalid UTF-16 strings but in such cases
3743
 * the result is implementation defined.
3744
 *
3745
 * This function is not BOM-aware.
3746
 *
3747
 * @param input         the UTF-16BE string to process
3748
 * @param length        the length of the string in 2-byte code units (char16_t)
3749
 * @return number of code points
3750
 */
3751
simdutf_warn_unused size_t count_utf16be(const char16_t *input,
3752
                                         size_t length) noexcept;
3753
  #if SIMDUTF_SPAN
3754
simdutf_really_inline simdutf_warn_unused size_t
3755
0
count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3756
0
  return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
3757
0
}
3758
  #endif // SIMDUTF_SPAN
3759
#endif   // SIMDUTF_FEATURE_UTF16
3760
3761
#if SIMDUTF_FEATURE_UTF8
3762
/**
3763
 * Count the number of code points (characters) in the string assuming that
3764
 * it is valid.
3765
 *
3766
 * This function assumes that the input string is valid UTF-8.
3767
 * It is acceptable to pass invalid UTF-8 strings but in such cases
3768
 * the result is implementation defined.
3769
 *
3770
 * @param input         the UTF-8 string to process
3771
 * @param length        the length of the string in bytes
3772
 * @return number of code points
3773
 */
3774
simdutf_warn_unused size_t count_utf8(const char *input,
3775
                                      size_t length) noexcept;
3776
  #if SIMDUTF_SPAN
3777
simdutf_really_inline simdutf_warn_unused size_t count_utf8(
3778
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3779
  return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
3780
                    valid_utf8_input.size());
3781
}
3782
  #endif // SIMDUTF_SPAN
3783
3784
/**
3785
 * Given a valid UTF-8 string having a possibly truncated last character,
3786
 * this function checks the end of string. If the last character is truncated
3787
 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
3788
 * that the short UTF-8 strings only contain complete characters. If there is no
3789
 * truncated character, the original length is returned.
3790
 *
3791
 * This function assumes that the input string is valid UTF-8, but possibly
3792
 * truncated.
3793
 *
3794
 * @param input         the UTF-8 string to process
3795
 * @param length        the length of the string in bytes
3796
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
3797
 */
3798
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
3799
  #if SIMDUTF_SPAN
3800
simdutf_really_inline simdutf_warn_unused size_t trim_partial_utf8(
3801
    const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3802
  return trim_partial_utf8(
3803
      reinterpret_cast<const char *>(valid_utf8_input.data()),
3804
      valid_utf8_input.size());
3805
}
3806
  #endif // SIMDUTF_SPAN
3807
#endif   // SIMDUTF_FEATURE_UTF8
3808
3809
#if SIMDUTF_FEATURE_UTF16
3810
/**
3811
 * Given a valid UTF-16BE string having a possibly truncated last character,
3812
 * this function checks the end of string. If the last character is truncated
3813
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3814
 * the short UTF-16BE strings only contain complete characters. If there is no
3815
 * truncated character, the original length is returned.
3816
 *
3817
 * This function assumes that the input string is valid UTF-16BE, but possibly
3818
 * truncated.
3819
 *
3820
 * @param input         the UTF-16BE string to process
3821
 * @param length        the length of the string in bytes
3822
 * @return the length of the string in bytes, possibly shorter by 1 unit
3823
 */
3824
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
3825
                                                size_t length);
3826
  #if SIMDUTF_SPAN
3827
simdutf_really_inline simdutf_warn_unused size_t
3828
0
trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3829
0
  return trim_partial_utf16be(valid_utf16_input.data(),
3830
0
                              valid_utf16_input.size());
3831
0
}
3832
  #endif // SIMDUTF_SPAN
3833
3834
/**
3835
 * Given a valid UTF-16LE string having a possibly truncated last character,
3836
 * this function checks the end of string. If the last character is truncated
3837
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3838
 * the short UTF-16LE strings only contain complete characters. If there is no
3839
 * truncated character, the original length is returned.
3840
 *
3841
 * This function assumes that the input string is valid UTF-16LE, but possibly
3842
 * truncated.
3843
 *
3844
 * @param input         the UTF-16LE string to process
3845
 * @param length        the length of the string in bytes
3846
 * @return the length of the string in unit, possibly shorter by 1 unit
3847
 */
3848
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
3849
                                                size_t length);
3850
  #if SIMDUTF_SPAN
3851
simdutf_really_inline simdutf_warn_unused size_t
3852
0
trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3853
0
  return trim_partial_utf16le(valid_utf16_input.data(),
3854
0
                              valid_utf16_input.size());
3855
0
}
3856
  #endif // SIMDUTF_SPAN
3857
3858
/**
3859
 * Given a valid UTF-16 string having a possibly truncated last character,
3860
 * this function checks the end of string. If the last character is truncated
3861
 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3862
 * the short UTF-16 strings only contain complete characters. If there is no
3863
 * truncated character, the original length is returned.
3864
 *
3865
 * This function assumes that the input string is valid UTF-16, but possibly
3866
 * truncated. We use the native endianness.
3867
 *
3868
 * @param input         the UTF-16 string to process
3869
 * @param length        the length of the string in bytes
3870
 * @return the length of the string in unit, possibly shorter by 1 unit
3871
 */
3872
simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
3873
                                              size_t length);
3874
  #if SIMDUTF_SPAN
3875
simdutf_really_inline simdutf_warn_unused size_t
3876
0
trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3877
0
  return trim_partial_utf16(valid_utf16_input.data(), valid_utf16_input.size());
3878
0
}
3879
  #endif // SIMDUTF_SPAN
3880
#endif   // SIMDUTF_FEATURE_UTF16
3881
3882
#if SIMDUTF_FEATURE_BASE64
3883
  #ifndef SIMDUTF_NEED_TRAILING_ZEROES
3884
    #define SIMDUTF_NEED_TRAILING_ZEROES 1
3885
  #endif
3886
// base64_options are used to specify the base64 encoding options.
3887
// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
3888
// garbage characters are characters that are not part of the base64 alphabet
3889
// nor ASCII spaces.
3890
constexpr uint64_t base64_reverse_padding =
3891
    2; /* modifier for base64_default and base64_url */
3892
enum base64_options : uint64_t {
3893
  base64_default = 0, /* standard base64 format (with padding) */
3894
  base64_url = 1,     /* base64url format (no padding) */
3895
  base64_default_no_padding =
3896
      base64_default |
3897
      base64_reverse_padding, /* standard base64 format without padding */
3898
  base64_url_with_padding =
3899
      base64_url | base64_reverse_padding, /* base64url with padding */
3900
  base64_default_accept_garbage =
3901
      4, /* standard base64 format accepting garbage characters, the input stops
3902
            with the first '=' if any */
3903
  base64_url_accept_garbage =
3904
      5, /* base64url format accepting garbage characters, the input stops with
3905
            the first '=' if any */
3906
  base64_default_or_url =
3907
      8, /* standard/base64url hybrid format (only meaningful for decoding!) */
3908
  base64_default_or_url_accept_garbage =
3909
      12, /* standard/base64url hybrid format accepting garbage characters
3910
             (only meaningful for decoding!), the input stops with the first '='
3911
             if any */
3912
};
3913
3914
  #if SIMDUTF_CPLUSPLUS17
3915
0
inline std::string_view to_string(base64_options options) {
3916
0
  switch (options) {
3917
0
  case base64_default:
3918
0
    return "base64_default";
3919
0
  case base64_url:
3920
0
    return "base64_url";
3921
0
  case base64_reverse_padding:
3922
0
    return "base64_reverse_padding";
3923
0
  case base64_url_with_padding:
3924
0
    return "base64_url_with_padding";
3925
0
  case base64_default_accept_garbage:
3926
0
    return "base64_default_accept_garbage";
3927
0
  case base64_url_accept_garbage:
3928
0
    return "base64_url_accept_garbage";
3929
0
  case base64_default_or_url:
3930
0
    return "base64_default_or_url";
3931
0
  case base64_default_or_url_accept_garbage:
3932
0
    return "base64_default_or_url_accept_garbage";
3933
0
  }
3934
0
  return "<unknown>";
3935
0
}
3936
  #endif // SIMDUTF_CPLUSPLUS17
3937
3938
// last_chunk_handling_options are used to specify the handling of the last
3939
// chunk in base64 decoding.
3940
// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
3941
enum last_chunk_handling_options : uint64_t {
3942
  loose = 0,  /* standard base64 format, decode partial final chunk */
3943
  strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
3944
                 unpadded, or non-zero bit padding */
3945
  stop_before_partial =
3946
      2, /* if the last chunk is partial, ignore it (no error) */
3947
  only_full_chunks =
3948
      3 /* only decode full blocks (4 base64 characters, no padding) */
3949
};
3950
3951
0
inline bool is_partial(last_chunk_handling_options options) {
3952
0
  return (options == stop_before_partial) || (options == only_full_chunks);
3953
0
}
3954
3955
  #if SIMDUTF_CPLUSPLUS17
3956
0
inline std::string_view to_string(last_chunk_handling_options options) {
3957
0
  switch (options) {
3958
0
  case loose:
3959
0
    return "loose";
3960
0
  case strict:
3961
0
    return "strict";
3962
0
  case stop_before_partial:
3963
0
    return "stop_before_partial";
3964
0
  case only_full_chunks:
3965
0
    return "only_full_chunks";
3966
0
  }
3967
0
  return "<unknown>";
3968
0
}
3969
  #endif
3970
3971
/**
3972
 * Provide the maximal binary length in bytes given the base64 input.
3973
 * In general, if the input contains ASCII spaces, the result will be less than
3974
 * the maximum length.
3975
 *
3976
 * @param input         the base64 input to process
3977
 * @param length        the length of the base64 input in bytes
3978
 * @return maximum number of binary bytes
3979
 */
3980
simdutf_warn_unused size_t
3981
maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
3982
  #if SIMDUTF_SPAN
3983
simdutf_really_inline simdutf_warn_unused size_t
3984
maximal_binary_length_from_base64(
3985
    const detail::input_span_of_byte_like auto &input) noexcept {
3986
  return maximal_binary_length_from_base64(
3987
      reinterpret_cast<const char *>(input.data()), input.size());
3988
}
3989
  #endif // SIMDUTF_SPAN
3990
3991
/**
3992
 * Provide the maximal binary length in bytes given the base64 input.
3993
 * In general, if the input contains ASCII spaces, the result will be less than
3994
 * the maximum length.
3995
 *
3996
 * @param input         the base64 input to process, in ASCII stored as 16-bit
3997
 * units
3998
 * @param length        the length of the base64 input in 16-bit units
3999
 * @return maximal number of binary bytes
4000
 */
4001
simdutf_warn_unused size_t maximal_binary_length_from_base64(
4002
    const char16_t *input, size_t length) noexcept;
4003
  #if SIMDUTF_SPAN
4004
simdutf_really_inline simdutf_warn_unused size_t
4005
0
maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
4006
0
  return maximal_binary_length_from_base64(input.data(), input.size());
4007
0
}
4008
  #endif // SIMDUTF_SPAN
4009
4010
/**
4011
 * Convert a base64 input to a binary output.
4012
 *
4013
 * This function follows the WHATWG forgiving-base64 format, which means that it
4014
 * will ignore any ASCII spaces in the input. You may provide a padded input
4015
 * (with one or two equal signs at the end) or an unpadded input (without any
4016
 * equal signs at the end).
4017
 *
4018
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4019
 *
4020
 * This function will fail in case of invalid input. When last_chunk_options =
4021
 * loose, there are two possible reasons for failure: the input contains a
4022
 * number of base64 characters that when divided by 4, leaves a single remainder
4023
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4024
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4025
 *
4026
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4027
 * input where the invalid character was found. When the error is
4028
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4029
 *
4030
 * The default option (simdutf::base64_default) expects the characters `+` and
4031
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4032
 * characters `-` and `_` as part of its alphabet.
4033
 *
4034
 * The padding (`=`) is validated if present. There may be at most two padding
4035
 * characters at the end of the input. If there are any padding characters, the
4036
 * total number of characters (excluding spaces but including padding
4037
 * characters) must be divisible by four.
4038
 *
4039
 * You should call this function with a buffer that is at least
4040
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
4041
 * provide that much space, the function may cause a buffer overflow.
4042
 *
4043
 * Advanced users may want to tailor how the last chunk is handled. By default,
4044
 * we use a loose (forgiving) approach but we also support a strict approach
4045
 * as well as a stop_before_partial approach, as per the following proposal:
4046
 *
4047
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4048
 *
4049
 * @param input         the base64 string to process
4050
 * @param length        the length of the string in bytes
4051
 * @param output        the pointer to a buffer that can hold the conversion
4052
 * result (should be at least maximal_binary_length_from_base64(input, length)
4053
 * bytes long).
4054
 * @param options       the base64 options to use, usually base64_default or
4055
 * base64_url, and base64_default by default.
4056
 * @param last_chunk_options the last chunk handling options,
4057
 * last_chunk_handling_options::loose by default
4058
 * but can also be last_chunk_handling_options::strict or
4059
 * last_chunk_handling_options::stop_before_partial.
4060
 * @return a result pair struct (of type simdutf::result containing the two
4061
 * fields error and count) with an error code and either position of the error
4062
 * (in the input in bytes) if any, or the number of bytes written if successful.
4063
 */
4064
simdutf_warn_unused result base64_to_binary(
4065
    const char *input, size_t length, char *output,
4066
    base64_options options = base64_default,
4067
    last_chunk_handling_options last_chunk_options = loose) noexcept;
4068
  #if SIMDUTF_SPAN
4069
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
4070
    const detail::input_span_of_byte_like auto &input,
4071
    detail::output_span_of_byte_like auto &&binary_output,
4072
    base64_options options = base64_default,
4073
    last_chunk_handling_options last_chunk_options = loose) noexcept {
4074
  return base64_to_binary(reinterpret_cast<const char *>(input.data()),
4075
                          input.size(),
4076
                          reinterpret_cast<char *>(binary_output.data()),
4077
                          options, last_chunk_options);
4078
}
4079
  #endif // SIMDUTF_SPAN
4080
4081
/**
4082
 * Provide the base64 length in bytes given the length of a binary input.
4083
 *
4084
 * @param length        the length of the input in bytes
4085
 * @return number of base64 bytes
4086
 */
4087
simdutf_warn_unused size_t base64_length_from_binary(
4088
    size_t length, base64_options options = base64_default) noexcept;
4089
4090
/**
4091
 * Convert a binary input to a base64 output.
4092
 *
4093
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4094
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4095
 * output to ensure that the output length is a multiple of four.
4096
 *
4097
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4098
 * of its alphabet. No padding is added at the end of the output.
4099
 *
4100
 * This function always succeeds.
4101
 *
4102
 * @param input         the binary to process
4103
 * @param length        the length of the input in bytes
4104
 * @param output        the pointer to a buffer that can hold the conversion
4105
 * result (should be at least base64_length_from_binary(length) bytes long)
4106
 * @param options       the base64 options to use, can be base64_default or
4107
 * base64_url, is base64_default by default.
4108
 * @return number of written bytes, will be equal to
4109
 * base64_length_from_binary(length, options)
4110
 */
4111
size_t binary_to_base64(const char *input, size_t length, char *output,
4112
                        base64_options options = base64_default) noexcept;
4113
  #if SIMDUTF_SPAN
4114
simdutf_really_inline simdutf_warn_unused size_t
4115
binary_to_base64(const detail::input_span_of_byte_like auto &input,
4116
                 detail::output_span_of_byte_like auto &&binary_output,
4117
                 base64_options options = base64_default) noexcept {
4118
  return binary_to_base64(
4119
      reinterpret_cast<const char *>(input.data()), input.size(),
4120
      reinterpret_cast<char *>(binary_output.data()), options);
4121
}
4122
  #endif // SIMDUTF_SPAN
4123
4124
  #if SIMDUTF_ATOMIC_REF
4125
/**
4126
 * Convert a binary input to a base64 output, using atomic accesses.
4127
 * This function comes with a potentially significant performance
4128
 * penalty, but it may be useful in some cases where the input
4129
 * buffers are shared between threads, to avoid undefined
4130
 * behavior in case of data races.
4131
 *
4132
 * The function is for advanced users. Its main use case is when
4133
 * to silence sanitizer warnings. We have no documented use case
4134
 * where this function is actually necessary in terms of practical correctness.
4135
 *
4136
 * This function is only available when simdutf is compiled with
4137
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4138
 * the availability of this function by checking the macro
4139
 * SIMDUTF_ATOMIC_REF.
4140
 *
4141
 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4142
 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4143
 * output to ensure that the output length is a multiple of four.
4144
 *
4145
 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4146
 * of its alphabet. No padding is added at the end of the output.
4147
 *
4148
 * This function always succeeds.
4149
 *
4150
 * This function is considered experimental. It is not tested by default
4151
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
4152
 * It is not documented in the public API documentation (README). It is
4153
 * offered on a best effort basis. We rely on the community for further
4154
 * testing and feedback.
4155
 *
4156
 * @brief atomic_binary_to_base64
4157
 * @param input         the binary to process
4158
 * @param length        the length of the input in bytes
4159
 * @param output        the pointer to a buffer that can hold the conversion
4160
 * result (should be at least base64_length_from_binary(length) bytes long)
4161
 * @param options       the base64 options to use, can be base64_default or
4162
 * base64_url, is base64_default by default.
4163
 * @return number of written bytes, will be equal to
4164
 * base64_length_from_binary(length, options)
4165
 */
4166
size_t
4167
atomic_binary_to_base64(const char *input, size_t length, char *output,
4168
                        base64_options options = base64_default) noexcept;
4169
    #if SIMDUTF_SPAN
4170
simdutf_really_inline simdutf_warn_unused size_t
4171
atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
4172
                        detail::output_span_of_byte_like auto &&binary_output,
4173
                        base64_options options = base64_default) noexcept {
4174
  return atomic_binary_to_base64(
4175
      reinterpret_cast<const char *>(input.data()), input.size(),
4176
      reinterpret_cast<char *>(binary_output.data()), options);
4177
}
4178
    #endif // SIMDUTF_SPAN
4179
  #endif   // SIMDUTF_ATOMIC_REF
4180
4181
/**
4182
 * Convert a base64 input to a binary output.
4183
 *
4184
 * This function follows the WHATWG forgiving-base64 format, which means that it
4185
 * will ignore any ASCII spaces in the input. You may provide a padded input
4186
 * (with one or two equal signs at the end) or an unpadded input (without any
4187
 * equal signs at the end).
4188
 *
4189
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4190
 *
4191
 * This function will fail in case of invalid input. When last_chunk_options =
4192
 * loose, there are two possible reasons for failure: the input contains a
4193
 * number of base64 characters that when divided by 4, leaves a single remainder
4194
 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4195
 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4196
 *
4197
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4198
 * input where the invalid character was found. When the error is
4199
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4200
 *
4201
 * The default option (simdutf::base64_default) expects the characters `+` and
4202
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4203
 * characters `-` and `_` as part of its alphabet.
4204
 *
4205
 * The padding (`=`) is validated if present. There may be at most two padding
4206
 * characters at the end of the input. If there are any padding characters, the
4207
 * total number of characters (excluding spaces but including padding
4208
 * characters) must be divisible by four.
4209
 *
4210
 * You should call this function with a buffer that is at least
4211
 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
4212
 * to provide that much space, the function may cause a buffer overflow.
4213
 *
4214
 * Advanced users may want to tailor how the last chunk is handled. By default,
4215
 * we use a loose (forgiving) approach but we also support a strict approach
4216
 * as well as a stop_before_partial approach, as per the following proposal:
4217
 *
4218
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4219
 *
4220
 * @param input         the base64 string to process, in ASCII stored as 16-bit
4221
 * units
4222
 * @param length        the length of the string in 16-bit units
4223
 * @param output        the pointer to a buffer that can hold the conversion
4224
 * result (should be at least maximal_binary_length_from_base64(input, length)
4225
 * bytes long).
4226
 * @param options       the base64 options to use, can be base64_default or
4227
 * base64_url, is base64_default by default.
4228
 * @param last_chunk_options the last chunk handling options,
4229
 * last_chunk_handling_options::loose by default
4230
 * but can also be last_chunk_handling_options::strict or
4231
 * last_chunk_handling_options::stop_before_partial.
4232
 * @return a result pair struct (of type simdutf::result containing the two
4233
 * fields error and count) with an error code and position of the
4234
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4235
 * of bytes written if successful.
4236
 */
4237
simdutf_warn_unused result
4238
base64_to_binary(const char16_t *input, size_t length, char *output,
4239
                 base64_options options = base64_default,
4240
                 last_chunk_handling_options last_chunk_options =
4241
                     last_chunk_handling_options::loose) noexcept;
4242
  #if SIMDUTF_SPAN
4243
simdutf_really_inline simdutf_warn_unused result base64_to_binary(
4244
    std::span<const char16_t> input,
4245
    detail::output_span_of_byte_like auto &&binary_output,
4246
    base64_options options = base64_default,
4247
    last_chunk_handling_options last_chunk_options = loose) noexcept {
4248
  return base64_to_binary(input.data(), input.size(),
4249
                          reinterpret_cast<char *>(binary_output.data()),
4250
                          options, last_chunk_options);
4251
}
4252
  #endif // SIMDUTF_SPAN
4253
4254
/**
4255
 * Check if a character is an ignorabl base64 character.
4256
 * Checking a large input, character by character, is not computationally
4257
 * efficient.
4258
 *
4259
 * @param input         the character to check
4260
 * @param options       the base64 options to use, is base64_default by default.
4261
 * @return true if the character is an ignorablee base64 character, false
4262
 * otherwise.
4263
 */
4264
simdutf_warn_unused bool
4265
base64_ignorable(char input, base64_options options = base64_default) noexcept;
4266
simdutf_warn_unused bool
4267
base64_ignorable(char16_t input,
4268
                 base64_options options = base64_default) noexcept;
4269
4270
/**
4271
 * Check if a character is a valid base64 character.
4272
 * Checking a large input, character by character, is not computationally
4273
 * efficient.
4274
 * Note that padding characters are not considered valid base64 characters in
4275
 * this context, nor are spaces.
4276
 *
4277
 * @param input         the character to check
4278
 * @param options       the base64 options to use, is base64_default by default.
4279
 * @return true if the character is a base64 character, false otherwise.
4280
 */
4281
simdutf_warn_unused bool
4282
base64_valid(char input, base64_options options = base64_default) noexcept;
4283
simdutf_warn_unused bool
4284
base64_valid(char16_t input, base64_options options = base64_default) noexcept;
4285
4286
/**
4287
 * Check if a character is a valid base64 character or the padding character
4288
 * ('='). Checking a large input, character by character, is not computationally
4289
 * efficient.
4290
 *
4291
 * @param input         the character to check
4292
 * @param options       the base64 options to use, is base64_default by default.
4293
 * @return true if the character is a base64 character, false otherwise.
4294
 */
4295
simdutf_warn_unused bool
4296
base64_valid_or_padding(char input,
4297
                        base64_options options = base64_default) noexcept;
4298
simdutf_warn_unused bool
4299
base64_valid_or_padding(char16_t input,
4300
                        base64_options options = base64_default) noexcept;
4301
4302
/**
4303
 * Convert a base64 input to a binary output.
4304
 *
4305
 * This function follows the WHATWG forgiving-base64 format, which means that it
4306
 * will ignore any ASCII spaces in the input. You may provide a padded input
4307
 * (with one or two equal signs at the end) or an unpadded input (without any
4308
 * equal signs at the end).
4309
 *
4310
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4311
 *
4312
 * This function will fail in case of invalid input. When last_chunk_options =
4313
 * loose, there are three possible reasons for failure: the input contains a
4314
 * number of base64 characters that when divided by 4, leaves a single remainder
4315
 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
4316
 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
4317
 * is too small (OUTPUT_BUFFER_TOO_SMALL).
4318
 *
4319
 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
4320
 * and the number of units processed, see description of the parameters and
4321
 * returned value.
4322
 *
4323
 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4324
 * input where the invalid character was found. When the error is
4325
 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4326
 *
4327
 * The default option (simdutf::base64_default) expects the characters `+` and
4328
 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4329
 * characters `-` and `_` as part of its alphabet.
4330
 *
4331
 * The padding (`=`) is validated if present. There may be at most two padding
4332
 * characters at the end of the input. If there are any padding characters, the
4333
 * total number of characters (excluding spaces but including padding
4334
 * characters) must be divisible by four.
4335
 *
4336
 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
4337
 * to discard the output unless the parameter decode_up_to_bad_char is set to
4338
 * true. In that case, the function will decode up to the first invalid
4339
 * character. Extra padding characters ('=') are considered invalid characters.
4340
 *
4341
 * Advanced users may want to tailor how the last chunk is handled. By default,
4342
 * we use a loose (forgiving) approach but we also support a strict approach
4343
 * as well as a stop_before_partial approach, as per the following proposal:
4344
 *
4345
 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4346
 *
4347
 * @param input         the base64 string to process, in ASCII stored as 8-bit
4348
 * or 16-bit units
4349
 * @param length        the length of the string in 8-bit or 16-bit units.
4350
 * @param output        the pointer to a buffer that can hold the conversion
4351
 * result.
4352
 * @param outlen        the number of bytes that can be written in the output
4353
 * buffer. Upon return, it is modified to reflect how many bytes were written.
4354
 * @param options       the base64 options to use, can be base64_default or
4355
 * base64_url, is base64_default by default.
4356
 * @param last_chunk_options the last chunk handling options,
4357
 * last_chunk_handling_options::loose by default
4358
 * but can also be last_chunk_handling_options::strict or
4359
 * last_chunk_handling_options::stop_before_partial.
4360
 * @param decode_up_to_bad_char if true, the function will decode up to the
4361
 * first invalid character. By default (false), it is assumed that the output
4362
 * buffer is to be discarded. When there are multiple errors in the input,
4363
 * using decode_up_to_bad_char might trigger a different error.
4364
 * @return a result pair struct (of type simdutf::result containing the two
4365
 * fields error and count) with an error code and position of the
4366
 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4367
 * of units processed if successful.
4368
 */
4369
simdutf_warn_unused result
4370
base64_to_binary_safe(const char *input, size_t length, char *output,
4371
                      size_t &outlen, base64_options options = base64_default,
4372
                      last_chunk_handling_options last_chunk_options =
4373
                          last_chunk_handling_options::loose,
4374
                      bool decode_up_to_bad_char = false) noexcept;
4375
  #if SIMDUTF_SPAN
4376
/**
4377
 * @brief span overload
4378
 * @return a tuple of result and outlen
4379
 */
4380
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
4381
base64_to_binary_safe(const detail::input_span_of_byte_like auto &input,
4382
                      detail::output_span_of_byte_like auto &&binary_output,
4383
                      base64_options options = base64_default,
4384
                      last_chunk_handling_options last_chunk_options = loose,
4385
                      bool decode_up_to_bad_char = false) noexcept {
4386
  size_t outlen = binary_output.size();
4387
  auto r = base64_to_binary_safe(
4388
      reinterpret_cast<const char *>(input.data()), input.size(),
4389
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
4390
      last_chunk_options, decode_up_to_bad_char);
4391
  return {r, outlen};
4392
}
4393
  #endif // SIMDUTF_SPAN
4394
4395
simdutf_warn_unused result
4396
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
4397
                      size_t &outlen, base64_options options = base64_default,
4398
                      last_chunk_handling_options last_chunk_options =
4399
                          last_chunk_handling_options::loose,
4400
                      bool decode_up_to_bad_char = false) noexcept;
4401
  #if SIMDUTF_SPAN
4402
/**
4403
 * @brief span overload
4404
 * @return a tuple of result and outlen
4405
 */
4406
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
4407
base64_to_binary_safe(std::span<const char16_t> input,
4408
                      detail::output_span_of_byte_like auto &&binary_output,
4409
                      base64_options options = base64_default,
4410
                      last_chunk_handling_options last_chunk_options = loose,
4411
                      bool decode_up_to_bad_char = false) noexcept {
4412
  size_t outlen = binary_output.size();
4413
  auto r = base64_to_binary_safe(input.data(), input.size(),
4414
                                 reinterpret_cast<char *>(binary_output.data()),
4415
                                 outlen, options, last_chunk_options,
4416
                                 decode_up_to_bad_char);
4417
  return {r, outlen};
4418
}
4419
  #endif // SIMDUTF_SPAN
4420
4421
  #if SIMDUTF_ATOMIC_REF
4422
/**
4423
 * Convert a base64 input to a binary output with a size limit and using atomic
4424
 * operations.
4425
 *
4426
 * Like `base64_to_binary_safe` but using atomic operations, this function is
4427
 * thread-safe for concurrent memory access, allowing the output
4428
 * buffers to be shared between threads without undefined behavior in case of
4429
 * data races.
4430
 *
4431
 * This function comes with a potentially significant performance penalty, but
4432
 * is useful when thread safety is needed during base64 decoding.
4433
 *
4434
 * This function is only available when simdutf is compiled with
4435
 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4436
 * the availability of this function by checking the macro
4437
 * SIMDUTF_ATOMIC_REF.
4438
 *
4439
 * This function is considered experimental. It is not tested by default
4440
 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
4441
 * It is not documented in the public API documentation (README). It is
4442
 * offered on a best effort basis. We rely on the community for further
4443
 * testing and feedback.
4444
 *
4445
 * @param input         the base64 input to decode
4446
 * @param length        the length of the input in bytes
4447
 * @param output        the pointer to buffer that can hold the conversion
4448
 * result
4449
 * @param outlen        the number of bytes that can be written in the output
4450
 * buffer. Upon return, it is modified to reflect how many bytes were written.
4451
 * @param options       the base64 options to use (default, url, etc.)
4452
 * @param last_chunk_options the last chunk handling options (loose, strict,
4453
 * stop_before_partial)
4454
 * @param decode_up_to_bad_char if true, the function will decode up to the
4455
 * first invalid character. By default (false), it is assumed that the output
4456
 * buffer is to be discarded. When there are multiple errors in the input,
4457
 * using decode_up_to_bad_char might trigger a different error.
4458
 * @return a result struct with an error code and count indicating error
4459
 * position or success
4460
 */
4461
simdutf_warn_unused result atomic_base64_to_binary_safe(
4462
    const char *input, size_t length, char *output, size_t &outlen,
4463
    base64_options options = base64_default,
4464
    last_chunk_handling_options last_chunk_options =
4465
        last_chunk_handling_options::loose,
4466
    bool decode_up_to_bad_char = false) noexcept;
4467
simdutf_warn_unused result atomic_base64_to_binary_safe(
4468
    const char16_t *input, size_t length, char *output, size_t &outlen,
4469
    base64_options options = base64_default,
4470
    last_chunk_handling_options last_chunk_options = loose,
4471
    bool decode_up_to_bad_char = false) noexcept;
4472
    #if SIMDUTF_SPAN
4473
/**
4474
 * @brief span overload
4475
 * @return a tuple of result and outlen
4476
 */
4477
simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
4478
atomic_base64_to_binary_safe(
4479
    const detail::input_span_of_byte_like auto &binary_input,
4480
    detail::output_span_of_byte_like auto &&output,
4481
    base64_options options = base64_default,
4482
    last_chunk_handling_options last_chunk_options =
4483
        last_chunk_handling_options::loose,
4484
    bool decode_up_to_bad_char = false) noexcept {
4485
  size_t outlen = output.size();
4486
  auto ret = atomic_base64_to_binary_safe(
4487
      reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
4488
      reinterpret_cast<char *>(output.data()), outlen, options,
4489
      last_chunk_options, decode_up_to_bad_char);
4490
  return {ret, outlen};
4491
}
4492
/**
4493
 * @brief span overload
4494
 * @return a tuple of result and outlen
4495
 */
4496
simdutf_warn_unused std::tuple<result, std::size_t>
4497
atomic_base64_to_binary_safe(
4498
    std::span<const char16_t> base64_input,
4499
    detail::output_span_of_byte_like auto &&binary_output,
4500
    base64_options options = base64_default,
4501
    last_chunk_handling_options last_chunk_options = loose,
4502
    bool decode_up_to_bad_char = false) noexcept {
4503
  size_t outlen = binary_output.size();
4504
  auto ret = atomic_base64_to_binary_safe(
4505
      base64_input.data(), base64_input.size(),
4506
      reinterpret_cast<char *>(binary_output.data()), outlen, options,
4507
      last_chunk_options, decode_up_to_bad_char);
4508
  return {ret, outlen};
4509
}
4510
    #endif // SIMDUTF_SPAN
4511
  #endif   // SIMDUTF_ATOMIC_REF
4512
4513
/**
4514
 * Find the first occurrence of a character in a string. If the character is
4515
 * not found, return a pointer to the end of the string.
4516
 * @param start        the start of the string
4517
 * @param end          the end of the string
4518
 * @param character    the character to find
4519
 * @return a pointer to the first occurrence of the character in the string,
4520
 * or a pointer to the end of the string if the character is not found.
4521
 *
4522
 */
4523
simdutf_warn_unused const char *find(const char *start, const char *end,
4524
                                     char character) noexcept;
4525
simdutf_warn_unused const char16_t *
4526
find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
4527
#endif // SIMDUTF_FEATURE_BASE64
4528
4529
/**
4530
 * An implementation of simdutf for a particular CPU architecture.
4531
 *
4532
 * Also used to maintain the currently active implementation. The active
4533
 * implementation is automatically initialized on first use to the most advanced
4534
 * implementation supported by the host.
4535
 */
4536
class implementation {
4537
public:
4538
  /**
4539
   * The name of this implementation.
4540
   *
4541
   *     const implementation *impl = simdutf::active_implementation;
4542
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
4543
   * impl->description() << ")" << endl;
4544
   *
4545
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
4546
   */
4547
0
  virtual std::string name() const { return std::string(_name); }
4548
4549
  /**
4550
   * The description of this implementation.
4551
   *
4552
   *     const implementation *impl = simdutf::active_implementation;
4553
   *     cout << "simdutf is optimized for " << impl->name() << "(" <<
4554
   * impl->description() << ")" << endl;
4555
   *
4556
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
4557
   */
4558
0
  virtual std::string description() const { return std::string(_description); }
4559
4560
  /**
4561
   * The instruction sets this implementation is compiled against
4562
   * and the current CPU match. This function may poll the current CPU/system
4563
   * and should therefore not be called too often if performance is a concern.
4564
   *
4565
   *
4566
   * @return true if the implementation can be safely used on the current system
4567
   * (determined at runtime)
4568
   */
4569
  bool supported_by_runtime_system() const;
4570
4571
#if SIMDUTF_FEATURE_DETECT_ENCODING
4572
  /**
4573
   * This function will try to detect the encoding
4574
   * @param input the string to identify
4575
   * @param length the length of the string in bytes.
4576
   * @return the encoding type detected
4577
   */
4578
  virtual encoding_type autodetect_encoding(const char *input,
4579
                                            size_t length) const noexcept;
4580
4581
  /**
4582
   * This function will try to detect the possible encodings in one pass
4583
   * @param input the string to identify
4584
   * @param length the length of the string in bytes.
4585
   * @return the encoding type detected
4586
   */
4587
  virtual int detect_encodings(const char *input,
4588
                               size_t length) const noexcept = 0;
4589
#endif // SIMDUTF_FEATURE_DETECT_ENCODING
4590
4591
  /**
4592
   * @private For internal implementation use
4593
   *
4594
   * The instruction sets this implementation is compiled against.
4595
   *
4596
   * @return a mask of all required `internal::instruction_set::` values
4597
   */
4598
0
  virtual uint32_t required_instruction_sets() const {
4599
0
    return _required_instruction_sets;
4600
0
  }
4601
4602
#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
4603
  /**
4604
   * Validate the UTF-8 string.
4605
   *
4606
   * Overridden by each implementation.
4607
   *
4608
   * @param buf the UTF-8 string to validate.
4609
   * @param len the length of the string in bytes.
4610
   * @return true if and only if the string is valid UTF-8.
4611
   */
4612
  simdutf_warn_unused virtual bool validate_utf8(const char *buf,
4613
                                                 size_t len) const noexcept = 0;
4614
#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
4615
4616
#if SIMDUTF_FEATURE_UTF8
4617
  /**
4618
   * Validate the UTF-8 string and stop on errors.
4619
   *
4620
   * Overridden by each implementation.
4621
   *
4622
   * @param buf the UTF-8 string to validate.
4623
   * @param len the length of the string in bytes.
4624
   * @return a result pair struct (of type simdutf::result containing the two
4625
   * fields error and count) with an error code and either position of the error
4626
   * (in the input in code units) if any, or the number of code units validated
4627
   * if successful.
4628
   */
4629
  simdutf_warn_unused virtual result
4630
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
4631
#endif // SIMDUTF_FEATURE_UTF8
4632
4633
#if SIMDUTF_FEATURE_ASCII
4634
  /**
4635
   * Validate the ASCII string.
4636
   *
4637
   * Overridden by each implementation.
4638
   *
4639
   * @param buf the ASCII string to validate.
4640
   * @param len the length of the string in bytes.
4641
   * @return true if and only if the string is valid ASCII.
4642
   */
4643
  simdutf_warn_unused virtual bool
4644
  validate_ascii(const char *buf, size_t len) const noexcept = 0;
4645
4646
  /**
4647
   * Validate the ASCII string and stop on error.
4648
   *
4649
   * Overridden by each implementation.
4650
   *
4651
   * @param buf the ASCII string to validate.
4652
   * @param len the length of the string in bytes.
4653
   * @return a result pair struct (of type simdutf::result containing the two
4654
   * fields error and count) with an error code and either position of the error
4655
   * (in the input in code units) if any, or the number of code units validated
4656
   * if successful.
4657
   */
4658
  simdutf_warn_unused virtual result
4659
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
4660
#endif // SIMDUTF_FEATURE_ASCII
4661
4662
#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
4663
  /**
4664
   * Validate the UTF-16LE string.This function may be best when you expect
4665
   * the input to be almost always valid. Otherwise, consider using
4666
   * validate_utf16le_with_errors.
4667
   *
4668
   * Overridden by each implementation.
4669
   *
4670
   * This function is not BOM-aware.
4671
   *
4672
   * @param buf the UTF-16LE string to validate.
4673
   * @param len the length of the string in number of 2-byte code units
4674
   * (char16_t).
4675
   * @return true if and only if the string is valid UTF-16LE.
4676
   */
4677
  simdutf_warn_unused virtual bool
4678
  validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
4679
#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
4680
4681
#if SIMDUTF_FEATURE_UTF16
4682
  /**
4683
   * Validate the UTF-16BE string. This function may be best when you expect
4684
   * the input to be almost always valid. Otherwise, consider using
4685
   * validate_utf16be_with_errors.
4686
   *
4687
   * Overridden by each implementation.
4688
   *
4689
   * This function is not BOM-aware.
4690
   *
4691
   * @param buf the UTF-16BE string to validate.
4692
   * @param len the length of the string in number of 2-byte code units
4693
   * (char16_t).
4694
   * @return true if and only if the string is valid UTF-16BE.
4695
   */
4696
  simdutf_warn_unused virtual bool
4697
  validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
4698
4699
  /**
4700
   * Validate the UTF-16LE string and stop on error.  It might be faster than
4701
   * validate_utf16le when an error is expected to occur early.
4702
   *
4703
   * Overridden by each implementation.
4704
   *
4705
   * This function is not BOM-aware.
4706
   *
4707
   * @param buf the UTF-16LE string to validate.
4708
   * @param len the length of the string in number of 2-byte code units
4709
   * (char16_t).
4710
   * @return a result pair struct (of type simdutf::result containing the two
4711
   * fields error and count) with an error code and either position of the error
4712
   * (in the input in code units) if any, or the number of code units validated
4713
   * if successful.
4714
   */
4715
  simdutf_warn_unused virtual result
4716
  validate_utf16le_with_errors(const char16_t *buf,
4717
                               size_t len) const noexcept = 0;
4718
4719
  /**
4720
   * Validate the UTF-16BE string and stop on error. It might be faster than
4721
   * validate_utf16be when an error is expected to occur early.
4722
   *
4723
   * Overridden by each implementation.
4724
   *
4725
   * This function is not BOM-aware.
4726
   *
4727
   * @param buf the UTF-16BE string to validate.
4728
   * @param len the length of the string in number of 2-byte code units
4729
   * (char16_t).
4730
   * @return a result pair struct (of type simdutf::result containing the two
4731
   * fields error and count) with an error code and either position of the error
4732
   * (in the input in code units) if any, or the number of code units validated
4733
   * if successful.
4734
   */
4735
  simdutf_warn_unused virtual result
4736
  validate_utf16be_with_errors(const char16_t *buf,
4737
                               size_t len) const noexcept = 0;
4738
  /**
4739
   * Copies the UTF-16LE string while replacing mismatched surrogates with the
4740
   * Unicode replacement character U+FFFD. We allow the input and output to be
4741
   * the same buffer so that the correction is done in-place.
4742
   *
4743
   * Overridden by each implementation.
4744
   *
4745
   * @param input the UTF-16LE string to correct.
4746
   * @param len the length of the string in number of 2-byte code units
4747
   * (char16_t).
4748
   * @param output the output buffer.
4749
   */
4750
  virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
4751
                                      char16_t *output) const noexcept = 0;
4752
  /**
4753
   * Copies the UTF-16BE string while replacing mismatched surrogates with the
4754
   * Unicode replacement character U+FFFD. We allow the input and output to be
4755
   * the same buffer so that the correction is done in-place.
4756
   *
4757
   * Overridden by each implementation.
4758
   *
4759
   * @param input the UTF-16BE string to correct.
4760
   * @param len the length of the string in number of 2-byte code units
4761
   * (char16_t).
4762
   * @param output the output buffer.
4763
   */
4764
  virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
4765
                                      char16_t *output) const noexcept = 0;
4766
#endif // SIMDUTF_FEATURE_UTF16
4767
4768
#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
4769
  /**
4770
   * Validate the UTF-32 string.
4771
   *
4772
   * Overridden by each implementation.
4773
   *
4774
   * This function is not BOM-aware.
4775
   *
4776
   * @param buf the UTF-32 string to validate.
4777
   * @param len the length of the string in number of 4-byte code units
4778
   * (char32_t).
4779
   * @return true if and only if the string is valid UTF-32.
4780
   */
4781
  simdutf_warn_unused virtual bool
4782
  validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
4783
#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
4784
4785
#if SIMDUTF_FEATURE_UTF32
4786
  /**
4787
   * Validate the UTF-32 string and stop on error.
4788
   *
4789
   * Overridden by each implementation.
4790
   *
4791
   * This function is not BOM-aware.
4792
   *
4793
   * @param buf the UTF-32 string to validate.
4794
   * @param len the length of the string in number of 4-byte code units
4795
   * (char32_t).
4796
   * @return a result pair struct (of type simdutf::result containing the two
4797
   * fields error and count) with an error code and either position of the error
4798
   * (in the input in code units) if any, or the number of code units validated
4799
   * if successful.
4800
   */
4801
  simdutf_warn_unused virtual result
4802
  validate_utf32_with_errors(const char32_t *buf,
4803
                             size_t len) const noexcept = 0;
4804
#endif // SIMDUTF_FEATURE_UTF32
4805
4806
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4807
  /**
4808
   * Convert Latin1 string into UTF-8 string.
4809
   *
4810
   * This function is suitable to work with inputs from untrusted sources.
4811
   *
4812
   * @param input         the Latin1 string to convert
4813
   * @param length        the length of the string in bytes
4814
   * @param utf8_output  the pointer to buffer that can hold conversion result
4815
   * @return the number of written char; 0 if conversion is not possible
4816
   */
4817
  simdutf_warn_unused virtual size_t
4818
  convert_latin1_to_utf8(const char *input, size_t length,
4819
                         char *utf8_output) const noexcept = 0;
4820
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4821
4822
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4823
  /**
4824
   * Convert possibly Latin1 string into UTF-16LE string.
4825
   *
4826
   * This function is suitable to work with inputs from untrusted sources.
4827
   *
4828
   * @param input         the Latin1  string to convert
4829
   * @param length        the length of the string in bytes
4830
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4831
   * @return the number of written char16_t; 0 if conversion is not possible
4832
   */
4833
  simdutf_warn_unused virtual size_t
4834
  convert_latin1_to_utf16le(const char *input, size_t length,
4835
                            char16_t *utf16_output) const noexcept = 0;
4836
4837
  /**
4838
   * Convert Latin1 string into UTF-16BE string.
4839
   *
4840
   * This function is suitable to work with inputs from untrusted sources.
4841
   *
4842
   * @param input         the Latin1 string to convert
4843
   * @param length        the length of the string in bytes
4844
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4845
   * @return the number of written char16_t; 0 if conversion is not possible
4846
   */
4847
  simdutf_warn_unused virtual size_t
4848
  convert_latin1_to_utf16be(const char *input, size_t length,
4849
                            char16_t *utf16_output) const noexcept = 0;
4850
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
4851
4852
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4853
  /**
4854
   * Convert Latin1 string into UTF-32 string.
4855
   *
4856
   * This function is suitable to work with inputs from untrusted sources.
4857
   *
4858
   * @param input         the Latin1 string to convert
4859
   * @param length        the length of the string in bytes
4860
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
4861
   * @return the number of written char32_t; 0 if conversion is not possible
4862
   */
4863
  simdutf_warn_unused virtual size_t
4864
  convert_latin1_to_utf32(const char *input, size_t length,
4865
                          char32_t *utf32_buffer) const noexcept = 0;
4866
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
4867
4868
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4869
  /**
4870
   * Convert possibly broken UTF-8 string into latin1 string.
4871
   *
4872
   * During the conversion also validation of the input string is done.
4873
   * This function is suitable to work with inputs from untrusted sources.
4874
   *
4875
   * @param input         the UTF-8 string to convert
4876
   * @param length        the length of the string in bytes
4877
   * @param latin1_output  the pointer to buffer that can hold conversion result
4878
   * @return the number of written char; 0 if the input was not valid UTF-8
4879
   * string or if it cannot be represented as Latin1
4880
   */
4881
  simdutf_warn_unused virtual size_t
4882
  convert_utf8_to_latin1(const char *input, size_t length,
4883
                         char *latin1_output) const noexcept = 0;
4884
4885
  /**
4886
   * Convert possibly broken UTF-8 string into latin1 string with errors.
4887
   * If the string cannot be represented as Latin1, an error
4888
   * code is returned.
4889
   *
4890
   * During the conversion also validation of the input string is done.
4891
   * This function is suitable to work with inputs from untrusted sources.
4892
   *
4893
   * @param input         the UTF-8 string to convert
4894
   * @param length        the length of the string in bytes
4895
   * @param latin1_output  the pointer to buffer that can hold conversion result
4896
   * @return a result pair struct (of type simdutf::result containing the two
4897
   * fields error and count) with an error code and either position of the error
4898
   * (in the input in code units) if any, or the number of code units validated
4899
   * if successful.
4900
   */
4901
  simdutf_warn_unused virtual result
4902
  convert_utf8_to_latin1_with_errors(const char *input, size_t length,
4903
                                     char *latin1_output) const noexcept = 0;
4904
4905
  /**
4906
   * Convert valid UTF-8 string into latin1 string.
4907
   *
4908
   * This function assumes that the input string is valid UTF-8 and that it can
4909
   * be represented as Latin1. If you violate this assumption, the result is
4910
   * implementation defined and may include system-dependent behavior such as
4911
   * crashes.
4912
   *
4913
   * This function is for expert users only and not part of our public API. Use
4914
   * convert_utf8_to_latin1 instead.
4915
   *
4916
   * This function is not BOM-aware.
4917
   *
4918
   * @param input         the UTF-8 string to convert
4919
   * @param length        the length of the string in bytes
4920
   * @param latin1_output  the pointer to buffer that can hold conversion result
4921
   * @return the number of written char; 0 if the input was not valid UTF-8
4922
   * string
4923
   */
4924
  simdutf_warn_unused virtual size_t
4925
  convert_valid_utf8_to_latin1(const char *input, size_t length,
4926
                               char *latin1_output) const noexcept = 0;
4927
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
4928
4929
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
4930
  /**
4931
   * Convert possibly broken UTF-8 string into UTF-16LE string.
4932
   *
4933
   * During the conversion also validation of the input string is done.
4934
   * This function is suitable to work with inputs from untrusted sources.
4935
   *
4936
   * @param input         the UTF-8 string to convert
4937
   * @param length        the length of the string in bytes
4938
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4939
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4940
   * string
4941
   */
4942
  simdutf_warn_unused virtual size_t
4943
  convert_utf8_to_utf16le(const char *input, size_t length,
4944
                          char16_t *utf16_output) const noexcept = 0;
4945
4946
  /**
4947
   * Convert possibly broken UTF-8 string into UTF-16BE string.
4948
   *
4949
   * During the conversion also validation of the input string is done.
4950
   * This function is suitable to work with inputs from untrusted sources.
4951
   *
4952
   * @param input         the UTF-8 string to convert
4953
   * @param length        the length of the string in bytes
4954
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4955
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
4956
   * string
4957
   */
4958
  simdutf_warn_unused virtual size_t
4959
  convert_utf8_to_utf16be(const char *input, size_t length,
4960
                          char16_t *utf16_output) const noexcept = 0;
4961
4962
  /**
4963
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
4964
   * error.
4965
   *
4966
   * During the conversion also validation of the input string is done.
4967
   * This function is suitable to work with inputs from untrusted sources.
4968
   *
4969
   * @param input         the UTF-8 string to convert
4970
   * @param length        the length of the string in bytes
4971
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4972
   * @return a result pair struct (of type simdutf::result containing the two
4973
   * fields error and count) with an error code and either position of the error
4974
   * (in the input in code units) if any, or the number of code units validated
4975
   * if successful.
4976
   */
4977
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
4978
      const char *input, size_t length,
4979
      char16_t *utf16_output) const noexcept = 0;
4980
4981
  /**
4982
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
4983
   * error.
4984
   *
4985
   * During the conversion also validation of the input string is done.
4986
   * This function is suitable to work with inputs from untrusted sources.
4987
   *
4988
   * @param input         the UTF-8 string to convert
4989
   * @param length        the length of the string in bytes
4990
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
4991
   * @return a result pair struct (of type simdutf::result containing the two
4992
   * fields error and count) with an error code and either position of the error
4993
   * (in the input in code units) if any, or the number of code units validated
4994
   * if successful.
4995
   */
4996
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
4997
      const char *input, size_t length,
4998
      char16_t *utf16_output) const noexcept = 0;
4999
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5000
5001
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5002
  /**
5003
   * Convert possibly broken UTF-8 string into UTF-32 string.
5004
   *
5005
   * During the conversion also validation of the input string is done.
5006
   * This function is suitable to work with inputs from untrusted sources.
5007
   *
5008
   * @param input         the UTF-8 string to convert
5009
   * @param length        the length of the string in bytes
5010
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5011
   * @return the number of written char16_t; 0 if the input was not valid UTF-8
5012
   * string
5013
   */
5014
  simdutf_warn_unused virtual size_t
5015
  convert_utf8_to_utf32(const char *input, size_t length,
5016
                        char32_t *utf32_output) const noexcept = 0;
5017
5018
  /**
5019
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
5020
   *
5021
   * During the conversion also validation of the input string is done.
5022
   * This function is suitable to work with inputs from untrusted sources.
5023
   *
5024
   * @param input         the UTF-8 string to convert
5025
   * @param length        the length of the string in bytes
5026
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
5027
   * @return a result pair struct (of type simdutf::result containing the two
5028
   * fields error and count) with an error code and either position of the error
5029
   * (in the input in code units) if any, or the number of char32_t written if
5030
   * successful.
5031
   */
5032
  simdutf_warn_unused virtual result
5033
  convert_utf8_to_utf32_with_errors(const char *input, size_t length,
5034
                                    char32_t *utf32_output) const noexcept = 0;
5035
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5036
5037
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5038
  /**
5039
   * Convert valid UTF-8 string into UTF-16LE string.
5040
   *
5041
   * This function assumes that the input string is valid UTF-8.
5042
   *
5043
   * @param input         the UTF-8 string to convert
5044
   * @param length        the length of the string in bytes
5045
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5046
   * @return the number of written char16_t
5047
   */
5048
  simdutf_warn_unused virtual size_t
5049
  convert_valid_utf8_to_utf16le(const char *input, size_t length,
5050
                                char16_t *utf16_buffer) const noexcept = 0;
5051
5052
  /**
5053
   * Convert valid UTF-8 string into UTF-16BE string.
5054
   *
5055
   * This function assumes that the input string is valid UTF-8.
5056
   *
5057
   * @param input         the UTF-8 string to convert
5058
   * @param length        the length of the string in bytes
5059
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5060
   * @return the number of written char16_t
5061
   */
5062
  simdutf_warn_unused virtual size_t
5063
  convert_valid_utf8_to_utf16be(const char *input, size_t length,
5064
                                char16_t *utf16_buffer) const noexcept = 0;
5065
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5066
5067
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5068
  /**
5069
   * Convert valid UTF-8 string into UTF-32 string.
5070
   *
5071
   * This function assumes that the input string is valid UTF-8.
5072
   *
5073
   * @param input         the UTF-8 string to convert
5074
   * @param length        the length of the string in bytes
5075
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
5076
   * @return the number of written char32_t
5077
   */
5078
  simdutf_warn_unused virtual size_t
5079
  convert_valid_utf8_to_utf32(const char *input, size_t length,
5080
                              char32_t *utf32_buffer) const noexcept = 0;
5081
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5082
5083
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5084
  /**
5085
   * Compute the number of 2-byte code units that this UTF-8 string would
5086
   * require in UTF-16LE format.
5087
   *
5088
   * This function does not validate the input. It is acceptable to pass invalid
5089
   * UTF-8 strings but in such cases the result is implementation defined.
5090
   *
5091
   * @param input         the UTF-8 string to process
5092
   * @param length        the length of the string in bytes
5093
   * @return the number of char16_t code units required to encode the UTF-8
5094
   * string as UTF-16LE
5095
   */
5096
  simdutf_warn_unused virtual size_t
5097
  utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5098
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5099
5100
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5101
  /**
5102
   * Compute the number of 4-byte code units that this UTF-8 string would
5103
   * require in UTF-32 format.
5104
   *
5105
   * This function is equivalent to count_utf8. It is acceptable to pass invalid
5106
   * UTF-8 strings but in such cases the result is implementation defined.
5107
   *
5108
   * This function does not validate the input.
5109
   *
5110
   * @param input         the UTF-8 string to process
5111
   * @param length        the length of the string in bytes
5112
   * @return the number of char32_t code units required to encode the UTF-8
5113
   * string as UTF-32
5114
   */
5115
  simdutf_warn_unused virtual size_t
5116
  utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5117
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5118
5119
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5120
  /**
5121
   * Convert possibly broken UTF-16LE string into Latin1 string.
5122
   *
5123
   * During the conversion also validation of the input string is done.
5124
   * This function is suitable to work with inputs from untrusted sources.
5125
   *
5126
   * This function is not BOM-aware.
5127
   *
5128
   * @param input         the UTF-16LE string to convert
5129
   * @param length        the length of the string in 2-byte code units
5130
   * (char16_t)
5131
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5132
   * result
5133
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5134
   * string or if it cannot be represented as Latin1
5135
   */
5136
  simdutf_warn_unused virtual size_t
5137
  convert_utf16le_to_latin1(const char16_t *input, size_t length,
5138
                            char *latin1_buffer) const noexcept = 0;
5139
5140
  /**
5141
   * Convert possibly broken UTF-16BE string into Latin1 string.
5142
   *
5143
   * During the conversion also validation of the input string is done.
5144
   * This function is suitable to work with inputs from untrusted sources.
5145
   *
5146
   * This function is not BOM-aware.
5147
   *
5148
   * @param input         the UTF-16BE string to convert
5149
   * @param length        the length of the string in 2-byte code units
5150
   * (char16_t)
5151
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5152
   * result
5153
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5154
   * string or if it cannot be represented as Latin1
5155
   */
5156
  simdutf_warn_unused virtual size_t
5157
  convert_utf16be_to_latin1(const char16_t *input, size_t length,
5158
                            char *latin1_buffer) const noexcept = 0;
5159
5160
  /**
5161
   * Convert possibly broken UTF-16LE string into Latin1 string.
5162
   * If the string cannot be represented as Latin1, an error
5163
   * is returned.
5164
   *
5165
   * During the conversion also validation of the input string is done.
5166
   * This function is suitable to work with inputs from untrusted sources.
5167
   * This function is not BOM-aware.
5168
   *
5169
   * @param input         the UTF-16LE string to convert
5170
   * @param length        the length of the string in 2-byte code units
5171
   * (char16_t)
5172
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5173
   * result
5174
   * @return a result pair struct (of type simdutf::result containing the two
5175
   * fields error and count) with an error code and either position of the error
5176
   * (in the input in code units) if any, or the number of char written if
5177
   * successful.
5178
   */
5179
  simdutf_warn_unused virtual result
5180
  convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
5181
                                        char *latin1_buffer) const noexcept = 0;
5182
5183
  /**
5184
   * Convert possibly broken UTF-16BE string into Latin1 string.
5185
   * If the string cannot be represented as Latin1, an error
5186
   * is returned.
5187
   *
5188
   * During the conversion also validation of the input string is done.
5189
   * This function is suitable to work with inputs from untrusted sources.
5190
   * This function is not BOM-aware.
5191
   *
5192
   * @param input         the UTF-16BE string to convert
5193
   * @param length        the length of the string in 2-byte code units
5194
   * (char16_t)
5195
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5196
   * result
5197
   * @return a result pair struct (of type simdutf::result containing the two
5198
   * fields error and count) with an error code and either position of the error
5199
   * (in the input in code units) if any, or the number of char written if
5200
   * successful.
5201
   */
5202
  simdutf_warn_unused virtual result
5203
  convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
5204
                                        char *latin1_buffer) const noexcept = 0;
5205
5206
  /**
5207
   * Convert valid UTF-16LE string into Latin1 string.
5208
   *
5209
   * This function assumes that the input string is valid UTF-L16LE and that it
5210
   * can be represented as Latin1. If you violate this assumption, the result is
5211
   * implementation defined and may include system-dependent behavior such as
5212
   * crashes.
5213
   *
5214
   * This function is for expert users only and not part of our public API. Use
5215
   * convert_utf16le_to_latin1 instead.
5216
   *
5217
   * This function is not BOM-aware.
5218
   *
5219
   * @param input         the UTF-16LE string to convert
5220
   * @param length        the length of the string in 2-byte code units
5221
   * (char16_t)
5222
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5223
   * result
5224
   * @return number of written code units; 0 if conversion is not possible
5225
   */
5226
  simdutf_warn_unused virtual size_t
5227
  convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
5228
                                  char *latin1_buffer) const noexcept = 0;
5229
5230
  /**
5231
   * Convert valid UTF-16BE string into Latin1 string.
5232
   *
5233
   * This function assumes that the input string is valid UTF16-BE and that it
5234
   * can be represented as Latin1. If you violate this assumption, the result is
5235
   * implementation defined and may include system-dependent behavior such as
5236
   * crashes.
5237
   *
5238
   * This function is for expert users only and not part of our public API. Use
5239
   * convert_utf16be_to_latin1 instead.
5240
   *
5241
   * This function is not BOM-aware.
5242
   *
5243
   * @param input         the UTF-16BE string to convert
5244
   * @param length        the length of the string in 2-byte code units
5245
   * (char16_t)
5246
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5247
   * result
5248
   * @return number of written code units; 0 if conversion is not possible
5249
   */
5250
  simdutf_warn_unused virtual size_t
5251
  convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
5252
                                  char *latin1_buffer) const noexcept = 0;
5253
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5254
5255
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5256
  /**
5257
   * Convert possibly broken UTF-16LE string into UTF-8 string.
5258
   *
5259
   * During the conversion also validation of the input string is done.
5260
   * This function is suitable to work with inputs from untrusted sources.
5261
   *
5262
   * This function is not BOM-aware.
5263
   *
5264
   * @param input         the UTF-16LE string to convert
5265
   * @param length        the length of the string in 2-byte code units
5266
   * (char16_t)
5267
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5268
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5269
   * string
5270
   */
5271
  simdutf_warn_unused virtual size_t
5272
  convert_utf16le_to_utf8(const char16_t *input, size_t length,
5273
                          char *utf8_buffer) const noexcept = 0;
5274
5275
  /**
5276
   * Convert possibly broken UTF-16BE string into UTF-8 string.
5277
   *
5278
   * During the conversion also validation of the input string is done.
5279
   * This function is suitable to work with inputs from untrusted sources.
5280
   *
5281
   * This function is not BOM-aware.
5282
   *
5283
   * @param input         the UTF-16BE string to convert
5284
   * @param length        the length of the string in 2-byte code units
5285
   * (char16_t)
5286
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5287
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5288
   * string
5289
   */
5290
  simdutf_warn_unused virtual size_t
5291
  convert_utf16be_to_utf8(const char16_t *input, size_t length,
5292
                          char *utf8_buffer) const noexcept = 0;
5293
5294
  /**
5295
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
5296
   * error.
5297
   *
5298
   * During the conversion also validation of the input string is done.
5299
   * This function is suitable to work with inputs from untrusted sources.
5300
   *
5301
   * This function is not BOM-aware.
5302
   *
5303
   * @param input         the UTF-16LE string to convert
5304
   * @param length        the length of the string in 2-byte code units
5305
   * (char16_t)
5306
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5307
   * @return a result pair struct (of type simdutf::result containing the two
5308
   * fields error and count) with an error code and either position of the error
5309
   * (in the input in code units) if any, or the number of char written if
5310
   * successful.
5311
   */
5312
  simdutf_warn_unused virtual result
5313
  convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
5314
                                      char *utf8_buffer) const noexcept = 0;
5315
5316
  /**
5317
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
5318
   * error.
5319
   *
5320
   * During the conversion also validation of the input string is done.
5321
   * This function is suitable to work with inputs from untrusted sources.
5322
   *
5323
   * This function is not BOM-aware.
5324
   *
5325
   * @param input         the UTF-16BE string to convert
5326
   * @param length        the length of the string in 2-byte code units
5327
   * (char16_t)
5328
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5329
   * @return a result pair struct (of type simdutf::result containing the two
5330
   * fields error and count) with an error code and either position of the error
5331
   * (in the input in code units) if any, or the number of char written if
5332
   * successful.
5333
   */
5334
  simdutf_warn_unused virtual result
5335
  convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
5336
                                      char *utf8_buffer) const noexcept = 0;
5337
5338
  /**
5339
   * Convert valid UTF-16LE string into UTF-8 string.
5340
   *
5341
   * This function assumes that the input string is valid UTF-16LE.
5342
   *
5343
   * This function is not BOM-aware.
5344
   *
5345
   * @param input         the UTF-16LE string to convert
5346
   * @param length        the length of the string in 2-byte code units
5347
   * (char16_t)
5348
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5349
   * result
5350
   * @return number of written code units; 0 if conversion is not possible
5351
   */
5352
  simdutf_warn_unused virtual size_t
5353
  convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
5354
                                char *utf8_buffer) const noexcept = 0;
5355
5356
  /**
5357
   * Convert valid UTF-16BE string into UTF-8 string.
5358
   *
5359
   * This function assumes that the input string is valid UTF-16BE.
5360
   *
5361
   * This function is not BOM-aware.
5362
   *
5363
   * @param input         the UTF-16BE string to convert
5364
   * @param length        the length of the string in 2-byte code units
5365
   * (char16_t)
5366
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5367
   * result
5368
   * @return number of written code units; 0 if conversion is not possible
5369
   */
5370
  simdutf_warn_unused virtual size_t
5371
  convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
5372
                                char *utf8_buffer) const noexcept = 0;
5373
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5374
5375
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5376
  /**
5377
   * Convert possibly broken UTF-16LE string into UTF-32 string.
5378
   *
5379
   * During the conversion also validation of the input string is done.
5380
   * This function is suitable to work with inputs from untrusted sources.
5381
   *
5382
   * This function is not BOM-aware.
5383
   *
5384
   * @param input         the UTF-16LE string to convert
5385
   * @param length        the length of the string in 2-byte code units
5386
   * (char16_t)
5387
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5388
   * @return number of written code units; 0 if input is not a valid UTF-16LE
5389
   * string
5390
   */
5391
  simdutf_warn_unused virtual size_t
5392
  convert_utf16le_to_utf32(const char16_t *input, size_t length,
5393
                           char32_t *utf32_buffer) const noexcept = 0;
5394
5395
  /**
5396
   * Convert possibly broken UTF-16BE string into UTF-32 string.
5397
   *
5398
   * During the conversion also validation of the input string is done.
5399
   * This function is suitable to work with inputs from untrusted sources.
5400
   *
5401
   * This function is not BOM-aware.
5402
   *
5403
   * @param input         the UTF-16BE string to convert
5404
   * @param length        the length of the string in 2-byte code units
5405
   * (char16_t)
5406
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5407
   * @return number of written code units; 0 if input is not a valid UTF-16BE
5408
   * string
5409
   */
5410
  simdutf_warn_unused virtual size_t
5411
  convert_utf16be_to_utf32(const char16_t *input, size_t length,
5412
                           char32_t *utf32_buffer) const noexcept = 0;
5413
5414
  /**
5415
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
5416
   * error.
5417
   *
5418
   * During the conversion also validation of the input string is done.
5419
   * This function is suitable to work with inputs from untrusted sources.
5420
   *
5421
   * This function is not BOM-aware.
5422
   *
5423
   * @param input         the UTF-16LE string to convert
5424
   * @param length        the length of the string in 2-byte code units
5425
   * (char16_t)
5426
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5427
   * @return a result pair struct (of type simdutf::result containing the two
5428
   * fields error and count) with an error code and either position of the error
5429
   * (in the input in code units) if any, or the number of char32_t written if
5430
   * successful.
5431
   */
5432
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(
5433
      const char16_t *input, size_t length,
5434
      char32_t *utf32_buffer) const noexcept = 0;
5435
5436
  /**
5437
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
5438
   * error.
5439
   *
5440
   * During the conversion also validation of the input string is done.
5441
   * This function is suitable to work with inputs from untrusted sources.
5442
   *
5443
   * This function is not BOM-aware.
5444
   *
5445
   * @param input         the UTF-16BE string to convert
5446
   * @param length        the length of the string in 2-byte code units
5447
   * (char16_t)
5448
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
5449
   * @return a result pair struct (of type simdutf::result containing the two
5450
   * fields error and count) with an error code and either position of the error
5451
   * (in the input in code units) if any, or the number of char32_t written if
5452
   * successful.
5453
   */
5454
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(
5455
      const char16_t *input, size_t length,
5456
      char32_t *utf32_buffer) const noexcept = 0;
5457
5458
  /**
5459
   * Convert valid UTF-16LE string into UTF-32 string.
5460
   *
5461
   * This function assumes that the input string is valid UTF-16LE.
5462
   *
5463
   * This function is not BOM-aware.
5464
   *
5465
   * @param input         the UTF-16LE string to convert
5466
   * @param length        the length of the string in 2-byte code units
5467
   * (char16_t)
5468
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
5469
   * result
5470
   * @return number of written code units; 0 if conversion is not possible
5471
   */
5472
  simdutf_warn_unused virtual size_t
5473
  convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
5474
                                 char32_t *utf32_buffer) const noexcept = 0;
5475
5476
  /**
5477
   * Convert valid UTF-16LE string into UTF-32BE string.
5478
   *
5479
   * This function assumes that the input string is valid UTF-16BE.
5480
   *
5481
   * This function is not BOM-aware.
5482
   *
5483
   * @param input         the UTF-16BE string to convert
5484
   * @param length        the length of the string in 2-byte code units
5485
   * (char16_t)
5486
   * @param utf32_buffer   the pointer to a buffer that can hold the conversion
5487
   * result
5488
   * @return number of written code units; 0 if conversion is not possible
5489
   */
5490
  simdutf_warn_unused virtual size_t
5491
  convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
5492
                                 char32_t *utf32_buffer) const noexcept = 0;
5493
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5494
5495
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5496
  /**
5497
   * Compute the number of bytes that this UTF-16LE string would require in
5498
   * UTF-8 format.
5499
   *
5500
   * This function does not validate the input. It is acceptable to pass invalid
5501
   * UTF-16 strings but in such cases the result is implementation defined.
5502
   *
5503
   * This function is not BOM-aware.
5504
   *
5505
   * @param input         the UTF-16LE string to convert
5506
   * @param length        the length of the string in 2-byte code units
5507
   * (char16_t)
5508
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
5509
   */
5510
  simdutf_warn_unused virtual size_t
5511
  utf8_length_from_utf16le(const char16_t *input,
5512
                           size_t length) const noexcept = 0;
5513
5514
  /**
5515
   * Compute the number of bytes that this UTF-16BE string would require in
5516
   * UTF-8 format.
5517
   *
5518
   * This function does not validate the input. It is acceptable to pass invalid
5519
   * UTF-16 strings but in such cases the result is implementation defined.
5520
   *
5521
   * This function is not BOM-aware.
5522
   *
5523
   * @param input         the UTF-16BE string to convert
5524
   * @param length        the length of the string in 2-byte code units
5525
   * (char16_t)
5526
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
5527
   */
5528
  simdutf_warn_unused virtual size_t
5529
  utf8_length_from_utf16be(const char16_t *input,
5530
                           size_t length) const noexcept = 0;
5531
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5532
5533
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5534
  /**
5535
   * Convert possibly broken UTF-32 string into Latin1 string.
5536
   *
5537
   * During the conversion also validation of the input string is done.
5538
   * This function is suitable to work with inputs from untrusted sources.
5539
   *
5540
   * This function is not BOM-aware.
5541
   *
5542
   * @param input         the UTF-32 string to convert
5543
   * @param length        the length of the string in 4-byte code units
5544
   * (char32_t)
5545
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5546
   * result
5547
   * @return number of written code units; 0 if input is not a valid UTF-32
5548
   * string
5549
   */
5550
  simdutf_warn_unused virtual size_t
5551
  convert_utf32_to_latin1(const char32_t *input, size_t length,
5552
                          char *latin1_buffer) const noexcept = 0;
5553
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5554
5555
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5556
  /**
5557
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
5558
   * If the string cannot be represented as Latin1, an error is returned.
5559
   *
5560
   * During the conversion also validation of the input string is done.
5561
   * This function is suitable to work with inputs from untrusted sources.
5562
   *
5563
   * This function is not BOM-aware.
5564
   *
5565
   * @param input         the UTF-32 string to convert
5566
   * @param length        the length of the string in 4-byte code units
5567
   * (char32_t)
5568
   * @param latin1_buffer   the pointer to buffer that can hold conversion
5569
   * result
5570
   * @return a result pair struct (of type simdutf::result containing the two
5571
   * fields error and count) with an error code and either position of the error
5572
   * (in the input in code units) if any, or the number of char written if
5573
   * successful.
5574
   */
5575
  simdutf_warn_unused virtual result
5576
  convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
5577
                                      char *latin1_buffer) const noexcept = 0;
5578
5579
  /**
5580
   * Convert valid UTF-32 string into Latin1 string.
5581
   *
5582
   * This function assumes that the input string is valid UTF-32 and can be
5583
   * represented as Latin1. If you violate this assumption, the result is
5584
   * implementation defined and may include system-dependent behavior such as
5585
   * crashes.
5586
   *
5587
   * This function is for expert users only and not part of our public API. Use
5588
   * convert_utf32_to_latin1 instead.
5589
   *
5590
   * This function is not BOM-aware.
5591
   *
5592
   * @param input         the UTF-32 string to convert
5593
   * @param length        the length of the string in 4-byte code units
5594
   * (char32_t)
5595
   * @param latin1_buffer   the pointer to a buffer that can hold the conversion
5596
   * result
5597
   * @return number of written code units; 0 if conversion is not possible
5598
   */
5599
  simdutf_warn_unused virtual size_t
5600
  convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
5601
                                char *latin1_buffer) const noexcept = 0;
5602
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5603
5604
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5605
  /**
5606
   * Convert possibly broken UTF-32 string into UTF-8 string.
5607
   *
5608
   * During the conversion also validation of the input string is done.
5609
   * This function is suitable to work with inputs from untrusted sources.
5610
   *
5611
   * This function is not BOM-aware.
5612
   *
5613
   * @param input         the UTF-32 string to convert
5614
   * @param length        the length of the string in 4-byte code units
5615
   * (char32_t)
5616
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5617
   * @return number of written code units; 0 if input is not a valid UTF-32
5618
   * string
5619
   */
5620
  simdutf_warn_unused virtual size_t
5621
  convert_utf32_to_utf8(const char32_t *input, size_t length,
5622
                        char *utf8_buffer) const noexcept = 0;
5623
5624
  /**
5625
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
5626
   *
5627
   * During the conversion also validation of the input string is done.
5628
   * This function is suitable to work with inputs from untrusted sources.
5629
   *
5630
   * This function is not BOM-aware.
5631
   *
5632
   * @param input         the UTF-32 string to convert
5633
   * @param length        the length of the string in 4-byte code units
5634
   * (char32_t)
5635
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
5636
   * @return a result pair struct (of type simdutf::result containing the two
5637
   * fields error and count) with an error code and either position of the error
5638
   * (in the input in code units) if any, or the number of char written if
5639
   * successful.
5640
   */
5641
  simdutf_warn_unused virtual result
5642
  convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
5643
                                    char *utf8_buffer) const noexcept = 0;
5644
5645
  /**
5646
   * Convert valid UTF-32 string into UTF-8 string.
5647
   *
5648
   * This function assumes that the input string is valid UTF-32.
5649
   *
5650
   * This function is not BOM-aware.
5651
   *
5652
   * @param input         the UTF-32 string to convert
5653
   * @param length        the length of the string in 4-byte code units
5654
   * (char32_t)
5655
   * @param utf8_buffer   the pointer to a buffer that can hold the conversion
5656
   * result
5657
   * @return number of written code units; 0 if conversion is not possible
5658
   */
5659
  simdutf_warn_unused virtual size_t
5660
  convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
5661
                              char *utf8_buffer) const noexcept = 0;
5662
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5663
5664
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5665
  /**
5666
   * Return the number of bytes that this UTF-16 string would require in Latin1
5667
   * format.
5668
   *
5669
   *
5670
   * @param input         the UTF-16 string to convert
5671
   * @param length        the length of the string in 2-byte code units
5672
   * (char16_t)
5673
   * @return the number of bytes required to encode the UTF-16 string as Latin1
5674
   */
5675
  simdutf_warn_unused virtual size_t
5676
0
  utf16_length_from_latin1(size_t length) const noexcept {
5677
0
    return length;
5678
0
  }
5679
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5680
5681
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5682
  /**
5683
   * Convert possibly broken UTF-32 string into UTF-16LE string.
5684
   *
5685
   * During the conversion also validation of the input string is done.
5686
   * This function is suitable to work with inputs from untrusted sources.
5687
   *
5688
   * This function is not BOM-aware.
5689
   *
5690
   * @param input         the UTF-32 string to convert
5691
   * @param length        the length of the string in 4-byte code units
5692
   * (char32_t)
5693
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
5694
   * @return number of written code units; 0 if input is not a valid UTF-32
5695
   * string
5696
   */
5697
  simdutf_warn_unused virtual size_t
5698
  convert_utf32_to_utf16le(const char32_t *input, size_t length,
5699
                           char16_t *utf16_buffer) const noexcept = 0;
5700
5701
  /**
5702
   * Convert possibly broken UTF-32 string into UTF-16BE string.
5703
   *
5704
   * During the conversion also validation of the input string is done.
5705
   * This function is suitable to work with inputs from untrusted sources.
5706
   *
5707
   * This function is not BOM-aware.
5708
   *
5709
   * @param input         the UTF-32 string to convert
5710
   * @param length        the length of the string in 4-byte code units
5711
   * (char32_t)
5712
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
5713
   * @return number of written code units; 0 if input is not a valid UTF-32
5714
   * string
5715
   */
5716
  simdutf_warn_unused virtual size_t
5717
  convert_utf32_to_utf16be(const char32_t *input, size_t length,
5718
                           char16_t *utf16_buffer) const noexcept = 0;
5719
5720
  /**
5721
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
5722
   * error.
5723
   *
5724
   * During the conversion also validation of the input string is done.
5725
   * This function is suitable to work with inputs from untrusted sources.
5726
   *
5727
   * This function is not BOM-aware.
5728
   *
5729
   * @param input         the UTF-32 string to convert
5730
   * @param length        the length of the string in 4-byte code units
5731
   * (char32_t)
5732
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
5733
   * @return a result pair struct (of type simdutf::result containing the two
5734
   * fields error and count) with an error code and either position of the error
5735
   * (in the input in code units) if any, or the number of char16_t written if
5736
   * successful.
5737
   */
5738
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(
5739
      const char32_t *input, size_t length,
5740
      char16_t *utf16_buffer) const noexcept = 0;
5741
5742
  /**
5743
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
5744
   * error.
5745
   *
5746
   * During the conversion also validation of the input string is done.
5747
   * This function is suitable to work with inputs from untrusted sources.
5748
   *
5749
   * This function is not BOM-aware.
5750
   *
5751
   * @param input         the UTF-32 string to convert
5752
   * @param length        the length of the string in 4-byte code units
5753
   * (char32_t)
5754
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
5755
   * @return a result pair struct (of type simdutf::result containing the two
5756
   * fields error and count) with an error code and either position of the error
5757
   * (in the input in code units) if any, or the number of char16_t written if
5758
   * successful.
5759
   */
5760
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(
5761
      const char32_t *input, size_t length,
5762
      char16_t *utf16_buffer) const noexcept = 0;
5763
5764
  /**
5765
   * Convert valid UTF-32 string into UTF-16LE string.
5766
   *
5767
   * This function assumes that the input string is valid UTF-32.
5768
   *
5769
   * This function is not BOM-aware.
5770
   *
5771
   * @param input         the UTF-32 string to convert
5772
   * @param length        the length of the string in 4-byte code units
5773
   * (char32_t)
5774
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
5775
   * result
5776
   * @return number of written code units; 0 if conversion is not possible
5777
   */
5778
  simdutf_warn_unused virtual size_t
5779
  convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
5780
                                 char16_t *utf16_buffer) const noexcept = 0;
5781
5782
  /**
5783
   * Convert valid UTF-32 string into UTF-16BE string.
5784
   *
5785
   * This function assumes that the input string is valid UTF-32.
5786
   *
5787
   * This function is not BOM-aware.
5788
   *
5789
   * @param input         the UTF-32 string to convert
5790
   * @param length        the length of the string in 4-byte code units
5791
   * (char32_t)
5792
   * @param utf16_buffer   the pointer to a buffer that can hold the conversion
5793
   * result
5794
   * @return number of written code units; 0 if conversion is not possible
5795
   */
5796
  simdutf_warn_unused virtual size_t
5797
  convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
5798
                                 char16_t *utf16_buffer) const noexcept = 0;
5799
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5800
5801
#if SIMDUTF_FEATURE_UTF16
5802
  /**
5803
   * Change the endianness of the input. Can be used to go from UTF-16LE to
5804
   * UTF-16BE or from UTF-16BE to UTF-16LE.
5805
   *
5806
   * This function does not validate the input.
5807
   *
5808
   * This function is not BOM-aware.
5809
   *
5810
   * @param input         the UTF-16 string to process
5811
   * @param length        the length of the string in 2-byte code units
5812
   * (char16_t)
5813
   * @param output        the pointer to a buffer that can hold the conversion
5814
   * result
5815
   */
5816
  virtual void change_endianness_utf16(const char16_t *input, size_t length,
5817
                                       char16_t *output) const noexcept = 0;
5818
#endif // SIMDUTF_FEATURE_UTF16
5819
5820
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5821
  /**
5822
   * Return the number of bytes that this Latin1 string would require in UTF-8
5823
   * format.
5824
   *
5825
   * @param input         the Latin1 string to convert
5826
   * @param length        the length of the string bytes
5827
   * @return the number of bytes required to encode the Latin1 string as UTF-8
5828
   */
5829
  simdutf_warn_unused virtual size_t
5830
  utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
5831
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5832
5833
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5834
  /**
5835
   * Compute the number of bytes that this UTF-32 string would require in UTF-8
5836
   * format.
5837
   *
5838
   * This function does not validate the input. It is acceptable to pass invalid
5839
   * UTF-32 strings but in such cases the result is implementation defined.
5840
   *
5841
   * @param input         the UTF-32 string to convert
5842
   * @param length        the length of the string in 4-byte code units
5843
   * (char32_t)
5844
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
5845
   */
5846
  simdutf_warn_unused virtual size_t
5847
  utf8_length_from_utf32(const char32_t *input,
5848
                         size_t length) const noexcept = 0;
5849
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5850
5851
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5852
  /**
5853
   * Compute the number of bytes that this UTF-32 string would require in Latin1
5854
   * format.
5855
   *
5856
   * This function does not validate the input. It is acceptable to pass invalid
5857
   * UTF-32 strings but in such cases the result is implementation defined.
5858
   *
5859
   * @param length        the length of the string in 4-byte code units
5860
   * (char32_t)
5861
   * @return the number of bytes required to encode the UTF-32 string as Latin1
5862
   */
5863
  simdutf_warn_unused virtual size_t
5864
0
  latin1_length_from_utf32(size_t length) const noexcept {
5865
0
    return length;
5866
0
  }
5867
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5868
5869
#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5870
  /**
5871
   * Compute the number of bytes that this UTF-8 string would require in Latin1
5872
   * format.
5873
   *
5874
   * This function does not validate the input. It is acceptable to pass invalid
5875
   * UTF-8 strings but in such cases the result is implementation defined.
5876
   *
5877
   * @param input         the UTF-8 string to convert
5878
   * @param length        the length of the string in byte
5879
   * @return the number of bytes required to encode the UTF-8 string as Latin1
5880
   */
5881
  simdutf_warn_unused virtual size_t
5882
  latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5883
#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5884
5885
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5886
  /**
5887
   * Compute the number of bytes that this UTF-16LE/BE string would require in
5888
   * Latin1 format.
5889
   *
5890
   * This function does not validate the input. It is acceptable to pass invalid
5891
   * UTF-16 strings but in such cases the result is implementation defined.
5892
   *
5893
   * This function is not BOM-aware.
5894
   *
5895
   * @param input         the UTF-16LE string to convert
5896
   * @param length        the length of the string in 2-byte code units
5897
   * (char16_t)
5898
   * @return the number of bytes required to encode the UTF-16LE string as
5899
   * Latin1
5900
   */
5901
  simdutf_warn_unused virtual size_t
5902
0
  latin1_length_from_utf16(size_t length) const noexcept {
5903
0
    return length;
5904
0
  }
5905
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5906
5907
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5908
  /**
5909
   * Compute the number of two-byte code units that this UTF-32 string would
5910
   * require in UTF-16 format.
5911
   *
5912
   * This function does not validate the input. It is acceptable to pass invalid
5913
   * UTF-32 strings but in such cases the result is implementation defined.
5914
   *
5915
   * @param input         the UTF-32 string to convert
5916
   * @param length        the length of the string in 4-byte code units
5917
   * (char32_t)
5918
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
5919
   */
5920
  simdutf_warn_unused virtual size_t
5921
  utf16_length_from_utf32(const char32_t *input,
5922
                          size_t length) const noexcept = 0;
5923
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5924
5925
#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5926
  /**
5927
   * Return the number of bytes that this UTF-32 string would require in Latin1
5928
   * format.
5929
   *
5930
   * @param length        the length of the string in 4-byte code units
5931
   * (char32_t)
5932
   * @return the number of bytes required to encode the UTF-32 string as Latin1
5933
   */
5934
  simdutf_warn_unused virtual size_t
5935
0
  utf32_length_from_latin1(size_t length) const noexcept {
5936
0
    return length;
5937
0
  }
5938
#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5939
5940
#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5941
  /**
5942
   * Compute the number of bytes that this UTF-16LE string would require in
5943
   * UTF-32 format.
5944
   *
5945
   * This function is equivalent to count_utf16le.
5946
   *
5947
   * This function does not validate the input. It is acceptable to pass invalid
5948
   * UTF-16 strings but in such cases the result is implementation defined.
5949
   *
5950
   * This function is not BOM-aware.
5951
   *
5952
   * @param input         the UTF-16LE string to convert
5953
   * @param length        the length of the string in 2-byte code units
5954
   * (char16_t)
5955
   * @return the number of bytes required to encode the UTF-16LE string as
5956
   * UTF-32
5957
   */
5958
  simdutf_warn_unused virtual size_t
5959
  utf32_length_from_utf16le(const char16_t *input,
5960
                            size_t length) const noexcept = 0;
5961
5962
  /**
5963
   * Compute the number of bytes that this UTF-16BE string would require in
5964
   * UTF-32 format.
5965
   *
5966
   * This function is equivalent to count_utf16be.
5967
   *
5968
   * This function does not validate the input. It is acceptable to pass invalid
5969
   * UTF-16 strings but in such cases the result is implementation defined.
5970
   *
5971
   * This function is not BOM-aware.
5972
   *
5973
   * @param input         the UTF-16BE string to convert
5974
   * @param length        the length of the string in 2-byte code units
5975
   * (char16_t)
5976
   * @return the number of bytes required to encode the UTF-16BE string as
5977
   * UTF-32
5978
   */
5979
  simdutf_warn_unused virtual size_t
5980
  utf32_length_from_utf16be(const char16_t *input,
5981
                            size_t length) const noexcept = 0;
5982
#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
5983
5984
#if SIMDUTF_FEATURE_UTF16
5985
  /**
5986
   * Count the number of code points (characters) in the string assuming that
5987
   * it is valid.
5988
   *
5989
   * This function assumes that the input string is valid UTF-16LE.
5990
   * It is acceptable to pass invalid UTF-16 strings but in such cases
5991
   * the result is implementation defined.
5992
   *
5993
   * This function is not BOM-aware.
5994
   *
5995
   * @param input         the UTF-16LE string to process
5996
   * @param length        the length of the string in 2-byte code units
5997
   * (char16_t)
5998
   * @return number of code points
5999
   */
6000
  simdutf_warn_unused virtual size_t
6001
  count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
6002
6003
  /**
6004
   * Count the number of code points (characters) in the string assuming that
6005
   * it is valid.
6006
   *
6007
   * This function assumes that the input string is valid UTF-16BE.
6008
   * It is acceptable to pass invalid UTF-16 strings but in such cases
6009
   * the result is implementation defined.
6010
   *
6011
   * This function is not BOM-aware.
6012
   *
6013
   * @param input         the UTF-16BE string to process
6014
   * @param length        the length of the string in 2-byte code units
6015
   * (char16_t)
6016
   * @return number of code points
6017
   */
6018
  simdutf_warn_unused virtual size_t
6019
  count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
6020
#endif // SIMDUTF_FEATURE_UTF16
6021
6022
#if SIMDUTF_FEATURE_UTF8
6023
  /**
6024
   * Count the number of code points (characters) in the string assuming that
6025
   * it is valid.
6026
   *
6027
   * This function assumes that the input string is valid UTF-8.
6028
   * It is acceptable to pass invalid UTF-8 strings but in such cases
6029
   * the result is implementation defined.
6030
   *
6031
   * @param input         the UTF-8 string to process
6032
   * @param length        the length of the string in bytes
6033
   * @return number of code points
6034
   */
6035
  simdutf_warn_unused virtual size_t
6036
  count_utf8(const char *input, size_t length) const noexcept = 0;
6037
#endif // SIMDUTF_FEATURE_UTF8
6038
6039
#if SIMDUTF_FEATURE_BASE64
6040
  /**
6041
   * Provide the maximal binary length in bytes given the base64 input.
6042
   * In general, if the input contains ASCII spaces, the result will be less
6043
   * than the maximum length. It is acceptable to pass invalid base64 strings
6044
   * but in such cases the result is implementation defined.
6045
   *
6046
   * @param input         the base64 input to process
6047
   * @param length        the length of the base64 input in bytes
6048
   * @return maximal number of binary bytes
6049
   */
6050
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
6051
      const char *input, size_t length) const noexcept;
6052
6053
  /**
6054
   * Provide the maximal binary length in bytes given the base64 input.
6055
   * In general, if the input contains ASCII spaces, the result will be less
6056
   * than the maximum length. It is acceptable to pass invalid base64 strings
6057
   * but in such cases the result is implementation defined.
6058
   *
6059
   * @param input         the base64 input to process, in ASCII stored as 16-bit
6060
   * units
6061
   * @param length        the length of the base64 input in 16-bit units
6062
   * @return maximal number of binary bytes
6063
   */
6064
  simdutf_warn_unused size_t maximal_binary_length_from_base64(
6065
      const char16_t *input, size_t length) const noexcept;
6066
6067
  /**
6068
   * Convert a base64 input to a binary output.
6069
   *
6070
   * This function follows the WHATWG forgiving-base64 format, which means that
6071
   * it will ignore any ASCII spaces in the input. You may provide a padded
6072
   * input (with one or two equal signs at the end) or an unpadded input
6073
   * (without any equal signs at the end).
6074
   *
6075
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6076
   *
6077
   * This function will fail in case of invalid input. When last_chunk_options =
6078
   * loose, there are two possible reasons for failure: the input contains a
6079
   * number of base64 characters that when divided by 4, leaves a single
6080
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6081
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6082
   *
6083
   * You should call this function with a buffer that is at least
6084
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6085
   * provide that much space, the function may cause a buffer overflow.
6086
   *
6087
   * @param input         the base64 string to process
6088
   * @param length        the length of the string in bytes
6089
   * @param output        the pointer to a buffer that can hold the conversion
6090
   * result (should be at least maximal_binary_length_from_base64(input, length)
6091
   * bytes long).
6092
   * @param options       the base64 options to use, can be base64_default or
6093
   * base64_url, is base64_default by default.
6094
   * @return a result pair struct (of type simdutf::result containing the two
6095
   * fields error and count) with an error code and either position of the error
6096
   * (in the input in bytes) if any, or the number of bytes written if
6097
   * successful.
6098
   */
6099
  simdutf_warn_unused virtual result
6100
  base64_to_binary(const char *input, size_t length, char *output,
6101
                   base64_options options = base64_default,
6102
                   last_chunk_handling_options last_chunk_options =
6103
                       last_chunk_handling_options::loose) const noexcept = 0;
6104
6105
  /**
6106
   * Convert a base64 input to a binary output while returning more details
6107
   * than base64_to_binary.
6108
   *
6109
   * This function follows the WHATWG forgiving-base64 format, which means that
6110
   * it will ignore any ASCII spaces in the input. You may provide a padded
6111
   * input (with one or two equal signs at the end) or an unpadded input
6112
   * (without any equal signs at the end).
6113
   *
6114
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6115
   *
6116
   * This function will fail in case of invalid input. When last_chunk_options =
6117
   * loose, there are two possible reasons for failure: the input contains a
6118
   * number of base64 characters that when divided by 4, leaves a single
6119
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6120
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6121
   *
6122
   * You should call this function with a buffer that is at least
6123
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6124
   * provide that much space, the function may cause a buffer overflow.
6125
   *
6126
   * @param input         the base64 string to process
6127
   * @param length        the length of the string in bytes
6128
   * @param output        the pointer to a buffer that can hold the conversion
6129
   * result (should be at least maximal_binary_length_from_base64(input, length)
6130
   * bytes long).
6131
   * @param options       the base64 options to use, can be base64_default or
6132
   * base64_url, is base64_default by default.
6133
   * @return a full_result pair struct (of type simdutf::result containing the
6134
   * three fields error, input_count and output_count).
6135
   */
6136
  simdutf_warn_unused virtual full_result base64_to_binary_details(
6137
      const char *input, size_t length, char *output,
6138
      base64_options options = base64_default,
6139
      last_chunk_handling_options last_chunk_options =
6140
          last_chunk_handling_options::loose) const noexcept = 0;
6141
  /**
6142
   * Convert a base64 input to a binary output.
6143
   *
6144
   * This function follows the WHATWG forgiving-base64 format, which means that
6145
   * it will ignore any ASCII spaces in the input. You may provide a padded
6146
   * input (with one or two equal signs at the end) or an unpadded input
6147
   * (without any equal signs at the end).
6148
   *
6149
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6150
   *
6151
   * This function will fail in case of invalid input. When last_chunk_options =
6152
   * loose, there are two possible reasons for failure: the input contains a
6153
   * number of base64 characters that when divided by 4, leaves a single
6154
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6155
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6156
   *
6157
   * You should call this function with a buffer that is at least
6158
   * maximal_binary_length_from_base64(input, length) bytes long. If you
6159
   * fail to provide that much space, the function may cause a buffer overflow.
6160
   *
6161
   * @param input         the base64 string to process, in ASCII stored as
6162
   * 16-bit units
6163
   * @param length        the length of the string in 16-bit units
6164
   * @param output        the pointer to a buffer that can hold the conversion
6165
   * result (should be at least maximal_binary_length_from_base64(input, length)
6166
   * bytes long).
6167
   * @param options       the base64 options to use, can be base64_default or
6168
   * base64_url, is base64_default by default.
6169
   * @return a result pair struct (of type simdutf::result containing the two
6170
   * fields error and count) with an error code and position of the
6171
   * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
6172
   * number of bytes written if successful.
6173
   */
6174
  simdutf_warn_unused virtual result
6175
  base64_to_binary(const char16_t *input, size_t length, char *output,
6176
                   base64_options options = base64_default,
6177
                   last_chunk_handling_options last_chunk_options =
6178
                       last_chunk_handling_options::loose) const noexcept = 0;
6179
6180
  /**
6181
   * Convert a base64 input to a binary output while returning more details
6182
   * than base64_to_binary.
6183
   *
6184
   * This function follows the WHATWG forgiving-base64 format, which means that
6185
   * it will ignore any ASCII spaces in the input. You may provide a padded
6186
   * input (with one or two equal signs at the end) or an unpadded input
6187
   * (without any equal signs at the end).
6188
   *
6189
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6190
   *
6191
   * This function will fail in case of invalid input. When last_chunk_options =
6192
   * loose, there are two possible reasons for failure: the input contains a
6193
   * number of base64 characters that when divided by 4, leaves a single
6194
   * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6195
   * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6196
   *
6197
   * You should call this function with a buffer that is at least
6198
   * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6199
   * provide that much space, the function may cause a buffer overflow.
6200
   *
6201
   * @param input         the base64 string to process
6202
   * @param length        the length of the string in bytes
6203
   * @param output        the pointer to a buffer that can hold the conversion
6204
   * result (should be at least maximal_binary_length_from_base64(input, length)
6205
   * bytes long).
6206
   * @param options       the base64 options to use, can be base64_default or
6207
   * base64_url, is base64_default by default.
6208
   * @return a full_result pair struct (of type simdutf::result containing the
6209
   * three fields error, input_count and output_count).
6210
   */
6211
  simdutf_warn_unused virtual full_result base64_to_binary_details(
6212
      const char16_t *input, size_t length, char *output,
6213
      base64_options options = base64_default,
6214
      last_chunk_handling_options last_chunk_options =
6215
          last_chunk_handling_options::loose) const noexcept = 0;
6216
  /**
6217
   * Provide the base64 length in bytes given the length of a binary input.
6218
   *
6219
   * @param length        the length of the input in bytes
6220
   * @parem options       the base64 options to use, can be base64_default or
6221
   * base64_url, is base64_default by default.
6222
   * @return number of base64 bytes
6223
   */
6224
  simdutf_warn_unused size_t base64_length_from_binary(
6225
      size_t length, base64_options options = base64_default) const noexcept;
6226
6227
  /**
6228
   * Convert a binary input to a base64 output.
6229
   *
6230
   * The default option (simdutf::base64_default) uses the characters `+` and
6231
   * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
6232
   * the output to ensure that the output length is a multiple of four.
6233
   *
6234
   * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
6235
   * part of its alphabet. No padding is added at the end of the output.
6236
   *
6237
   * This function always succeeds.
6238
   *
6239
   * @param input         the binary to process
6240
   * @param length        the length of the input in bytes
6241
   * @param output        the pointer to a buffer that can hold the conversion
6242
   * result (should be at least base64_length_from_binary(length) bytes long)
6243
   * @param options       the base64 options to use, can be base64_default or
6244
   * base64_url, is base64_default by default.
6245
   * @return number of written bytes, will be equal to
6246
   * base64_length_from_binary(length, options)
6247
   */
6248
  virtual size_t
6249
  binary_to_base64(const char *input, size_t length, char *output,
6250
                   base64_options options = base64_default) const noexcept = 0;
6251
  /**
6252
   * Find the first occurrence of a character in a string. If the character is
6253
   * not found, return a pointer to the end of the string.
6254
   * @param start        the start of the string
6255
   * @param end          the end of the string
6256
   * @param character    the character to find
6257
   * @return a pointer to the first occurrence of the character in the string,
6258
   * or a pointer to the end of the string if the character is not found.
6259
   *
6260
   */
6261
  virtual const char *find(const char *start, const char *end,
6262
                           char character) const noexcept = 0;
6263
  virtual const char16_t *find(const char16_t *start, const char16_t *end,
6264
                               char16_t character) const noexcept = 0;
6265
#endif // SIMDUTF_FEATURE_BASE64
6266
6267
#ifdef SIMDUTF_INTERNAL_TESTS
6268
  // This method is exported only in developer mode, its purpose
6269
  // is to expose some internal test procedures from the given
6270
  // implementation and then use them through our standard test
6271
  // framework.
6272
  //
6273
  // Regular users should not use it, the tests of the public
6274
  // API are enough.
6275
6276
  struct TestProcedure {
6277
    // display name
6278
    std::string name;
6279
6280
    // procedure should return whether given test pass or not
6281
    void (*procedure)(const implementation &);
6282
  };
6283
6284
  virtual std::vector<TestProcedure> internal_tests() const;
6285
#endif
6286
6287
protected:
6288
  /** @private Construct an implementation with the given name and description.
6289
   * For subclasses. */
6290
  simdutf_really_inline implementation(const char *name,
6291
                                       const char *description,
6292
                                       uint32_t required_instruction_sets)
6293
      : _name(name), _description(description),
6294
0
        _required_instruction_sets(required_instruction_sets) {}
6295
6296
protected:
6297
  ~implementation() = default;
6298
6299
private:
6300
  /**
6301
   * The name of this implementation.
6302
   */
6303
  const char *_name;
6304
6305
  /**
6306
   * The description of this implementation.
6307
   */
6308
  const char *_description;
6309
6310
  /**
6311
   * Instruction sets required for this implementation.
6312
   */
6313
  const uint32_t _required_instruction_sets;
6314
};
6315
6316
/** @private */
6317
namespace internal {
6318
6319
/**
6320
 * The list of available implementations compiled into simdutf.
6321
 */
6322
class available_implementation_list {
6323
public:
6324
  /** Get the list of available implementations compiled into simdutf */
6325
0
  simdutf_really_inline available_implementation_list() {}
6326
  /** Number of implementations */
6327
  size_t size() const noexcept;
6328
  /** STL const begin() iterator */
6329
  const implementation *const *begin() const noexcept;
6330
  /** STL const end() iterator */
6331
  const implementation *const *end() const noexcept;
6332
6333
  /**
6334
   * Get the implementation with the given name.
6335
   *
6336
   * Case sensitive.
6337
   *
6338
   *     const implementation *impl =
6339
   * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
6340
   * (!imp->supported_by_runtime_system()) { exit(1); }
6341
   *     simdutf::active_implementation = impl;
6342
   *
6343
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
6344
   * @return the implementation, or nullptr if the parse failed.
6345
   */
6346
0
  const implementation *operator[](const std::string &name) const noexcept {
6347
0
    for (const implementation *impl : *this) {
6348
0
      if (impl->name() == name) {
6349
0
        return impl;
6350
0
      }
6351
0
    }
6352
0
    return nullptr;
6353
0
  }
6354
6355
  /**
6356
   * Detect the most advanced implementation supported by the current host.
6357
   *
6358
   * This is used to initialize the implementation on startup.
6359
   *
6360
   *     const implementation *impl =
6361
   * simdutf::available_implementation::detect_best_supported();
6362
   *     simdutf::active_implementation = impl;
6363
   *
6364
   * @return the most advanced supported implementation for the current host, or
6365
   * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
6366
   * supported implementation. Will never return nullptr.
6367
   */
6368
  const implementation *detect_best_supported() const noexcept;
6369
};
6370
6371
template <typename T> class atomic_ptr {
6372
public:
6373
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
6374
6375
#if defined(SIMDUTF_NO_THREADS)
6376
  operator const T *() const { return ptr; }
6377
  const T &operator*() const { return *ptr; }
6378
  const T *operator->() const { return ptr; }
6379
6380
  operator T *() { return ptr; }
6381
  T &operator*() { return *ptr; }
6382
  T *operator->() { return ptr; }
6383
  atomic_ptr &operator=(T *_ptr) {
6384
    ptr = _ptr;
6385
    return *this;
6386
  }
6387
6388
#else
6389
  operator const T *() const { return ptr.load(); }
6390
  const T &operator*() const { return *ptr; }
6391
  const T *operator->() const { return ptr.load(); }
6392
6393
  operator T *() { return ptr.load(); }
6394
  T &operator*() { return *ptr; }
6395
  T *operator->() { return ptr.load(); }
6396
  atomic_ptr &operator=(T *_ptr) {
6397
    ptr = _ptr;
6398
    return *this;
6399
  }
6400
6401
#endif
6402
6403
private:
6404
#if defined(SIMDUTF_NO_THREADS)
6405
  T *ptr;
6406
#else
6407
  std::atomic<T *> ptr;
6408
#endif
6409
};
6410
6411
class detect_best_supported_implementation_on_first_use;
6412
6413
} // namespace internal
6414
6415
/**
6416
 * The list of available implementations compiled into simdutf.
6417
 */
6418
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
6419
get_available_implementations();
6420
6421
/**
6422
 * The active implementation.
6423
 *
6424
 * Automatically initialized on first use to the most advanced implementation
6425
 * supported by this hardware.
6426
 */
6427
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
6428
get_active_implementation();
6429
6430
} // namespace simdutf
6431
6432
#endif // SIMDUTF_IMPLEMENTATION_H
6433
/* end file include/simdutf/implementation.h */
6434
6435
// Implementation-internal files (must be included before the implementations
6436
// themselves, to keep amalgamation working--otherwise, the first time a file is
6437
// included, it might be put inside the #ifdef
6438
// SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other
6439
// implementations can't compile unless that implementation is turned on).
6440
6441
SIMDUTF_POP_DISABLE_WARNINGS
6442
6443
#endif // SIMDUTF_H
6444
/* end file include/simdutf.h */