Coverage Report

Created: 2025-07-04 09:33

/src/node/deps/simdutf/simdutf.h
Line
Count
Source (jump to first uncovered line)
1
/* auto-generated on 2024-03-18 10:58:28 -0400. Do not edit! */
2
/* begin file include/simdutf.h */
3
#ifndef SIMDUTF_H
4
#define SIMDUTF_H
5
#include <cstring>
6
7
/* begin file include/simdutf/compiler_check.h */
8
#ifndef SIMDUTF_COMPILER_CHECK_H
9
#define SIMDUTF_COMPILER_CHECK_H
10
11
#ifndef __cplusplus
12
#error simdutf requires a C++ compiler
13
#endif
14
15
#ifndef SIMDUTF_CPLUSPLUS
16
#if defined(_MSVC_LANG) && !defined(__clang__)
17
#define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
18
#else
19
#define SIMDUTF_CPLUSPLUS __cplusplus
20
#endif
21
#endif
22
23
// C++ 17
24
#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
25
#define SIMDUTF_CPLUSPLUS17 1
26
#endif
27
28
// C++ 14
29
#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L)
30
#define SIMDUTF_CPLUSPLUS14 1
31
#endif
32
33
// C++ 11
34
#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L)
35
#define SIMDUTF_CPLUSPLUS11 1
36
#endif
37
38
#ifndef SIMDUTF_CPLUSPLUS11
39
#error simdutf requires a compiler compliant with the C++11 standard
40
#endif
41
42
#endif // SIMDUTF_COMPILER_CHECK_H
43
/* end file include/simdutf/compiler_check.h */
44
/* begin file include/simdutf/common_defs.h */
45
#ifndef SIMDUTF_COMMON_DEFS_H
46
#define SIMDUTF_COMMON_DEFS_H
47
48
#include <cassert>
49
/* begin file include/simdutf/portability.h */
50
#ifndef SIMDUTF_PORTABILITY_H
51
#define SIMDUTF_PORTABILITY_H
52
53
#include <cstddef>
54
#include <cstdint>
55
#include <cstdlib>
56
#include <cfloat>
57
#include <cassert>
58
#ifndef _WIN32
59
// strcasecmp, strncasecmp
60
#include <strings.h>
61
#endif
62
63
/**
64
 * We want to check that it is actually a little endian system at
65
 * compile-time.
66
 */
67
68
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
69
#define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
70
#elif defined(_WIN32)
71
#define SIMDUTF_IS_BIG_ENDIAN 0
72
#else
73
#if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
74
#include <machine/endian.h>
75
#elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
76
#include <sys/byteorder.h>
77
#else  // defined(__APPLE__) || defined(__FreeBSD__)
78
79
#ifdef __has_include
80
#if __has_include(<endian.h>)
81
#include <endian.h>
82
#endif //__has_include(<endian.h>)
83
#endif //__has_include
84
85
#endif // defined(__APPLE__) || defined(__FreeBSD__)
86
87
88
#ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
89
#define SIMDUTF_IS_BIG_ENDIAN 0
90
#endif
91
92
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
93
#define SIMDUTF_IS_BIG_ENDIAN 0
94
#else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
95
#define SIMDUTF_IS_BIG_ENDIAN 1
96
#endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
97
98
#endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
99
100
101
/**
102
 * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined.
103
 */
104
105
#ifdef _MSC_VER
106
#define SIMDUTF_VISUAL_STUDIO 1
107
/**
108
 * We want to differentiate carefully between
109
 * clang under visual studio and regular visual
110
 * studio.
111
 *
112
 * Under clang for Windows, we enable:
113
 *  * target pragmas so that part and only part of the
114
 *     code gets compiled for advanced instructions.
115
 *
116
 */
117
#ifdef __clang__
118
// clang under visual studio
119
#define SIMDUTF_CLANG_VISUAL_STUDIO 1
120
#else
121
// just regular visual studio (best guess)
122
#define SIMDUTF_REGULAR_VISUAL_STUDIO 1
123
#endif // __clang__
124
#endif // _MSC_VER
125
126
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
127
// https://en.wikipedia.org/wiki/C_alternative_tokens
128
// This header should have no effect, except maybe
129
// under Visual Studio.
130
#include <iso646.h>
131
#endif
132
133
#if defined(__x86_64__) || defined(_M_AMD64)
134
#define SIMDUTF_IS_X86_64 1
135
#elif defined(__aarch64__) || defined(_M_ARM64)
136
#define SIMDUTF_IS_ARM64 1
137
#elif defined(__PPC64__) || defined(_M_PPC64)
138
//#define SIMDUTF_IS_PPC64 1
139
// The simdutf library does yet support SIMD acceleration under
140
// POWER processors. Please see https://github.com/lemire/simdutf/issues/51
141
#elif defined(__s390__)
142
// s390 IBM system. Big endian.
143
#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
144
// RISC-V 64-bit
145
#define SIMDUTF_IS_RISCV64 1
146
147
#if __clang_major__ >= 19
148
// Does the compiler support target regions for RISC-V
149
#define SIMDUTF_HAS_RVV_TARGET_REGION 1
150
#endif
151
152
#if __riscv_v_intrinsic >= 11000 && !(__GNUC__ == 13 && __GNUC_MINOR__ == 2 && __GNUC_PATCHLEVEL__ == 0)
153
#define SIMDUTF_HAS_RVV_INTRINSICS 1
154
#endif
155
156
#define SIMDUTF_HAS_ZVBB_INTRINSICS 0 // there is currently no way to detect this
157
158
#if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector && __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64
159
// RISC-V V extension
160
#define SIMDUTF_IS_RVV 1
161
#if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000
162
// RISC-V Vector Basic Bit-manipulation
163
#define SIMDUTF_IS_ZVBB 1
164
#endif
165
#endif
166
167
#elif defined(__loongarch_lp64)
168
// LoongArch 64-bit
169
#else
170
// The simdutf library is designed
171
// for 64-bit processors and it seems that you are not
172
// compiling for a known 64-bit platform. Please
173
// use a 64-bit target such as x64 or 64-bit ARM for best performance.
174
#define SIMDUTF_IS_32BITS 1
175
176
// We do not support 32-bit platforms, but it can be
177
// handy to identify them.
178
#if defined(_M_IX86) || defined(__i386__)
179
#define SIMDUTF_IS_X86_32BITS 1
180
#elif defined(__arm__) || defined(_M_ARM)
181
#define SIMDUTF_IS_ARM_32BITS 1
182
#elif defined(__PPC__) || defined(_M_PPC)
183
#define SIMDUTF_IS_PPC_32BITS 1
184
#endif
185
186
#endif // defined(__x86_64__) || defined(_M_AMD64)
187
188
#ifdef SIMDUTF_IS_32BITS
189
#ifndef SIMDUTF_NO_PORTABILITY_WARNING
190
// In the future, we may want to warn users of 32-bit systems that
191
// the simdutf does not support accelerated kernels for such systems.
192
#endif // SIMDUTF_NO_PORTABILITY_WARNING
193
#endif // SIMDUTF_IS_32BITS
194
195
// this is almost standard?
196
#define SIMDUTF_STRINGIFY_IMPLEMENTATION_(a) #a
197
#define SIMDUTF_STRINGIFY(a) SIMDUTF_STRINGIFY_IMPLEMENTATION_(a)
198
199
// Our fast kernels require 64-bit systems.
200
//
201
// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
202
// Furthermore, the number of SIMD registers is reduced.
203
//
204
// On 32-bit ARM, we would have smaller registers.
205
//
206
// The simdutf users should still have the fallback kernel. It is
207
// slower, but it should run everywhere.
208
209
//
210
// Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION
211
//
212
213
// We are going to use runtime dispatch.
214
#ifdef SIMDUTF_IS_X86_64
215
#ifdef __clang__
216
// clang does not have GCC push pop
217
// warning: clang attribute push can't be used within a namespace in clang up
218
// til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a
219
// namespace.
220
#define SIMDUTF_TARGET_REGION(T)                                                       \
221
  _Pragma(SIMDUTF_STRINGIFY(                                                           \
222
      clang attribute push(__attribute__((target(T))), apply_to = function)))
223
#define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
224
#elif defined(__GNUC__)
225
// GCC is easier
226
#define SIMDUTF_TARGET_REGION(T)                                                       \
227
  _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
228
#define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
229
#endif // clang then gcc
230
231
#endif // x86
232
233
// Default target region macros don't do anything.
234
#ifndef SIMDUTF_TARGET_REGION
235
#define SIMDUTF_TARGET_REGION(T)
236
#define SIMDUTF_UNTARGET_REGION
237
#endif
238
239
// Is threading enabled?
240
#if defined(_REENTRANT) || defined(_MT)
241
#ifndef SIMDUTF_THREADS_ENABLED
242
#define SIMDUTF_THREADS_ENABLED
243
#endif
244
#endif
245
246
// workaround for large stack sizes under -O0.
247
// https://github.com/simdutf/simdutf/issues/691
248
#ifdef __APPLE__
249
#ifndef __OPTIMIZE__
250
// Apple systems have small stack sizes in secondary threads.
251
// Lack of compiler optimization may generate high stack usage.
252
// Users may want to disable threads for safety, but only when
253
// in debug mode which we detect by the fact that the __OPTIMIZE__
254
// macro is not defined.
255
#undef SIMDUTF_THREADS_ENABLED
256
#endif
257
#endif
258
259
#ifdef SIMDUTF_VISUAL_STUDIO
260
// This is one case where we do not distinguish between
261
// regular visual studio and clang under visual studio.
262
// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
263
#define simdutf_strcasecmp _stricmp
264
#define simdutf_strncasecmp _strnicmp
265
#else
266
// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
267
// So they are only useful for ASCII in our context.
268
// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
269
#define simdutf_strcasecmp strcasecmp
270
#define simdutf_strncasecmp strncasecmp
271
#endif
272
273
#ifdef NDEBUG
274
275
#ifdef SIMDUTF_VISUAL_STUDIO
276
#define SIMDUTF_UNREACHABLE() __assume(0)
277
#define SIMDUTF_ASSUME(COND) __assume(COND)
278
#else
279
#define SIMDUTF_UNREACHABLE() __builtin_unreachable();
280
#define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
281
#endif
282
283
#else // NDEBUG
284
285
#define SIMDUTF_UNREACHABLE() assert(0);
286
#define SIMDUTF_ASSUME(COND) assert(COND)
287
288
#endif
289
290
291
#if defined(__GNUC__) && !defined(__clang__)
292
#if __GNUC__ >= 11
293
#define SIMDUTF_GCC11ORMORE 1
294
#endif //  __GNUC__ >= 11
295
#endif // defined(__GNUC__) && !defined(__clang__)
296
297
298
#endif // SIMDUTF_PORTABILITY_H
299
/* end file include/simdutf/portability.h */
300
/* begin file include/simdutf/avx512.h */
301
#ifndef SIMDUTF_AVX512_H_
302
#define SIMDUTF_AVX512_H_
303
304
/*
305
    It's possible to override AVX512 settings with cmake DCMAKE_CXX_FLAGS.
306
307
    All preprocessor directives has form `SIMDUTF_HAS_AVX512{feature}`,
308
    where a feature is a code name for extensions.
309
310
    Please see the listing below to find which are supported.
311
*/
312
313
#ifndef SIMDUTF_HAS_AVX512F
314
# if defined(__AVX512F__) && __AVX512F__ == 1
315
#   define SIMDUTF_HAS_AVX512F 1
316
# endif
317
#endif
318
319
#ifndef SIMDUTF_HAS_AVX512DQ
320
# if defined(__AVX512DQ__) && __AVX512DQ__ == 1
321
#   define SIMDUTF_HAS_AVX512DQ 1
322
# endif
323
#endif
324
325
#ifndef SIMDUTF_HAS_AVX512IFMA
326
# if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
327
#   define SIMDUTF_HAS_AVX512IFMA 1
328
# endif
329
#endif
330
331
#ifndef SIMDUTF_HAS_AVX512CD
332
# if defined(__AVX512CD__) && __AVX512CD__ == 1
333
#   define SIMDUTF_HAS_AVX512CD 1
334
# endif
335
#endif
336
337
#ifndef SIMDUTF_HAS_AVX512BW
338
# if defined(__AVX512BW__) && __AVX512BW__ == 1
339
#   define SIMDUTF_HAS_AVX512BW 1
340
# endif
341
#endif
342
343
#ifndef SIMDUTF_HAS_AVX512VL
344
# if defined(__AVX512VL__) && __AVX512VL__ == 1
345
#   define SIMDUTF_HAS_AVX512VL 1
346
# endif
347
#endif
348
349
#ifndef SIMDUTF_HAS_AVX512VBMI
350
# if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
351
#   define SIMDUTF_HAS_AVX512VBMI 1
352
# endif
353
#endif
354
355
#ifndef SIMDUTF_HAS_AVX512VBMI2
356
# if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
357
#   define SIMDUTF_HAS_AVX512VBMI2 1
358
# endif
359
#endif
360
361
#ifndef SIMDUTF_HAS_AVX512VNNI
362
# if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
363
#   define SIMDUTF_HAS_AVX512VNNI 1
364
# endif
365
#endif
366
367
#ifndef SIMDUTF_HAS_AVX512BITALG
368
# if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
369
#   define SIMDUTF_HAS_AVX512BITALG 1
370
# endif
371
#endif
372
373
#ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
374
# if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
375
#   define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
376
# endif
377
#endif
378
379
#endif // SIMDUTF_AVX512_H_
380
/* end file include/simdutf/avx512.h */
381
382
383
#if defined(__GNUC__)
384
  // Marks a block with a name so that MCA analysis can see it.
385
  #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
386
  #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
387
  #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
388
#else
389
  #define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
390
  #define SIMDUTF_END_DEBUG_BLOCK(name)
391
  #define SIMDUTF_DEBUG_BLOCK(name, block)
392
#endif
393
394
// Align to N-byte boundary
395
#define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
396
#define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
397
398
#define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
399
400
#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
401
402
  #define simdutf_really_inline __forceinline
403
  #define simdutf_never_inline __declspec(noinline)
404
405
  #define simdutf_unused
406
  #define simdutf_warn_unused
407
408
  #ifndef simdutf_likely
409
  #define simdutf_likely(x) x
410
  #endif
411
  #ifndef simdutf_unlikely
412
  #define simdutf_unlikely(x) x
413
  #endif
414
415
  #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
416
  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
417
  #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
418
  // Get rid of Intellisense-only warnings (Code Analysis)
419
  // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
420
  #ifdef __has_include
421
  #if __has_include(<CppCoreCheck\Warnings.h>)
422
  #include <CppCoreCheck\Warnings.h>
423
  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
424
  #endif
425
  #endif
426
427
  #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
428
  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
429
  #endif
430
431
  #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
432
  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
433
  #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop ))
434
435
#else // SIMDUTF_REGULAR_VISUAL_STUDIO
436
437
  #define simdutf_really_inline inline __attribute__((always_inline))
438
  #define simdutf_never_inline inline __attribute__((noinline))
439
440
  #define simdutf_unused __attribute__((unused))
441
  #define simdutf_warn_unused __attribute__((warn_unused_result))
442
443
  #ifndef simdutf_likely
444
  #define simdutf_likely(x) __builtin_expect(!!(x), 1)
445
  #endif
446
  #ifndef simdutf_unlikely
447
  #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
448
  #endif
449
450
  #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
451
  // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
452
  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \
453
    SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
454
    SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
455
    SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
456
    SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
457
    SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
458
    SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
459
    SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
460
    SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
461
    SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
462
    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
463
    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
464
  #define SIMDUTF_PRAGMA(P) _Pragma(#P)
465
  #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
466
  #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
467
  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
468
  #else
469
  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
470
  #endif
471
  #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
472
  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
473
  #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
474
475
476
477
#endif // MSC_VER
478
479
#ifndef SIMDUTF_DLLIMPORTEXPORT
480
    #if defined(SIMDUTF_VISUAL_STUDIO)
481
      /**
482
       * It does not matter here whether you are using
483
       * the regular visual studio or clang under visual
484
       * studio.
485
       */
486
      #if SIMDUTF_USING_LIBRARY
487
      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
488
      #else
489
      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
490
      #endif
491
    #else
492
      #define SIMDUTF_DLLIMPORTEXPORT
493
    #endif
494
#endif
495
496
/// If EXPR is an error, returns it.
497
#define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
498
499
500
#endif // SIMDUTF_COMMON_DEFS_H
501
/* end file include/simdutf/common_defs.h */
502
/* begin file include/simdutf/encoding_types.h */
503
#include <string>
504
505
namespace simdutf {
506
507
enum encoding_type {
508
        UTF8 = 1,       // BOM 0xef 0xbb 0xbf
509
        UTF16_LE = 2,   // BOM 0xff 0xfe
510
        UTF16_BE = 4,   // BOM 0xfe 0xff
511
        UTF32_LE = 8,   // BOM 0xff 0xfe 0x00 0x00
512
        UTF32_BE = 16,   // BOM 0x00 0x00 0xfe 0xff
513
        Latin1 = 32,
514
515
        unspecified = 0
516
};
517
518
enum endianness {
519
        LITTLE = 0,
520
        BIG = 1
521
};
522
523
bool match_system(endianness e);
524
525
std::string to_string(encoding_type bom);
526
527
// Note that BOM for UTF8 is discouraged.
528
namespace BOM {
529
530
/**
531
 * Checks for a BOM. If not, returns unspecified
532
 * @param input         the string to process
533
 * @param length        the length of the string in code units
534
 * @return the corresponding encoding
535
 */
536
537
encoding_type check_bom(const uint8_t* byte, size_t length);
538
encoding_type check_bom(const char* byte, size_t length);
539
/**
540
 * Returns the size, in bytes, of the BOM for a given encoding type.
541
 * Note that UTF8 BOM are discouraged.
542
 * @param bom         the encoding type
543
 * @return the size in bytes of the corresponding BOM
544
 */
545
size_t bom_byte_size(encoding_type bom);
546
547
} // BOM namespace
548
} // simdutf namespace
549
/* end file include/simdutf/encoding_types.h */
550
/* begin file include/simdutf/error.h */
551
#ifndef SIMDUTF_ERROR_H
552
#define SIMDUTF_ERROR_H
553
namespace simdutf {
554
555
enum error_code {
556
  SUCCESS = 0,
557
  HEADER_BITS,  // Any byte must have fewer than 5 header bits.
558
  TOO_SHORT,    // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length
559
                // This is also the error when the input is truncated.
560
  TOO_LONG,     // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
561
  OVERLONG,     // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
562
                // and U+FFFF for four-byte characters.
563
  TOO_LARGE,    // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1
564
  SURROGATE,    // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
565
                // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR
566
                // there must be no surrogate at all (Latin1)
567
  INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid base64 string.
568
  BASE64_INPUT_REMAINDER, // The base64 input terminates with a single character, excluding padding (=).
569
  OTHER         // Not related to validation/transcoding.
570
};
571
572
struct result {
573
  error_code error;
574
  size_t count;     // In case of error, indicates the position of the error. In case of success, indicates the number of code units validated/written.
575
576
  simdutf_really_inline result();
577
578
  simdutf_really_inline result(error_code, size_t);
579
};
580
581
}
582
#endif
583
/* end file include/simdutf/error.h */
584
585
SIMDUTF_PUSH_DISABLE_WARNINGS
586
SIMDUTF_DISABLE_UNDESIRED_WARNINGS
587
588
// Public API
589
/* begin file include/simdutf/simdutf_version.h */
590
// /include/simdutf/simdutf_version.h automatically generated by release.py,
591
// do not change by hand
592
#ifndef SIMDUTF_SIMDUTF_VERSION_H
593
#define SIMDUTF_SIMDUTF_VERSION_H
594
595
/** The version of simdutf being used (major.minor.revision) */
596
132k
#define SIMDUTF_VERSION "5.0.0"
597
598
namespace simdutf {
599
enum {
600
  /**
601
   * The major version (MAJOR.minor.revision) of simdutf being used.
602
   */
603
  SIMDUTF_VERSION_MAJOR = 5,
604
  /**
605
   * The minor version (major.MINOR.revision) of simdutf being used.
606
   */
607
  SIMDUTF_VERSION_MINOR = 0,
608
  /**
609
   * The revision (major.minor.REVISION) of simdutf being used.
610
   */
611
  SIMDUTF_VERSION_REVISION = 0
612
};
613
} // namespace simdutf
614
615
#endif // SIMDUTF_SIMDUTF_VERSION_H
616
/* end file include/simdutf/simdutf_version.h */
617
/* begin file include/simdutf/implementation.h */
618
#ifndef SIMDUTF_IMPLEMENTATION_H
619
#define SIMDUTF_IMPLEMENTATION_H
620
#include <string>
621
#if !defined(SIMDUTF_NO_THREADS)
622
#include <atomic>
623
#endif
624
#include <vector>
625
#include <tuple>
626
/* begin file include/simdutf/internal/isadetection.h */
627
/* From
628
https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
629
Highly modified.
630
631
Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
632
Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
633
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
634
Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
635
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
636
Copyright (c) 2011-2013 NYU                      (Clement Farabet)
637
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
638
Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
639
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
640
Samy Bengio, Johnny Mariethoz)
641
642
All rights reserved.
643
644
Redistribution and use in source and binary forms, with or without
645
modification, are permitted provided that the following conditions are met:
646
647
1. Redistributions of source code must retain the above copyright
648
   notice, this list of conditions and the following disclaimer.
649
650
2. Redistributions in binary form must reproduce the above copyright
651
   notice, this list of conditions and the following disclaimer in the
652
   documentation and/or other materials provided with the distribution.
653
654
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
655
America and IDIAP Research Institute nor the names of its contributors may be
656
   used to endorse or promote products derived from this software without
657
   specific prior written permission.
658
659
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
660
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
661
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
662
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
663
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
664
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
665
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
666
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
667
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
668
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
669
POSSIBILITY OF SUCH DAMAGE.
670
*/
671
672
#ifndef SIMDutf_INTERNAL_ISADETECTION_H
673
#define SIMDutf_INTERNAL_ISADETECTION_H
674
675
#include <cstdint>
676
#include <cstdlib>
677
#if defined(_MSC_VER)
678
#include <intrin.h>
679
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
680
#include <cpuid.h>
681
#endif
682
683
684
namespace simdutf {
685
namespace internal {
686
687
enum instruction_set {
688
  DEFAULT = 0x0,
689
  NEON = 0x1,
690
  AVX2 = 0x4,
691
  SSE42 = 0x8,
692
  PCLMULQDQ = 0x10,
693
  BMI1 = 0x20,
694
  BMI2 = 0x40,
695
  ALTIVEC = 0x80,
696
  AVX512F = 0x100,
697
  AVX512DQ = 0x200,
698
  AVX512IFMA = 0x400,
699
  AVX512PF = 0x800,
700
  AVX512ER = 0x1000,
701
  AVX512CD = 0x2000,
702
  AVX512BW = 0x4000,
703
  AVX512VL = 0x8000,
704
  AVX512VBMI2 = 0x10000,
705
  AVX512VPOPCNTDQ = 0x2000,
706
  RVV = 0x4000,
707
  ZVBB = 0x8000,
708
};
709
710
#if defined(__PPC64__)
711
712
static inline uint32_t detect_supported_architectures() {
713
  return instruction_set::ALTIVEC;
714
}
715
716
#elif SIMDUTF_IS_RISCV64
717
718
#if defined(__linux__)
719
#include <unistd.h>
720
// We define these our selfs, for backwards compatibility
721
struct simdutf_riscv_hwprobe { int64_t key; uint64_t value; };
722
#define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__)
723
#define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4
724
#define SIMDUTF_RISCV_HWPROBE_IMA_V    (1 << 2)
725
#define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17)
726
#endif
727
728
static inline uint32_t detect_supported_architectures() {
729
  uint32_t host_isa = instruction_set::DEFAULT;
730
#if SIMDUTF_IS_RVV
731
  host_isa |= instruction_set::RVV;
732
#endif
733
#if SIMDUTF_IS_ZVBB
734
  host_isa |= instruction_set::ZVBB;
735
#endif
736
#if defined(__linux__)
737
  simdutf_riscv_hwprobe probes[] = { { SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0 } };
738
  long ret = simdutf_riscv_hwprobe(&probes, sizeof probes/sizeof *probes, 0, nullptr, 0);
739
  if (ret == 0) {
740
    uint64_t extensions = probes[0].value;
741
    if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V)
742
      host_isa |= instruction_set::RVV;
743
    if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB)
744
      host_isa |= instruction_set::ZVBB;
745
  }
746
#endif
747
  return host_isa;
748
}
749
750
#elif defined(__aarch64__) || defined(_M_ARM64)
751
752
static inline uint32_t detect_supported_architectures() {
753
  return instruction_set::NEON;
754
}
755
756
#elif defined(__x86_64__) || defined(_M_AMD64) // x64
757
758
759
namespace {
760
namespace cpuid_bit {
761
    // Can be found on Intel ISA Reference for CPUID
762
763
    // EAX = 0x01
764
    constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit  1 of ECX for EAX=0x1
765
    constexpr uint32_t sse42 = uint32_t(1) << 20;    ///< @private bit 20 of ECX for EAX=0x1
766
    constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
767
768
    // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
769
    // See: "Table 3-8. Information Returned by CPUID Instruction"
770
    namespace ebx {
771
      constexpr uint32_t bmi1 = uint32_t(1) << 3;
772
      constexpr uint32_t avx2 = uint32_t(1) << 5;
773
      constexpr uint32_t bmi2 = uint32_t(1) << 8;
774
      constexpr uint32_t avx512f = uint32_t(1) << 16;
775
      constexpr uint32_t avx512dq = uint32_t(1) << 17;
776
      constexpr uint32_t avx512ifma = uint32_t(1) << 21;
777
      constexpr uint32_t avx512cd = uint32_t(1) << 28;
778
      constexpr uint32_t avx512bw = uint32_t(1) << 30;
779
      constexpr uint32_t avx512vl = uint32_t(1) << 31;
780
    }
781
782
    namespace ecx {
783
      constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
784
      constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
785
      constexpr uint32_t avx512vnni = uint32_t(1) << 11;
786
      constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
787
      constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
788
    }
789
    namespace edx {
790
      constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
791
    }
792
    namespace xcr0_bit {
793
     constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
794
     constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
795
   }
796
  }
797
}
798
799
800
801
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
802
0
                         uint32_t *edx) {
803
0
#if defined(_MSC_VER)
804
0
  int cpu_info[4];
805
0
  __cpuidex(cpu_info, *eax, *ecx);
806
0
  *eax = cpu_info[0];
807
0
  *ebx = cpu_info[1];
808
0
  *ecx = cpu_info[2];
809
0
  *edx = cpu_info[3];
810
0
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
811
0
  uint32_t level = *eax;
812
0
  __get_cpuid(level, eax, ebx, ecx, edx);
813
0
#else
814
0
  uint32_t a = *eax, b, c = *ecx, d;
815
0
  asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
816
0
  *eax = a;
817
0
  *ebx = b;
818
0
  *ecx = c;
819
0
  *edx = d;
820
0
#endif
821
0
}
Unexecuted instantiation: node_buffer.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: node_builtins.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: node_metadata.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: string_bytes.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: main_thread_interface.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: node_string.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: encoding_binding.cc:simdutf::internal::cpuid(unsigned int*, unsigned int*, unsigned int*, unsigned int*)
822
823
0
static inline uint64_t xgetbv() {
824
0
 #if defined(_MSC_VER)
825
0
   return _xgetbv(0);
826
0
 #else
827
0
   uint32_t xcr0_lo, xcr0_hi;
828
0
   asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
829
0
   return xcr0_lo | ((uint64_t)xcr0_hi << 32);
830
0
 #endif
831
0
 }
Unexecuted instantiation: node_buffer.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: node_builtins.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: node_metadata.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: string_bytes.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: main_thread_interface.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: node_string.cc:simdutf::internal::xgetbv()
Unexecuted instantiation: encoding_binding.cc:simdutf::internal::xgetbv()
832
833
0
static inline uint32_t detect_supported_architectures() {
834
0
  uint32_t eax;
835
0
  uint32_t ebx = 0;
836
0
  uint32_t ecx = 0;
837
0
  uint32_t edx = 0;
838
0
  uint32_t host_isa = 0x0;
839
0
840
0
  // EBX for EAX=0x1
841
0
  eax = 0x1;
842
0
  cpuid(&eax, &ebx, &ecx, &edx);
843
0
844
0
  if (ecx & cpuid_bit::sse42) {
845
0
    host_isa |= instruction_set::SSE42;
846
0
  }
847
0
848
0
  if (ecx & cpuid_bit::pclmulqdq) {
849
0
    host_isa |= instruction_set::PCLMULQDQ;
850
0
  }
851
0
852
0
  if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
853
0
    return host_isa;
854
0
  }
855
0
856
0
  // xgetbv for checking if the OS saves registers
857
0
  uint64_t xcr0 = xgetbv();
858
0
859
0
  if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
860
0
    return host_isa;
861
0
  }
862
0
  // ECX for EAX=0x7
863
0
  eax = 0x7;
864
0
  ecx = 0x0; // Sub-leaf = 0
865
0
  cpuid(&eax, &ebx, &ecx, &edx);
866
0
  if (ebx & cpuid_bit::ebx::avx2) {
867
0
    host_isa |= instruction_set::AVX2;
868
0
  }
869
0
  if (ebx & cpuid_bit::ebx::bmi1) {
870
0
    host_isa |= instruction_set::BMI1;
871
0
  }
872
0
  if (ebx & cpuid_bit::ebx::bmi2) {
873
0
    host_isa |= instruction_set::BMI2;
874
0
  }
875
0
  if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
876
0
    return host_isa;
877
0
  }
878
0
  if (ebx & cpuid_bit::ebx::avx512f) {
879
0
    host_isa |= instruction_set::AVX512F;
880
0
  }
881
0
  if (ebx & cpuid_bit::ebx::avx512bw) {
882
0
    host_isa |= instruction_set::AVX512BW;
883
0
  }
884
0
  if (ebx & cpuid_bit::ebx::avx512cd) {
885
0
    host_isa |= instruction_set::AVX512CD;
886
0
  }
887
0
  if (ebx & cpuid_bit::ebx::avx512dq) {
888
0
    host_isa |= instruction_set::AVX512DQ;
889
0
  }
890
0
  if (ebx & cpuid_bit::ebx::avx512vl) {
891
0
    host_isa |= instruction_set::AVX512VL;
892
0
  }
893
0
  if (ecx & cpuid_bit::ecx::avx512vbmi2) {
894
0
    host_isa |= instruction_set::AVX512VBMI2;
895
0
  }
896
0
  if (ecx & cpuid_bit::ecx::avx512vpopcnt) {
897
0
    host_isa |= instruction_set::AVX512VPOPCNTDQ;
898
0
  }
899
0
  return host_isa;
900
0
}
Unexecuted instantiation: node_buffer.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: node_builtins.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: node_metadata.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: string_bytes.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: main_thread_interface.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: node_string.cc:simdutf::internal::detect_supported_architectures()
Unexecuted instantiation: encoding_binding.cc:simdutf::internal::detect_supported_architectures()
901
#else // fallback
902
903
// includes 32-bit ARM.
904
static inline uint32_t detect_supported_architectures() {
905
  return instruction_set::DEFAULT;
906
}
907
908
909
#endif // end SIMD extension detection code
910
911
} // namespace internal
912
} // namespace simdutf
913
914
#endif // SIMDutf_INTERNAL_ISADETECTION_H
915
/* end file include/simdutf/internal/isadetection.h */
916
917
918
namespace simdutf {
919
920
/**
921
 * Autodetect the encoding of the input, a single encoding is recommended.
922
 * E.g., the function might return simdutf::encoding_type::UTF8,
923
 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
924
 * simdutf::encoding_type::UTF32_LE.
925
 *
926
 * @param input the string to analyze.
927
 * @param length the length of the string in bytes.
928
 * @return the detected encoding type
929
 */
930
simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept;
931
0
simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept {
932
0
  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
933
0
}
934
935
/**
936
 * Autodetect the possible encodings of the input in one pass.
937
 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
938
 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
939
 *
940
 * Overridden by each implementation.
941
 *
942
 * @param input the string to analyze.
943
 * @param length the length of the string in bytes.
944
 * @return the detected encoding type
945
 */
946
simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept;
947
0
simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept {
948
0
  return detect_encodings(reinterpret_cast<const char *>(input), length);
949
0
}
950
951
/**
952
 * Validate the UTF-8 string. This function may be best when you expect
953
 * the input to be almost always valid. Otherwise, consider using
954
 * validate_utf8_with_errors.
955
 *
956
 * Overridden by each implementation.
957
 *
958
 * @param buf the UTF-8 string to validate.
959
 * @param len the length of the string in bytes.
960
 * @return true if and only if the string is valid UTF-8.
961
 */
962
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
963
964
/**
965
 * Validate the UTF-8 string and stop on error.
966
 *
967
 * Overridden by each implementation.
968
 *
969
 * @param buf the UTF-8 string to validate.
970
 * @param len the length of the string in bytes.
971
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
972
 */
973
simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept;
974
975
/**
976
 * Validate the ASCII string.
977
 *
978
 * Overridden by each implementation.
979
 *
980
 * @param buf the ASCII string to validate.
981
 * @param len the length of the string in bytes.
982
 * @return true if and only if the string is valid ASCII.
983
 */
984
simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
985
986
/**
987
 * Validate the ASCII string and stop on error. It might be faster than
988
 * validate_utf8 when an error is expected to occur early.
989
 *
990
 * Overridden by each implementation.
991
 *
992
 * @param buf the ASCII string to validate.
993
 * @param len the length of the string in bytes.
994
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
995
 */
996
simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept;
997
998
/**
999
 * Using native endianness; Validate the UTF-16 string.
1000
 * This function may be best when you expect the input to be almost always valid.
1001
 * Otherwise, consider using validate_utf16_with_errors.
1002
 *
1003
 * Overridden by each implementation.
1004
 *
1005
 * This function is not BOM-aware.
1006
 *
1007
 * @param buf the UTF-16 string to validate.
1008
 * @param len the length of the string in number of 2-byte code units (char16_t).
1009
 * @return true if and only if the string is valid UTF-16.
1010
 */
1011
simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept;
1012
1013
/**
1014
 * Validate the UTF-16LE string. This function may be best when you expect
1015
 * the input to be almost always valid. Otherwise, consider using
1016
 * validate_utf16le_with_errors.
1017
 *
1018
 * Overridden by each implementation.
1019
 *
1020
 * This function is not BOM-aware.
1021
 *
1022
 * @param buf the UTF-16LE string to validate.
1023
 * @param len the length of the string in number of 2-byte code units (char16_t).
1024
 * @return true if and only if the string is valid UTF-16LE.
1025
 */
1026
simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept;
1027
1028
/**
1029
 * Validate the UTF-16BE string. This function may be best when you expect
1030
 * the input to be almost always valid. Otherwise, consider using
1031
 * validate_utf16be_with_errors.
1032
 *
1033
 * Overridden by each implementation.
1034
 *
1035
 * This function is not BOM-aware.
1036
 *
1037
 * @param buf the UTF-16BE string to validate.
1038
 * @param len the length of the string in number of 2-byte code units (char16_t).
1039
 * @return true if and only if the string is valid UTF-16BE.
1040
 */
1041
simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept;
1042
1043
/**
1044
 * Using native endianness; Validate the UTF-16 string and stop on error.
1045
 * It might be faster than validate_utf16 when an error is expected to occur early.
1046
 *
1047
 * Overridden by each implementation.
1048
 *
1049
 * This function is not BOM-aware.
1050
 *
1051
 * @param buf the UTF-16 string to validate.
1052
 * @param len the length of the string in number of 2-byte code units (char16_t).
1053
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1054
 */
1055
simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept;
1056
1057
/**
1058
 * Validate the UTF-16LE string and stop on error. It might be faster than
1059
 * validate_utf16le when an error is expected to occur early.
1060
 *
1061
 * Overridden by each implementation.
1062
 *
1063
 * This function is not BOM-aware.
1064
 *
1065
 * @param buf the UTF-16LE string to validate.
1066
 * @param len the length of the string in number of 2-byte code units (char16_t).
1067
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1068
 */
1069
simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept;
1070
1071
/**
1072
 * Validate the UTF-16BE string and stop on error. It might be faster than
1073
 * validate_utf16be when an error is expected to occur early.
1074
 *
1075
 * Overridden by each implementation.
1076
 *
1077
 * This function is not BOM-aware.
1078
 *
1079
 * @param buf the UTF-16BE string to validate.
1080
 * @param len the length of the string in number of 2-byte code units (char16_t).
1081
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1082
 */
1083
simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept;
1084
1085
/**
1086
 * Validate the UTF-32 string. This function may be best when you expect
1087
 * the input to be almost always valid. Otherwise, consider using
1088
 * validate_utf32_with_errors.
1089
 *
1090
 * Overridden by each implementation.
1091
 *
1092
 * This function is not BOM-aware.
1093
 *
1094
 * @param buf the UTF-32 string to validate.
1095
 * @param len the length of the string in number of 4-byte code units (char32_t).
1096
 * @return true if and only if the string is valid UTF-32.
1097
 */
1098
simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept;
1099
1100
/**
1101
 * Validate the UTF-32 string and stop on error. It might be faster than
1102
 * validate_utf32 when an error is expected to occur early.
1103
 *
1104
 * Overridden by each implementation.
1105
 *
1106
 * This function is not BOM-aware.
1107
 *
1108
 * @param buf the UTF-32 string to validate.
1109
 * @param len the length of the string in number of 4-byte code units (char32_t).
1110
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1111
 */
1112
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
1113
1114
  /**
1115
   * Convert Latin1 string into UTF8 string.
1116
   *
1117
   * This function is suitable to work with inputs from untrusted sources.
1118
   *
1119
   * @param input         the Latin1 string to convert
1120
   * @param length        the length of the string in bytes
1121
   * @param latin1_output  the pointer to buffer that can hold conversion result
1122
   * @return the number of written char; 0 if conversion is not possible
1123
   */
1124
  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept;
1125
1126
1127
    /**
1128
   * Convert possibly Latin1 string into UTF-16LE string.
1129
   *
1130
   * This function is suitable to work with inputs from untrusted sources.
1131
   *
1132
   * @param input         the Latin1  string to convert
1133
   * @param length        the length of the string in bytes
1134
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
1135
   * @return the number of written char16_t; 0 if conversion is not possible
1136
   */
1137
  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
1138
1139
  /**
1140
   * Convert Latin1 string into UTF-16BE string.
1141
   *
1142
   * This function is suitable to work with inputs from untrusted sources.
1143
   *
1144
   * @param input         the Latin1 string to convert
1145
   * @param length        the length of the string in bytes
1146
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
1147
   * @return the number of written char16_t; 0 if conversion is not possible
1148
   */
1149
  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
1150
1151
  /**
1152
   * Convert Latin1 string into UTF-32 string.
1153
   *
1154
   * This function is suitable to work with inputs from untrusted sources.
1155
   *
1156
   * @param input         the Latin1 string to convert
1157
   * @param length        the length of the string in bytes
1158
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
1159
   * @return the number of written char32_t; 0 if conversion is not possible
1160
   */
1161
  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
1162
1163
 /**
1164
   * Convert possibly broken UTF-8 string into latin1 string.
1165
   *
1166
   * During the conversion also validation of the input string is done.
1167
   * This function is suitable to work with inputs from untrusted sources.
1168
   *
1169
   * @param input         the UTF-8 string to convert
1170
   * @param length        the length of the string in bytes
1171
   * @param latin1_output  the pointer to buffer that can hold conversion result
1172
   * @return the number of written char; 0 if the input was not valid UTF-8 string
1173
   */
1174
  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
1175
1176
/**
1177
 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 string.
1178
 *
1179
 * During the conversion also validation of the input string is done.
1180
 * This function is suitable to work with inputs from untrusted sources.
1181
 *
1182
 * @param input         the UTF-8 string to convert
1183
 * @param length        the length of the string in bytes
1184
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1185
 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1186
 */
1187
simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
1188
1189
1190
/**
1191
 * Using native endianness, convert a Latin1 string into a UTF-16 string.
1192
 *
1193
 * @param input         the UTF-8 string to convert
1194
 * @param length        the length of the string in bytes
1195
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1196
 * @return the number of written char16_t.
1197
 */
1198
simdutf_warn_unused size_t convert_latin1_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
1199
1200
/**
1201
 * Convert possibly broken UTF-8 string into UTF-16LE string.
1202
 *
1203
 * During the conversion also validation of the input string is done.
1204
 * This function is suitable to work with inputs from untrusted sources.
1205
 *
1206
 * @param input         the UTF-8 string to convert
1207
 * @param length        the length of the string in bytes
1208
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1209
 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1210
 */
1211
simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
1212
1213
/**
1214
 * Convert possibly broken UTF-8 string into UTF-16BE string.
1215
 *
1216
 * During the conversion also validation of the input string is done.
1217
 * This function is suitable to work with inputs from untrusted sources.
1218
 *
1219
 * @param input         the UTF-8 string to convert
1220
 * @param length        the length of the string in bytes
1221
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1222
 * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
1223
 */
1224
simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
1225
1226
1227
  /**
1228
   * Convert possibly broken UTF-8 string into latin1 string with errors.
1229
   *
1230
   * During the conversion also validation of the input string is done.
1231
   * This function is suitable to work with inputs from untrusted sources.
1232
   *
1233
   * @param input         the UTF-8 string to convert
1234
   * @param length        the length of the string in bytes
1235
   * @param latin1_output  the pointer to buffer that can hold conversion result
1236
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
1237
   */
1238
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept;
1239
1240
/**
1241
 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
1242
 * string and stop on error.
1243
 *
1244
 * During the conversion also validation of the input string is done.
1245
 * This function is suitable to work with inputs from untrusted sources.
1246
 *
1247
 * @param input         the UTF-8 string to convert
1248
 * @param length        the length of the string in bytes
1249
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1250
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1251
 */
1252
simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1253
1254
/**
1255
 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1256
 *
1257
 * During the conversion also validation of the input string is done.
1258
 * This function is suitable to work with inputs from untrusted sources.
1259
 *
1260
 * @param input         the UTF-8 string to convert
1261
 * @param length        the length of the string in bytes
1262
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1263
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1264
 */
1265
simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1266
1267
/**
1268
 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1269
 *
1270
 * During the conversion also validation of the input string is done.
1271
 * This function is suitable to work with inputs from untrusted sources.
1272
 *
1273
 * @param input         the UTF-8 string to convert
1274
 * @param length        the length of the string in bytes
1275
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1276
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
1277
 */
1278
simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
1279
1280
/**
1281
 * Convert possibly broken UTF-8 string into UTF-32 string.
1282
 *
1283
 * During the conversion also validation of the input string is done.
1284
 * This function is suitable to work with inputs from untrusted sources.
1285
 *
1286
 * @param input         the UTF-8 string to convert
1287
 * @param length        the length of the string in bytes
1288
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1289
 * @return the number of written char32_t; 0 if the input was not valid UTF-8 string
1290
 */
1291
simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept;
1292
1293
/**
1294
 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1295
 *
1296
 * During the conversion also validation of the input string is done.
1297
 * This function is suitable to work with inputs from untrusted sources.
1298
 *
1299
 * @param input         the UTF-8 string to convert
1300
 * @param length        the length of the string in bytes
1301
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1302
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1303
 */
1304
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
1305
1306
    /**
1307
   * Convert valid UTF-8 string into latin1 string.
1308
   *
1309
   * This function assumes that the input string is valid UTF-8.
1310
   *
1311
   * This function is not BOM-aware.
1312
   *
1313
   * @param input         the UTF-8 string to convert
1314
   * @param length        the length of the string in bytes
1315
   * @param latin1_output  the pointer to buffer that can hold conversion result
1316
   * @return the number of written char; 0 if the input was not valid UTF-8 string
1317
   */
1318
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
1319
1320
1321
/**
1322
 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1323
 *
1324
 * This function assumes that the input string is valid UTF-8.
1325
 *
1326
 * @param input         the UTF-8 string to convert
1327
 * @param length        the length of the string in bytes
1328
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1329
 * @return the number of written char16_t
1330
 */
1331
simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1332
1333
/**
1334
 * Convert valid UTF-8 string into UTF-16LE string.
1335
 *
1336
 * This function assumes that the input string is valid UTF-8.
1337
 *
1338
 * @param input         the UTF-8 string to convert
1339
 * @param length        the length of the string in bytes
1340
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1341
 * @return the number of written char16_t
1342
 */
1343
simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1344
1345
/**
1346
 * Convert valid UTF-8 string into UTF-16BE string.
1347
 *
1348
 * This function assumes that the input string is valid UTF-8.
1349
 *
1350
 * @param input         the UTF-8 string to convert
1351
 * @param length        the length of the string in bytes
1352
 * @param utf16_buffer  the pointer to buffer that can hold conversion result
1353
 * @return the number of written char16_t
1354
 */
1355
simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
1356
1357
/**
1358
 * Convert valid UTF-8 string into UTF-32 string.
1359
 *
1360
 * This function assumes that the input string is valid UTF-8.
1361
 *
1362
 * @param input         the UTF-8 string to convert
1363
 * @param length        the length of the string in bytes
1364
 * @param utf32_buffer  the pointer to buffer that can hold conversion result
1365
 * @return the number of written char32_t
1366
 */
1367
simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
1368
1369
1370
/**
1371
 * Return the number of bytes that this Latin1 string would require in UTF-8 format.
1372
 *
1373
 * @param input         the Latin1 string to convert
1374
 * @param length        the length of the string bytes
1375
 * @return the number of bytes required to encode the Latin1 string as UTF-8
1376
 */
1377
simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept;
1378
1379
/**
1380
 * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
1381
 *
1382
 * This function does not validate the input.
1383
 *
1384
 * This function is not BOM-aware.
1385
 *
1386
 * @param input         the UTF-8 string to convert
1387
 * @param length        the length of the string in byte
1388
 * @return the number of bytes required to encode the UTF-8 string as Latin1
1389
 */
1390
simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept;
1391
1392
/**
1393
 * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format.
1394
 *
1395
 * This function does not validate the input.
1396
 *
1397
 * This function is not BOM-aware.
1398
 *
1399
 * @param input         the UTF-8 string to process
1400
 * @param length        the length of the string in bytes
1401
 * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE
1402
 */
1403
simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept;
1404
1405
/**
1406
 * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format.
1407
 *
1408
 * This function is equivalent to count_utf8
1409
 *
1410
 * This function does not validate the input.
1411
 *
1412
 * This function is not BOM-aware.
1413
 *
1414
 * @param input         the UTF-8 string to process
1415
 * @param length        the length of the string in bytes
1416
 * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32
1417
 */
1418
simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept;
1419
1420
/**
1421
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string.
1422
 *
1423
 * During the conversion also validation of the input string is done.
1424
 * This function is suitable to work with inputs from untrusted sources.
1425
 *
1426
 * This function is not BOM-aware.
1427
 *
1428
 * @param input         the UTF-16 string to convert
1429
 * @param length        the length of the string in 2-byte code units (char16_t)
1430
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1431
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1432
 */
1433
simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1434
1435
1436
1437
/**
1438
 * Using native endianness, convert possibly broken UTF-16 string into Latin1 string.
1439
 *
1440
 * During the conversion also validation of the input string is done.
1441
 * This function is suitable to work with inputs from untrusted sources.
1442
 *
1443
 * This function is not BOM-aware.
1444
 *
1445
 * @param input         the UTF-16 string to convert
1446
 * @param length        the length of the string in 2-byte code units (char16_t)
1447
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1448
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1449
 */
1450
simdutf_warn_unused size_t convert_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1451
1452
/**
1453
 * Convert possibly broken UTF-16LE string into Latin1 string.
1454
 *
1455
 * During the conversion also validation of the input string is done.
1456
 * This function is suitable to work with inputs from untrusted sources.
1457
 *
1458
 * This function is not BOM-aware.
1459
 *
1460
 * @param input         the UTF-16LE string to convert
1461
 * @param length        the length of the string in 2-byte code units (char16_t)
1462
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1463
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1464
 */
1465
simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1466
1467
/**
1468
 * Convert possibly broken UTF-16BE string into Latin1 string.
1469
 *
1470
 * During the conversion also validation of the input string is done.
1471
 * This function is suitable to work with inputs from untrusted sources.
1472
 *
1473
 * This function is not BOM-aware.
1474
 *
1475
 * @param input         the UTF-16BE string to convert
1476
 * @param length        the length of the string in 2-byte code units (char16_t)
1477
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1478
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1479
 */
1480
simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1481
1482
1483
/**
1484
 * Convert possibly broken UTF-16LE string into UTF-8 string.
1485
 *
1486
 * During the conversion also validation of the input string is done.
1487
 * This function is suitable to work with inputs from untrusted sources.
1488
 *
1489
 * This function is not BOM-aware.
1490
 *
1491
 * @param input         the UTF-16LE string to convert
1492
 * @param length        the length of the string in 2-byte code units (char16_t)
1493
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1494
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1495
 */
1496
simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1497
1498
/**
1499
 * Convert possibly broken UTF-16BE string into UTF-8 string.
1500
 *
1501
 * During the conversion also validation of the input string is done.
1502
 * This function is suitable to work with inputs from untrusted sources.
1503
 *
1504
 * This function is not BOM-aware.
1505
 *
1506
 * @param input         the UTF-16BE string to convert
1507
 * @param length        the length of the string in 2-byte code units (char16_t)
1508
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1509
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1510
 */
1511
simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1512
1513
/**
1514
 * Using native endianness, convert possibly broken UTF-16 string into Latin1 string.
1515
 *
1516
 * During the conversion also validation of the input string is done.
1517
 * This function is suitable to work with inputs from untrusted sources.
1518
 * This function is not BOM-aware.
1519
 *
1520
 * @param input         the UTF-16 string to convert
1521
 * @param length        the length of the string in 2-byte code units (char16_t)
1522
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1523
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1524
 */
1525
simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1526
1527
/**
1528
 * Convert possibly broken UTF-16LE string into Latin1 string.
1529
 *
1530
 * During the conversion also validation of the input string is done.
1531
 * This function is suitable to work with inputs from untrusted sources.
1532
 * This function is not BOM-aware.
1533
 *
1534
 * @param input         the UTF-16LE string to convert
1535
 * @param length        the length of the string in 2-byte code units (char16_t)
1536
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1537
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1538
 */
1539
simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1540
1541
/**
1542
 * Convert possibly broken UTF-16BE string into Latin1 string.
1543
 *
1544
 * During the conversion also validation of the input string is done.
1545
 * This function is suitable to work with inputs from untrusted sources.
1546
 * This function is not BOM-aware.
1547
 *
1548
 * @param input         the UTF-16BE string to convert
1549
 * @param length        the length of the string in 2-byte code units (char16_t)
1550
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1551
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1552
 */
1553
simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1554
1555
1556
/**
1557
 * Using native endianness, convert possibly broken UTF-16 string into UTF-8 string and stop on error.
1558
 *
1559
 * During the conversion also validation of the input string is done.
1560
 * This function is suitable to work with inputs from untrusted sources.
1561
 *
1562
 * This function is not BOM-aware.
1563
 *
1564
 * @param input         the UTF-16 string to convert
1565
 * @param length        the length of the string in 2-byte code units (char16_t)
1566
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1567
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1568
 */
1569
simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1570
1571
/**
1572
 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
1573
 *
1574
 * During the conversion also validation of the input string is done.
1575
 * This function is suitable to work with inputs from untrusted sources.
1576
 *
1577
 * This function is not BOM-aware.
1578
 *
1579
 * @param input         the UTF-16LE string to convert
1580
 * @param length        the length of the string in 2-byte code units (char16_t)
1581
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1582
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1583
 */
1584
simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1585
1586
/**
1587
 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
1588
 *
1589
 * During the conversion also validation of the input string is done.
1590
 * This function is suitable to work with inputs from untrusted sources.
1591
 *
1592
 * This function is not BOM-aware.
1593
 *
1594
 * @param input         the UTF-16BE string to convert
1595
 * @param length        the length of the string in 2-byte code units (char16_t)
1596
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1597
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1598
 */
1599
simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1600
1601
/**
1602
 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
1603
 *
1604
 * This function assumes that the input string is valid UTF-16LE.
1605
 *
1606
 * This function is not BOM-aware.
1607
 *
1608
 * @param input         the UTF-16 string to convert
1609
 * @param length        the length of the string in 2-byte code units (char16_t)
1610
 * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1611
 * @return number of written code units; 0 if conversion is not possible
1612
 */
1613
simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1614
1615
1616
/**
1617
 * Using native endianness, convert UTF-16 string into Latin1 string.
1618
 *
1619
 * This function assumes that the input string is valid UTF-8.
1620
 *
1621
 * This function is not BOM-aware.
1622
 *
1623
 * @param input         the UTF-16 string to convert
1624
 * @param length        the length of the string in 2-byte code units (char16_t)
1625
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1626
 * @return number of written code units; 0 if conversion is not possible
1627
 */
1628
simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1629
1630
/**
1631
 * Convert valid UTF-16LE string into Latin1 string.
1632
 *
1633
 * This function assumes that the input string is valid UTF-16LE.
1634
 *
1635
 * This function is not BOM-aware.
1636
 *
1637
 * @param input         the UTF-16LE string to convert
1638
 * @param length        the length of the string in 2-byte code units (char16_t)
1639
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1640
 * @return number of written code units; 0 if conversion is not possible
1641
 */
1642
simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1643
1644
/**
1645
 * Convert valid UTF-16BE string into Latin1 string.
1646
 *
1647
 * This function assumes that the input string is valid UTF-16BE.
1648
 *
1649
 * This function is not BOM-aware.
1650
 *
1651
 * @param input         the UTF-16BE string to convert
1652
 * @param length        the length of the string in 2-byte code units (char16_t)
1653
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1654
 * @return number of written code units; 0 if conversion is not possible
1655
 */
1656
simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
1657
1658
1659
/**
1660
 * Convert valid UTF-16LE string into UTF-8 string.
1661
 *
1662
 * This function assumes that the input string is valid UTF-16LE.
1663
 *
1664
 * This function is not BOM-aware.
1665
 *
1666
 * @param input         the UTF-16LE string to convert
1667
 * @param length        the length of the string in 2-byte code units (char16_t)
1668
 * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1669
 * @return number of written code units; 0 if conversion is not possible
1670
 */
1671
simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1672
1673
/**
1674
 * Convert valid UTF-16BE string into UTF-8 string.
1675
 *
1676
 * This function assumes that the input string is valid UTF-16BE.
1677
 *
1678
 * This function is not BOM-aware.
1679
 *
1680
 * @param input         the UTF-16BE string to convert
1681
 * @param length        the length of the string in 2-byte code units (char16_t)
1682
 * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1683
 * @return number of written code units; 0 if conversion is not possible
1684
 */
1685
simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
1686
1687
/**
1688
 * Using native endianness, convert possibly broken UTF-16 string into UTF-32 string.
1689
 *
1690
 * During the conversion also validation of the input string is done.
1691
 * This function is suitable to work with inputs from untrusted sources.
1692
 *
1693
 * This function is not BOM-aware.
1694
 *
1695
 * @param input         the UTF-16 string to convert
1696
 * @param length        the length of the string in 2-byte code units (char16_t)
1697
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1698
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1699
 */
1700
simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1701
1702
/**
1703
 * Convert possibly broken UTF-16LE string into UTF-32 string.
1704
 *
1705
 * During the conversion also validation of the input string is done.
1706
 * This function is suitable to work with inputs from untrusted sources.
1707
 *
1708
 * This function is not BOM-aware.
1709
 *
1710
 * @param input         the UTF-16LE string to convert
1711
 * @param length        the length of the string in 2-byte code units (char16_t)
1712
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1713
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1714
 */
1715
simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1716
1717
/**
1718
 * Convert possibly broken UTF-16BE string into UTF-32 string.
1719
 *
1720
 * During the conversion also validation of the input string is done.
1721
 * This function is suitable to work with inputs from untrusted sources.
1722
 *
1723
 * This function is not BOM-aware.
1724
 *
1725
 * @param input         the UTF-16BE string to convert
1726
 * @param length        the length of the string in 2-byte code units (char16_t)
1727
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1728
 * @return number of written code units; 0 if input is not a valid UTF-16LE string
1729
 */
1730
simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1731
1732
/**
1733
 * Using native endianness, convert possibly broken UTF-16 string into
1734
 * UTF-32 string and stop on error.
1735
 *
1736
 * During the conversion also validation of the input string is done.
1737
 * This function is suitable to work with inputs from untrusted sources.
1738
 *
1739
 * This function is not BOM-aware.
1740
 *
1741
 * @param input         the UTF-16 string to convert
1742
 * @param length        the length of the string in 2-byte code units (char16_t)
1743
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1744
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1745
 */
1746
simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1747
1748
/**
1749
 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
1750
 *
1751
 * During the conversion also validation of the input string is done.
1752
 * This function is suitable to work with inputs from untrusted sources.
1753
 *
1754
 * This function is not BOM-aware.
1755
 *
1756
 * @param input         the UTF-16LE string to convert
1757
 * @param length        the length of the string in 2-byte code units (char16_t)
1758
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1759
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1760
 */
1761
simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1762
1763
/**
1764
 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
1765
 *
1766
 * During the conversion also validation of the input string is done.
1767
 * This function is suitable to work with inputs from untrusted sources.
1768
 *
1769
 * This function is not BOM-aware.
1770
 *
1771
 * @param input         the UTF-16BE string to convert
1772
 * @param length        the length of the string in 2-byte code units (char16_t)
1773
 * @param utf32_buffer   the pointer to buffer that can hold conversion result
1774
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
1775
 */
1776
simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1777
1778
/**
1779
 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
1780
 *
1781
 * This function assumes that the input string is valid UTF-16 (native endianness).
1782
 *
1783
 * This function is not BOM-aware.
1784
 *
1785
 * @param input         the UTF-16 string to convert
1786
 * @param length        the length of the string in 2-byte code units (char16_t)
1787
 * @param utf32_buffer   the pointer to buffer that can hold the conversion result
1788
 * @return number of written code units; 0 if conversion is not possible
1789
 */
1790
simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1791
1792
/**
1793
 * Convert valid UTF-16LE string into UTF-32 string.
1794
 *
1795
 * This function assumes that the input string is valid UTF-16LE.
1796
 *
1797
 * This function is not BOM-aware.
1798
 *
1799
 * @param input         the UTF-16LE string to convert
1800
 * @param length        the length of the string in 2-byte code units (char16_t)
1801
 * @param utf32_buffer   the pointer to buffer that can hold the conversion result
1802
 * @return number of written code units; 0 if conversion is not possible
1803
 */
1804
simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1805
1806
/**
1807
 * Convert valid UTF-16BE string into UTF-32 string.
1808
 *
1809
 * This function assumes that the input string is valid UTF-16LE.
1810
 *
1811
 * This function is not BOM-aware.
1812
 *
1813
 * @param input         the UTF-16BE string to convert
1814
 * @param length        the length of the string in 2-byte code units (char16_t)
1815
 * @param utf32_buffer   the pointer to buffer that can hold the conversion result
1816
 * @return number of written code units; 0 if conversion is not possible
1817
 */
1818
simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
1819
1820
1821
/*
1822
 * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
1823
 *
1824
 * This function does not validate the input.
1825
 *
1826
 * This function is not BOM-aware.
1827
 *
1828
 * @param length        the length of the string in 2-byte code units (char16_t)
1829
 * @return the number of bytes required to encode the UTF-16LE string as Latin1
1830
 */
1831
simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
1832
1833
1834
/**
1835
 * Using native endianness; Compute the number of bytes that this UTF-16
1836
 * string would require in UTF-8 format.
1837
 *
1838
 * This function does not validate the input.
1839
 *
1840
 * @param input         the UTF-16 string to convert
1841
 * @param length        the length of the string in 2-byte code units (char16_t)
1842
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1843
 */
1844
simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept;
1845
1846
/**
1847
 * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
1848
 *
1849
 * This function does not validate the input.
1850
 *
1851
 * @param input         the UTF-16LE string to convert
1852
 * @param length        the length of the string in 2-byte code units (char16_t)
1853
 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
1854
 */
1855
simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept;
1856
1857
/**
1858
 * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
1859
 *
1860
 * This function does not validate the input.
1861
 *
1862
 * @param input         the UTF-16BE string to convert
1863
 * @param length        the length of the string in 2-byte code units (char16_t)
1864
 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
1865
 */
1866
simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept;
1867
1868
/**
1869
 * Convert possibly broken UTF-32 string into UTF-8 string.
1870
 *
1871
 * During the conversion also validation of the input string is done.
1872
 * This function is suitable to work with inputs from untrusted sources.
1873
 *
1874
 * This function is not BOM-aware.
1875
 *
1876
 * @param input         the UTF-32 string to convert
1877
 * @param length        the length of the string in 4-byte code units (char32_t)
1878
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1879
 * @return number of written code units; 0 if input is not a valid UTF-32 string
1880
 */
1881
simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1882
1883
/**
1884
 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
1885
 *
1886
 * During the conversion also validation of the input string is done.
1887
 * This function is suitable to work with inputs from untrusted sources.
1888
 *
1889
 * This function is not BOM-aware.
1890
 *
1891
 * @param input         the UTF-32 string to convert
1892
 * @param length        the length of the string in 4-byte code units (char32_t)
1893
 * @param utf8_buffer   the pointer to buffer that can hold conversion result
1894
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1895
 */
1896
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1897
1898
/**
1899
 * Convert valid UTF-32 string into UTF-8 string.
1900
 *
1901
 * This function assumes that the input string is valid UTF-32.
1902
 *
1903
 * This function is not BOM-aware.
1904
 *
1905
 * @param input         the UTF-32 string to convert
1906
 * @param length        the length of the string in 4-byte code units (char32_t)
1907
 * @param utf8_buffer   the pointer to buffer that can hold the conversion result
1908
 * @return number of written code units; 0 if conversion is not possible
1909
 */
1910
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
1911
1912
/**
1913
 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16 string.
1914
 *
1915
 * During the conversion also validation of the input string is done.
1916
 * This function is suitable to work with inputs from untrusted sources.
1917
 *
1918
 * This function is not BOM-aware.
1919
 *
1920
 * @param input         the UTF-32 string to convert
1921
 * @param length        the length of the string in 4-byte code units (char32_t)
1922
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
1923
 * @return number of written code units; 0 if input is not a valid UTF-32 string
1924
 */
1925
simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1926
1927
/**
1928
 * Convert possibly broken UTF-32 string into UTF-16LE string.
1929
 *
1930
 * During the conversion also validation of the input string is done.
1931
 * This function is suitable to work with inputs from untrusted sources.
1932
 *
1933
 * This function is not BOM-aware.
1934
 *
1935
 * @param input         the UTF-32 string to convert
1936
 * @param length        the length of the string in 4-byte code units (char32_t)
1937
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
1938
 * @return number of written code units; 0 if input is not a valid UTF-32 string
1939
 */
1940
simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
1941
1942
/**
1943
 * Convert possibly broken UTF-32 string into Latin1 string.
1944
 *
1945
 * During the conversion also validation of the input string is done.
1946
 * This function is suitable to work with inputs from untrusted sources.
1947
 *
1948
 * This function is not BOM-aware.
1949
 *
1950
 * @param input         the UTF-32 string to convert
1951
 * @param length        the length of the string in 4-byte code units (char32_t)
1952
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1953
 * @return number of written code units; 0 if input is not a valid UTF-32 string
1954
 */
1955
simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
1956
1957
1958
/**
1959
 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
1960
 *
1961
 * During the conversion also validation of the input string is done.
1962
 * This function is suitable to work with inputs from untrusted sources.
1963
 *
1964
 * This function is not BOM-aware.
1965
 *
1966
 * @param input         the UTF-32 string to convert
1967
 * @param length        the length of the string in 4-byte code units (char32_t)
1968
 * @param latin1_buffer   the pointer to buffer that can hold conversion result
1969
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
1970
 */
1971
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
1972
1973
/**
1974
 * Convert valid UTF-32 string into Latin1 string.
1975
 *
1976
 * This function assumes that the input string is valid UTF-32.
1977
 *
1978
 * This function is not BOM-aware.
1979
 *
1980
 * @param input         the UTF-32 string to convert
1981
 * @param length        the length of the string in 4-byte code units (char32_t)
1982
 * @param latin1_buffer   the pointer to buffer that can hold the conversion result
1983
 * @return number of written code units; 0 if conversion is not possible
1984
 */
1985
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
1986
1987
/**
1988
 * Convert possibly broken UTF-32 string into UTF-16BE string.
1989
 *
1990
 * During the conversion also validation of the input string is done.
1991
 * This function is suitable to work with inputs from untrusted sources.
1992
 *
1993
 * This function is not BOM-aware.
1994
 *
1995
 * @param input         the UTF-32 string to convert
1996
 * @param length        the length of the string in 4-byte code units (char32_t)
1997
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
1998
 * @return number of written code units; 0 if input is not a valid UTF-32 string
1999
 */
2000
simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2001
2002
/**
2003
 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
2004
 * string and stop on error.
2005
 *
2006
 * During the conversion also validation of the input string is done.
2007
 * This function is suitable to work with inputs from untrusted sources.
2008
 *
2009
 * This function is not BOM-aware.
2010
 *
2011
 * @param input         the UTF-32 string to convert
2012
 * @param length        the length of the string in 4-byte code units (char32_t)
2013
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2014
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
2015
 */
2016
simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2017
2018
/**
2019
 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
2020
 *
2021
 * During the conversion also validation of the input string is done.
2022
 * This function is suitable to work with inputs from untrusted sources.
2023
 *
2024
 * This function is not BOM-aware.
2025
 *
2026
 * @param input         the UTF-32 string to convert
2027
 * @param length        the length of the string in 4-byte code units (char32_t)
2028
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2029
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
2030
 */
2031
simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2032
2033
/**
2034
 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
2035
 *
2036
 * During the conversion also validation of the input string is done.
2037
 * This function is suitable to work with inputs from untrusted sources.
2038
 *
2039
 * This function is not BOM-aware.
2040
 *
2041
 * @param input         the UTF-32 string to convert
2042
 * @param length        the length of the string in 4-byte code units (char32_t)
2043
 * @param utf16_buffer   the pointer to buffer that can hold conversion result
2044
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
2045
 */
2046
simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2047
2048
/**
2049
 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
2050
 *
2051
 * This function assumes that the input string is valid UTF-32.
2052
 *
2053
 * This function is not BOM-aware.
2054
 *
2055
 * @param input         the UTF-32 string to convert
2056
 * @param length        the length of the string in 4-byte code units (char32_t)
2057
 * @param utf16_buffer   the pointer to buffer that can hold the conversion result
2058
 * @return number of written code units; 0 if conversion is not possible
2059
 */
2060
simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2061
2062
/**
2063
 * Convert valid UTF-32 string into UTF-16LE string.
2064
 *
2065
 * This function assumes that the input string is valid UTF-32.
2066
 *
2067
 * This function is not BOM-aware.
2068
 *
2069
 * @param input         the UTF-32 string to convert
2070
 * @param length        the length of the string in 4-byte code units (char32_t)
2071
 * @param utf16_buffer   the pointer to buffer that can hold the conversion result
2072
 * @return number of written code units; 0 if conversion is not possible
2073
 */
2074
simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2075
2076
/**
2077
 * Convert valid UTF-32 string into UTF-16BE string.
2078
 *
2079
 * This function assumes that the input string is valid UTF-32.
2080
 *
2081
 * This function is not BOM-aware.
2082
 *
2083
 * @param input         the UTF-32 string to convert
2084
 * @param length        the length of the string in 4-byte code units (char32_t)
2085
 * @param utf16_buffer   the pointer to buffer that can hold the conversion result
2086
 * @return number of written code units; 0 if conversion is not possible
2087
 */
2088
simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
2089
2090
/**
2091
 * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
2092
 * from UTF-16BE to UTF-16LE.
2093
 *
2094
 * This function does not validate the input.
2095
 *
2096
 * This function is not BOM-aware.
2097
 *
2098
 * @param input         the UTF-16 string to process
2099
 * @param length        the length of the string in 2-byte code units (char16_t)
2100
 * @param output        the pointer to buffer that can hold the conversion result
2101
 */
2102
void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept;
2103
2104
/**
2105
 * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
2106
 *
2107
 * This function does not validate the input.
2108
 *
2109
 * @param input         the UTF-32 string to convert
2110
 * @param length        the length of the string in 4-byte code units (char32_t)
2111
 * @return the number of bytes required to encode the UTF-32 string as UTF-8
2112
 */
2113
simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept;
2114
2115
/**
2116
 * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format.
2117
 *
2118
 * This function does not validate the input.
2119
 *
2120
 * @param input         the UTF-32 string to convert
2121
 * @param length        the length of the string in 4-byte code units (char32_t)
2122
 * @return the number of bytes required to encode the UTF-32 string as UTF-16
2123
 */
2124
simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept;
2125
2126
/**
2127
 * Using native endianness; Compute the number of bytes that this UTF-16
2128
 * string would require in UTF-32 format.
2129
 *
2130
 * This function is equivalent to count_utf16.
2131
 *
2132
 * This function does not validate the input.
2133
 *
2134
 * This function is not BOM-aware.
2135
 *
2136
 * @param input         the UTF-16 string to convert
2137
 * @param length        the length of the string in 2-byte code units (char16_t)
2138
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2139
 */
2140
simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept;
2141
2142
/**
2143
 * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
2144
 *
2145
 * This function is equivalent to count_utf16le.
2146
 *
2147
 * This function does not validate the input.
2148
 *
2149
 * This function is not BOM-aware.
2150
 *
2151
 * @param input         the UTF-16LE string to convert
2152
 * @param length        the length of the string in 2-byte code units (char16_t)
2153
 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
2154
 */
2155
simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept;
2156
2157
/**
2158
 * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
2159
 *
2160
 * This function is equivalent to count_utf16be.
2161
 *
2162
 * This function does not validate the input.
2163
 *
2164
 * This function is not BOM-aware.
2165
 *
2166
 * @param input         the UTF-16BE string to convert
2167
 * @param length        the length of the string in 2-byte code units (char16_t)
2168
 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
2169
 */
2170
simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept;
2171
2172
/**
2173
 * Count the number of code points (characters) in the string assuming that
2174
 * it is valid.
2175
 *
2176
 * This function assumes that the input string is valid UTF-16 (native endianness).
2177
 *
2178
 * This function is not BOM-aware.
2179
 *
2180
 * @param input         the UTF-16 string to process
2181
 * @param length        the length of the string in 2-byte code units (char16_t)
2182
 * @return number of code points
2183
 */
2184
simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept;
2185
2186
/**
2187
 * Count the number of code points (characters) in the string assuming that
2188
 * it is valid.
2189
 *
2190
 * This function assumes that the input string is valid UTF-16LE.
2191
 *
2192
 * This function is not BOM-aware.
2193
 *
2194
 * @param input         the UTF-16LE string to process
2195
 * @param length        the length of the string in 2-byte code units (char16_t)
2196
 * @return number of code points
2197
 */
2198
simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept;
2199
2200
/**
2201
 * Count the number of code points (characters) in the string assuming that
2202
 * it is valid.
2203
 *
2204
 * This function assumes that the input string is valid UTF-16BE.
2205
 *
2206
 * This function is not BOM-aware.
2207
 *
2208
 * @param input         the UTF-16BE string to process
2209
 * @param length        the length of the string in 2-byte code units (char16_t)
2210
 * @return number of code points
2211
 */
2212
simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept;
2213
2214
/**
2215
 * Count the number of code points (characters) in the string assuming that
2216
 * it is valid.
2217
 *
2218
 * This function assumes that the input string is valid UTF-8.
2219
 *
2220
 * @param input         the UTF-8 string to process
2221
 * @param length        the length of the string in bytes
2222
 * @return number of code points
2223
 */
2224
simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept;
2225
2226
/**
2227
 * Given a valid UTF-8 string having a possibly truncated last character,
2228
 * this function checks the end of string. If the last character is truncated (or partial),
2229
 * then it returns a shorter length (shorter by 1 to 3 bytes) so that the short UTF-8
2230
 * strings only contain complete characters. If there is no truncated character,
2231
 * the original length is returned.
2232
 *
2233
 * This function assumes that the input string is valid UTF-8, but possibly truncated.
2234
 *
2235
 * @param input         the UTF-8 string to process
2236
 * @param length        the length of the string in bytes
2237
 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
2238
 */
2239
simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
2240
2241
/**
2242
 * Given a valid UTF-16BE string having a possibly truncated last character,
2243
 * this function checks the end of string. If the last character is truncated (or partial),
2244
 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16BE
2245
 * strings only contain complete characters. If there is no truncated character,
2246
 * the original length is returned.
2247
 *
2248
 * This function assumes that the input string is valid UTF-16BE, but possibly truncated.
2249
 *
2250
 * @param input         the UTF-16BE string to process
2251
 * @param length        the length of the string in bytes
2252
 * @return the length of the string in bytes, possibly shorter by 1 unit
2253
 */
2254
simdutf_warn_unused size_t trim_partial_utf16be(const char16_t* input, size_t length);
2255
2256
/**
2257
 * Given a valid UTF-16LE string having a possibly truncated last character,
2258
 * this function checks the end of string. If the last character is truncated (or partial),
2259
 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16LE
2260
 * strings only contain complete characters. If there is no truncated character,
2261
 * the original length is returned.
2262
 *
2263
 * This function assumes that the input string is valid UTF-16LE, but possibly truncated.
2264
 *
2265
 * @param input         the UTF-16LE string to process
2266
 * @param length        the length of the string in bytes
2267
 * @return the length of the string in unit, possibly shorter by 1 unit
2268
 */
2269
simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t length);
2270
2271
2272
/**
2273
 * Given a valid UTF-16 string having a possibly truncated last character,
2274
 * this function checks the end of string. If the last character is truncated (or partial),
2275
 * then it returns a shorter length (shorter by 1 unit) so that the short UTF-16
2276
 * strings only contain complete characters. If there is no truncated character,
2277
 * the original length is returned.
2278
 *
2279
 * This function assumes that the input string is valid UTF-16, but possibly truncated.
2280
 * We use the native endianness.
2281
 *
2282
 * @param input         the UTF-16 string to process
2283
 * @param length        the length of the string in bytes
2284
 * @return the length of the string in unit, possibly shorter by 1 unit
2285
 */
2286
simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
2287
2288
2289
/**
2290
 * Provide the maximal binary length in bytes given the base64 input.
2291
 * In general, if the input contains ASCII spaces, the result will be less than
2292
 * the maximum length.
2293
 *
2294
 * @param input         the base64 input to process
2295
 * @param length        the length of the base64 input in bytes
2296
 * @return number of base64 bytes
2297
 */
2298
simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
2299
2300
/**
2301
 * Convert a base64 input to a binary ouput.
2302
 *
2303
 * This function follows the WHATWG forgiving-base64 format, which means that it will
2304
 * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
2305
 * equal signs at the end) or an unpadded input (without any equal signs at the end).
2306
 *
2307
 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
2308
 *
2309
 * This function will fail in case of invalid input. There are two possible reasons for
2310
 * failure: the input is contains a number of base64 characters that when divided by 4, leaves
2311
 * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
2312
 * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
2313
 *
2314
 * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
2315
 * If you fail to provide that much space, the function may cause a buffer overflow.
2316
 *
2317
 * @param input         the base64 string to process
2318
 * @param length        the length of the string in bytes
2319
 * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
2320
 * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
2321
 */
2322
simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
2323
2324
/**
2325
 * Provide the base64 length in bytes given the length of a binary input.
2326
 *
2327
 * @param length        the length of the input in bytes
2328
 * @return number of base64 bytes
2329
 */
2330
simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
2331
2332
/**
2333
 * Convert a binary input to a base64 ouput. The output is always padded with equal signs so that it is
2334
 * a multiple of 4 bytes long.
2335
 *
2336
 * This function always succeeds.
2337
 *
2338
 * @param input         the binary to process
2339
 * @param length        the length of the input in bytes
2340
 * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
2341
 * @return number of written bytes, will be equal to base64_length_from_binary(length)
2342
 */
2343
size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
2344
2345
/**
2346
 * An implementation of simdutf for a particular CPU architecture.
2347
 *
2348
 * Also used to maintain the currently active implementation. The active implementation is
2349
 * automatically initialized on first use to the most advanced implementation supported by the host.
2350
 */
2351
class implementation {
2352
public:
2353
2354
  /**
2355
   * The name of this implementation.
2356
   *
2357
   *     const implementation *impl = simdutf::active_implementation;
2358
   *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
2359
   *
2360
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
2361
   */
2362
0
  virtual const std::string &name() const { return _name; }
2363
2364
  /**
2365
   * The description of this implementation.
2366
   *
2367
   *     const implementation *impl = simdutf::active_implementation;
2368
   *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
2369
   *
2370
   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
2371
   */
2372
0
  virtual const std::string &description() const { return _description; }
2373
2374
  /**
2375
   * The instruction sets this implementation is compiled against
2376
   * and the current CPU match. This function may poll the current CPU/system
2377
   * and should therefore not be called too often if performance is a concern.
2378
   *
2379
   *
2380
   * @return true if the implementation can be safely used on the current system (determined at runtime)
2381
   */
2382
  bool supported_by_runtime_system() const;
2383
2384
  /**
2385
   * This function will try to detect the encoding
2386
   * @param input the string to identify
2387
   * @param length the length of the string in bytes.
2388
   * @return the encoding type detected
2389
   */
2390
  virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept;
2391
2392
  /**
2393
   * This function will try to detect the possible encodings in one pass
2394
   * @param input the string to identify
2395
   * @param length the length of the string in bytes.
2396
   * @return the encoding type detected
2397
   */
2398
  virtual int detect_encodings(const char * input, size_t length) const noexcept = 0;
2399
2400
  /**
2401
   * @private For internal implementation use
2402
   *
2403
   * The instruction sets this implementation is compiled against.
2404
   *
2405
   * @return a mask of all required `internal::instruction_set::` values
2406
   */
2407
0
  virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }
2408
2409
2410
  /**
2411
   * Validate the UTF-8 string.
2412
   *
2413
   * Overridden by each implementation.
2414
   *
2415
   * @param buf the UTF-8 string to validate.
2416
   * @param len the length of the string in bytes.
2417
   * @return true if and only if the string is valid UTF-8.
2418
   */
2419
  simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
2420
2421
  /**
2422
   * Validate the UTF-8 string and stop on errors.
2423
   *
2424
   * Overridden by each implementation.
2425
   *
2426
   * @param buf the UTF-8 string to validate.
2427
   * @param len the length of the string in bytes.
2428
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2429
   */
2430
  simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
2431
2432
  /**
2433
   * Validate the ASCII string.
2434
   *
2435
   * Overridden by each implementation.
2436
   *
2437
   * @param buf the ASCII string to validate.
2438
   * @param len the length of the string in bytes.
2439
   * @return true if and only if the string is valid ASCII.
2440
   */
2441
  simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0;
2442
2443
  /**
2444
   * Validate the ASCII string and stop on error.
2445
   *
2446
   * Overridden by each implementation.
2447
   *
2448
   * @param buf the ASCII string to validate.
2449
   * @param len the length of the string in bytes.
2450
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2451
   */
2452
  simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
2453
2454
  /**
2455
   * Validate the UTF-16LE string.This function may be best when you expect
2456
   * the input to be almost always valid. Otherwise, consider using
2457
   * validate_utf16le_with_errors.
2458
   *
2459
   * Overridden by each implementation.
2460
   *
2461
   * This function is not BOM-aware.
2462
   *
2463
   * @param buf the UTF-16LE string to validate.
2464
   * @param len the length of the string in number of 2-byte code units (char16_t).
2465
   * @return true if and only if the string is valid UTF-16LE.
2466
   */
2467
  simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
2468
2469
  /**
2470
   * Validate the UTF-16BE string. This function may be best when you expect
2471
   * the input to be almost always valid. Otherwise, consider using
2472
   * validate_utf16be_with_errors.
2473
   *
2474
   * Overridden by each implementation.
2475
   *
2476
   * This function is not BOM-aware.
2477
   *
2478
   * @param buf the UTF-16BE string to validate.
2479
   * @param len the length of the string in number of 2-byte code units (char16_t).
2480
   * @return true if and only if the string is valid UTF-16BE.
2481
   */
2482
  simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
2483
2484
  /**
2485
   * Validate the UTF-16LE string and stop on error.  It might be faster than
2486
 * validate_utf16le when an error is expected to occur early.
2487
   *
2488
   * Overridden by each implementation.
2489
   *
2490
   * This function is not BOM-aware.
2491
   *
2492
   * @param buf the UTF-16LE string to validate.
2493
   * @param len the length of the string in number of 2-byte code units (char16_t).
2494
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2495
   */
2496
  simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2497
2498
  /**
2499
   * Validate the UTF-16BE string and stop on error. It might be faster than
2500
   * validate_utf16be when an error is expected to occur early.
2501
   *
2502
   * Overridden by each implementation.
2503
   *
2504
   * This function is not BOM-aware.
2505
   *
2506
   * @param buf the UTF-16BE string to validate.
2507
   * @param len the length of the string in number of 2-byte code units (char16_t).
2508
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2509
   */
2510
  simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
2511
2512
  /**
2513
   * Validate the UTF-32 string.
2514
   *
2515
   * Overridden by each implementation.
2516
   *
2517
   * This function is not BOM-aware.
2518
   *
2519
   * @param buf the UTF-32 string to validate.
2520
   * @param len the length of the string in number of 4-byte code units (char32_t).
2521
   * @return true if and only if the string is valid UTF-32.
2522
   */
2523
  simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
2524
2525
  /**
2526
   * Validate the UTF-32 string and stop on error.
2527
   *
2528
   * Overridden by each implementation.
2529
   *
2530
   * This function is not BOM-aware.
2531
   *
2532
   * @param buf the UTF-32 string to validate.
2533
   * @param len the length of the string in number of 4-byte code units (char32_t).
2534
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2535
   */
2536
  simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0;
2537
2538
  /**
2539
   * Convert Latin1 string into UTF8 string.
2540
   *
2541
   * This function is suitable to work with inputs from untrusted sources.
2542
   *
2543
   * @param input         the Latin1 string to convert
2544
   * @param length        the length of the string in bytes
2545
   * @param latin1_output  the pointer to buffer that can hold conversion result
2546
   * @return the number of written char; 0 if conversion is not possible
2547
   */
2548
  simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0;
2549
2550
2551
    /**
2552
   * Convert possibly Latin1 string into UTF-16LE string.
2553
   *
2554
   * This function is suitable to work with inputs from untrusted sources.
2555
   *
2556
   * @param input         the Latin1  string to convert
2557
   * @param length        the length of the string in bytes
2558
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2559
   * @return the number of written char16_t; 0 if conversion is not possible
2560
   */
2561
  simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2562
2563
  /**
2564
   * Convert Latin1 string into UTF-16BE string.
2565
   *
2566
   * This function is suitable to work with inputs from untrusted sources.
2567
   *
2568
   * @param input         the Latin1 string to convert
2569
   * @param length        the length of the string in bytes
2570
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2571
   * @return the number of written char16_t; 0 if conversion is not possible
2572
   */
2573
  simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2574
2575
  /**
2576
   * Convert Latin1 string into UTF-32 string.
2577
   *
2578
   * This function is suitable to work with inputs from untrusted sources.
2579
   *
2580
   * @param input         the Latin1 string to convert
2581
   * @param length        the length of the string in bytes
2582
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
2583
   * @return the number of written char32_t; 0 if conversion is not possible
2584
   */
2585
  simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2586
2587
 /**
2588
   * Convert possibly broken UTF-8 string into latin1 string.
2589
   *
2590
   * During the conversion also validation of the input string is done.
2591
   * This function is suitable to work with inputs from untrusted sources.
2592
   *
2593
   * @param input         the UTF-8 string to convert
2594
   * @param length        the length of the string in bytes
2595
   * @param latin1_output  the pointer to buffer that can hold conversion result
2596
   * @return the number of written char; 0 if the input was not valid UTF-8 string
2597
   */
2598
  simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
2599
2600
  /**
2601
   * Convert possibly broken UTF-8 string into latin1 string with errors
2602
   *
2603
   * During the conversion also validation of the input string is done.
2604
   * This function is suitable to work with inputs from untrusted sources.
2605
   *
2606
   * @param input         the UTF-8 string to convert
2607
   * @param length        the length of the string in bytes
2608
   * @param latin1_output  the pointer to buffer that can hold conversion result
2609
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2610
   */
2611
  simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0;
2612
2613
    /**
2614
   * Convert valid UTF-8 string into latin1 string.
2615
   *
2616
   * This function assumes that the input string is valid UTF-8.
2617
   *
2618
   * This function is not BOM-aware.
2619
   *
2620
   * @param input         the UTF-8 string to convert
2621
   * @param length        the length of the string in bytes
2622
   * @param latin1_output  the pointer to buffer that can hold conversion result
2623
   * @return the number of written char; 0 if the input was not valid UTF-8 string
2624
   */
2625
  simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
2626
2627
2628
  /**
2629
   * Convert possibly broken UTF-8 string into UTF-16LE string.
2630
   *
2631
   * During the conversion also validation of the input string is done.
2632
   * This function is suitable to work with inputs from untrusted sources.
2633
   *
2634
   * @param input         the UTF-8 string to convert
2635
   * @param length        the length of the string in bytes
2636
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2637
   * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2638
   */
2639
  simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2640
2641
  /**
2642
   * Convert possibly broken UTF-8 string into UTF-16BE string.
2643
   *
2644
   * During the conversion also validation of the input string is done.
2645
   * This function is suitable to work with inputs from untrusted sources.
2646
   *
2647
   * @param input         the UTF-8 string to convert
2648
   * @param length        the length of the string in bytes
2649
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2650
   * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2651
   */
2652
  simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2653
2654
  /**
2655
   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
2656
   *
2657
   * During the conversion also validation of the input string is done.
2658
   * This function is suitable to work with inputs from untrusted sources.
2659
   *
2660
   * @param input         the UTF-8 string to convert
2661
   * @param length        the length of the string in bytes
2662
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2663
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2664
   */
2665
  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2666
2667
  /**
2668
   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
2669
   *
2670
   * During the conversion also validation of the input string is done.
2671
   * This function is suitable to work with inputs from untrusted sources.
2672
   *
2673
   * @param input         the UTF-8 string to convert
2674
   * @param length        the length of the string in bytes
2675
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2676
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of code units validated if successful.
2677
   */
2678
  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
2679
2680
  /**
2681
   * Convert possibly broken UTF-8 string into UTF-32 string.
2682
   *
2683
   * During the conversion also validation of the input string is done.
2684
   * This function is suitable to work with inputs from untrusted sources.
2685
   *
2686
   * @param input         the UTF-8 string to convert
2687
   * @param length        the length of the string in bytes
2688
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
2689
   * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
2690
   */
2691
  simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2692
2693
  /**
2694
   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
2695
   *
2696
   * During the conversion also validation of the input string is done.
2697
   * This function is suitable to work with inputs from untrusted sources.
2698
   *
2699
   * @param input         the UTF-8 string to convert
2700
   * @param length        the length of the string in bytes
2701
   * @param utf32_buffer  the pointer to buffer that can hold conversion result
2702
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
2703
   */
2704
  simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
2705
2706
  /**
2707
   * Convert valid UTF-8 string into UTF-16LE string.
2708
   *
2709
   * This function assumes that the input string is valid UTF-8.
2710
   *
2711
   * @param input         the UTF-8 string to convert
2712
   * @param length        the length of the string in bytes
2713
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2714
   * @return the number of written char16_t
2715
   */
2716
  simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2717
2718
/**
2719
   * Convert valid UTF-8 string into UTF-16BE string.
2720
   *
2721
   * This function assumes that the input string is valid UTF-8.
2722
   *
2723
   * @param input         the UTF-8 string to convert
2724
   * @param length        the length of the string in bytes
2725
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2726
   * @return the number of written char16_t
2727
   */
2728
  simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
2729
2730
  /**
2731
   * Convert valid UTF-8 string into UTF-32 string.
2732
   *
2733
   * This function assumes that the input string is valid UTF-8.
2734
   *
2735
   * @param input         the UTF-8 string to convert
2736
   * @param length        the length of the string in bytes
2737
   * @param utf16_buffer  the pointer to buffer that can hold conversion result
2738
   * @return the number of written char32_t
2739
   */
2740
  simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2741
2742
  /**
2743
   * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format.
2744
   *
2745
   * This function does not validate the input.
2746
   *
2747
   * @param input         the UTF-8 string to process
2748
   * @param length        the length of the string in bytes
2749
   * @return the number of char16_t code units required to encode the UTF-8 string as UTF-16LE
2750
   */
2751
  simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2752
2753
   /**
2754
   * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format.
2755
   *
2756
   * This function is equivalent to count_utf8.
2757
   *
2758
   * This function does not validate the input.
2759
   *
2760
   * @param input         the UTF-8 string to process
2761
   * @param length        the length of the string in bytes
2762
   * @return the number of char32_t code units required to encode the UTF-8 string as UTF-32
2763
   */
2764
  simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0;
2765
2766
  /**
2767
   * Convert possibly broken UTF-16LE string into Latin1 string.
2768
   *
2769
   * During the conversion also validation of the input string is done.
2770
   * This function is suitable to work with inputs from untrusted sources.
2771
   *
2772
   * This function is not BOM-aware.
2773
   *
2774
   * @param input         the UTF-16LE string to convert
2775
   * @param length        the length of the string in 2-byte code units (char16_t)
2776
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
2777
   * @return number of written code units; 0 if input is not a valid UTF-16LE string
2778
   */
2779
  simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2780
2781
  /**
2782
   * Convert possibly broken UTF-16BE string into Latin1 string.
2783
   *
2784
   * During the conversion also validation of the input string is done.
2785
   * This function is suitable to work with inputs from untrusted sources.
2786
   *
2787
   * This function is not BOM-aware.
2788
   *
2789
   * @param input         the UTF-16BE string to convert
2790
   * @param length        the length of the string in 2-byte code units (char16_t)
2791
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
2792
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2793
   */
2794
  simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2795
2796
  /**
2797
   * Convert possibly broken UTF-16LE string into Latin1 string.
2798
   *
2799
   * During the conversion also validation of the input string is done.
2800
   * This function is suitable to work with inputs from untrusted sources.
2801
   * This function is not BOM-aware.
2802
   *
2803
   * @param input         the UTF-16LE string to convert
2804
   * @param length        the length of the string in 2-byte code units (char16_t)
2805
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
2806
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2807
   */
2808
  simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2809
2810
  /**
2811
   * Convert possibly broken UTF-16BE string into Latin1 string.
2812
   *
2813
   * During the conversion also validation of the input string is done.
2814
   * This function is suitable to work with inputs from untrusted sources.
2815
   * This function is not BOM-aware.
2816
   *
2817
   * @param input         the UTF-16BE string to convert
2818
   * @param length        the length of the string in 2-byte code units (char16_t)
2819
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
2820
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2821
   */
2822
  simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2823
2824
  /**
2825
   * Convert valid UTF-16LE string into Latin1 string.
2826
   *
2827
   * This function assumes that the input string is valid UTF-8.
2828
2829
   * This function is not BOM-aware.
2830
   *
2831
   * @param input         the UTF-16LE string to convert
2832
   * @param length        the length of the string in 2-byte code units (char16_t)
2833
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
2834
   * @return number of written code units; 0 if conversion is not possible
2835
   */
2836
  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2837
2838
  /**
2839
   * Convert valid UTF-16BE string into Latin1 string.
2840
   *
2841
   * This function assumes that the input string is valid UTF-8.
2842
   *
2843
   * This function is not BOM-aware.
2844
   *
2845
   * @param input         the UTF-16BE string to convert
2846
   * @param length        the length of the string in 2-byte code units (char16_t)
2847
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
2848
   * @return number of written code units; 0 if conversion is not possible
2849
   */
2850
  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
2851
2852
  /**
2853
   * Convert possibly broken UTF-16LE string into UTF-8 string.
2854
   *
2855
   * During the conversion also validation of the input string is done.
2856
   * This function is suitable to work with inputs from untrusted sources.
2857
   *
2858
   * This function is not BOM-aware.
2859
   *
2860
   * @param input         the UTF-16LE string to convert
2861
   * @param length        the length of the string in 2-byte code units (char16_t)
2862
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
2863
   * @return number of written code units; 0 if input is not a valid UTF-16LE string
2864
   */
2865
  simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2866
2867
  /**
2868
   * Convert possibly broken UTF-16BE string into UTF-8 string.
2869
   *
2870
   * During the conversion also validation of the input string is done.
2871
   * This function is suitable to work with inputs from untrusted sources.
2872
   *
2873
   * This function is not BOM-aware.
2874
   *
2875
   * @param input         the UTF-16BE string to convert
2876
   * @param length        the length of the string in 2-byte code units (char16_t)
2877
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
2878
   * @return number of written code units; 0 if input is not a valid UTF-16BE string
2879
   */
2880
  simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2881
2882
  /**
2883
   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2884
   *
2885
   * During the conversion also validation of the input string is done.
2886
   * This function is suitable to work with inputs from untrusted sources.
2887
   *
2888
   * This function is not BOM-aware.
2889
   *
2890
   * @param input         the UTF-16LE string to convert
2891
   * @param length        the length of the string in 2-byte code units (char16_t)
2892
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
2893
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2894
   */
2895
  simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2896
2897
  /**
2898
   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2899
   *
2900
   * During the conversion also validation of the input string is done.
2901
   * This function is suitable to work with inputs from untrusted sources.
2902
   *
2903
   * This function is not BOM-aware.
2904
   *
2905
   * @param input         the UTF-16BE string to convert
2906
   * @param length        the length of the string in 2-byte code units (char16_t)
2907
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
2908
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
2909
   */
2910
  simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2911
2912
  /**
2913
   * Convert valid UTF-16LE string into UTF-8 string.
2914
   *
2915
   * This function assumes that the input string is valid UTF-16LE.
2916
   *
2917
   * This function is not BOM-aware.
2918
   *
2919
   * @param input         the UTF-16LE string to convert
2920
   * @param length        the length of the string in 2-byte code units (char16_t)
2921
   * @param utf8_buffer   the pointer to buffer that can hold the conversion result
2922
   * @return number of written code units; 0 if conversion is not possible
2923
   */
2924
  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2925
2926
  /**
2927
   * Convert valid UTF-16BE string into UTF-8 string.
2928
   *
2929
   * This function assumes that the input string is valid UTF-16BE.
2930
   *
2931
   * This function is not BOM-aware.
2932
   *
2933
   * @param input         the UTF-16BE string to convert
2934
   * @param length        the length of the string in 2-byte code units (char16_t)
2935
   * @param utf8_buffer   the pointer to buffer that can hold the conversion result
2936
   * @return number of written code units; 0 if conversion is not possible
2937
   */
2938
  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
2939
2940
  /**
2941
   * Convert possibly broken UTF-16LE string into UTF-32 string.
2942
   *
2943
   * During the conversion also validation of the input string is done.
2944
   * This function is suitable to work with inputs from untrusted sources.
2945
   *
2946
   * This function is not BOM-aware.
2947
   *
2948
   * @param input         the UTF-16LE string to convert
2949
   * @param length        the length of the string in 2-byte code units (char16_t)
2950
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
2951
   * @return number of written code units; 0 if input is not a valid UTF-16LE string
2952
   */
2953
  simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2954
2955
  /**
2956
   * Convert possibly broken UTF-16BE string into UTF-32 string.
2957
   *
2958
   * During the conversion also validation of the input string is done.
2959
   * This function is suitable to work with inputs from untrusted sources.
2960
   *
2961
   * This function is not BOM-aware.
2962
   *
2963
   * @param input         the UTF-16BE string to convert
2964
   * @param length        the length of the string in 2-byte code units (char16_t)
2965
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
2966
   * @return number of written code units; 0 if input is not a valid UTF-16BE string
2967
   */
2968
  simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2969
2970
  /**
2971
   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2972
   *
2973
   * During the conversion also validation of the input string is done.
2974
   * This function is suitable to work with inputs from untrusted sources.
2975
   *
2976
   * This function is not BOM-aware.
2977
   *
2978
   * @param input         the UTF-16LE string to convert
2979
   * @param length        the length of the string in 2-byte code units (char16_t)
2980
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
2981
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
2982
   */
2983
  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2984
2985
  /**
2986
   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2987
   *
2988
   * During the conversion also validation of the input string is done.
2989
   * This function is suitable to work with inputs from untrusted sources.
2990
   *
2991
   * This function is not BOM-aware.
2992
   *
2993
   * @param input         the UTF-16BE string to convert
2994
   * @param length        the length of the string in 2-byte code units (char16_t)
2995
   * @param utf32_buffer   the pointer to buffer that can hold conversion result
2996
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char32_t written if successful.
2997
   */
2998
  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
2999
3000
  /**
3001
   * Convert valid UTF-16LE string into UTF-32 string.
3002
   *
3003
   * This function assumes that the input string is valid UTF-16LE.
3004
   *
3005
   * This function is not BOM-aware.
3006
   *
3007
   * @param input         the UTF-16LE string to convert
3008
   * @param length        the length of the string in 2-byte code units (char16_t)
3009
   * @param utf32_buffer   the pointer to buffer that can hold the conversion result
3010
   * @return number of written code units; 0 if conversion is not possible
3011
   */
3012
  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
3013
3014
  /**
3015
   * Convert valid UTF-16LE string into UTF-32BE string.
3016
   *
3017
   * This function assumes that the input string is valid UTF-16BE.
3018
   *
3019
   * This function is not BOM-aware.
3020
   *
3021
   * @param input         the UTF-16BE string to convert
3022
   * @param length        the length of the string in 2-byte code units (char16_t)
3023
   * @param utf32_buffer   the pointer to buffer that can hold the conversion result
3024
   * @return number of written code units; 0 if conversion is not possible
3025
   */
3026
  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
3027
3028
  /**
3029
   * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
3030
   *
3031
   * This function does not validate the input.
3032
   *
3033
   * This function is not BOM-aware.
3034
   *
3035
   * @param input         the UTF-16LE string to convert
3036
   * @param length        the length of the string in 2-byte code units (char16_t)
3037
   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
3038
   */
3039
  simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
3040
3041
  /**
3042
   * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
3043
   *
3044
   * This function does not validate the input.
3045
   *
3046
   * This function is not BOM-aware.
3047
   *
3048
   * @param input         the UTF-16BE string to convert
3049
   * @param length        the length of the string in 2-byte code units (char16_t)
3050
   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
3051
   */
3052
  simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
3053
3054
  /**
3055
   * Convert possibly broken UTF-32 string into Latin1 string.
3056
   *
3057
   * During the conversion also validation of the input string is done.
3058
   * This function is suitable to work with inputs from untrusted sources.
3059
   *
3060
   * This function is not BOM-aware.
3061
   *
3062
   * @param input         the UTF-32 string to convert
3063
   * @param length        the length of the string in 4-byte code units (char32_t)
3064
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
3065
   * @return number of written code units; 0 if input is not a valid UTF-32 string
3066
   */
3067
3068
  simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
3069
3070
  /**
3071
   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
3072
   *
3073
   * During the conversion also validation of the input string is done.
3074
   * This function is suitable to work with inputs from untrusted sources.
3075
   *
3076
   * This function is not BOM-aware.
3077
   *
3078
   * @param input         the UTF-32 string to convert
3079
   * @param length        the length of the string in 4-byte code units (char32_t)
3080
   * @param latin1_buffer   the pointer to buffer that can hold conversion result
3081
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
3082
   */
3083
3084
  simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
3085
3086
  /**
3087
   * Convert valid UTF-32 string into Latin1 string.
3088
   *
3089
   * This function assumes that the input string is valid UTF-32.
3090
   *
3091
   * This function is not BOM-aware.
3092
   *
3093
   * @param input         the UTF-32 string to convert
3094
   * @param length        the length of the string in 4-byte code units (char32_t)
3095
   * @param latin1_buffer   the pointer to buffer that can hold the conversion result
3096
   * @return number of written code units; 0 if conversion is not possible
3097
   */
3098
  simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
3099
3100
  /**
3101
   * Convert possibly broken UTF-32 string into UTF-8 string.
3102
   *
3103
   * During the conversion also validation of the input string is done.
3104
   * This function is suitable to work with inputs from untrusted sources.
3105
   *
3106
   * This function is not BOM-aware.
3107
   *
3108
   * @param input         the UTF-32 string to convert
3109
   * @param length        the length of the string in 4-byte code units (char32_t)
3110
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
3111
   * @return number of written code units; 0 if input is not a valid UTF-32 string
3112
   */
3113
  simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
3114
3115
  /**
3116
   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
3117
   *
3118
   * During the conversion also validation of the input string is done.
3119
   * This function is suitable to work with inputs from untrusted sources.
3120
   *
3121
   * This function is not BOM-aware.
3122
   *
3123
   * @param input         the UTF-32 string to convert
3124
   * @param length        the length of the string in 4-byte code units (char32_t)
3125
   * @param utf8_buffer   the pointer to buffer that can hold conversion result
3126
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char written if successful.
3127
   */
3128
  simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
3129
3130
  /**
3131
   * Convert valid UTF-32 string into UTF-8 string.
3132
   *
3133
   * This function assumes that the input string is valid UTF-32.
3134
   *
3135
   * This function is not BOM-aware.
3136
   *
3137
   * @param input         the UTF-32 string to convert
3138
   * @param length        the length of the string in 4-byte code units (char32_t)
3139
   * @param utf8_buffer   the pointer to buffer that can hold the conversion result
3140
   * @return number of written code units; 0 if conversion is not possible
3141
   */
3142
  simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
3143
3144
3145
    /**
3146
   * Return the number of bytes that this UTF-16 string would require in Latin1 format.
3147
   *
3148
   *
3149
   * @param input         the UTF-16 string to convert
3150
   * @param length        the length of the string in 2-byte code units (char16_t)
3151
   * @return the number of bytes required to encode the UTF-16 string as Latin1
3152
   */
3153
    simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0;
3154
3155
  /**
3156
   * Convert possibly broken UTF-32 string into UTF-16LE string.
3157
   *
3158
   * During the conversion also validation of the input string is done.
3159
   * This function is suitable to work with inputs from untrusted sources.
3160
   *
3161
   * This function is not BOM-aware.
3162
   *
3163
   * @param input         the UTF-32 string to convert
3164
   * @param length        the length of the string in 4-byte code units (char32_t)
3165
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
3166
   * @return number of written code units; 0 if input is not a valid UTF-32 string
3167
   */
3168
  simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3169
3170
  /**
3171
   * Convert possibly broken UTF-32 string into UTF-16BE string.
3172
   *
3173
   * During the conversion also validation of the input string is done.
3174
   * This function is suitable to work with inputs from untrusted sources.
3175
   *
3176
   * This function is not BOM-aware.
3177
   *
3178
   * @param input         the UTF-32 string to convert
3179
   * @param length        the length of the string in 4-byte code units (char32_t)
3180
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
3181
   * @return number of written code units; 0 if input is not a valid UTF-32 string
3182
   */
3183
  simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3184
3185
  /**
3186
   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
3187
   *
3188
   * During the conversion also validation of the input string is done.
3189
   * This function is suitable to work with inputs from untrusted sources.
3190
   *
3191
   * This function is not BOM-aware.
3192
   *
3193
   * @param input         the UTF-32 string to convert
3194
   * @param length        the length of the string in 4-byte code units (char32_t)
3195
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
3196
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
3197
   */
3198
  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3199
3200
  /**
3201
   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
3202
   *
3203
   * During the conversion also validation of the input string is done.
3204
   * This function is suitable to work with inputs from untrusted sources.
3205
   *
3206
   * This function is not BOM-aware.
3207
   *
3208
   * @param input         the UTF-32 string to convert
3209
   * @param length        the length of the string in 4-byte code units (char32_t)
3210
   * @param utf16_buffer   the pointer to buffer that can hold conversion result
3211
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in code units) if any, or the number of char16_t written if successful.
3212
   */
3213
  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3214
3215
  /**
3216
   * Convert valid UTF-32 string into UTF-16LE string.
3217
   *
3218
   * This function assumes that the input string is valid UTF-32.
3219
   *
3220
   * This function is not BOM-aware.
3221
   *
3222
   * @param input         the UTF-32 string to convert
3223
   * @param length        the length of the string in 4-byte code units (char32_t)
3224
   * @param utf16_buffer   the pointer to buffer that can hold the conversion result
3225
   * @return number of written code units; 0 if conversion is not possible
3226
   */
3227
  simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3228
3229
  /**
3230
   * Convert valid UTF-32 string into UTF-16BE string.
3231
   *
3232
   * This function assumes that the input string is valid UTF-32.
3233
   *
3234
   * This function is not BOM-aware.
3235
   *
3236
   * @param input         the UTF-32 string to convert
3237
   * @param length        the length of the string in 4-byte code units (char32_t)
3238
   * @param utf16_buffer   the pointer to buffer that can hold the conversion result
3239
   * @return number of written code units; 0 if conversion is not possible
3240
   */
3241
  simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
3242
3243
  /**
3244
   * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
3245
   * from UTF-16BE to UTF-16LE.
3246
   *
3247
   * This function does not validate the input.
3248
   *
3249
   * This function is not BOM-aware.
3250
   *
3251
   * @param input         the UTF-16 string to process
3252
   * @param length        the length of the string in 2-byte code units (char16_t)
3253
   * @param output        the pointer to buffer that can hold the conversion result
3254
   */
3255
  virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0;
3256
3257
 /**
3258
   * Return the number of bytes that this Latin1 string would require in UTF-8 format.
3259
   *
3260
   * @param input         the Latin1 string to convert
3261
   * @param length        the length of the string bytes
3262
   * @return the number of bytes required to encode the Latin1 string as UTF-8
3263
   */
3264
    simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0;
3265
3266
  /**
3267
   * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
3268
   *
3269
   * This function does not validate the input.
3270
   *
3271
   * @param input         the UTF-32 string to convert
3272
   * @param length        the length of the string in 4-byte code units (char32_t)
3273
   * @return the number of bytes required to encode the UTF-32 string as UTF-8
3274
   */
3275
  simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
3276
3277
  /**
3278
   * Compute the number of bytes that this UTF-32 string would require in Latin1 format.
3279
   *
3280
   * This function does not validate the input.
3281
   *
3282
   * @param length        the length of the string in 4-byte code units (char32_t)
3283
   * @return the number of bytes required to encode the UTF-32 string as Latin1
3284
   */
3285
  simdutf_warn_unused virtual size_t latin1_length_from_utf32(size_t length) const noexcept = 0;
3286
3287
  /**
3288
   * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
3289
   *
3290
   * This function does not validate the input.
3291
   *
3292
   * @param input         the UTF-8 string to convert
3293
   * @param length        the length of the string in byte
3294
   * @return the number of bytes required to encode the UTF-8 string as Latin1
3295
   */
3296
  simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0;
3297
3298
  /*
3299
   * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
3300
   *
3301
   * This function does not validate the input.
3302
   *
3303
   * This function is not BOM-aware.
3304
   *
3305
   * @param input         the UTF-16LE string to convert
3306
   * @param length        the length of the string in 2-byte code units (char16_t)
3307
   * @return the number of bytes required to encode the UTF-16LE string as Latin1
3308
   */
3309
  simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0;
3310
3311
  /**
3312
   * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format.
3313
   *
3314
   * This function does not validate the input.
3315
   *
3316
   * @param input         the UTF-32 string to convert
3317
   * @param length        the length of the string in 4-byte code units (char32_t)
3318
   * @return the number of bytes required to encode the UTF-32 string as UTF-16
3319
   */
3320
  simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
3321
3322
3323
    /**
3324
   * Return the number of bytes that this UTF-32 string would require in Latin1 format.
3325
   *
3326
   * This function does not validate the input.
3327
   *
3328
   * @param input         the UTF-32 string to convert
3329
   * @param length        the length of the string in 4-byte code units (char32_t)
3330
   * @return the number of bytes required to encode the UTF-32 string as Latin1
3331
   */
3332
    simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0;
3333
3334
  /*
3335
   * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
3336
   *
3337
   * This function is equivalent to count_utf16le.
3338
   *
3339
   * This function does not validate the input.
3340
   *
3341
   * This function is not BOM-aware.
3342
   *
3343
   * @param input         the UTF-16LE string to convert
3344
   * @param length        the length of the string in 2-byte code units (char16_t)
3345
   * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3346
   */
3347
  simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
3348
3349
  /*
3350
   * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
3351
   *
3352
   * This function is equivalent to count_utf16be.
3353
   *
3354
   * This function does not validate the input.
3355
   *
3356
   * This function is not BOM-aware.
3357
   *
3358
   * @param input         the UTF-16BE string to convert
3359
   * @param length        the length of the string in 2-byte code units (char16_t)
3360
   * @return the number of bytes required to encode the UTF-16BE string as UTF-32
3361
   */
3362
  simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
3363
3364
  /**
3365
   * Count the number of code points (characters) in the string assuming that
3366
   * it is valid.
3367
   *
3368
   * This function assumes that the input string is valid UTF-16LE.
3369
   *
3370
   * This function is not BOM-aware.
3371
   *
3372
   * @param input         the UTF-16LE string to process
3373
   * @param length        the length of the string in 2-byte code units (char16_t)
3374
   * @return number of code points
3375
   */
3376
  simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0;
3377
3378
  /**
3379
   * Count the number of code points (characters) in the string assuming that
3380
   * it is valid.
3381
   *
3382
   * This function assumes that the input string is valid UTF-16BE.
3383
   *
3384
   * This function is not BOM-aware.
3385
   *
3386
   * @param input         the UTF-16BE string to process
3387
   * @param length        the length of the string in 2-byte code units (char16_t)
3388
   * @return number of code points
3389
   */
3390
  simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0;
3391
3392
3393
  /**
3394
   * Count the number of code points (characters) in the string assuming that
3395
   * it is valid.
3396
   *
3397
   * This function assumes that the input string is valid UTF-8.
3398
   *
3399
   * @param input         the UTF-8 string to process
3400
   * @param length        the length of the string in bytes
3401
   * @return number of code points
3402
   */
3403
  simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0;
3404
3405
  /**
3406
   * Provide the maximal binary length in bytes given the base64 input.
3407
   * In general, if the input contains ASCII spaces, the result will be less than
3408
   * the maximum length.
3409
   *
3410
   * @param input         the base64 input to process
3411
   * @param length        the length of the base64 input in bytes
3412
   * @return number of base64 bytes
3413
   */
3414
  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept = 0;
3415
3416
  /**
3417
   * Convert a base64 input to a binary ouput.
3418
   *
3419
   * This function follows the WHATWG forgiving-base64 format, which means that it will
3420
   * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
3421
   * equal signs at the end) or an unpadded input (without any equal signs at the end).
3422
   *
3423
   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
3424
   *
3425
   * This function will fail in case of invalid input. There are two possible reasons for
3426
   * failure: the input is contains a number of base64 characters that when divided by 4, leaves
3427
   * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
3428
   * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
3429
   *
3430
   * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
3431
   * If you fail to provide that much space, the function may cause a buffer overflow.
3432
   *
3433
   * @param input         the base64 string to process
3434
   * @param length        the length of the string in bytes
3435
   * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
3436
   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
3437
   */
3438
  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept = 0;
3439
3440
  /**
3441
   * Provide the base64 length in bytes given the length of a binary input.
3442
   *
3443
   * @param length        the length of the input in bytes
3444
   * @return number of base64 bytes
3445
   */
3446
  simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept = 0;
3447
3448
  /**
3449
   * Convert a binary input to a base64 ouput. The output is always padded with equal signs so that it is
3450
   * a multiple of 4 bytes long.
3451
   *
3452
   * This function always succeeds.
3453
   *
3454
   * @param input         the binary to process
3455
   * @param length        the length of the input in bytes
3456
   * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
3457
   * @return number of written bytes, will be equal to base64_length_from_binary(length)
3458
   */
3459
  virtual size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept = 0;
3460
3461
3462
protected:
3463
  /** @private Construct an implementation with the given name and description. For subclasses. */
3464
  simdutf_really_inline implementation(
3465
    std::string name,
3466
    std::string description,
3467
    uint32_t required_instruction_sets
3468
  ) :
3469
    _name(name),
3470
    _description(description),
3471
    _required_instruction_sets(required_instruction_sets)
3472
0
  {
3473
0
  }
3474
  virtual ~implementation()=default;
3475
3476
private:
3477
  /**
3478
   * The name of this implementation.
3479
   */
3480
  const std::string _name;
3481
3482
  /**
3483
   * The description of this implementation.
3484
   */
3485
  const std::string _description;
3486
3487
  /**
3488
   * Instruction sets required for this implementation.
3489
   */
3490
  const uint32_t _required_instruction_sets;
3491
};
3492
3493
/** @private */
3494
namespace internal {
3495
3496
/**
3497
 * The list of available implementations compiled into simdutf.
3498
 */
3499
class available_implementation_list {
3500
public:
3501
  /** Get the list of available implementations compiled into simdutf */
3502
0
  simdutf_really_inline available_implementation_list() {}
3503
  /** Number of implementations */
3504
  size_t size() const noexcept;
3505
  /** STL const begin() iterator */
3506
  const implementation * const *begin() const noexcept;
3507
  /** STL const end() iterator */
3508
  const implementation * const *end() const noexcept;
3509
3510
  /**
3511
   * Get the implementation with the given name.
3512
   *
3513
   * Case sensitive.
3514
   *
3515
   *     const implementation *impl = simdutf::available_implementations["westmere"];
3516
   *     if (!impl) { exit(1); }
3517
   *     if (!imp->supported_by_runtime_system()) { exit(1); }
3518
   *     simdutf::active_implementation = impl;
3519
   *
3520
   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
3521
   * @return the implementation, or nullptr if the parse failed.
3522
   */
3523
0
  const implementation * operator[](const std::string &name) const noexcept {
3524
0
    for (const implementation * impl : *this) {
3525
0
      if (impl->name() == name) { return impl; }
3526
0
    }
3527
0
    return nullptr;
3528
0
  }
3529
3530
  /**
3531
   * Detect the most advanced implementation supported by the current host.
3532
   *
3533
   * This is used to initialize the implementation on startup.
3534
   *
3535
   *     const implementation *impl = simdutf::available_implementation::detect_best_supported();
3536
   *     simdutf::active_implementation = impl;
3537
   *
3538
   * @return the most advanced supported implementation for the current host, or an
3539
   *         implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
3540
   *         implementation. Will never return nullptr.
3541
   */
3542
  const implementation *detect_best_supported() const noexcept;
3543
};
3544
3545
template<typename T>
3546
class atomic_ptr {
3547
public:
3548
  atomic_ptr(T *_ptr) : ptr{_ptr} {}
3549
3550
#if defined(SIMDUTF_NO_THREADS)
3551
  operator const T*() const { return ptr; }
3552
  const T& operator*() const { return *ptr; }
3553
  const T* operator->() const { return ptr; }
3554
3555
  operator T*() { return ptr; }
3556
  T& operator*() { return *ptr; }
3557
  T* operator->() { return ptr; }
3558
  atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
3559
3560
#else
3561
  operator const T*() const { return ptr.load(); }
3562
  const T& operator*() const { return *ptr; }
3563
  const T* operator->() const { return ptr.load(); }
3564
3565
  operator T*() { return ptr.load(); }
3566
  T& operator*() { return *ptr; }
3567
  T* operator->() { return ptr.load(); }
3568
  atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
3569
3570
#endif
3571
3572
private:
3573
#if defined(SIMDUTF_NO_THREADS)
3574
  T* ptr;
3575
#else
3576
  std::atomic<T*> ptr;
3577
#endif
3578
};
3579
3580
class detect_best_supported_implementation_on_first_use;
3581
3582
} // namespace internal
3583
3584
/**
3585
 * The list of available implementations compiled into simdutf.
3586
 */
3587
extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations();
3588
3589
/**
3590
  * The active implementation.
3591
  *
3592
  * Automatically initialized on first use to the most advanced implementation supported by this hardware.
3593
  */
3594
extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation();
3595
3596
3597
} // namespace simdutf
3598
3599
#endif // SIMDUTF_IMPLEMENTATION_H
3600
/* end file include/simdutf/implementation.h */
3601
3602
3603
// Implementation-internal files (must be included before the implementations themselves, to keep
3604
// amalgamation working--otherwise, the first time a file is included, it might be put inside the
3605
// #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't
3606
// compile unless that implementation is turned on).
3607
3608
3609
SIMDUTF_POP_DISABLE_WARNINGS
3610
3611
#endif // SIMDUTF_H
3612
/* end file include/simdutf.h */