/src/simdutf/src/scalar/utf32.h
Line | Count | Source |
1 | | #ifndef SIMDUTF_UTF32_H |
2 | | #define SIMDUTF_UTF32_H |
3 | | |
4 | | namespace simdutf { |
5 | | namespace scalar { |
6 | | namespace { |
7 | | namespace utf32 { |
8 | | |
9 | | inline simdutf_warn_unused bool validate(const char32_t *buf, |
10 | 14.1k | size_t len) noexcept { |
11 | 14.1k | const uint32_t *data = reinterpret_cast<const uint32_t *>(buf); |
12 | 14.1k | uint64_t pos = 0; |
13 | 129k | for (; pos < len; pos++) { |
14 | 115k | uint32_t word = data[pos]; |
15 | 115k | if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) { |
16 | 0 | return false; |
17 | 0 | } |
18 | 115k | } |
19 | 14.1k | return true; |
20 | 14.1k | } |
21 | | |
22 | | inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, |
23 | 0 | size_t len) noexcept { |
24 | 0 | const uint32_t *data = reinterpret_cast<const uint32_t *>(buf); |
25 | 0 | size_t pos = 0; |
26 | 0 | for (; pos < len; pos++) { |
27 | 0 | uint32_t word = data[pos]; |
28 | 0 | if (word > 0x10FFFF) { |
29 | 0 | return result(error_code::TOO_LARGE, pos); |
30 | 0 | } |
31 | 0 | if (word >= 0xD800 && word <= 0xDFFF) { |
32 | 0 | return result(error_code::SURROGATE, pos); |
33 | 0 | } |
34 | 0 | } |
35 | 0 | return result(error_code::SUCCESS, pos); |
36 | 0 | } |
37 | | |
38 | 15.1k | inline size_t utf8_length_from_utf32(const char32_t *buf, size_t len) { |
39 | | // We are not BOM aware. |
40 | 15.1k | const uint32_t *p = reinterpret_cast<const uint32_t *>(buf); |
41 | 15.1k | size_t counter{0}; |
42 | 58.9k | for (size_t i = 0; i < len; i++) { |
43 | | // credit: @ttsugriy for the vectorizable approach |
44 | 43.8k | counter++; // ASCII |
45 | 43.8k | counter += static_cast<size_t>(p[i] > 0x7F); // two-byte |
46 | 43.8k | counter += static_cast<size_t>(p[i] > 0x7FF); // three-byte |
47 | 43.8k | counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes |
48 | 43.8k | } |
49 | 15.1k | return counter; |
50 | 15.1k | } |
51 | | |
52 | 0 | inline size_t utf16_length_from_utf32(const char32_t *buf, size_t len) { |
53 | | // We are not BOM aware. |
54 | 0 | const uint32_t *p = reinterpret_cast<const uint32_t *>(buf); |
55 | 0 | size_t counter{0}; |
56 | 0 | for (size_t i = 0; i < len; i++) { |
57 | 0 | counter++; // non-surrogate word |
58 | 0 | counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair |
59 | 0 | } |
60 | 0 | return counter; |
61 | 0 | } |
62 | | |
63 | | } // namespace utf32 |
64 | | } // unnamed namespace |
65 | | } // namespace scalar |
66 | | } // namespace simdutf |
67 | | |
68 | | #endif |