/src/ruby/ext/json/simd/simd.h
Line | Count | Source |
1 | | #include "../json.h" |
2 | | |
3 | | typedef enum { |
4 | | SIMD_NONE, |
5 | | SIMD_NEON, |
6 | | SIMD_SSE2 |
7 | | } SIMD_Implementation; |
8 | | |
9 | | #ifndef __has_builtin // Optional of course. |
10 | | #define __has_builtin(x) 0 // Compatibility with non-clang compilers. |
11 | | #endif |
12 | | |
13 | | #ifdef __clang__ |
14 | | # if __has_builtin(__builtin_ctzll) |
15 | | # define HAVE_BUILTIN_CTZLL 1 |
16 | | # else |
17 | | # define HAVE_BUILTIN_CTZLL 0 |
18 | | # endif |
19 | | #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) |
20 | | # define HAVE_BUILTIN_CTZLL 1 |
21 | | #else |
22 | | # define HAVE_BUILTIN_CTZLL 0 |
23 | | #endif |
24 | | |
25 | | static inline uint32_t trailing_zeros64(uint64_t input) |
26 | 2.81M | { |
27 | 2.81M | JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior |
28 | | |
29 | 2.81M | #if HAVE_BUILTIN_CTZLL |
30 | 2.81M | return __builtin_ctzll(input); |
31 | | #else |
32 | | uint32_t trailing_zeros = 0; |
33 | | uint64_t temp = input; |
34 | | while ((temp & 1) == 0 && temp > 0) { |
35 | | trailing_zeros++; |
36 | | temp >>= 1; |
37 | | } |
38 | | return trailing_zeros; |
39 | | #endif |
40 | 2.81M | } |
41 | | |
42 | | static inline int trailing_zeros(int input) |
43 | 0 | { |
44 | 0 | JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior |
45 | 0 |
|
46 | 0 | #if HAVE_BUILTIN_CTZLL |
47 | 0 | return __builtin_ctz(input); |
48 | 0 | #else |
49 | 0 | int trailing_zeros = 0; |
50 | 0 | int temp = input; |
51 | 0 | while ((temp & 1) == 0 && temp > 0) { |
52 | 0 | trailing_zeros++; |
53 | 0 | temp >>= 1; |
54 | 0 | } |
55 | 0 | return trailing_zeros; |
56 | 0 | #endif |
57 | 0 | } |
58 | | |
59 | | #ifdef JSON_ENABLE_SIMD |
60 | | |
61 | | #define SIMD_MINIMUM_THRESHOLD 4 |
62 | | |
63 | | ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len) |
64 | | { |
65 | | RBIMPL_ASSERT_OR_ASSUME(len < 16); |
66 | | RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4 |
67 | | #if defined(__has_builtin) && __has_builtin(__builtin_memcpy) |
68 | | // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes. |
69 | | // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy |
70 | | // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct |
71 | | // position in both copies. |
72 | | |
73 | | // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the |
74 | | // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)), |
75 | | // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional |
76 | | // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch |
77 | | // plus two loads and stores generated when using __builtin_memcpy. |
78 | | if (len >= 8) { |
79 | | __builtin_memcpy(dst, src, 8); |
80 | | __builtin_memcpy(dst + len - 8, src + len - 8, 8); |
81 | | } else { |
82 | | __builtin_memcpy(dst, src, 4); |
83 | | __builtin_memcpy(dst + len - 4, src + len - 4, 4); |
84 | | } |
85 | | #else |
86 | | MEMCPY(dst, src, char, len); |
87 | | #endif |
88 | | } |
89 | | |
90 | | #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) |
91 | | #include <arm_neon.h> |
92 | | |
93 | | #define FIND_SIMD_IMPLEMENTATION_DEFINED 1 |
94 | | static inline SIMD_Implementation find_simd_implementation(void) |
95 | | { |
96 | | return SIMD_NEON; |
97 | | } |
98 | | |
99 | | #define HAVE_SIMD 1 |
100 | | #define HAVE_SIMD_NEON 1 |
101 | | |
102 | | // See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon |
103 | | ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches) |
104 | | { |
105 | | const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4); |
106 | | const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0); |
107 | | return mask & 0x8888888888888888ull; |
108 | | } |
109 | | |
110 | | ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr) |
111 | | { |
112 | | uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr); |
113 | | |
114 | | // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33 |
115 | | // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/ |
116 | | const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33)); |
117 | | |
118 | | uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\')); |
119 | | uint8x16_t needs_escape = vorrq_u8(too_low_or_dbl_quote, has_backslash); |
120 | | return neon_match_mask(needs_escape); |
121 | | } |
122 | | |
123 | | ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask) |
124 | | { |
125 | | while (*ptr + sizeof(uint8x16_t) <= end) { |
126 | | uint64_t chunk_mask = compute_chunk_mask_neon(*ptr); |
127 | | if (chunk_mask) { |
128 | | *mask = chunk_mask; |
129 | | return 1; |
130 | | } |
131 | | *ptr += sizeof(uint8x16_t); |
132 | | } |
133 | | return 0; |
134 | | } |
135 | | |
136 | | #endif /* ARM Neon Support.*/ |
137 | | |
138 | | #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) |
139 | | |
140 | | #ifdef HAVE_X86INTRIN_H |
141 | | #include <x86intrin.h> |
142 | | |
143 | | #define HAVE_SIMD 1 |
144 | | #define HAVE_SIMD_SSE2 1 |
145 | | |
146 | | #ifdef HAVE_CPUID_H |
147 | | #define FIND_SIMD_IMPLEMENTATION_DEFINED 1 |
148 | | |
149 | | #if defined(__clang__) || defined(__GNUC__) |
150 | | #define TARGET_SSE2 __attribute__((target("sse2"))) |
151 | | #else |
152 | | #define TARGET_SSE2 |
153 | | #endif |
154 | | |
155 | | #define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) |
156 | | #define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) |
157 | | #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) |
158 | | #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) |
159 | | |
160 | | ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr) |
161 | | { |
162 | | __m128i chunk = _mm_loadu_si128((__m128i const*)ptr); |
163 | | // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33 |
164 | | // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/ |
165 | | __m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33)); |
166 | | __m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\')); |
167 | | __m128i needs_escape = _mm_or_si128(too_low_or_dbl_quote, has_backslash); |
168 | | return _mm_movemask_epi8(needs_escape); |
169 | | } |
170 | | |
171 | | ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask) |
172 | | { |
173 | | while (*ptr + sizeof(__m128i) <= end) { |
174 | | int chunk_mask = compute_chunk_mask_sse2(*ptr); |
175 | | if (chunk_mask) { |
176 | | *mask = chunk_mask; |
177 | | return 1; |
178 | | } |
179 | | *ptr += sizeof(__m128i); |
180 | | } |
181 | | |
182 | | return 0; |
183 | | } |
184 | | |
185 | | #include <cpuid.h> |
186 | | #endif /* HAVE_CPUID_H */ |
187 | | |
188 | | static inline SIMD_Implementation find_simd_implementation(void) |
189 | | { |
190 | | // TODO Revisit. I think the SSE version now only uses SSE2 instructions. |
191 | | if (__builtin_cpu_supports("sse2")) { |
192 | | return SIMD_SSE2; |
193 | | } |
194 | | |
195 | | return SIMD_NONE; |
196 | | } |
197 | | |
198 | | #endif /* HAVE_X86INTRIN_H */ |
199 | | #endif /* X86_64 Support */ |
200 | | |
201 | | #endif /* JSON_ENABLE_SIMD */ |
202 | | |
203 | | #ifndef FIND_SIMD_IMPLEMENTATION_DEFINED |
204 | | static inline SIMD_Implementation find_simd_implementation(void) |
205 | 0 | { |
206 | 0 | return SIMD_NONE; |
207 | 0 | } |
208 | | #endif |