Coverage Report

Created: 2026-06-03 06:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ruby/ext/json/simd/simd.h
Line
Count
Source
1
#include "../json.h"
2
3
typedef enum {
4
    SIMD_NONE,
5
    SIMD_NEON,
6
    SIMD_SSE2
7
} SIMD_Implementation;
8
9
#ifndef __has_builtin         // Optional of course.
10
  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
11
#endif
12
13
#ifdef __clang__
14
# if __has_builtin(__builtin_ctzll)
15
#   define HAVE_BUILTIN_CTZLL 1
16
# else
17
#   define HAVE_BUILTIN_CTZLL 0
18
# endif
19
#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
20
# define HAVE_BUILTIN_CTZLL 1
21
#else
22
# define HAVE_BUILTIN_CTZLL 0
23
#endif
24
25
static inline uint32_t trailing_zeros64(uint64_t input)
26
2.81M
{
27
2.81M
    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
28
29
2.81M
#if HAVE_BUILTIN_CTZLL
30
2.81M
    return __builtin_ctzll(input);
31
#else
32
    uint32_t trailing_zeros = 0;
33
    uint64_t temp = input;
34
    while ((temp & 1) == 0 && temp > 0) {
35
        trailing_zeros++;
36
        temp >>= 1;
37
    }
38
    return trailing_zeros;
39
#endif
40
2.81M
}
41
42
static inline int trailing_zeros(int input)
43
0
{
44
0
    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
45
0
46
0
#if HAVE_BUILTIN_CTZLL
47
0
    return __builtin_ctz(input);
48
0
#else
49
0
    int trailing_zeros = 0;
50
0
    int temp = input;
51
0
    while ((temp & 1) == 0 && temp > 0) {
52
0
        trailing_zeros++;
53
0
        temp >>= 1;
54
0
    }
55
0
    return trailing_zeros;
56
0
#endif
57
0
}
58
59
#ifdef JSON_ENABLE_SIMD
60
61
#define SIMD_MINIMUM_THRESHOLD 4
62
63
ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
64
{
65
    RBIMPL_ASSERT_OR_ASSUME(len < 16);
66
    RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
67
#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
68
    // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
69
    // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
70
    // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
71
    // position in both copies.
72
73
    // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
74
    // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
75
    // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
76
    // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
77
    // plus two loads and stores generated when using __builtin_memcpy.
78
    if (len >= 8) {
79
        __builtin_memcpy(dst, src, 8);
80
        __builtin_memcpy(dst + len - 8, src + len - 8, 8);
81
    } else {
82
        __builtin_memcpy(dst, src, 4);
83
        __builtin_memcpy(dst + len - 4, src + len - 4, 4);
84
    }
85
#else
86
    MEMCPY(dst, src, char, len);
87
#endif
88
}
89
90
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
91
#include <arm_neon.h>
92
93
#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
94
static inline SIMD_Implementation find_simd_implementation(void)
95
{
96
    return SIMD_NEON;
97
}
98
99
#define HAVE_SIMD 1
100
#define HAVE_SIMD_NEON 1
101
102
// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
103
ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
104
{
105
    const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
106
    const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
107
    return mask & 0x8888888888888888ull;
108
}
109
110
ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
111
{
112
    uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
113
114
    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
115
    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
116
    const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));
117
118
    uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
119
    uint8x16_t needs_escape  = vorrq_u8(too_low_or_dbl_quote, has_backslash);
120
    return neon_match_mask(needs_escape);
121
}
122
123
ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
124
{
125
    while (*ptr + sizeof(uint8x16_t) <= end) {
126
        uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
127
        if (chunk_mask) {
128
            *mask = chunk_mask;
129
            return 1;
130
        }
131
        *ptr += sizeof(uint8x16_t);
132
    }
133
    return 0;
134
}
135
136
#endif /* ARM Neon Support.*/
137
138
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
139
140
#ifdef HAVE_X86INTRIN_H
141
#include <x86intrin.h>
142
143
#define HAVE_SIMD 1
144
#define HAVE_SIMD_SSE2 1
145
146
#ifdef HAVE_CPUID_H
147
#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
148
149
#if defined(__clang__) || defined(__GNUC__)
150
#define TARGET_SSE2 __attribute__((target("sse2")))
151
#else
152
#define TARGET_SSE2
153
#endif
154
155
#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
156
#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
157
#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
158
#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
159
160
ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
161
{
162
    __m128i chunk         = _mm_loadu_si128((__m128i const*)ptr);
163
    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
164
    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
165
    __m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
166
    __m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
167
    __m128i needs_escape  = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
168
    return _mm_movemask_epi8(needs_escape);
169
}
170
171
ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
172
{
173
    while (*ptr + sizeof(__m128i) <= end) {
174
        int chunk_mask = compute_chunk_mask_sse2(*ptr);
175
        if (chunk_mask) {
176
            *mask = chunk_mask;
177
            return 1;
178
        }
179
        *ptr += sizeof(__m128i);
180
    }
181
182
    return 0;
183
}
184
185
#include <cpuid.h>
186
#endif /* HAVE_CPUID_H */
187
188
static inline SIMD_Implementation find_simd_implementation(void)
189
{
190
    // TODO Revisit. I think the SSE version now only uses SSE2 instructions.
191
    if (__builtin_cpu_supports("sse2")) {
192
        return SIMD_SSE2;
193
    }
194
195
    return SIMD_NONE;
196
}
197
198
#endif /* HAVE_X86INTRIN_H */
199
#endif /* X86_64 Support */
200
201
#endif /* JSON_ENABLE_SIMD */
202
203
#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
204
static inline SIMD_Implementation find_simd_implementation(void)
205
0
{
206
0
    return SIMD_NONE;
207
0
}
208
#endif