/src/ruby/ext/json/simd/simd.h

Source
#include "../json.h"

typedef enum {
    SIMD_NONE,
    SIMD_NEON,
    SIMD_SSE2
} SIMD_Implementation;

#ifndef __has_builtin         // Optional of course.
  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
#endif

#ifdef __clang__
# if __has_builtin(__builtin_ctzll)
#   define HAVE_BUILTIN_CTZLL 1
# else
#   define HAVE_BUILTIN_CTZLL 0
# endif
#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
# define HAVE_BUILTIN_CTZLL 1
#else
# define HAVE_BUILTIN_CTZLL 0
#endif

static inline uint32_t trailing_zeros64(uint64_t input)
{
    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior

#if HAVE_BUILTIN_CTZLL
    return __builtin_ctzll(input);
#else
    uint32_t trailing_zeros = 0;
    uint64_t temp = input;
    while ((temp & 1) == 0 && temp > 0) {
        trailing_zeros++;
        temp >>= 1;
    }
    return trailing_zeros;
#endif
}

static inline int trailing_zeros(int input)
{
    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior

#if HAVE_BUILTIN_CTZLL
    return __builtin_ctz(input);
#else
    int trailing_zeros = 0;
    int temp = input;
    while ((temp & 1) == 0 && temp > 0) {
        trailing_zeros++;
        temp >>= 1;
    }
    return trailing_zeros;
#endif
}

#ifdef JSON_ENABLE_SIMD

#define SIMD_MINIMUM_THRESHOLD 4

ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
{
    RBIMPL_ASSERT_OR_ASSUME(len < 16);
    RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
    // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
    // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
    // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
    // position in both copies.

    // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
    // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
    // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
    // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
    // plus two loads and stores generated when using __builtin_memcpy.
    if (len >= 8) {
        __builtin_memcpy(dst, src, 8);
        __builtin_memcpy(dst + len - 8, src + len - 8, 8);
    } else {
        __builtin_memcpy(dst, src, 4);
        __builtin_memcpy(dst + len - 4, src + len - 4, 4);
    }
#else
    MEMCPY(dst, src, char, len);
#endif
}

#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>

#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
static inline SIMD_Implementation find_simd_implementation(void)
{
    return SIMD_NEON;
}

#define HAVE_SIMD 1
#define HAVE_SIMD_NEON 1

// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
{
    const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
    const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
    return mask & 0x8888888888888888ull;
}

ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
{
    uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);

    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
    const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));

    uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
    uint8x16_t needs_escape  = vorrq_u8(too_low_or_dbl_quote, has_backslash);
    return neon_match_mask(needs_escape);
}

ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
{
    while (*ptr + sizeof(uint8x16_t) <= end) {
        uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
        if (chunk_mask) {
            *mask = chunk_mask;
            return 1;
        }
        *ptr += sizeof(uint8x16_t);
    }
    return 0;
}

#endif /* ARM Neon Support.*/

#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)

#ifdef HAVE_X86INTRIN_H
#include <x86intrin.h>

#define HAVE_SIMD 1
#define HAVE_SIMD_SSE2 1

#ifdef HAVE_CPUID_H
#define FIND_SIMD_IMPLEMENTATION_DEFINED 1

#if defined(__clang__) || defined(__GNUC__)
#define TARGET_SSE2 __attribute__((target("sse2")))
#else
#define TARGET_SSE2
#endif

#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)

ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
{
    __m128i chunk         = _mm_loadu_si128((__m128i const*)ptr);
    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
    __m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
    __m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
    __m128i needs_escape  = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
    return _mm_movemask_epi8(needs_escape);
}

ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
{
    while (*ptr + sizeof(__m128i) <= end) {
        int chunk_mask = compute_chunk_mask_sse2(*ptr);
        if (chunk_mask) {
            *mask = chunk_mask;
            return 1;
        }
        *ptr += sizeof(__m128i);
    }

    return 0;
}

#include <cpuid.h>
#endif /* HAVE_CPUID_H */

static inline SIMD_Implementation find_simd_implementation(void)
{
    // TODO Revisit. I think the SSE version now only uses SSE2 instructions.
    if (__builtin_cpu_supports("sse2")) {
        return SIMD_SSE2;
    }

    return SIMD_NONE;
}

#endif /* HAVE_X86INTRIN_H */
#endif /* X86_64 Support */

#endif /* JSON_ENABLE_SIMD */

#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
static inline SIMD_Implementation find_simd_implementation(void)
{
    return SIMD_NONE;
}
#endif

Coverage Report

Created: 2026-06-03 06:22

Line	Count	Source
1		#include "../json.h"
2
3		typedef enum {
4		SIMD_NONE,
5		SIMD_NEON,
6		SIMD_SSE2
7		} SIMD_Implementation;
8
9		#ifndef __has_builtin // Optional of course.
10		#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
11		#endif
12
13		#ifdef __clang__
14		# if __has_builtin(__builtin_ctzll)
15		# define HAVE_BUILTIN_CTZLL 1
16		# else
17		# define HAVE_BUILTIN_CTZLL 0
18		# endif
19		#elif defined(__GNUC__) && (__GNUC__ > 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
20		# define HAVE_BUILTIN_CTZLL 1
21		#else
22		# define HAVE_BUILTIN_CTZLL 0
23		#endif
24
25		static inline uint32_t trailing_zeros64(uint64_t input)
26	2.81M	{
27	2.81M	JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
28
29	2.81M	#if HAVE_BUILTIN_CTZLL
30	2.81M	return __builtin_ctzll(input);
31		#else
32		uint32_t trailing_zeros = 0;
33		uint64_t temp = input;
34		while ((temp & 1) == 0 && temp > 0) {
35		trailing_zeros++;
36		temp >>= 1;
37		}
38		return trailing_zeros;
39		#endif
40	2.81M	}
41
42		static inline int trailing_zeros(int input)
43	0	{
44	0	JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
45	0
46	0	#if HAVE_BUILTIN_CTZLL
47	0	return __builtin_ctz(input);
48	0	#else
49	0	int trailing_zeros = 0;
50	0	int temp = input;
51	0	while ((temp & 1) == 0 && temp > 0) {
52	0	trailing_zeros++;
53	0	temp >>= 1;
54	0	}
55	0	return trailing_zeros;
56	0	#endif
57	0	}
58
59		#ifdef JSON_ENABLE_SIMD
60
61		#define SIMD_MINIMUM_THRESHOLD 4
62
63		ALWAYS_INLINE(static) void json_fast_memcpy16(char dst, const char src, size_t len)
64		{
65		RBIMPL_ASSERT_OR_ASSUME(len < 16);
66		RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
67		#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
68		// If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
69		// These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
70		// the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
71		// position in both copies.
72
73		// Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
74		// generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
75		// when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
76		// select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
77		// plus two loads and stores generated when using __builtin_memcpy.
78		if (len >= 8) {
79		__builtin_memcpy(dst, src, 8);
80		__builtin_memcpy(dst + len - 8, src + len - 8, 8);
81		} else {
82		__builtin_memcpy(dst, src, 4);
83		__builtin_memcpy(dst + len - 4, src + len - 4, 4);
84		}
85		#else
86		MEMCPY(dst, src, char, len);
87		#endif
88		}
89
90		#if defined(__ARM_NEON) \|\| defined(__ARM_NEON__) \|\| defined(__aarch64__) \|\| defined(_M_ARM64)
91		#include <arm_neon.h>
92
93		#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
94		static inline SIMD_Implementation find_simd_implementation(void)
95		{
96		return SIMD_NEON;
97		}
98
99		#define HAVE_SIMD 1
100		#define HAVE_SIMD_NEON 1
101
102		// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
103		ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
104		{
105		const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
106		const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
107		return mask & 0x8888888888888888ull;
108		}
109
110		ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
111		{
112		uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
113
114		// Trick: c < 32 \|\| c == 34 can be factored as c ^ 2 < 33
115		// https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
116		const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));
117
118		uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
119		uint8x16_t needs_escape = vorrq_u8(too_low_or_dbl_quote, has_backslash);
120		return neon_match_mask(needs_escape);
121		}
122
123		ALWAYS_INLINE(static) int string_scan_simd_neon(const char *ptr, const char end, uint64_t *mask)
124		{
125		while (*ptr + sizeof(uint8x16_t) <= end) {
126		uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
127		if (chunk_mask) {
128		*mask = chunk_mask;
129		return 1;
130		}
131		*ptr += sizeof(uint8x16_t);
132		}
133		return 0;
134		}
135
136		#endif /* ARM Neon Support.*/
137
138		#if defined(__amd64__) \|\| defined(__amd64) \|\| defined(__x86_64__) \|\| defined(__x86_64) \|\| defined(_M_X64) \|\| defined(_M_AMD64)
139
140		#ifdef HAVE_X86INTRIN_H
141		#include <x86intrin.h>
142
143		#define HAVE_SIMD 1
144		#define HAVE_SIMD_SSE2 1
145
146		#ifdef HAVE_CPUID_H
147		#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
148
149		#if defined(__clang__) \|\| defined(__GNUC__)
150		#define TARGET_SSE2 __attribute__((target("sse2")))
151		#else
152		#define TARGET_SSE2
153		#endif
154
155		#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
156		#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
157		#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
158		#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
159
160		ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
161		{
162		__m128i chunk = _mm_loadu_si128((__m128i const*)ptr);
163		// Trick: c < 32 \|\| c == 34 can be factored as c ^ 2 < 33
164		// https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
165		__m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
166		__m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
167		__m128i needs_escape = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
168		return _mm_movemask_epi8(needs_escape);
169		}
170
171		ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char *ptr, const char end, int *mask)
172		{
173		while (*ptr + sizeof(__m128i) <= end) {
174		int chunk_mask = compute_chunk_mask_sse2(*ptr);
175		if (chunk_mask) {
176		*mask = chunk_mask;
177		return 1;
178		}
179		*ptr += sizeof(__m128i);
180		}
181
182		return 0;
183		}
184
185		#include <cpuid.h>
186		#endif /* HAVE_CPUID_H */
187
188		static inline SIMD_Implementation find_simd_implementation(void)
189		{
190		// TODO Revisit. I think the SSE version now only uses SSE2 instructions.
191		if (__builtin_cpu_supports("sse2")) {
192		return SIMD_SSE2;
193		}
194
195		return SIMD_NONE;
196		}
197
198		#endif /* HAVE_X86INTRIN_H */
199		#endif /* X86_64 Support */
200
201		#endif /* JSON_ENABLE_SIMD */
202
203		#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
204		static inline SIMD_Implementation find_simd_implementation(void)
205	0	{
206	0	return SIMD_NONE;
207	0	}
208		#endif