/src/postgres/src/port/pg_popcount_avx512.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * pg_popcount_avx512.c |
4 | | * Holds the AVX-512 pg_popcount() implementation. |
5 | | * |
6 | | * Copyright (c) 2024-2025, PostgreSQL Global Development Group |
7 | | * |
8 | | * IDENTIFICATION |
9 | | * src/port/pg_popcount_avx512.c |
10 | | * |
11 | | *------------------------------------------------------------------------- |
12 | | */ |
13 | | #include "c.h" |
14 | | |
15 | | #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK |
16 | | |
17 | | #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) |
18 | | #include <cpuid.h> |
19 | | #endif |
20 | | |
21 | | #include <immintrin.h> |
22 | | |
23 | | #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) |
24 | | #include <intrin.h> |
25 | | #endif |
26 | | |
27 | | #include "port/pg_bitutils.h" |
28 | | |
29 | | /* |
30 | | * It's probably unlikely that TRY_POPCNT_X86_64 won't be set if we are able to |
31 | | * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on |
32 | | * the function pointers that are only used when TRY_POPCNT_X86_64 is set. |
33 | | */ |
34 | | #ifdef TRY_POPCNT_X86_64 |
35 | | |
36 | | /* |
37 | | * Does CPUID say there's support for XSAVE instructions? |
38 | | */ |
39 | | static inline bool |
40 | | xsave_available(void) |
41 | 0 | { |
42 | 0 | unsigned int exx[4] = {0, 0, 0, 0}; |
43 | |
|
44 | 0 | #if defined(HAVE__GET_CPUID) |
45 | 0 | __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); |
46 | | #elif defined(HAVE__CPUID) |
47 | | __cpuid(exx, 1); |
48 | | #else |
49 | | #error cpuid instruction not available |
50 | | #endif |
51 | 0 | return (exx[2] & (1 << 27)) != 0; /* osxsave */ |
52 | 0 | } |
53 | | |
54 | | /* |
55 | | * Does XGETBV say the ZMM registers are enabled? |
56 | | * |
57 | | * NB: Caller is responsible for verifying that xsave_available() returns true |
58 | | * before calling this. |
59 | | */ |
60 | | #ifdef HAVE_XSAVE_INTRINSICS |
61 | | pg_attribute_target("xsave") |
62 | | #endif |
63 | | static inline bool |
64 | | zmm_regs_available(void) |
65 | 0 | { |
66 | 0 | #ifdef HAVE_XSAVE_INTRINSICS |
67 | 0 | return (_xgetbv(0) & 0xe6) == 0xe6; |
68 | | #else |
69 | | return false; |
70 | | #endif |
71 | 0 | } |
72 | | |
73 | | /* |
74 | | * Does CPUID say there's support for AVX-512 popcount and byte-and-word |
75 | | * instructions? |
76 | | */ |
77 | | static inline bool |
78 | | avx512_popcnt_available(void) |
79 | 0 | { |
80 | 0 | unsigned int exx[4] = {0, 0, 0, 0}; |
81 | |
|
82 | 0 | #if defined(HAVE__GET_CPUID_COUNT) |
83 | 0 | __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); |
84 | | #elif defined(HAVE__CPUIDEX) |
85 | | __cpuidex(exx, 7, 0); |
86 | | #else |
87 | | #error cpuid instruction not available |
88 | | #endif |
89 | 0 | return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */ |
90 | 0 | (exx[1] & (1 << 30)) != 0; /* avx512-bw */ |
91 | 0 | } |
92 | | |
93 | | /* |
94 | | * Returns true if the CPU supports the instructions required for the AVX-512 |
95 | | * pg_popcount() implementation. |
96 | | */ |
97 | | bool |
98 | | pg_popcount_avx512_available(void) |
99 | 0 | { |
100 | 0 | return xsave_available() && |
101 | 0 | zmm_regs_available() && |
102 | 0 | avx512_popcnt_available(); |
103 | 0 | } |
104 | | |
105 | | /* |
106 | | * pg_popcount_avx512 |
107 | | * Returns the number of 1-bits in buf |
108 | | */ |
109 | | pg_attribute_target("avx512vpopcntdq,avx512bw") |
110 | | uint64 |
111 | | pg_popcount_avx512(const char *buf, int bytes) |
112 | 0 | { |
113 | 0 | __m512i val, |
114 | 0 | cnt; |
115 | 0 | __m512i accum = _mm512_setzero_si512(); |
116 | 0 | const char *final; |
117 | 0 | int tail_idx; |
118 | 0 | __mmask64 mask = ~UINT64CONST(0); |
119 | | |
120 | | /* |
121 | | * Align buffer down to avoid double load overhead from unaligned access. |
122 | | * Calculate a mask to ignore preceding bytes. Find start offset of final |
123 | | * iteration and ensure it is not empty. |
124 | | */ |
125 | 0 | mask <<= ((uintptr_t) buf) % sizeof(__m512i); |
126 | 0 | tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; |
127 | 0 | final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); |
128 | 0 | buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); |
129 | | |
130 | | /* |
131 | | * Iterate through all but the final iteration. Starting from the second |
132 | | * iteration, the mask is ignored. |
133 | | */ |
134 | 0 | if (buf < final) |
135 | 0 | { |
136 | 0 | val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); |
137 | 0 | cnt = _mm512_popcnt_epi64(val); |
138 | 0 | accum = _mm512_add_epi64(accum, cnt); |
139 | |
|
140 | 0 | buf += sizeof(__m512i); |
141 | 0 | mask = ~UINT64CONST(0); |
142 | |
|
143 | 0 | for (; buf < final; buf += sizeof(__m512i)) |
144 | 0 | { |
145 | 0 | val = _mm512_load_si512((const __m512i *) buf); |
146 | 0 | cnt = _mm512_popcnt_epi64(val); |
147 | 0 | accum = _mm512_add_epi64(accum, cnt); |
148 | 0 | } |
149 | 0 | } |
150 | | |
151 | | /* Final iteration needs to ignore bytes that are not within the length */ |
152 | 0 | mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx)); |
153 | |
|
154 | 0 | val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); |
155 | 0 | cnt = _mm512_popcnt_epi64(val); |
156 | 0 | accum = _mm512_add_epi64(accum, cnt); |
157 | |
|
158 | 0 | return _mm512_reduce_add_epi64(accum); |
159 | 0 | } |
160 | | |
161 | | /* |
162 | | * pg_popcount_masked_avx512 |
163 | | * Returns the number of 1-bits in buf after applying the mask to each byte |
164 | | */ |
165 | | pg_attribute_target("avx512vpopcntdq,avx512bw") |
166 | | uint64 |
167 | | pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) |
168 | 0 | { |
169 | 0 | __m512i val, |
170 | 0 | vmasked, |
171 | 0 | cnt; |
172 | 0 | __m512i accum = _mm512_setzero_si512(); |
173 | 0 | const char *final; |
174 | 0 | int tail_idx; |
175 | 0 | __mmask64 bmask = ~UINT64CONST(0); |
176 | 0 | const __m512i maskv = _mm512_set1_epi8(mask); |
177 | | |
178 | | /* |
179 | | * Align buffer down to avoid double load overhead from unaligned access. |
180 | | * Calculate a mask to ignore preceding bytes. Find start offset of final |
181 | | * iteration and ensure it is not empty. |
182 | | */ |
183 | 0 | bmask <<= ((uintptr_t) buf) % sizeof(__m512i); |
184 | 0 | tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; |
185 | 0 | final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); |
186 | 0 | buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); |
187 | | |
188 | | /* |
189 | | * Iterate through all but the final iteration. Starting from the second |
190 | | * iteration, the mask is ignored. |
191 | | */ |
192 | 0 | if (buf < final) |
193 | 0 | { |
194 | 0 | val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf); |
195 | 0 | vmasked = _mm512_and_si512(val, maskv); |
196 | 0 | cnt = _mm512_popcnt_epi64(vmasked); |
197 | 0 | accum = _mm512_add_epi64(accum, cnt); |
198 | |
|
199 | 0 | buf += sizeof(__m512i); |
200 | 0 | bmask = ~UINT64CONST(0); |
201 | |
|
202 | 0 | for (; buf < final; buf += sizeof(__m512i)) |
203 | 0 | { |
204 | 0 | val = _mm512_load_si512((const __m512i *) buf); |
205 | 0 | vmasked = _mm512_and_si512(val, maskv); |
206 | 0 | cnt = _mm512_popcnt_epi64(vmasked); |
207 | 0 | accum = _mm512_add_epi64(accum, cnt); |
208 | 0 | } |
209 | 0 | } |
210 | | |
211 | | /* Final iteration needs to ignore bytes that are not within the length */ |
212 | 0 | bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx)); |
213 | |
|
214 | 0 | val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf); |
215 | 0 | vmasked = _mm512_and_si512(val, maskv); |
216 | 0 | cnt = _mm512_popcnt_epi64(vmasked); |
217 | 0 | accum = _mm512_add_epi64(accum, cnt); |
218 | |
|
219 | 0 | return _mm512_reduce_add_epi64(accum); |
220 | 0 | } |
221 | | |
222 | | #endif /* TRY_POPCNT_X86_64 */ |
223 | | #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ |