Coverage Report

Created: 2025-07-03 06:49

/src/postgres/src/port/pg_popcount_avx512.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * pg_popcount_avx512.c
4
 *    Holds the AVX-512 pg_popcount() implementation.
5
 *
6
 * Copyright (c) 2024-2025, PostgreSQL Global Development Group
7
 *
8
 * IDENTIFICATION
9
 *    src/port/pg_popcount_avx512.c
10
 *
11
 *-------------------------------------------------------------------------
12
 */
13
#include "c.h"
14
15
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
16
17
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
18
#include <cpuid.h>
19
#endif
20
21
#include <immintrin.h>
22
23
#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
24
#include <intrin.h>
25
#endif
26
27
#include "port/pg_bitutils.h"
28
29
/*
30
 * It's probably unlikely that TRY_POPCNT_X86_64 won't be set if we are able to
31
 * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
32
 * the function pointers that are only used when TRY_POPCNT_X86_64 is set.
33
 */
34
#ifdef TRY_POPCNT_X86_64
35
36
/*
37
 * Does CPUID say there's support for XSAVE instructions?
38
 */
39
static inline bool
40
xsave_available(void)
41
0
{
42
0
  unsigned int exx[4] = {0, 0, 0, 0};
43
44
0
#if defined(HAVE__GET_CPUID)
45
0
  __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
46
#elif defined(HAVE__CPUID)
47
  __cpuid(exx, 1);
48
#else
49
#error cpuid instruction not available
50
#endif
51
0
  return (exx[2] & (1 << 27)) != 0; /* osxsave */
52
0
}
53
54
/*
55
 * Does XGETBV say the ZMM registers are enabled?
56
 *
57
 * NB: Caller is responsible for verifying that xsave_available() returns true
58
 * before calling this.
59
 */
60
#ifdef HAVE_XSAVE_INTRINSICS
61
pg_attribute_target("xsave")
62
#endif
63
static inline bool
64
zmm_regs_available(void)
65
0
{
66
0
#ifdef HAVE_XSAVE_INTRINSICS
67
0
  return (_xgetbv(0) & 0xe6) == 0xe6;
68
#else
69
  return false;
70
#endif
71
0
}
72
73
/*
74
 * Does CPUID say there's support for AVX-512 popcount and byte-and-word
75
 * instructions?
76
 */
77
static inline bool
78
avx512_popcnt_available(void)
79
0
{
80
0
  unsigned int exx[4] = {0, 0, 0, 0};
81
82
0
#if defined(HAVE__GET_CPUID_COUNT)
83
0
  __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
84
#elif defined(HAVE__CPUIDEX)
85
  __cpuidex(exx, 7, 0);
86
#else
87
#error cpuid instruction not available
88
#endif
89
0
  return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
90
0
    (exx[1] & (1 << 30)) != 0; /* avx512-bw */
91
0
}
92
93
/*
94
 * Returns true if the CPU supports the instructions required for the AVX-512
95
 * pg_popcount() implementation.
96
 */
97
bool
98
pg_popcount_avx512_available(void)
99
0
{
100
0
  return xsave_available() &&
101
0
    zmm_regs_available() &&
102
0
    avx512_popcnt_available();
103
0
}
104
105
/*
106
 * pg_popcount_avx512
107
 *    Returns the number of 1-bits in buf
108
 */
109
pg_attribute_target("avx512vpopcntdq,avx512bw")
110
uint64
111
pg_popcount_avx512(const char *buf, int bytes)
112
0
{
113
0
  __m512i   val,
114
0
        cnt;
115
0
  __m512i   accum = _mm512_setzero_si512();
116
0
  const char *final;
117
0
  int     tail_idx;
118
0
  __mmask64 mask = ~UINT64CONST(0);
119
120
  /*
121
   * Align buffer down to avoid double load overhead from unaligned access.
122
   * Calculate a mask to ignore preceding bytes.  Find start offset of final
123
   * iteration and ensure it is not empty.
124
   */
125
0
  mask <<= ((uintptr_t) buf) % sizeof(__m512i);
126
0
  tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
127
0
  final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
128
0
  buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
129
130
  /*
131
   * Iterate through all but the final iteration.  Starting from the second
132
   * iteration, the mask is ignored.
133
   */
134
0
  if (buf < final)
135
0
  {
136
0
    val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
137
0
    cnt = _mm512_popcnt_epi64(val);
138
0
    accum = _mm512_add_epi64(accum, cnt);
139
140
0
    buf += sizeof(__m512i);
141
0
    mask = ~UINT64CONST(0);
142
143
0
    for (; buf < final; buf += sizeof(__m512i))
144
0
    {
145
0
      val = _mm512_load_si512((const __m512i *) buf);
146
0
      cnt = _mm512_popcnt_epi64(val);
147
0
      accum = _mm512_add_epi64(accum, cnt);
148
0
    }
149
0
  }
150
151
  /* Final iteration needs to ignore bytes that are not within the length */
152
0
  mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
153
154
0
  val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
155
0
  cnt = _mm512_popcnt_epi64(val);
156
0
  accum = _mm512_add_epi64(accum, cnt);
157
158
0
  return _mm512_reduce_add_epi64(accum);
159
0
}
160
161
/*
162
 * pg_popcount_masked_avx512
163
 *    Returns the number of 1-bits in buf after applying the mask to each byte
164
 */
165
pg_attribute_target("avx512vpopcntdq,avx512bw")
166
uint64
167
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
168
0
{
169
0
  __m512i   val,
170
0
        vmasked,
171
0
        cnt;
172
0
  __m512i   accum = _mm512_setzero_si512();
173
0
  const char *final;
174
0
  int     tail_idx;
175
0
  __mmask64 bmask = ~UINT64CONST(0);
176
0
  const __m512i maskv = _mm512_set1_epi8(mask);
177
178
  /*
179
   * Align buffer down to avoid double load overhead from unaligned access.
180
   * Calculate a mask to ignore preceding bytes.  Find start offset of final
181
   * iteration and ensure it is not empty.
182
   */
183
0
  bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
184
0
  tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
185
0
  final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
186
0
  buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
187
188
  /*
189
   * Iterate through all but the final iteration.  Starting from the second
190
   * iteration, the mask is ignored.
191
   */
192
0
  if (buf < final)
193
0
  {
194
0
    val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
195
0
    vmasked = _mm512_and_si512(val, maskv);
196
0
    cnt = _mm512_popcnt_epi64(vmasked);
197
0
    accum = _mm512_add_epi64(accum, cnt);
198
199
0
    buf += sizeof(__m512i);
200
0
    bmask = ~UINT64CONST(0);
201
202
0
    for (; buf < final; buf += sizeof(__m512i))
203
0
    {
204
0
      val = _mm512_load_si512((const __m512i *) buf);
205
0
      vmasked = _mm512_and_si512(val, maskv);
206
0
      cnt = _mm512_popcnt_epi64(vmasked);
207
0
      accum = _mm512_add_epi64(accum, cnt);
208
0
    }
209
0
  }
210
211
  /* Final iteration needs to ignore bytes that are not within the length */
212
0
  bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
213
214
0
  val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
215
0
  vmasked = _mm512_and_si512(val, maskv);
216
0
  cnt = _mm512_popcnt_epi64(vmasked);
217
0
  accum = _mm512_add_epi64(accum, cnt);
218
219
0
  return _mm512_reduce_add_epi64(accum);
220
0
}
221
222
#endif              /* TRY_POPCNT_X86_64 */
223
#endif              /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */