Coverage Report

Created: 2025-06-16 07:00

/src/libdeflate/lib/x86/adler32_template.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * x86/adler32_template.h - template for vectorized Adler-32 implementations
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 */
27
28
/*
29
 * This file is a "template" for instantiating Adler-32 functions for x86.
30
 * The "parameters" are:
31
 *
32
 * SUFFIX:
33
 *  Name suffix to append to all instantiated functions.
34
 * ATTRIBUTES:
35
 *  Target function attributes to use.  Must satisfy the dependencies of the
36
 *  other parameters as follows:
37
 *     VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
38
 *     VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
39
 *     VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
40
 *     VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
41
 *     VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
42
 *     (Other combinations are not useful and have not been tested.)
43
 * VL:
44
 *  Vector length in bytes.  Must be 16, 32, or 64.
45
 * USE_VNNI:
46
 *  If 1, use the VNNI dot product based algorithm.
47
 *  If 0, use the legacy SSE2 and AVX2 compatible algorithm.
48
 * USE_AVX512:
49
 *  If 1, take advantage of AVX-512 features such as masking.  This doesn't
50
 *  enable the use of 512-bit vectors; the vector length is controlled by
51
 *  VL.  If 0, assume that the CPU might not support AVX-512.
52
 */
53
54
#if VL == 16
55
0
#  define vec_t     __m128i
56
#  define mask_t    u16
57
#  define LOG2_VL   4
58
#  define VADD8(a, b)   _mm_add_epi8((a), (b))
59
0
#  define VADD16(a, b)    _mm_add_epi16((a), (b))
60
0
#  define VADD32(a, b)    _mm_add_epi32((a), (b))
61
#  if USE_AVX512
62
#    define VDPBUSD(a, b, c)  _mm_dpbusd_epi32((a), (b), (c))
63
#  else
64
#    define VDPBUSD(a, b, c)  _mm_dpbusd_avx_epi32((a), (b), (c))
65
#  endif
66
0
#  define VLOAD(p)    _mm_load_si128((const void *)(p))
67
0
#  define VLOADU(p)   _mm_loadu_si128((const void *)(p))
68
#  define VMADD16(a, b)   _mm_madd_epi16((a), (b))
69
#  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
70
#  define VMULLO32(a, b)  _mm_mullo_epi32((a), (b))
71
#  define VSAD8(a, b)   _mm_sad_epu8((a), (b))
72
#  define VSET1_8(a)    _mm_set1_epi8(a)
73
#  define VSET1_32(a)   _mm_set1_epi32(a)
74
0
#  define VSETZERO()    _mm_setzero_si128()
75
#  define VSLL32(a, b)    _mm_slli_epi32((a), (b))
76
#  define VUNPACKLO8(a, b)  _mm_unpacklo_epi8((a), (b))
77
#  define VUNPACKHI8(a, b)  _mm_unpackhi_epi8((a), (b))
78
#elif VL == 32
79
602k
#  define vec_t     __m256i
80
#  define mask_t    u32
81
#  define LOG2_VL   5
82
0
#  define VADD8(a, b)   _mm256_add_epi8((a), (b))
83
1.14M
#  define VADD16(a, b)    _mm256_add_epi16((a), (b))
84
578k
#  define VADD32(a, b)    _mm256_add_epi32((a), (b))
85
#  if USE_AVX512
86
0
#    define VDPBUSD(a, b, c)  _mm256_dpbusd_epi32((a), (b), (c))
87
#  else
88
0
#    define VDPBUSD(a, b, c)  _mm256_dpbusd_avx_epi32((a), (b), (c))
89
#  endif
90
4.16k
#  define VLOAD(p)    _mm256_load_si256((const void *)(p))
91
574k
#  define VLOADU(p)   _mm256_loadu_si256((const void *)(p))
92
#  define VMADD16(a, b)   _mm256_madd_epi16((a), (b))
93
0
#  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
94
#  define VMULLO32(a, b)  _mm256_mullo_epi32((a), (b))
95
#  define VSAD8(a, b)   _mm256_sad_epu8((a), (b))
96
0
#  define VSET1_8(a)    _mm256_set1_epi8(a)
97
#  define VSET1_32(a)   _mm256_set1_epi32(a)
98
1.04k
#  define VSETZERO()    _mm256_setzero_si256()
99
#  define VSLL32(a, b)    _mm256_slli_epi32((a), (b))
100
#  define VUNPACKLO8(a, b)  _mm256_unpacklo_epi8((a), (b))
101
#  define VUNPACKHI8(a, b)  _mm256_unpackhi_epi8((a), (b))
102
#elif VL == 64
103
0
#  define vec_t     __m512i
104
#  define mask_t    u64
105
#  define LOG2_VL   6
106
0
#  define VADD8(a, b)   _mm512_add_epi8((a), (b))
107
#  define VADD16(a, b)    _mm512_add_epi16((a), (b))
108
0
#  define VADD32(a, b)    _mm512_add_epi32((a), (b))
109
0
#  define VDPBUSD(a, b, c)  _mm512_dpbusd_epi32((a), (b), (c))
110
0
#  define VLOAD(p)    _mm512_load_si512((const void *)(p))
111
0
#  define VLOADU(p)   _mm512_loadu_si512((const void *)(p))
112
#  define VMADD16(a, b)   _mm512_madd_epi16((a), (b))
113
0
#  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
114
#  define VMULLO32(a, b)  _mm512_mullo_epi32((a), (b))
115
#  define VSAD8(a, b)   _mm512_sad_epu8((a), (b))
116
0
#  define VSET1_8(a)    _mm512_set1_epi8(a)
117
#  define VSET1_32(a)   _mm512_set1_epi32(a)
118
0
#  define VSETZERO()    _mm512_setzero_si512()
119
#  define VSLL32(a, b)    _mm512_slli_epi32((a), (b))
120
#  define VUNPACKLO8(a, b)  _mm512_unpacklo_epi8((a), (b))
121
#  define VUNPACKHI8(a, b)  _mm512_unpackhi_epi8((a), (b))
122
#else
123
#  error "unsupported vector length"
124
#endif
125
126
0
#define VADD32_3X(a, b, c)  VADD32(VADD32((a), (b)), (c))
127
0
#define VADD32_4X(a, b, c, d) VADD32(VADD32((a), (b)), VADD32((c), (d)))
128
3.90k
#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e)))
129
#define VADD32_7X(a, b, c, d, e, f, g)  \
130
0
  VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g)))
131
132
/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */
133
#undef reduce_to_32bits
134
static forceinline ATTRIBUTES void
135
ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
136
3.90k
{
137
3.90k
  __m128i v_s1_128, v_s2_128;
138
#if VL == 16
139
  {
140
    v_s1_128 = v_s1;
141
    v_s2_128 = v_s2;
142
  }
143
#else
144
  {
145
    __m256i v_s1_256, v_s2_256;
146
  #if VL == 32
147
    v_s1_256 = v_s1;
148
    v_s2_256 = v_s2;
149
  #else
150
    /* Reduce 512 bits to 256 bits. */
151
    v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0),
152
              _mm512_extracti64x4_epi64(v_s1, 1));
153
    v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0),
154
              _mm512_extracti64x4_epi64(v_s2, 1));
155
  #endif
156
    /* Reduce 256 bits to 128 bits. */
157
    v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0),
158
           _mm256_extracti128_si256(v_s1_256, 1));
159
    v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0),
160
           _mm256_extracti128_si256(v_s2_256, 1));
161
  }
162
#endif
163
164
  /*
165
   * Reduce 128 bits to 32 bits.
166
   *
167
   * If the bytes were summed into v_s1 using psadbw + paddd, then ignore
168
   * the odd-indexed elements of v_s1_128 since they are zero.
169
   */
170
#if USE_VNNI
171
  v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31));
172
#endif
173
3.90k
  v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
174
3.90k
  v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02));
175
3.90k
  v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02));
176
177
3.90k
  *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128);
178
3.90k
  *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128);
179
3.90k
}
Unexecuted instantiation: adler32.c:reduce_to_32bits_avx512_vl512_vnni
Unexecuted instantiation: adler32.c:reduce_to_32bits_avx512_vl256_vnni
Unexecuted instantiation: adler32.c:reduce_to_32bits_avx2_vnni
adler32.c:reduce_to_32bits_avx2
Line
Count
Source
136
3.90k
{
137
3.90k
  __m128i v_s1_128, v_s2_128;
138
#if VL == 16
139
  {
140
    v_s1_128 = v_s1;
141
    v_s2_128 = v_s2;
142
  }
143
#else
144
3.90k
  {
145
3.90k
    __m256i v_s1_256, v_s2_256;
146
3.90k
  #if VL == 32
147
3.90k
    v_s1_256 = v_s1;
148
3.90k
    v_s2_256 = v_s2;
149
  #else
150
    /* Reduce 512 bits to 256 bits. */
151
    v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0),
152
              _mm512_extracti64x4_epi64(v_s1, 1));
153
    v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0),
154
              _mm512_extracti64x4_epi64(v_s2, 1));
155
  #endif
156
    /* Reduce 256 bits to 128 bits. */
157
3.90k
    v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0),
158
3.90k
           _mm256_extracti128_si256(v_s1_256, 1));
159
3.90k
    v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0),
160
3.90k
           _mm256_extracti128_si256(v_s2_256, 1));
161
3.90k
  }
162
3.90k
#endif
163
164
  /*
165
   * Reduce 128 bits to 32 bits.
166
   *
167
   * If the bytes were summed into v_s1 using psadbw + paddd, then ignore
168
   * the odd-indexed elements of v_s1_128 since they are zero.
169
   */
170
#if USE_VNNI
171
  v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31));
172
#endif
173
3.90k
  v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
174
3.90k
  v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02));
175
3.90k
  v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02));
176
177
3.90k
  *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128);
178
3.90k
  *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128);
179
3.90k
}
Unexecuted instantiation: adler32.c:reduce_to_32bits_sse2
180
3.90k
#define reduce_to_32bits  ADD_SUFFIX(reduce_to_32bits)
181
182
static ATTRIBUTES u32
183
ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
184
1.04k
{
185
#if USE_VNNI
186
  /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
187
  static const u8 _aligned_attribute(VL) raw_mults[VL] = {
188
  #if VL == 64
189
    64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
190
    48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
191
  #endif
192
  #if VL >= 32
193
    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
194
  #endif
195
    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
196
  };
197
0
  const vec_t ones = VSET1_8(1);
198
#else
199
  /*
200
   * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1].
201
   * For VL==32 the ordering is weird because it has to match the way that
202
   * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately.
203
   */
204
  static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = {
205
  #if VL == 16
206
    { 32, 31, 30, 29, 28, 27, 26, 25 },
207
    { 24, 23, 22, 21, 20, 19, 18, 17 },
208
    { 16, 15, 14, 13, 12, 11, 10, 9  },
209
    { 8,  7,  6,  5,  4,  3,  2,  1  },
210
  #elif VL == 32
211
    { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
212
    { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
213
    { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
214
    { 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
215
  #else
216
  #  error "unsupported parameters"
217
  #endif
218
  };
219
1.04k
  const vec_t mults_a = VLOAD(raw_mults[0]);
220
1.04k
  const vec_t mults_b = VLOAD(raw_mults[1]);
221
1.04k
  const vec_t mults_c = VLOAD(raw_mults[2]);
222
1.04k
  const vec_t mults_d = VLOAD(raw_mults[3]);
223
#endif
224
1.04k
  const vec_t zeroes = VSETZERO();
225
1.04k
  u32 s1 = adler & 0xFFFF;
226
1.04k
  u32 s2 = adler >> 16;
227
228
  /*
229
   * If the length is large and the pointer is misaligned, align it.
230
   * For smaller lengths, just take the misaligned load penalty.
231
   */
232
1.04k
  if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
233
656
    do {
234
656
      s1 += *p++;
235
656
      s2 += s1;
236
656
      len--;
237
656
    } while ((uintptr_t)p & (VL-1));
238
41
    s1 %= DIVISOR;
239
41
    s2 %= DIVISOR;
240
41
  }
241
242
#if USE_VNNI
243
  /*
244
   * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
245
   * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
246
   * the signed bytes of another vector and adds the sums in groups of 4
247
   * to the 32-bit elements of a third vector.  We use it in two ways:
248
   * multiplying the data bytes by a sequence like 64,63,62,...,1 for
249
   * calculating part of s2, and multiplying the data bytes by an all-ones
250
   * sequence 1,1,1,...,1 for calculating s1 and part of s2.  The all-ones
251
   * trick seems to be faster than the alternative of vpsadbw + vpaddd.
252
   */
253
0
  while (len) {
254
    /*
255
     * Calculate the length of the next data chunk such that s1 and
256
     * s2 are guaranteed to not exceed UINT32_MAX.
257
     */
258
0
    size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1));
259
0
    vec_t mults = VLOAD(raw_mults);
260
0
    vec_t v_s1 = zeroes;
261
0
    vec_t v_s2 = zeroes;
262
263
    s2 += s1 * n;
264
    len -= n;
265
266
0
    if (n >= 4*VL) {
267
0
      vec_t v_s1_b = zeroes;
268
0
      vec_t v_s1_c = zeroes;
269
0
      vec_t v_s1_d = zeroes;
270
0
      vec_t v_s2_b = zeroes;
271
0
      vec_t v_s2_c = zeroes;
272
0
      vec_t v_s2_d = zeroes;
273
0
      vec_t v_s1_sums   = zeroes;
274
0
      vec_t v_s1_sums_b = zeroes;
275
0
      vec_t v_s1_sums_c = zeroes;
276
0
      vec_t v_s1_sums_d = zeroes;
277
0
      vec_t tmp0, tmp1;
278
279
0
      do {
280
0
        vec_t data_a = VLOADU(p + 0*VL);
281
0
        vec_t data_b = VLOADU(p + 1*VL);
282
0
        vec_t data_c = VLOADU(p + 2*VL);
283
0
        vec_t data_d = VLOADU(p + 3*VL);
284
285
        /*
286
         * Workaround for gcc bug where it generates
287
         * unnecessary move instructions
288
         * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
289
         */
290
      #if GCC_PREREQ(1, 0)
291
        __asm__("" : "+v" (data_a), "+v" (data_b),
292
               "+v" (data_c), "+v" (data_d));
293
      #endif
294
295
0
        v_s2   = VDPBUSD(v_s2,   data_a, mults);
296
0
        v_s2_b = VDPBUSD(v_s2_b, data_b, mults);
297
0
        v_s2_c = VDPBUSD(v_s2_c, data_c, mults);
298
0
        v_s2_d = VDPBUSD(v_s2_d, data_d, mults);
299
300
0
        v_s1_sums   = VADD32(v_s1_sums,   v_s1);
301
0
        v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b);
302
0
        v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c);
303
0
        v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d);
304
305
0
        v_s1   = VDPBUSD(v_s1,   data_a, ones);
306
0
        v_s1_b = VDPBUSD(v_s1_b, data_b, ones);
307
0
        v_s1_c = VDPBUSD(v_s1_c, data_c, ones);
308
0
        v_s1_d = VDPBUSD(v_s1_d, data_d, ones);
309
310
        /* Same gcc bug workaround.  See above */
311
      #if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32)
312
        __asm__("" : "+v" (v_s2), "+v" (v_s2_b),
313
               "+v" (v_s2_c), "+v" (v_s2_d),
314
               "+v" (v_s1_sums),
315
               "+v" (v_s1_sums_b),
316
               "+v" (v_s1_sums_c),
317
               "+v" (v_s1_sums_d),
318
               "+v" (v_s1), "+v" (v_s1_b),
319
               "+v" (v_s1_c), "+v" (v_s1_d));
320
      #endif
321
0
        p += 4*VL;
322
0
        n -= 4*VL;
323
0
      } while (n >= 4*VL);
324
325
      /*
326
       * Reduce into v_s1 and v_s2 as follows:
327
       *
328
       * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d +
329
       *    (4*VL)*(v_s1_sums   + v_s1_sums_b +
330
       *      v_s1_sums_c + v_s1_sums_d) +
331
       *    (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c
332
       * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d
333
       */
334
0
      tmp0 = VADD32(v_s1, v_s1_b);
335
0
      tmp1 = VADD32(v_s1, v_s1_c);
336
0
      v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b,
337
                v_s1_sums_c, v_s1_sums_d);
338
0
      v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d);
339
0
      v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2),
340
0
           VSLL32(tmp0, LOG2_VL + 1),
341
0
           VSLL32(tmp1, LOG2_VL),
342
0
           v_s2, v_s2_b, v_s2_c, v_s2_d);
343
0
    }
344
345
    /* Process the last 0 <= n < 4*VL bytes of the chunk. */
346
0
    if (n >= 2*VL) {
347
0
      const vec_t data_a = VLOADU(p + 0*VL);
348
0
      const vec_t data_b = VLOADU(p + 1*VL);
349
350
0
      v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1));
351
0
      v_s1 = VDPBUSD(v_s1, data_a, ones);
352
0
      v_s1 = VDPBUSD(v_s1, data_b, ones);
353
0
      v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL));
354
0
      v_s2 = VDPBUSD(v_s2, data_a, mults);
355
0
      v_s2 = VDPBUSD(v_s2, data_b, mults);
356
0
      p += 2*VL;
357
0
      n -= 2*VL;
358
0
    }
359
0
    if (n) {
360
      /* Process the last 0 < n < 2*VL bytes of the chunk. */
361
0
      vec_t data;
362
363
0
      v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n)));
364
365
0
      mults = VADD8(mults, VSET1_8((int)n - VL));
366
0
      if (n > VL) {
367
0
        data = VLOADU(p);
368
0
        v_s1 = VDPBUSD(v_s1, data, ones);
369
0
        v_s2 = VDPBUSD(v_s2, data, mults);
370
0
        p += VL;
371
0
        n -= VL;
372
0
        mults = VADD8(mults, VSET1_8(-VL));
373
0
      }
374
      /*
375
       * Process the last 0 < n <= VL bytes of the chunk.
376
       * Utilize a masked load if it's available.
377
       */
378
    #if USE_AVX512
379
0
      data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
380
    #else
381
      data = zeroes;
382
      memcpy(&data, p, n);
383
    #endif
384
0
      v_s1 = VDPBUSD(v_s1, data, ones);
385
0
      v_s2 = VDPBUSD(v_s2, data, mults);
386
0
      p += n;
387
0
    }
388
389
0
    reduce_to_32bits(v_s1, v_s2, &s1, &s2);
390
0
    s1 %= DIVISOR;
391
0
    s2 %= DIVISOR;
392
0
  }
393
#else /* USE_VNNI */
394
  /*
395
   * This is Adler-32 for SSE2 and AVX2.
396
   *
397
   * To horizontally sum bytes, use psadbw + paddd, where one of the
398
   * arguments to psadbw is all-zeroes.
399
   *
400
   * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL
401
   * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw
402
   * to sum, for each i across iterations, byte i into a corresponding
403
   * 16-bit counter in v_byte_sums_*.  After the inner loop, use pmaddwd
404
   * to multiply each counter by (2*VL - i), then add the products to s2.
405
   *
406
   * An alternative implementation would use pmaddubsw and pmaddwd in the
407
   * inner loop to do (2*VL - i)*data[i] directly and add the products in
408
   * groups of 4 to 32-bit counters.  However, on average that approach
409
   * seems to be slower than the current approach which delays the
410
   * multiplications.  Also, pmaddubsw requires SSSE3; the current
411
   * approach keeps the implementation aligned between SSE2 and AVX2.
412
   *
413
   * The inner loop processes 2*VL bytes per iteration.  Increasing this
414
   * to 4*VL doesn't seem to be helpful here.
415
   */
416
4.97k
  while (len) {
417
    /*
418
     * Calculate the length of the next data chunk such that s1 and
419
     * s2 are guaranteed to not exceed UINT32_MAX, and every
420
     * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
421
     * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
422
     * used with pmaddwd which does signed multiplication.  In the
423
     * SSE2 case this limits chunks to 4096 bytes instead of 5536.
424
     */
425
3.93k
    size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
426
3.93k
          MAX_CHUNK_LEN) & ~(2*VL - 1));
427
3.93k
    len -= n;
428
429
3.93k
    if (n >= 2*VL) {
430
3.90k
      vec_t v_s1 = zeroes;
431
3.90k
      vec_t v_s1_sums = zeroes;
432
3.90k
      vec_t v_byte_sums_a = zeroes;
433
3.90k
      vec_t v_byte_sums_b = zeroes;
434
3.90k
      vec_t v_byte_sums_c = zeroes;
435
3.90k
      vec_t v_byte_sums_d = zeroes;
436
3.90k
      vec_t v_s2;
437
438
3.90k
      s2 += s1 * (n & ~(2*VL - 1));
439
440
287k
      do {
441
287k
        vec_t data_a = VLOADU(p + 0*VL);
442
287k
        vec_t data_b = VLOADU(p + 1*VL);
443
444
287k
        v_s1_sums = VADD32(v_s1_sums, v_s1);
445
287k
        v_byte_sums_a = VADD16(v_byte_sums_a,
446
287k
                   VUNPACKLO8(data_a, zeroes));
447
287k
        v_byte_sums_b = VADD16(v_byte_sums_b,
448
287k
                   VUNPACKHI8(data_a, zeroes));
449
287k
        v_byte_sums_c = VADD16(v_byte_sums_c,
450
287k
                   VUNPACKLO8(data_b, zeroes));
451
287k
        v_byte_sums_d = VADD16(v_byte_sums_d,
452
287k
                   VUNPACKHI8(data_b, zeroes));
453
287k
        v_s1 = VADD32(v_s1,
454
287k
                VADD32(VSAD8(data_a, zeroes),
455
287k
                 VSAD8(data_b, zeroes)));
456
        /*
457
         * Workaround for gcc bug where it generates
458
         * unnecessary move instructions
459
         * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
460
         */
461
      #if GCC_PREREQ(1, 0)
462
        __asm__("" : "+x" (v_s1), "+x" (v_s1_sums),
463
               "+x" (v_byte_sums_a),
464
               "+x" (v_byte_sums_b),
465
               "+x" (v_byte_sums_c),
466
               "+x" (v_byte_sums_d));
467
      #endif
468
287k
        p += 2*VL;
469
287k
        n -= 2*VL;
470
287k
      } while (n >= 2*VL);
471
472
      /*
473
       * Calculate v_s2 as (2*VL)*v_s1_sums +
474
       * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums.
475
       * Then update s1 and s2 from v_s1 and v_s2.
476
       */
477
3.90k
      v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1),
478
3.90k
           VMADD16(v_byte_sums_a, mults_a),
479
3.90k
           VMADD16(v_byte_sums_b, mults_b),
480
3.90k
           VMADD16(v_byte_sums_c, mults_c),
481
3.90k
           VMADD16(v_byte_sums_d, mults_d));
482
3.90k
      reduce_to_32bits(v_s1, v_s2, &s1, &s2);
483
3.90k
    }
484
    /*
485
     * Process the last 0 <= n < 2*VL bytes of the chunk using
486
     * scalar instructions and reduce s1 and s2 mod DIVISOR.
487
     */
488
3.93k
    ADLER32_CHUNK(s1, s2, p, n);
489
3.93k
  }
490
#endif /* !USE_VNNI */
491
1.04k
  return (s2 << 16) | s1;
492
1.04k
}
Unexecuted instantiation: adler32.c:adler32_x86_avx512_vl512_vnni
Unexecuted instantiation: adler32.c:adler32_x86_avx512_vl256_vnni
Unexecuted instantiation: adler32.c:adler32_x86_avx2_vnni
adler32.c:adler32_x86_avx2
Line
Count
Source
184
1.04k
{
185
#if USE_VNNI
186
  /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
187
  static const u8 _aligned_attribute(VL) raw_mults[VL] = {
188
  #if VL == 64
189
    64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
190
    48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
191
  #endif
192
  #if VL >= 32
193
    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
194
  #endif
195
    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
196
  };
197
  const vec_t ones = VSET1_8(1);
198
#else
199
  /*
200
   * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1].
201
   * For VL==32 the ordering is weird because it has to match the way that
202
   * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately.
203
   */
204
1.04k
  static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = {
205
  #if VL == 16
206
    { 32, 31, 30, 29, 28, 27, 26, 25 },
207
    { 24, 23, 22, 21, 20, 19, 18, 17 },
208
    { 16, 15, 14, 13, 12, 11, 10, 9  },
209
    { 8,  7,  6,  5,  4,  3,  2,  1  },
210
  #elif VL == 32
211
    { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
212
1.04k
    { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
213
1.04k
    { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
214
1.04k
    { 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
215
  #else
216
  #  error "unsupported parameters"
217
  #endif
218
1.04k
  };
219
1.04k
  const vec_t mults_a = VLOAD(raw_mults[0]);
220
1.04k
  const vec_t mults_b = VLOAD(raw_mults[1]);
221
1.04k
  const vec_t mults_c = VLOAD(raw_mults[2]);
222
1.04k
  const vec_t mults_d = VLOAD(raw_mults[3]);
223
1.04k
#endif
224
1.04k
  const vec_t zeroes = VSETZERO();
225
1.04k
  u32 s1 = adler & 0xFFFF;
226
1.04k
  u32 s2 = adler >> 16;
227
228
  /*
229
   * If the length is large and the pointer is misaligned, align it.
230
   * For smaller lengths, just take the misaligned load penalty.
231
   */
232
1.04k
  if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
233
656
    do {
234
656
      s1 += *p++;
235
656
      s2 += s1;
236
656
      len--;
237
656
    } while ((uintptr_t)p & (VL-1));
238
41
    s1 %= DIVISOR;
239
41
    s2 %= DIVISOR;
240
41
  }
241
242
#if USE_VNNI
243
  /*
244
   * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
245
   * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
246
   * the signed bytes of another vector and adds the sums in groups of 4
247
   * to the 32-bit elements of a third vector.  We use it in two ways:
248
   * multiplying the data bytes by a sequence like 64,63,62,...,1 for
249
   * calculating part of s2, and multiplying the data bytes by an all-ones
250
   * sequence 1,1,1,...,1 for calculating s1 and part of s2.  The all-ones
251
   * trick seems to be faster than the alternative of vpsadbw + vpaddd.
252
   */
253
  while (len) {
254
    /*
255
     * Calculate the length of the next data chunk such that s1 and
256
     * s2 are guaranteed to not exceed UINT32_MAX.
257
     */
258
    size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1));
259
    vec_t mults = VLOAD(raw_mults);
260
    vec_t v_s1 = zeroes;
261
    vec_t v_s2 = zeroes;
262
263
    s2 += s1 * n;
264
    len -= n;
265
266
    if (n >= 4*VL) {
267
      vec_t v_s1_b = zeroes;
268
      vec_t v_s1_c = zeroes;
269
      vec_t v_s1_d = zeroes;
270
      vec_t v_s2_b = zeroes;
271
      vec_t v_s2_c = zeroes;
272
      vec_t v_s2_d = zeroes;
273
      vec_t v_s1_sums   = zeroes;
274
      vec_t v_s1_sums_b = zeroes;
275
      vec_t v_s1_sums_c = zeroes;
276
      vec_t v_s1_sums_d = zeroes;
277
      vec_t tmp0, tmp1;
278
279
      do {
280
        vec_t data_a = VLOADU(p + 0*VL);
281
        vec_t data_b = VLOADU(p + 1*VL);
282
        vec_t data_c = VLOADU(p + 2*VL);
283
        vec_t data_d = VLOADU(p + 3*VL);
284
285
        /*
286
         * Workaround for gcc bug where it generates
287
         * unnecessary move instructions
288
         * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
289
         */
290
      #if GCC_PREREQ(1, 0)
291
        __asm__("" : "+v" (data_a), "+v" (data_b),
292
               "+v" (data_c), "+v" (data_d));
293
      #endif
294
295
        v_s2   = VDPBUSD(v_s2,   data_a, mults);
296
        v_s2_b = VDPBUSD(v_s2_b, data_b, mults);
297
        v_s2_c = VDPBUSD(v_s2_c, data_c, mults);
298
        v_s2_d = VDPBUSD(v_s2_d, data_d, mults);
299
300
        v_s1_sums   = VADD32(v_s1_sums,   v_s1);
301
        v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b);
302
        v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c);
303
        v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d);
304
305
        v_s1   = VDPBUSD(v_s1,   data_a, ones);
306
        v_s1_b = VDPBUSD(v_s1_b, data_b, ones);
307
        v_s1_c = VDPBUSD(v_s1_c, data_c, ones);
308
        v_s1_d = VDPBUSD(v_s1_d, data_d, ones);
309
310
        /* Same gcc bug workaround.  See above */
311
      #if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32)
312
        __asm__("" : "+v" (v_s2), "+v" (v_s2_b),
313
               "+v" (v_s2_c), "+v" (v_s2_d),
314
               "+v" (v_s1_sums),
315
               "+v" (v_s1_sums_b),
316
               "+v" (v_s1_sums_c),
317
               "+v" (v_s1_sums_d),
318
               "+v" (v_s1), "+v" (v_s1_b),
319
               "+v" (v_s1_c), "+v" (v_s1_d));
320
      #endif
321
        p += 4*VL;
322
        n -= 4*VL;
323
      } while (n >= 4*VL);
324
325
      /*
326
       * Reduce into v_s1 and v_s2 as follows:
327
       *
328
       * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d +
329
       *    (4*VL)*(v_s1_sums   + v_s1_sums_b +
330
       *      v_s1_sums_c + v_s1_sums_d) +
331
       *    (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c
332
       * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d
333
       */
334
      tmp0 = VADD32(v_s1, v_s1_b);
335
      tmp1 = VADD32(v_s1, v_s1_c);
336
      v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b,
337
                v_s1_sums_c, v_s1_sums_d);
338
      v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d);
339
      v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2),
340
           VSLL32(tmp0, LOG2_VL + 1),
341
           VSLL32(tmp1, LOG2_VL),
342
           v_s2, v_s2_b, v_s2_c, v_s2_d);
343
    }
344
345
    /* Process the last 0 <= n < 4*VL bytes of the chunk. */
346
    if (n >= 2*VL) {
347
      const vec_t data_a = VLOADU(p + 0*VL);
348
      const vec_t data_b = VLOADU(p + 1*VL);
349
350
      v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1));
351
      v_s1 = VDPBUSD(v_s1, data_a, ones);
352
      v_s1 = VDPBUSD(v_s1, data_b, ones);
353
      v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL));
354
      v_s2 = VDPBUSD(v_s2, data_a, mults);
355
      v_s2 = VDPBUSD(v_s2, data_b, mults);
356
      p += 2*VL;
357
      n -= 2*VL;
358
    }
359
    if (n) {
360
      /* Process the last 0 < n < 2*VL bytes of the chunk. */
361
      vec_t data;
362
363
      v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n)));
364
365
      mults = VADD8(mults, VSET1_8((int)n - VL));
366
      if (n > VL) {
367
        data = VLOADU(p);
368
        v_s1 = VDPBUSD(v_s1, data, ones);
369
        v_s2 = VDPBUSD(v_s2, data, mults);
370
        p += VL;
371
        n -= VL;
372
        mults = VADD8(mults, VSET1_8(-VL));
373
      }
374
      /*
375
       * Process the last 0 < n <= VL bytes of the chunk.
376
       * Utilize a masked load if it's available.
377
       */
378
    #if USE_AVX512
379
      data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
380
    #else
381
      data = zeroes;
382
      memcpy(&data, p, n);
383
    #endif
384
      v_s1 = VDPBUSD(v_s1, data, ones);
385
      v_s2 = VDPBUSD(v_s2, data, mults);
386
      p += n;
387
    }
388
389
    reduce_to_32bits(v_s1, v_s2, &s1, &s2);
390
    s1 %= DIVISOR;
391
    s2 %= DIVISOR;
392
  }
393
#else /* USE_VNNI */
394
  /*
395
   * This is Adler-32 for SSE2 and AVX2.
396
   *
397
   * To horizontally sum bytes, use psadbw + paddd, where one of the
398
   * arguments to psadbw is all-zeroes.
399
   *
400
   * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL
401
   * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw
402
   * to sum, for each i across iterations, byte i into a corresponding
403
   * 16-bit counter in v_byte_sums_*.  After the inner loop, use pmaddwd
404
   * to multiply each counter by (2*VL - i), then add the products to s2.
405
   *
406
   * An alternative implementation would use pmaddubsw and pmaddwd in the
407
   * inner loop to do (2*VL - i)*data[i] directly and add the products in
408
   * groups of 4 to 32-bit counters.  However, on average that approach
409
   * seems to be slower than the current approach which delays the
410
   * multiplications.  Also, pmaddubsw requires SSSE3; the current
411
   * approach keeps the implementation aligned between SSE2 and AVX2.
412
   *
413
   * The inner loop processes 2*VL bytes per iteration.  Increasing this
414
   * to 4*VL doesn't seem to be helpful here.
415
   */
416
4.97k
  while (len) {
417
    /*
418
     * Calculate the length of the next data chunk such that s1 and
419
     * s2 are guaranteed to not exceed UINT32_MAX, and every
420
     * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
421
     * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
422
     * used with pmaddwd which does signed multiplication.  In the
423
     * SSE2 case this limits chunks to 4096 bytes instead of 5536.
424
     */
425
3.93k
    size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
426
3.93k
          MAX_CHUNK_LEN) & ~(2*VL - 1));
427
3.93k
    len -= n;
428
429
3.93k
    if (n >= 2*VL) {
430
3.90k
      vec_t v_s1 = zeroes;
431
3.90k
      vec_t v_s1_sums = zeroes;
432
3.90k
      vec_t v_byte_sums_a = zeroes;
433
3.90k
      vec_t v_byte_sums_b = zeroes;
434
3.90k
      vec_t v_byte_sums_c = zeroes;
435
3.90k
      vec_t v_byte_sums_d = zeroes;
436
3.90k
      vec_t v_s2;
437
438
3.90k
      s2 += s1 * (n & ~(2*VL - 1));
439
440
287k
      do {
441
287k
        vec_t data_a = VLOADU(p + 0*VL);
442
287k
        vec_t data_b = VLOADU(p + 1*VL);
443
444
287k
        v_s1_sums = VADD32(v_s1_sums, v_s1);
445
287k
        v_byte_sums_a = VADD16(v_byte_sums_a,
446
287k
                   VUNPACKLO8(data_a, zeroes));
447
287k
        v_byte_sums_b = VADD16(v_byte_sums_b,
448
287k
                   VUNPACKHI8(data_a, zeroes));
449
287k
        v_byte_sums_c = VADD16(v_byte_sums_c,
450
287k
                   VUNPACKLO8(data_b, zeroes));
451
287k
        v_byte_sums_d = VADD16(v_byte_sums_d,
452
287k
                   VUNPACKHI8(data_b, zeroes));
453
287k
        v_s1 = VADD32(v_s1,
454
287k
                VADD32(VSAD8(data_a, zeroes),
455
287k
                 VSAD8(data_b, zeroes)));
456
        /*
457
         * Workaround for gcc bug where it generates
458
         * unnecessary move instructions
459
         * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
460
         */
461
      #if GCC_PREREQ(1, 0)
462
        __asm__("" : "+x" (v_s1), "+x" (v_s1_sums),
463
               "+x" (v_byte_sums_a),
464
               "+x" (v_byte_sums_b),
465
               "+x" (v_byte_sums_c),
466
               "+x" (v_byte_sums_d));
467
      #endif
468
287k
        p += 2*VL;
469
287k
        n -= 2*VL;
470
287k
      } while (n >= 2*VL);
471
472
      /*
473
       * Calculate v_s2 as (2*VL)*v_s1_sums +
474
       * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums.
475
       * Then update s1 and s2 from v_s1 and v_s2.
476
       */
477
3.90k
      v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1),
478
3.90k
           VMADD16(v_byte_sums_a, mults_a),
479
3.90k
           VMADD16(v_byte_sums_b, mults_b),
480
3.90k
           VMADD16(v_byte_sums_c, mults_c),
481
3.90k
           VMADD16(v_byte_sums_d, mults_d));
482
3.90k
      reduce_to_32bits(v_s1, v_s2, &s1, &s2);
483
3.90k
    }
484
    /*
485
     * Process the last 0 <= n < 2*VL bytes of the chunk using
486
     * scalar instructions and reduce s1 and s2 mod DIVISOR.
487
     */
488
3.93k
    ADLER32_CHUNK(s1, s2, p, n);
489
3.93k
  }
490
1.04k
#endif /* !USE_VNNI */
491
1.04k
  return (s2 << 16) | s1;
492
1.04k
}
Unexecuted instantiation: adler32.c:adler32_x86_sse2
493
494
#undef vec_t
495
#undef mask_t
496
#undef LOG2_VL
497
#undef VADD8
498
#undef VADD16
499
#undef VADD32
500
#undef VDPBUSD
501
#undef VLOAD
502
#undef VLOADU
503
#undef VMADD16
504
#undef VMASKZ_LOADU
505
#undef VMULLO32
506
#undef VSAD8
507
#undef VSET1_8
508
#undef VSET1_32
509
#undef VSETZERO
510
#undef VSLL32
511
#undef VUNPACKLO8
512
#undef VUNPACKHI8
513
514
#undef SUFFIX
515
#undef ATTRIBUTES
516
#undef VL
517
#undef USE_VNNI
518
#undef USE_AVX512