Coverage Report

Created: 2024-07-27 06:20

/src/c-blosc2/blosc/shuffle-avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
#include "shuffle-avx2.h"
12
#include "shuffle-generic.h"
13
#include <stdlib.h>
14
15
/* Make sure AVX2 is available for the compilation target and compiler. */
16
#if defined(__AVX2__)
17
18
#include <immintrin.h>
19
20
#include <stdint.h>
21
22
/* The next is useful for debugging purposes */
23
#if 0
24
#include <stdio.h>
25
#include <string.h>
26
27
static void printymm(__m256i ymm0)
28
{
29
  uint8_t buf[32];
30
31
  ((__m256i *)buf)[0] = ymm0;
32
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
33
          buf[0], buf[1], buf[2], buf[3],
34
          buf[4], buf[5], buf[6], buf[7],
35
          buf[8], buf[9], buf[10], buf[11],
36
          buf[12], buf[13], buf[14], buf[15],
37
          buf[16], buf[17], buf[18], buf[19],
38
          buf[20], buf[21], buf[22], buf[23],
39
          buf[24], buf[25], buf[26], buf[27],
40
          buf[28], buf[29], buf[30], buf[31]);
41
}
42
#endif
43
44
/* GCC doesn't include the split load/store intrinsics
45
   needed for the tiled shuffle, so define them here. */
46
#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
47
static inline __m256i
48
__attribute__((__always_inline__))
49
_mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) {
50
  return _mm256_inserti128_si256(
51
      _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
52
}
53
54
static inline void
55
__attribute__((__always_inline__))
56
_mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) {
57
  _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
58
  _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
59
}
60
#endif  /* defined(__GNUC__) */
61
62
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
63
static void
64
shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
65
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
66
0
  static const int32_t bytesoftype = 2;
67
0
  int32_t j;
68
0
  int k;
69
0
  __m256i ymm0[2], ymm1[2];
70
71
  /* Create the shuffle mask.
72
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
73
     most to least significant (i.e., their order is reversed when compared to
74
     loading the mask from an array). */
75
0
  const __m256i shmask = _mm256_set_epi8(
76
0
      0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
77
0
      0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
78
0
      0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
79
0
      0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
80
81
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
82
    /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
83
0
    for (k = 0; k < 2; k++) {
84
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
85
0
      ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
86
0
    }
87
88
0
    ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
89
0
    ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
90
91
0
    ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
92
0
    ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
93
0
    ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
94
95
    /* Store the result vectors */
96
0
    uint8_t* const dest_for_jth_element = dest + j;
97
0
    for (k = 0; k < 2; k++) {
98
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
99
0
    }
100
0
  }
101
0
}
102
103
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
104
static void
105
shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
106
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
107
0
  static const int32_t bytesoftype = 4;
108
0
  int32_t i;
109
0
  int j;
110
0
  __m256i ymm0[4], ymm1[4];
111
112
  /* Create the shuffle mask.
113
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
114
     most to least significant (i.e., their order is reversed when compared to
115
     loading the mask from an array). */
116
0
  const __m256i mask = _mm256_set_epi32(
117
0
      0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
118
119
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
120
    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
121
0
    for (j = 0; j < 4; j++) {
122
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
123
0
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
124
0
      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
125
0
      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
126
0
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
127
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
128
0
    }
129
    /* Transpose double words */
130
0
    for (j = 0; j < 2; j++) {
131
0
      ymm1[j * 2] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
132
0
      ymm1[j * 2 + 1] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
133
0
    }
134
    /* Transpose quad words */
135
0
    for (j = 0; j < 2; j++) {
136
0
      ymm0[j * 2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j + 2]);
137
0
      ymm0[j * 2 + 1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j + 2]);
138
0
    }
139
0
    for (j = 0; j < 4; j++) {
140
0
      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
141
0
    }
142
    /* Store the result vectors */
143
0
    uint8_t* const dest_for_ith_element = dest + i;
144
0
    for (j = 0; j < 4; j++) {
145
0
      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
146
0
    }
147
0
  }
148
0
}
149
150
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
151
static void
152
shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
153
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
154
0
  static const int32_t bytesoftype = 8;
155
0
  int32_t j;
156
0
  int k, l;
157
0
  __m256i ymm0[8], ymm1[8];
158
159
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
160
    /* Fetch 32 elements (256 bytes) then transpose bytes. */
161
0
    for (k = 0; k < 8; k++) {
162
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
163
0
      ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
164
0
      ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
165
0
    }
166
    /* Transpose words */
167
0
    for (k = 0, l = 0; k < 4; k++, l += 2) {
168
0
      ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 1]);
169
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 1]);
170
0
    }
171
    /* Transpose double words */
172
0
    for (k = 0, l = 0; k < 4; k++, l++) {
173
0
      if (k == 2) l += 2;
174
0
      ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 2]);
175
0
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 2]);
176
0
    }
177
    /* Transpose quad words */
178
0
    for (k = 0; k < 4; k++) {
179
0
      ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 4]);
180
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 4]);
181
0
    }
182
0
    for (k = 0; k < 8; k++) {
183
0
      ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
184
0
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
185
0
      ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
186
0
    }
187
    /* Store the result vectors */
188
0
    uint8_t* const dest_for_jth_element = dest + j;
189
0
    for (k = 0; k < 8; k++) {
190
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
191
0
    }
192
0
  }
193
0
}
194
195
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
196
static void
197
shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
198
0
               const int32_t vectorizable_elements, const int32_t total_elements) {
199
0
  static const int32_t bytesoftype = 16;
200
0
  int32_t j;
201
0
  int k, l;
202
0
  __m256i ymm0[16], ymm1[16];
203
204
  /* Create the shuffle mask.
205
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
206
     most to least significant (i.e., their order is reversed when compared to
207
     loading the mask from an array). */
208
0
  const __m256i shmask = _mm256_set_epi8(
209
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
210
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
211
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
212
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
213
214
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
215
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
216
0
    for (k = 0; k < 16; k++) {
217
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
218
0
    }
219
    /* Transpose bytes */
220
0
    for (k = 0, l = 0; k < 8; k++, l += 2) {
221
0
      ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
222
0
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
223
0
    }
224
    /* Transpose words */
225
0
    for (k = 0, l = -2; k < 8; k++, l++) {
226
0
      if ((k % 2) == 0) l += 2;
227
0
      ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
228
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
229
0
    }
230
    /* Transpose double words */
231
0
    for (k = 0, l = -4; k < 8; k++, l++) {
232
0
      if ((k % 4) == 0) l += 4;
233
0
      ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
234
0
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
235
0
    }
236
    /* Transpose quad words */
237
0
    for (k = 0; k < 8; k++) {
238
0
      ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
239
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
240
0
    }
241
0
    for (k = 0; k < 16; k++) {
242
0
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
243
0
      ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
244
0
    }
245
    /* Store the result vectors */
246
0
    uint8_t* const dest_for_jth_element = dest + j;
247
0
    for (k = 0; k < 16; k++) {
248
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
249
0
    }
250
0
  }
251
0
}
252
253
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
254
static void
255
shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
256
0
                     const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
257
0
  int32_t j;
258
0
  int k, l;
259
0
  __m256i ymm0[16], ymm1[16];
260
261
0
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
262
0
  const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
263
264
  /* Create the shuffle mask.
265
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
266
     most to least significant (i.e., their order is reversed when compared to
267
     loading the mask from an array). */
268
0
  const __m256i shmask = _mm256_set_epi8(
269
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
270
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
271
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
272
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
273
274
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
275
    /* Advance the offset into the type by the vector size (in bytes), unless this is
276
    the initial iteration and the type size is not a multiple of the vector size.
277
    In that case, only advance by the number of bytes necessary so that the number
278
    of remaining bytes in the type will be a multiple of the vector size. */
279
0
    int32_t offset_into_type;
280
0
    for (offset_into_type = 0; offset_into_type < bytesoftype;
281
0
         offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
282
283
      /* Fetch elements in groups of 512 bytes */
284
0
      const uint8_t* const src_with_offset = src + offset_into_type;
285
0
      for (k = 0; k < 16; k++) {
286
0
        ymm0[k] = _mm256_loadu2_m128i(
287
0
            (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
288
0
            (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
289
0
      }
290
      /* Transpose bytes */
291
0
      for (k = 0, l = 0; k < 8; k++, l += 2) {
292
0
        ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
293
0
        ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
294
0
      }
295
      /* Transpose words */
296
0
      for (k = 0, l = -2; k < 8; k++, l++) {
297
0
        if ((k % 2) == 0) l += 2;
298
0
        ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
299
0
        ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
300
0
      }
301
      /* Transpose double words */
302
0
      for (k = 0, l = -4; k < 8; k++, l++) {
303
0
        if ((k % 4) == 0) l += 4;
304
0
        ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
305
0
        ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
306
0
      }
307
      /* Transpose quad words */
308
0
      for (k = 0; k < 8; k++) {
309
0
        ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
310
0
        ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
311
0
      }
312
0
      for (k = 0; k < 16; k++) {
313
0
        ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
314
0
        ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
315
0
      }
316
      /* Store the result vectors */
317
0
      uint8_t* const dest_for_jth_element = dest + j;
318
0
      for (k = 0; k < 16; k++) {
319
0
        _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
320
0
      }
321
0
    }
322
0
  }
323
0
}
324
325
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
326
static void
327
unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
328
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
329
0
  static const int32_t bytesoftype = 2;
330
0
  int32_t i;
331
0
  int j;
332
0
  __m256i ymm0[2], ymm1[2];
333
334
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
335
    /* Load 32 elements (64 bytes) into 2 YMM registers. */
336
0
    const uint8_t* const src_for_ith_element = src + i;
337
0
    for (j = 0; j < 2; j++) {
338
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
339
0
    }
340
    /* Shuffle bytes */
341
0
    for (j = 0; j < 2; j++) {
342
0
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
343
0
    }
344
    /* Compute the low 64 bytes */
345
0
    ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
346
    /* Compute the hi 64 bytes */
347
0
    ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
348
    /* Store the result vectors in proper order */
349
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
350
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
351
0
  }
352
0
}
353
354
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
355
static void
356
unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
357
23
                const int32_t vectorizable_elements, const int32_t total_elements) {
358
23
  static const int32_t bytesoftype = 4;
359
23
  int32_t i;
360
23
  int j;
361
23
  __m256i ymm0[4], ymm1[4];
362
363
18.2k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
364
    /* Load 32 elements (128 bytes) into 4 YMM registers. */
365
18.2k
    const uint8_t* const src_for_ith_element = src + i;
366
91.0k
    for (j = 0; j < 4; j++) {
367
72.8k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
368
72.8k
    }
369
    /* Shuffle bytes */
370
54.6k
    for (j = 0; j < 2; j++) {
371
      /* Compute the low 64 bytes */
372
36.4k
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
373
      /* Compute the hi 64 bytes */
374
36.4k
      ymm1[2 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
375
36.4k
    }
376
    /* Shuffle 2-byte words */
377
54.6k
    for (j = 0; j < 2; j++) {
378
      /* Compute the low 64 bytes */
379
36.4k
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
380
      /* Compute the hi 64 bytes */
381
36.4k
      ymm0[2 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
382
36.4k
    }
383
18.2k
    ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
384
18.2k
    ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
385
18.2k
    ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
386
18.2k
    ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
387
388
    /* Store the result vectors in proper order */
389
91.0k
    for (j = 0; j < 4; j++) {
390
72.8k
      _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
391
72.8k
    }
392
18.2k
  }
393
23
}
394
395
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
396
static void
397
unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
398
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
399
0
  static const int32_t bytesoftype = 8;
400
0
  int32_t i;
401
0
  int j;
402
0
  __m256i ymm0[8], ymm1[8];
403
404
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
405
    /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
406
0
    const uint8_t* const src_for_ith_element = src + i;
407
0
    for (j = 0; j < 8; j++) {
408
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
409
0
    }
410
    /* Shuffle bytes */
411
0
    for (j = 0; j < 4; j++) {
412
      /* Compute the low 32 bytes */
413
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
414
      /* Compute the hi 32 bytes */
415
0
      ymm1[4 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
416
0
    }
417
    /* Shuffle words */
418
0
    for (j = 0; j < 4; j++) {
419
      /* Compute the low 32 bytes */
420
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
421
      /* Compute the hi 32 bytes */
422
0
      ymm0[4 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
423
0
    }
424
0
    for (j = 0; j < 8; j++) {
425
0
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
426
0
    }
427
428
    /* Shuffle 4-byte dwords */
429
0
    for (j = 0; j < 4; j++) {
430
      /* Compute the low 32 bytes */
431
0
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
432
      /* Compute the hi 32 bytes */
433
0
      ymm1[4 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
434
0
    }
435
436
    /* Store the result vectors in proper order */
437
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
438
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
439
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
440
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
441
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
442
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
443
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
444
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
445
0
  }
446
0
}
447
448
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
449
static void
450
unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
451
0
                 const int32_t vectorizable_elements, const int32_t total_elements) {
452
0
  static const int32_t bytesoftype = 16;
453
0
  int32_t i;
454
0
  int j;
455
0
  __m256i ymm0[16], ymm1[16];
456
457
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
458
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
459
0
    const uint8_t* const src_for_ith_element = src + i;
460
0
    for (j = 0; j < 16; j++) {
461
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
462
0
    }
463
464
    /* Shuffle bytes */
465
0
    for (j = 0; j < 8; j++) {
466
      /* Compute the low 32 bytes */
467
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
468
      /* Compute the hi 32 bytes */
469
0
      ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
470
0
    }
471
    /* Shuffle 2-byte words */
472
0
    for (j = 0; j < 8; j++) {
473
      /* Compute the low 32 bytes */
474
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
475
      /* Compute the hi 32 bytes */
476
0
      ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
477
0
    }
478
    /* Shuffle 4-byte dwords */
479
0
    for (j = 0; j < 8; j++) {
480
      /* Compute the low 32 bytes */
481
0
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
482
      /* Compute the hi 32 bytes */
483
0
      ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
484
0
    }
485
486
    /* Shuffle 8-byte qwords */
487
0
    for (j = 0; j < 8; j++) {
488
      /* Compute the low 32 bytes */
489
0
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
490
      /* Compute the hi 32 bytes */
491
0
      ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
492
0
    }
493
494
0
    for (j = 0; j < 8; j++) {
495
0
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
496
0
      ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
497
0
    }
498
499
    /* Store the result vectors in proper order */
500
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
501
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
502
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
503
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
504
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
505
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
506
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
507
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
508
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
509
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
510
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
511
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
512
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
513
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
514
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
515
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
516
0
  }
517
0
}
518
519
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
520
static void
521
unshuffle16_tiled_avx2(const uint8_t *dest, const uint8_t* const src,
522
0
                       const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
523
0
  int32_t i;
524
0
  int j;
525
0
  __m256i ymm0[16], ymm1[16];
526
527
0
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
528
0
  const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
529
530
  /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
531
     to optimize cache utilization. */
532
0
  int32_t offset_into_type;
533
0
  for (offset_into_type = 0; offset_into_type < bytesoftype;
534
0
       offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
535
0
    for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
536
      /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
537
0
      const uint8_t* const src_for_ith_element = src + i;
538
0
      for (j = 0; j < 16; j++) {
539
0
        ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
540
0
      }
541
542
      /* Shuffle bytes */
543
0
      for (j = 0; j < 8; j++) {
544
        /* Compute the low 32 bytes */
545
0
        ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
546
        /* Compute the hi 32 bytes */
547
0
        ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
548
0
      }
549
      /* Shuffle 2-byte words */
550
0
      for (j = 0; j < 8; j++) {
551
        /* Compute the low 32 bytes */
552
0
        ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
553
        /* Compute the hi 32 bytes */
554
0
        ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
555
0
      }
556
      /* Shuffle 4-byte dwords */
557
0
      for (j = 0; j < 8; j++) {
558
        /* Compute the low 32 bytes */
559
0
        ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
560
        /* Compute the hi 32 bytes */
561
0
        ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
562
0
      }
563
564
      /* Shuffle 8-byte qwords */
565
0
      for (j = 0; j < 8; j++) {
566
        /* Compute the low 32 bytes */
567
0
        ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
568
        /* Compute the hi 32 bytes */
569
0
        ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
570
0
      }
571
572
0
      for (j = 0; j < 8; j++) {
573
0
        ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
574
0
        ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
575
0
      }
576
577
      /* Store the result vectors in proper order */
578
0
      const uint8_t* const dest_with_offset = dest + offset_into_type;
579
0
      _mm256_storeu2_m128i(
580
0
          (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
581
0
          (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
582
0
      _mm256_storeu2_m128i(
583
0
          (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
584
0
          (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
585
0
      _mm256_storeu2_m128i(
586
0
          (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
587
0
          (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
588
0
      _mm256_storeu2_m128i(
589
0
          (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
590
0
          (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
591
0
      _mm256_storeu2_m128i(
592
0
          (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
593
0
          (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
594
0
      _mm256_storeu2_m128i(
595
0
          (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
596
0
          (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
597
0
      _mm256_storeu2_m128i(
598
0
          (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
599
0
          (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
600
0
      _mm256_storeu2_m128i(
601
0
          (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
602
0
          (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
603
0
      _mm256_storeu2_m128i(
604
0
          (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
605
0
          (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
606
0
      _mm256_storeu2_m128i(
607
0
          (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
608
0
          (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
609
0
      _mm256_storeu2_m128i(
610
0
          (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
611
0
          (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
612
0
      _mm256_storeu2_m128i(
613
0
          (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
614
0
          (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
615
0
      _mm256_storeu2_m128i(
616
0
          (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
617
0
          (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
618
0
      _mm256_storeu2_m128i(
619
0
          (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
620
0
          (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
621
0
      _mm256_storeu2_m128i(
622
0
          (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
623
0
          (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
624
0
      _mm256_storeu2_m128i(
625
0
          (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
626
0
          (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
627
0
    }
628
0
  }
629
0
}
630
631
/* Shuffle a block.  This can never fail. */
632
void
633
shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
634
0
             const uint8_t *_src, uint8_t *_dest) {
635
0
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i);
636
637
  /* If the block size is too small to be vectorized,
638
     use the generic implementation. */
639
0
  if (blocksize < vectorized_chunk_size) {
640
0
    shuffle_generic(bytesoftype, blocksize, _src, _dest);
641
0
    return;
642
0
  }
643
644
  /* If the blocksize is not a multiple of both the typesize and
645
     the vector size, round the blocksize down to the next value
646
     which is a multiple of both. The vectorized shuffle can be
647
     used for that portion of the data, and the naive implementation
648
     can be used for the remaining portion. */
649
0
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
650
651
0
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
652
0
  const int32_t total_elements = blocksize / bytesoftype;
653
654
  /* Optimized shuffle implementations */
655
0
  switch (bytesoftype) {
656
0
    case 2:
657
0
      shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
658
0
      break;
659
0
    case 4:
660
0
      shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
661
0
      break;
662
0
    case 8:
663
0
      shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
664
0
      break;
665
0
    case 16:
666
0
      shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
667
0
      break;
668
0
    default:
669
      /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
670
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
671
0
        shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
672
0
      }
673
0
      else {
674
        /* Non-optimized shuffle */
675
0
        shuffle_generic(bytesoftype, blocksize, _src, _dest);
676
        /* The non-optimized function covers the whole buffer,
677
           so we're done processing here. */
678
0
        return;
679
0
      }
680
0
  }
681
682
  /* If the buffer had any bytes at the end which couldn't be handled
683
     by the vectorized implementations, use the non-optimized version
684
     to finish them up. */
685
0
  if (vectorizable_bytes < blocksize) {
686
0
    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
687
0
  }
688
0
}
689
690
/* Unshuffle a block.  This can never fail. */
691
void
692
unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
693
23
               const uint8_t *_src, uint8_t *_dest) {
694
23
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i);
695
696
  /* If the block size is too small to be vectorized,
697
     use the generic implementation. */
698
23
  if (blocksize < vectorized_chunk_size) {
699
0
    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
700
0
    return;
701
0
  }
702
703
  /* If the blocksize is not a multiple of both the typesize and
704
     the vector size, round the blocksize down to the next value
705
     which is a multiple of both. The vectorized unshuffle can be
706
     used for that portion of the data, and the naive implementation
707
     can be used for the remaining portion. */
708
23
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
709
710
23
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
711
23
  const int32_t total_elements = blocksize / bytesoftype;
712
713
  /* Optimized unshuffle implementations */
714
23
  switch (bytesoftype) {
715
0
    case 2:
716
0
      unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
717
0
      break;
718
23
    case 4:
719
23
      unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
720
23
      break;
721
0
    case 8:
722
0
      unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
723
0
      break;
724
0
    case 16:
725
0
      unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
726
0
      break;
727
0
    default:
728
      /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
729
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
730
0
        unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
731
0
      }
732
0
      else {
733
        /* Non-optimized unshuffle */
734
0
        unshuffle_generic(bytesoftype, blocksize, _src, _dest);
735
        /* The non-optimized function covers the whole buffer,
736
           so we're done processing here. */
737
0
        return;
738
0
      }
739
23
  }
740
741
  /* If the buffer had any bytes at the end which couldn't be handled
742
     by the vectorized implementations, use the non-optimized version
743
     to finish them up. */
744
23
  if (vectorizable_bytes < blocksize) {
745
11
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
746
11
  }
747
23
}
748
749
const bool is_shuffle_avx2 = true;
750
751
#else
752
753
const bool is_shuffle_avx2 = false;
754
755
void shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
756
                  const uint8_t *_src, uint8_t *_dest) {
757
  abort();
758
}
759
760
void unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
761
                    const uint8_t *_src, uint8_t *_dest) {
762
  abort();
763
}
764
765
#endif /* defined(__AVX2__) */