Coverage Report

Created: 2025-07-11 06:49

/src/c-blosc2/blosc/shuffle-avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
#include "shuffle-avx2.h"
12
#include "shuffle-generic.h"
13
#include <stdlib.h>
14
15
/* Make sure AVX2 is available for the compilation target and compiler. */
16
#if defined(__AVX2__)
17
18
#include <immintrin.h>
19
20
#include <stdint.h>
21
22
/* The next is useful for debugging purposes */
23
#if 0
24
#include <stdio.h>
25
#include <string.h>
26
27
static void printymm32(__m256i ymm0)
28
{
29
  uint32_t buf[8];
30
31
  ((__m256i *)buf)[0] = ymm0;
32
  fprintf(stderr, "%x,%x,%x,%x,%x,%x,%x,%x\n",
33
          buf[0], buf[1], buf[2], buf[3],
34
          buf[4], buf[5], buf[6], buf[7]);
35
}
36
37
static void printymm(__m256i ymm0)
38
{
39
  uint8_t buf[32];
40
41
  ((__m256i *)buf)[0] = ymm0;
42
  fprintf(stderr, "%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
43
          buf[0], buf[1], buf[2], buf[3],
44
          buf[4], buf[5], buf[6], buf[7],
45
          buf[8], buf[9], buf[10], buf[11],
46
          buf[12], buf[13], buf[14], buf[15],
47
          buf[16], buf[17], buf[18], buf[19],
48
          buf[20], buf[21], buf[22], buf[23],
49
          buf[24], buf[25], buf[26], buf[27],
50
          buf[28], buf[29], buf[30], buf[31]);
51
}
52
#endif
53
54
/* GCC doesn't include the split load/store intrinsics
55
   needed for the tiled shuffle, so define them here. */
56
#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
57
static inline __m256i
58
__attribute__((__always_inline__))
59
_mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) {
60
  return _mm256_inserti128_si256(
61
      _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
62
}
63
64
static inline void
65
__attribute__((__always_inline__))
66
_mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) {
67
  _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
68
  _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
69
}
70
#endif  /* defined(__GNUC__) */
71
72
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
73
static void
74
shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
75
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
76
0
  static const int32_t bytesoftype = 2;
77
0
  int32_t j;
78
0
  int k;
79
0
  __m256i ymm0[2], ymm1[2];
80
81
  /* Create the shuffle mask.
82
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
83
     most to least significant (i.e., their order is reversed when compared to
84
     loading the mask from an array). */
85
0
  const __m256i shmask = _mm256_set_epi8(
86
0
      0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
87
0
      0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
88
0
      0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
89
0
      0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
90
91
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
92
    /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
93
0
    for (k = 0; k < 2; k++) {
94
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
95
0
      ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
96
0
    }
97
98
0
    ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
99
0
    ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
100
101
0
    ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
102
0
    ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
103
0
    ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
104
105
    /* Store the result vectors */
106
0
    uint8_t* const dest_for_jth_element = dest + j;
107
0
    for (k = 0; k < 2; k++) {
108
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
109
0
    }
110
0
  }
111
0
}
112
113
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
114
static void
115
shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
116
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
117
0
  static const int32_t bytesoftype = 4;
118
0
  int32_t i;
119
0
  int j;
120
0
  __m256i ymm0[4], ymm1[4];
121
122
  /* Create the shuffle mask.
123
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
124
     most to least significant (i.e., their order is reversed when compared to
125
     loading the mask from an array). */
126
0
  const __m256i mask = _mm256_set_epi32(
127
0
      0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
128
129
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
130
    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
131
0
    for (j = 0; j < 4; j++) {
132
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
133
0
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
134
0
      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
135
0
      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
136
0
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
137
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
138
0
    }
139
    /* Transpose double words */
140
0
    for (j = 0; j < 2; j++) {
141
0
      ymm1[j * 2] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
142
0
      ymm1[j * 2 + 1] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
143
0
    }
144
    /* Transpose quad words */
145
0
    for (j = 0; j < 2; j++) {
146
0
      ymm0[j * 2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j + 2]);
147
0
      ymm0[j * 2 + 1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j + 2]);
148
0
    }
149
0
    for (j = 0; j < 4; j++) {
150
0
      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
151
0
    }
152
    /* Store the result vectors */
153
0
    uint8_t* const dest_for_ith_element = dest + i;
154
0
    for (j = 0; j < 4; j++) {
155
0
      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
156
0
    }
157
0
  }
158
0
}
159
160
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
161
static void
162
shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
163
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
164
0
  static const int32_t bytesoftype = 8;
165
0
  int32_t j;
166
0
  int k, l;
167
0
  __m256i ymm0[8], ymm1[8];
168
169
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
170
    /* Fetch 32 elements (256 bytes) then transpose bytes. */
171
0
    for (k = 0; k < 8; k++) {
172
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
173
0
      ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
174
0
      ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
175
0
    }
176
    /* Transpose words */
177
0
    for (k = 0, l = 0; k < 4; k++, l += 2) {
178
0
      ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 1]);
179
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 1]);
180
0
    }
181
    /* Transpose double words */
182
0
    for (k = 0, l = 0; k < 4; k++, l++) {
183
0
      if (k == 2) l += 2;
184
0
      ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 2]);
185
0
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 2]);
186
0
    }
187
    /* Transpose quad words */
188
0
    for (k = 0; k < 4; k++) {
189
0
      ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 4]);
190
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 4]);
191
0
    }
192
0
    for (k = 0; k < 8; k++) {
193
0
      ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
194
0
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
195
0
      ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
196
0
    }
197
    /* Store the result vectors */
198
0
    uint8_t* const dest_for_jth_element = dest + j;
199
0
    for (k = 0; k < 8; k++) {
200
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
201
0
    }
202
0
  }
203
0
}
204
205
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
206
static void
207
shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
208
0
               const int32_t vectorizable_elements, const int32_t total_elements) {
209
0
  static const int32_t bytesoftype = 16;
210
0
  int32_t j;
211
0
  int k, l;
212
0
  __m256i ymm0[16], ymm1[16];
213
214
  /* Create the shuffle mask.
215
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
216
     most to least significant (i.e., their order is reversed when compared to
217
     loading the mask from an array). */
218
0
  const __m256i shmask = _mm256_set_epi8(
219
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
220
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
221
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
222
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
223
224
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
225
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
226
0
    for (k = 0; k < 16; k++) {
227
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
228
0
    }
229
    /* Transpose bytes */
230
0
    for (k = 0, l = 0; k < 8; k++, l += 2) {
231
0
      ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
232
0
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
233
0
    }
234
    /* Transpose words */
235
0
    for (k = 0, l = -2; k < 8; k++, l++) {
236
0
      if ((k % 2) == 0) l += 2;
237
0
      ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
238
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
239
0
    }
240
    /* Transpose double words */
241
0
    for (k = 0, l = -4; k < 8; k++, l++) {
242
0
      if ((k % 4) == 0) l += 4;
243
0
      ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
244
0
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
245
0
    }
246
    /* Transpose quad words */
247
0
    for (k = 0; k < 8; k++) {
248
0
      ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
249
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
250
0
    }
251
0
    for (k = 0; k < 16; k++) {
252
0
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
253
0
      ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
254
0
    }
255
    /* Store the result vectors */
256
0
    uint8_t* const dest_for_jth_element = dest + j;
257
0
    for (k = 0; k < 16; k++) {
258
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
259
0
    }
260
0
  }
261
0
}
262
263
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
264
static void
265
shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
266
0
                     const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
267
0
  int32_t j;
268
0
  int k, l;
269
0
  __m256i ymm0[16], ymm1[16];
270
271
0
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
272
0
  const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
273
274
  /* Create the shuffle mask.
275
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
276
     most to least significant (i.e., their order is reversed when compared to
277
     loading the mask from an array). */
278
0
  const __m256i shmask = _mm256_set_epi8(
279
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
280
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
281
0
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
282
0
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
283
284
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
285
    /* Advance the offset into the type by the vector size (in bytes), unless this is
286
    the initial iteration and the type size is not a multiple of the vector size.
287
    In that case, only advance by the number of bytes necessary so that the number
288
    of remaining bytes in the type will be a multiple of the vector size. */
289
0
    int32_t offset_into_type;
290
0
    for (offset_into_type = 0; offset_into_type < bytesoftype;
291
0
         offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
292
293
      /* Fetch elements in groups of 512 bytes */
294
0
      const uint8_t* const src_with_offset = src + offset_into_type;
295
0
      for (k = 0; k < 16; k++) {
296
0
        ymm0[k] = _mm256_loadu2_m128i(
297
0
            (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
298
0
            (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
299
0
      }
300
      /* Transpose bytes */
301
0
      for (k = 0, l = 0; k < 8; k++, l += 2) {
302
0
        ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
303
0
        ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
304
0
      }
305
      /* Transpose words */
306
0
      for (k = 0, l = -2; k < 8; k++, l++) {
307
0
        if ((k % 2) == 0) l += 2;
308
0
        ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
309
0
        ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
310
0
      }
311
      /* Transpose double words */
312
0
      for (k = 0, l = -4; k < 8; k++, l++) {
313
0
        if ((k % 4) == 0) l += 4;
314
0
        ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
315
0
        ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
316
0
      }
317
      /* Transpose quad words */
318
0
      for (k = 0; k < 8; k++) {
319
0
        ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
320
0
        ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
321
0
      }
322
0
      for (k = 0; k < 16; k++) {
323
0
        ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
324
0
        ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
325
0
      }
326
      /* Store the result vectors */
327
0
      uint8_t* const dest_for_jth_element = dest + j;
328
0
      for (k = 0; k < 16; k++) {
329
0
        _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
330
0
      }
331
0
    }
332
0
  }
333
0
}
334
335
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
336
static void
337
unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
338
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
339
0
  static const int32_t bytesoftype = 2;
340
0
  int32_t i;
341
0
  int j;
342
0
  __m256i ymm0[2], ymm1[2];
343
344
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
345
    /* Load 32 elements (64 bytes) into 2 YMM registers. */
346
0
    const uint8_t* const src_for_ith_element = src + i;
347
0
    for (j = 0; j < 2; j++) {
348
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
349
0
    }
350
    /* Shuffle bytes */
351
0
    for (j = 0; j < 2; j++) {
352
0
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
353
0
    }
354
    /* Compute the low 64 bytes */
355
0
    ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
356
    /* Compute the hi 64 bytes */
357
0
    ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
358
    /* Store the result vectors in proper order */
359
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
360
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
361
0
  }
362
0
}
363
364
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
365
static void
366
unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
367
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
368
0
  static const int32_t bytesoftype = 4;
369
0
  int32_t i;
370
0
  int j;
371
0
  __m256i ymm0[4], ymm1[4];
372
373
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
374
    /* Load 32 elements (128 bytes) into 4 YMM registers. */
375
0
    const uint8_t* const src_for_ith_element = src + i;
376
0
    for (j = 0; j < 4; j++) {
377
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
378
0
    }
379
    /* Shuffle bytes */
380
0
    for (j = 0; j < 2; j++) {
381
      /* Compute the low 64 bytes */
382
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
383
      /* Compute the hi 64 bytes */
384
0
      ymm1[2 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
385
0
    }
386
    /* Shuffle 2-byte words */
387
0
    for (j = 0; j < 2; j++) {
388
      /* Compute the low 64 bytes */
389
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
390
      /* Compute the hi 64 bytes */
391
0
      ymm0[2 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
392
0
    }
393
0
    ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
394
0
    ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
395
0
    ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
396
0
    ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
397
398
    /* Store the result vectors in proper order */
399
0
    for (j = 0; j < 4; j++) {
400
0
      _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
401
0
    }
402
0
  }
403
0
}
404
405
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
406
static void
407
unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
408
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
409
0
  static const int32_t bytesoftype = 8;
410
0
  int32_t i;
411
0
  int j;
412
0
  __m256i ymm0[8], ymm1[8];
413
414
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
415
    /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
416
0
    const uint8_t* const src_for_ith_element = src + i;
417
0
    for (j = 0; j < 8; j++) {
418
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
419
0
    }
420
    /* Shuffle bytes */
421
0
    for (j = 0; j < 4; j++) {
422
      /* Compute the low 32 bytes */
423
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
424
      /* Compute the hi 32 bytes */
425
0
      ymm1[4 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
426
0
    }
427
    /* Shuffle words */
428
0
    for (j = 0; j < 4; j++) {
429
      /* Compute the low 32 bytes */
430
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
431
      /* Compute the hi 32 bytes */
432
0
      ymm0[4 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
433
0
    }
434
0
    for (j = 0; j < 8; j++) {
435
0
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
436
0
    }
437
438
    /* Shuffle 4-byte dwords */
439
0
    for (j = 0; j < 4; j++) {
440
      /* Compute the low 32 bytes */
441
0
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
442
      /* Compute the hi 32 bytes */
443
0
      ymm1[4 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
444
0
    }
445
446
    /* Store the result vectors in proper order */
447
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
448
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
449
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
450
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
451
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
452
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
453
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
454
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
455
0
  }
456
0
}
457
458
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
459
static void
460
unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
461
0
                 const int32_t vectorizable_elements, const int32_t total_elements) {
462
0
  static const int32_t bytesoftype = 16;
463
0
  int32_t i;
464
0
  int j;
465
0
  __m256i ymm0[16], ymm1[16];
466
467
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
468
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
469
0
    const uint8_t* const src_for_ith_element = src + i;
470
0
    for (j = 0; j < 16; j++) {
471
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
472
0
    }
473
474
    /* Shuffle bytes */
475
0
    for (j = 0; j < 8; j++) {
476
      /* Compute the low 32 bytes */
477
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
478
      /* Compute the hi 32 bytes */
479
0
      ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
480
0
    }
481
    /* Shuffle 2-byte words */
482
0
    for (j = 0; j < 8; j++) {
483
      /* Compute the low 32 bytes */
484
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
485
      /* Compute the hi 32 bytes */
486
0
      ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
487
0
    }
488
    /* Shuffle 4-byte dwords */
489
0
    for (j = 0; j < 8; j++) {
490
      /* Compute the low 32 bytes */
491
0
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
492
      /* Compute the hi 32 bytes */
493
0
      ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
494
0
    }
495
496
    /* Shuffle 8-byte qwords */
497
0
    for (j = 0; j < 8; j++) {
498
      /* Compute the low 32 bytes */
499
0
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
500
      /* Compute the hi 32 bytes */
501
0
      ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
502
0
    }
503
504
0
    for (j = 0; j < 8; j++) {
505
0
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
506
0
      ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
507
0
    }
508
509
    /* Store the result vectors in proper order */
510
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
511
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
512
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
513
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
514
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
515
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
516
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
517
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
518
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
519
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
520
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
521
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
522
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
523
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
524
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
525
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
526
0
  }
527
0
}
528
529
/* Routine optimized for unshuffling a buffer for a type size of 12 bytes.
530
 * Based off 16-byte implementation*/
531
static void
532
unshuffle12_avx2(uint8_t* const dest, const uint8_t* const src,
533
0
    const int32_t vectorizable_elements, const int32_t total_elements) {
534
0
  static const int32_t bytesoftype = 12;
535
0
  int32_t i;
536
0
  int j;
537
0
  __m256i ymm0[16], ymm1[16];
538
539
0
  __m256i permute = _mm256_set_epi32(0,0,6,5,4,2,1,0);
540
0
  __m256i store_mask = _mm256_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
541
0
  int32_t jump = 2*bytesoftype;
542
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
543
    /* Fetch 24 elements (384 bytes) into 12 YMM registers. */
544
0
    const uint8_t* const src_for_ith_element = src + i;
545
0
    for (j = 0; j < bytesoftype; j++) {
546
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
547
0
    }
548
    /* Initialize the last 4 registers (128 bytes) to null */
549
0
    for (j = bytesoftype; j < 16; j++) {
550
0
      ymm0[j] = _mm256_setzero_si256();
551
0
    }
552
553
    /* Shuffle bytes */
554
0
    for (j = 0; j < 8; j++) {
555
      /* Compute the low 32 bytes */
556
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
557
      /* Compute the hi 32 bytes */
558
0
      ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
559
0
    }
560
    /* Shuffle 2-byte words */
561
0
    for (j = 0; j < 8; j++) {
562
      /* Compute the low 32 bytes */
563
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
564
      /* Compute the hi 32 bytes */
565
0
      ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
566
0
    }
567
    /* Shuffle 4-byte dwords */
568
0
    for (j = 0; j < 8; j++) {
569
      /* Compute the low 32 bytes */
570
0
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
571
      /* Compute the hi 32 bytes */
572
0
      ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
573
0
    }
574
575
    /* Shuffle 8-byte qwords */
576
0
    for (j = 0; j < 8; j++) {
577
      /* Compute the low 32 bytes */
578
0
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
579
      /* Compute the hi 32 bytes */
580
0
      ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
581
0
    }
582
583
584
0
    for (j = 0; j < 8; j++) {
585
0
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
586
0
      ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
587
0
      ymm1[j] = _mm256_permutevar8x32_epi32(ymm1[j], permute);
588
0
      ymm1[j+8] = _mm256_permutevar8x32_epi32(ymm1[j+8], permute);
589
590
591
0
    }
592
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * jump)), ymm1[0]);
593
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * jump)), ymm1[4]);
594
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * jump)), ymm1[2]);
595
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * jump)), ymm1[6]);
596
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * jump)), ymm1[1]);
597
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * jump)), ymm1[5]);
598
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * jump)), ymm1[3]);
599
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * jump)), ymm1[7]);
600
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * jump)), ymm1[8]);
601
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * jump)), ymm1[12]);
602
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * jump)), ymm1[10]);
603
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * jump)), ymm1[14]);
604
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * jump)), ymm1[9]);
605
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * jump)), ymm1[13]);
606
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * jump)), ymm1[11]);
607
0
    _mm256_maskstore_epi32((int *)(dest + (i * bytesoftype) + (15 * jump)), store_mask, ymm1[15]);
608
0
  }
609
0
}
610
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
611
static void
612
unshuffle16_tiled_avx2(const uint8_t *dest, const uint8_t* const src,
613
0
                       const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
614
0
  int32_t i;
615
0
  int j;
616
0
  __m256i ymm0[16], ymm1[16];
617
618
0
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
619
0
  const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
620
621
  /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
622
     to optimize cache utilization. */
623
0
  int32_t offset_into_type;
624
0
  for (offset_into_type = 0; offset_into_type < bytesoftype;
625
0
       offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
626
0
    for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
627
      /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
628
0
      const uint8_t* const src_for_ith_element = src + i;
629
0
      for (j = 0; j < 16; j++) {
630
0
        ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
631
0
      }
632
633
      /* Shuffle bytes */
634
0
      for (j = 0; j < 8; j++) {
635
        /* Compute the low 32 bytes */
636
0
        ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
637
        /* Compute the hi 32 bytes */
638
0
        ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
639
0
      }
640
      /* Shuffle 2-byte words */
641
0
      for (j = 0; j < 8; j++) {
642
        /* Compute the low 32 bytes */
643
0
        ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
644
        /* Compute the hi 32 bytes */
645
0
        ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
646
0
      }
647
      /* Shuffle 4-byte dwords */
648
0
      for (j = 0; j < 8; j++) {
649
        /* Compute the low 32 bytes */
650
0
        ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
651
        /* Compute the hi 32 bytes */
652
0
        ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
653
0
      }
654
655
      /* Shuffle 8-byte qwords */
656
0
      for (j = 0; j < 8; j++) {
657
        /* Compute the low 32 bytes */
658
0
        ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
659
        /* Compute the hi 32 bytes */
660
0
        ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
661
0
      }
662
663
0
      for (j = 0; j < 8; j++) {
664
0
        ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
665
0
        ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
666
0
      }
667
668
      /* Store the result vectors in proper order */
669
0
      const uint8_t* const dest_with_offset = dest + offset_into_type;
670
0
      _mm256_storeu2_m128i(
671
0
          (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
672
0
          (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
673
0
      _mm256_storeu2_m128i(
674
0
          (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
675
0
          (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
676
0
      _mm256_storeu2_m128i(
677
0
          (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
678
0
          (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
679
0
      _mm256_storeu2_m128i(
680
0
          (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
681
0
          (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
682
0
      _mm256_storeu2_m128i(
683
0
          (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
684
0
          (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
685
0
      _mm256_storeu2_m128i(
686
0
          (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
687
0
          (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
688
0
      _mm256_storeu2_m128i(
689
0
          (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
690
0
          (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
691
0
      _mm256_storeu2_m128i(
692
0
          (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
693
0
          (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
694
0
      _mm256_storeu2_m128i(
695
0
          (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
696
0
          (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
697
0
      _mm256_storeu2_m128i(
698
0
          (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
699
0
          (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
700
0
      _mm256_storeu2_m128i(
701
0
          (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
702
0
          (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
703
0
      _mm256_storeu2_m128i(
704
0
          (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
705
0
          (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
706
0
      _mm256_storeu2_m128i(
707
0
          (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
708
0
          (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
709
0
      _mm256_storeu2_m128i(
710
0
          (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
711
0
          (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
712
0
      _mm256_storeu2_m128i(
713
0
          (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
714
0
          (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
715
0
      _mm256_storeu2_m128i(
716
0
          (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
717
0
          (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
718
0
    }
719
0
  }
720
0
}
721
722
/* Shuffle a block.  This can never fail. */
723
void
724
shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
725
0
             const uint8_t *_src, uint8_t *_dest) {
726
0
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i);
727
728
  /* If the block size is too small to be vectorized,
729
     use the generic implementation. */
730
0
  if (blocksize < vectorized_chunk_size) {
731
0
    shuffle_generic(bytesoftype, blocksize, _src, _dest);
732
0
    return;
733
0
  }
734
735
  /* If the blocksize is not a multiple of both the typesize and
736
     the vector size, round the blocksize down to the next value
737
     which is a multiple of both. The vectorized shuffle can be
738
     used for that portion of the data, and the naive implementation
739
     can be used for the remaining portion. */
740
0
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
741
742
0
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
743
0
  const int32_t total_elements = blocksize / bytesoftype;
744
745
  /* Optimized shuffle implementations */
746
0
  switch (bytesoftype) {
747
0
    case 2:
748
0
      shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
749
0
      break;
750
0
    case 4:
751
0
      shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
752
0
      break;
753
0
    case 8:
754
0
      shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
755
0
      break;
756
0
    case 16:
757
0
      shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
758
0
      break;
759
0
    default:
760
      /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
761
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
762
0
        shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
763
0
      }
764
0
      else {
765
        /* Non-optimized shuffle */
766
0
        shuffle_generic(bytesoftype, blocksize, _src, _dest);
767
        /* The non-optimized function covers the whole buffer,
768
           so we're done processing here. */
769
0
        return;
770
0
      }
771
0
  }
772
773
  /* If the buffer had any bytes at the end which couldn't be handled
774
     by the vectorized implementations, use the non-optimized version
775
     to finish them up. */
776
0
  if (vectorizable_bytes < blocksize) {
777
0
    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
778
0
  }
779
0
}
780
781
/* Unshuffle a block.  This can never fail. */
782
void
783
unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
784
0
               const uint8_t *_src, uint8_t *_dest) {
785
0
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i);
786
787
  /* If the block size is too small to be vectorized,
788
     use the generic implementation. */
789
0
  if (blocksize < vectorized_chunk_size) {
790
0
    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
791
0
    return;
792
0
  }
793
794
  /* If the blocksize is not a multiple of both the typesize and
795
     the vector size, round the blocksize down to the next value
796
     which is a multiple of both. The vectorized unshuffle can be
797
     used for that portion of the data, and the naive implementation
798
     can be used for the remaining portion. */
799
0
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
800
801
0
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
802
0
  const int32_t total_elements = blocksize / bytesoftype;
803
804
  /* Optimized unshuffle implementations */
805
0
  switch (bytesoftype) {
806
0
    case 2:
807
0
      unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
808
0
      break;
809
0
    case 4:
810
0
      unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
811
0
      break;
812
0
    case 8:
813
0
      unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
814
0
      break;
815
0
    case 12:
816
0
      unshuffle12_avx2(_dest, _src, vectorizable_elements, total_elements);
817
0
      break;
818
0
    case 16:
819
0
      unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
820
0
      break;
821
0
    default:
822
      /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
823
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
824
0
        unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
825
0
      }
826
0
      else {
827
        /* Non-optimized unshuffle */
828
0
        unshuffle_generic(bytesoftype, blocksize, _src, _dest);
829
        /* The non-optimized function covers the whole buffer,
830
           so we're done processing here. */
831
0
        return;
832
0
      }
833
0
  }
834
835
  /* If the buffer had any bytes at the end which couldn't be handled
836
     by the vectorized implementations, use the non-optimized version
837
     to finish them up. */
838
0
  if (vectorizable_bytes < blocksize) {
839
0
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
840
0
  }
841
0
}
842
843
const bool is_shuffle_avx2 = true;
844
845
#else
846
847
const bool is_shuffle_avx2 = false;
848
849
void shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
850
                  const uint8_t *_src, uint8_t *_dest) {
851
  abort();
852
}
853
854
void unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
855
                    const uint8_t *_src, uint8_t *_dest) {
856
  abort();
857
}
858
859
#endif /* defined(__AVX2__) */