Coverage Report

Created: 2026-02-22 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/shuffle-avx2.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
#include "shuffle-avx2.h"
12
#include "shuffle-generic.h"
13
#include <stdlib.h>
14
15
/* Make sure AVX2 is available for the compilation target and compiler. */
16
#if defined(__AVX2__)
17
18
#include <immintrin.h>
19
20
#include <stdint.h>
21
22
/* The next is useful for debugging purposes */
23
#if 0
24
#include <stdio.h>
25
#include <string.h>
26
27
static void printymm32(__m256i ymm0)
28
{
29
  uint32_t buf[8];
30
31
  ((__m256i *)buf)[0] = ymm0;
32
  fprintf(stderr, "%x,%x,%x,%x,%x,%x,%x,%x\n",
33
          buf[0], buf[1], buf[2], buf[3],
34
          buf[4], buf[5], buf[6], buf[7]);
35
}
36
37
static void printymm(__m256i ymm0)
38
{
39
  uint8_t buf[32];
40
41
  ((__m256i *)buf)[0] = ymm0;
42
  fprintf(stderr, "%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
43
          buf[0], buf[1], buf[2], buf[3],
44
          buf[4], buf[5], buf[6], buf[7],
45
          buf[8], buf[9], buf[10], buf[11],
46
          buf[12], buf[13], buf[14], buf[15],
47
          buf[16], buf[17], buf[18], buf[19],
48
          buf[20], buf[21], buf[22], buf[23],
49
          buf[24], buf[25], buf[26], buf[27],
50
          buf[28], buf[29], buf[30], buf[31]);
51
}
52
#endif
53
54
/* GCC doesn't include the split load/store intrinsics
55
   needed for the tiled shuffle, so define them here. */
56
#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
57
static inline __m256i
58
__attribute__((__always_inline__))
59
_mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) {
60
  return _mm256_inserti128_si256(
61
      _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
62
}
63
64
static inline void
65
__attribute__((__always_inline__))
66
_mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) {
67
  _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
68
  _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
69
}
70
#endif  /* defined(__GNUC__) */
71
72
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
73
static void
74
shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
75
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
76
0
  static const int32_t bytesoftype = 2;
77
0
  int32_t j;
78
0
  int k;
79
0
  __m256i ymm0[2], ymm1[2];
80
81
  /* Create the shuffle mask.
82
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
83
     most to least significant (i.e., their order is reversed when compared to
84
     loading the mask from an array). */
85
0
  const __m256i shmask = _mm256_set_epi8(
86
0
      0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
87
0
      0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
88
0
      0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
89
0
      0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
90
91
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
92
    /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
93
0
    for (k = 0; k < 2; k++) {
94
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
95
0
      ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
96
0
    }
97
98
0
    ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
99
0
    ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
100
101
0
    ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
102
0
    ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
103
0
    ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
104
105
    /* Store the result vectors */
106
0
    uint8_t* const dest_for_jth_element = dest + j;
107
0
    for (k = 0; k < 2; k++) {
108
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
109
0
    }
110
0
  }
111
0
}
112
113
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
114
static void
115
shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
116
2.82k
              const int32_t vectorizable_elements, const int32_t total_elements) {
117
2.82k
  static const int32_t bytesoftype = 4;
118
2.82k
  int32_t i;
119
2.82k
  int j;
120
2.82k
  __m256i ymm0[4], ymm1[4];
121
122
  /* Create the shuffle mask.
123
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
124
     most to least significant (i.e., their order is reversed when compared to
125
     loading the mask from an array). */
126
2.82k
  const __m256i mask = _mm256_set_epi32(
127
2.82k
      0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
128
129
20.1k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
130
    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
131
86.6k
    for (j = 0; j < 4; j++) {
132
69.3k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
133
69.3k
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
134
69.3k
      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
135
69.3k
      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
136
69.3k
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
137
69.3k
      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
138
69.3k
    }
139
    /* Transpose double words */
140
52.0k
    for (j = 0; j < 2; j++) {
141
34.6k
      ymm1[j * 2] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
142
34.6k
      ymm1[j * 2 + 1] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
143
34.6k
    }
144
    /* Transpose quad words */
145
52.0k
    for (j = 0; j < 2; j++) {
146
34.6k
      ymm0[j * 2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j + 2]);
147
34.6k
      ymm0[j * 2 + 1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j + 2]);
148
34.6k
    }
149
86.6k
    for (j = 0; j < 4; j++) {
150
69.3k
      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
151
69.3k
    }
152
    /* Store the result vectors */
153
17.3k
    uint8_t* const dest_for_ith_element = dest + i;
154
86.6k
    for (j = 0; j < 4; j++) {
155
69.3k
      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
156
69.3k
    }
157
17.3k
  }
158
2.82k
}
159
160
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
161
static void
162
shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
163
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
164
0
  static const int32_t bytesoftype = 8;
165
0
  int32_t j;
166
0
  int k, l;
167
0
  __m256i ymm0[8], ymm1[8];
168
169
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
170
    /* Fetch 32 elements (256 bytes) then transpose bytes. */
171
0
    for (k = 0; k < 8; k++) {
172
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
173
0
      ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
174
0
      ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
175
0
    }
176
    /* Transpose words */
177
0
    for (k = 0, l = 0; k < 4; k++, l += 2) {
178
0
      ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 1]);
179
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 1]);
180
0
    }
181
    /* Transpose double words */
182
0
    for (k = 0, l = 0; k < 4; k++, l++) {
183
0
      if (k == 2) l += 2;
184
0
      ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 2]);
185
0
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 2]);
186
0
    }
187
    /* Transpose quad words */
188
0
    for (k = 0; k < 4; k++) {
189
0
      ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 4]);
190
0
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 4]);
191
0
    }
192
0
    for (k = 0; k < 8; k++) {
193
0
      ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
194
0
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
195
0
      ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
196
0
    }
197
    /* Store the result vectors */
198
0
    uint8_t* const dest_for_jth_element = dest + j;
199
0
    for (k = 0; k < 8; k++) {
200
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
201
0
    }
202
0
  }
203
0
}
204
205
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
206
static void
207
shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
208
406
               const int32_t vectorizable_elements, const int32_t total_elements) {
209
406
  static const int32_t bytesoftype = 16;
210
406
  int32_t j;
211
406
  int k, l;
212
406
  __m256i ymm0[16], ymm1[16];
213
214
  /* Create the shuffle mask.
215
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
216
     most to least significant (i.e., their order is reversed when compared to
217
     loading the mask from an array). */
218
406
  const __m256i shmask = _mm256_set_epi8(
219
406
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
220
406
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
221
406
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
222
406
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
223
224
1.64k
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
225
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
226
21.0k
    for (k = 0; k < 16; k++) {
227
19.7k
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
228
19.7k
    }
229
    /* Transpose bytes */
230
11.1k
    for (k = 0, l = 0; k < 8; k++, l += 2) {
231
9.89k
      ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
232
9.89k
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
233
9.89k
    }
234
    /* Transpose words */
235
11.1k
    for (k = 0, l = -2; k < 8; k++, l++) {
236
9.89k
      if ((k % 2) == 0) l += 2;
237
9.89k
      ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
238
9.89k
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
239
9.89k
    }
240
    /* Transpose double words */
241
11.1k
    for (k = 0, l = -4; k < 8; k++, l++) {
242
9.89k
      if ((k % 4) == 0) l += 4;
243
9.89k
      ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
244
9.89k
      ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
245
9.89k
    }
246
    /* Transpose quad words */
247
11.1k
    for (k = 0; k < 8; k++) {
248
9.89k
      ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
249
9.89k
      ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
250
9.89k
    }
251
21.0k
    for (k = 0; k < 16; k++) {
252
19.7k
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
253
19.7k
      ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
254
19.7k
    }
255
    /* Store the result vectors */
256
1.23k
    uint8_t* const dest_for_jth_element = dest + j;
257
21.0k
    for (k = 0; k < 16; k++) {
258
19.7k
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
259
19.7k
    }
260
1.23k
  }
261
406
}
262
263
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
264
static void
265
shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
266
2.16k
                     const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
267
2.16k
  int32_t j;
268
2.16k
  int k, l;
269
2.16k
  __m256i ymm0[16], ymm1[16];
270
271
2.16k
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
272
2.16k
  const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
273
274
  /* Create the shuffle mask.
275
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
276
     most to least significant (i.e., their order is reversed when compared to
277
     loading the mask from an array). */
278
2.16k
  const __m256i shmask = _mm256_set_epi8(
279
2.16k
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
280
2.16k
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
281
2.16k
      0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
282
2.16k
      0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
283
284
7.44k
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
285
    /* Advance the offset into the type by the vector size (in bytes), unless this is
286
    the initial iteration and the type size is not a multiple of the vector size.
287
    In that case, only advance by the number of bytes necessary so that the number
288
    of remaining bytes in the type will be a multiple of the vector size. */
289
5.28k
    int32_t offset_into_type;
290
29.6k
    for (offset_into_type = 0; offset_into_type < bytesoftype;
291
24.3k
         offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
292
293
      /* Fetch elements in groups of 512 bytes */
294
24.3k
      const uint8_t* const src_with_offset = src + offset_into_type;
295
413k
      for (k = 0; k < 16; k++) {
296
389k
        ymm0[k] = _mm256_loadu2_m128i(
297
389k
            (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
298
389k
            (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
299
389k
      }
300
      /* Transpose bytes */
301
219k
      for (k = 0, l = 0; k < 8; k++, l += 2) {
302
194k
        ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
303
194k
        ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
304
194k
      }
305
      /* Transpose words */
306
219k
      for (k = 0, l = -2; k < 8; k++, l++) {
307
194k
        if ((k % 2) == 0) l += 2;
308
194k
        ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
309
194k
        ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
310
194k
      }
311
      /* Transpose double words */
312
219k
      for (k = 0, l = -4; k < 8; k++, l++) {
313
194k
        if ((k % 4) == 0) l += 4;
314
194k
        ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
315
194k
        ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
316
194k
      }
317
      /* Transpose quad words */
318
219k
      for (k = 0; k < 8; k++) {
319
194k
        ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
320
194k
        ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
321
194k
      }
322
413k
      for (k = 0; k < 16; k++) {
323
389k
        ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
324
389k
        ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
325
389k
      }
326
      /* Store the result vectors */
327
24.3k
      uint8_t* const dest_for_jth_element = dest + j;
328
413k
      for (k = 0; k < 16; k++) {
329
389k
        _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
330
389k
      }
331
24.3k
    }
332
5.28k
  }
333
2.16k
}
334
335
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
336
static void
337
unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
338
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
339
0
  static const int32_t bytesoftype = 2;
340
0
  int32_t i;
341
0
  int j;
342
0
  __m256i ymm0[2], ymm1[2];
343
344
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
345
    /* Load 32 elements (64 bytes) into 2 YMM registers. */
346
0
    const uint8_t* const src_for_ith_element = src + i;
347
0
    for (j = 0; j < 2; j++) {
348
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
349
0
    }
350
    /* Shuffle bytes */
351
0
    for (j = 0; j < 2; j++) {
352
0
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
353
0
    }
354
    /* Compute the low 64 bytes */
355
0
    ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
356
    /* Compute the hi 64 bytes */
357
0
    ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
358
    /* Store the result vectors in proper order */
359
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
360
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
361
0
  }
362
0
}
363
364
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
365
static void
366
unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
367
872
                const int32_t vectorizable_elements, const int32_t total_elements) {
368
872
  static const int32_t bytesoftype = 4;
369
872
  int32_t i;
370
872
  int j;
371
872
  __m256i ymm0[4], ymm1[4];
372
373
9.63k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
374
    /* Load 32 elements (128 bytes) into 4 YMM registers. */
375
8.76k
    const uint8_t* const src_for_ith_element = src + i;
376
43.8k
    for (j = 0; j < 4; j++) {
377
35.0k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
378
35.0k
    }
379
    /* Shuffle bytes */
380
26.2k
    for (j = 0; j < 2; j++) {
381
      /* Compute the low 64 bytes */
382
17.5k
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
383
      /* Compute the hi 64 bytes */
384
17.5k
      ymm1[2 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
385
17.5k
    }
386
    /* Shuffle 2-byte words */
387
26.2k
    for (j = 0; j < 2; j++) {
388
      /* Compute the low 64 bytes */
389
17.5k
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
390
      /* Compute the hi 64 bytes */
391
17.5k
      ymm0[2 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
392
17.5k
    }
393
8.76k
    ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
394
8.76k
    ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
395
8.76k
    ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
396
8.76k
    ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
397
398
    /* Store the result vectors in proper order */
399
43.8k
    for (j = 0; j < 4; j++) {
400
35.0k
      _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
401
35.0k
    }
402
8.76k
  }
403
872
}
404
405
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
406
static void
407
unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
408
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
409
0
  static const int32_t bytesoftype = 8;
410
0
  int32_t i;
411
0
  int j;
412
0
  __m256i ymm0[8], ymm1[8];
413
414
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
415
    /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
416
0
    const uint8_t* const src_for_ith_element = src + i;
417
0
    for (j = 0; j < 8; j++) {
418
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
419
0
    }
420
    /* Shuffle bytes */
421
0
    for (j = 0; j < 4; j++) {
422
      /* Compute the low 32 bytes */
423
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
424
      /* Compute the hi 32 bytes */
425
0
      ymm1[4 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
426
0
    }
427
    /* Shuffle words */
428
0
    for (j = 0; j < 4; j++) {
429
      /* Compute the low 32 bytes */
430
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
431
      /* Compute the hi 32 bytes */
432
0
      ymm0[4 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
433
0
    }
434
0
    for (j = 0; j < 8; j++) {
435
0
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
436
0
    }
437
438
    /* Shuffle 4-byte dwords */
439
0
    for (j = 0; j < 4; j++) {
440
      /* Compute the low 32 bytes */
441
0
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
442
      /* Compute the hi 32 bytes */
443
0
      ymm1[4 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
444
0
    }
445
446
    /* Store the result vectors in proper order */
447
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
448
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
449
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
450
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
451
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
452
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
453
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
454
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
455
0
  }
456
0
}
457
458
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
459
static void
460
unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
461
268
                 const int32_t vectorizable_elements, const int32_t total_elements) {
462
268
  static const int32_t bytesoftype = 16;
463
268
  int32_t i;
464
268
  int j;
465
268
  __m256i ymm0[16], ymm1[16];
466
467
1.24k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
468
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
469
973
    const uint8_t* const src_for_ith_element = src + i;
470
16.5k
    for (j = 0; j < 16; j++) {
471
15.5k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
472
15.5k
    }
473
474
    /* Shuffle bytes */
475
8.75k
    for (j = 0; j < 8; j++) {
476
      /* Compute the low 32 bytes */
477
7.78k
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
478
      /* Compute the hi 32 bytes */
479
7.78k
      ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
480
7.78k
    }
481
    /* Shuffle 2-byte words */
482
8.75k
    for (j = 0; j < 8; j++) {
483
      /* Compute the low 32 bytes */
484
7.78k
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
485
      /* Compute the hi 32 bytes */
486
7.78k
      ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
487
7.78k
    }
488
    /* Shuffle 4-byte dwords */
489
8.75k
    for (j = 0; j < 8; j++) {
490
      /* Compute the low 32 bytes */
491
7.78k
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
492
      /* Compute the hi 32 bytes */
493
7.78k
      ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
494
7.78k
    }
495
496
    /* Shuffle 8-byte qwords */
497
8.75k
    for (j = 0; j < 8; j++) {
498
      /* Compute the low 32 bytes */
499
7.78k
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
500
      /* Compute the hi 32 bytes */
501
7.78k
      ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
502
7.78k
    }
503
504
8.75k
    for (j = 0; j < 8; j++) {
505
7.78k
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
506
7.78k
      ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
507
7.78k
    }
508
509
    /* Store the result vectors in proper order */
510
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
511
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
512
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
513
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
514
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
515
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
516
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
517
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
518
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
519
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
520
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
521
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
522
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
523
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
524
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
525
973
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
526
973
  }
527
268
}
528
529
/* Routine optimized for unshuffling a buffer for a type size of 12 bytes.
530
 * Based off 16-byte implementation*/
531
static void
532
unshuffle12_avx2(uint8_t* const dest, const uint8_t* const src,
533
0
    const int32_t vectorizable_elements, const int32_t total_elements) {
534
0
  static const int32_t bytesoftype = 12;
535
0
  int32_t i;
536
0
  int j;
537
0
  __m256i ymm0[16], ymm1[16];
538
539
0
  __m256i permute = _mm256_set_epi32(0,0,6,5,4,2,1,0);
540
0
  __m256i store_mask = _mm256_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
541
0
  int32_t jump = 2*bytesoftype;
542
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
543
    /* Fetch 24 elements (384 bytes) into 12 YMM registers. */
544
0
    const uint8_t* const src_for_ith_element = src + i;
545
0
    for (j = 0; j < bytesoftype; j++) {
546
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
547
0
    }
548
    /* Initialize the last 4 registers (128 bytes) to null */
549
0
    for (j = bytesoftype; j < 16; j++) {
550
0
      ymm0[j] = _mm256_setzero_si256();
551
0
    }
552
553
    /* Shuffle bytes */
554
0
    for (j = 0; j < 8; j++) {
555
      /* Compute the low 32 bytes */
556
0
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
557
      /* Compute the hi 32 bytes */
558
0
      ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
559
0
    }
560
    /* Shuffle 2-byte words */
561
0
    for (j = 0; j < 8; j++) {
562
      /* Compute the low 32 bytes */
563
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
564
      /* Compute the hi 32 bytes */
565
0
      ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
566
0
    }
567
    /* Shuffle 4-byte dwords */
568
0
    for (j = 0; j < 8; j++) {
569
      /* Compute the low 32 bytes */
570
0
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
571
      /* Compute the hi 32 bytes */
572
0
      ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
573
0
    }
574
575
    /* Shuffle 8-byte qwords */
576
0
    for (j = 0; j < 8; j++) {
577
      /* Compute the low 32 bytes */
578
0
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
579
      /* Compute the hi 32 bytes */
580
0
      ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
581
0
    }
582
583
584
0
    for (j = 0; j < 8; j++) {
585
0
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
586
0
      ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
587
0
      ymm1[j] = _mm256_permutevar8x32_epi32(ymm1[j], permute);
588
0
      ymm1[j+8] = _mm256_permutevar8x32_epi32(ymm1[j+8], permute);
589
590
591
0
    }
592
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * jump)), ymm1[0]);
593
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * jump)), ymm1[4]);
594
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * jump)), ymm1[2]);
595
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * jump)), ymm1[6]);
596
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * jump)), ymm1[1]);
597
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * jump)), ymm1[5]);
598
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * jump)), ymm1[3]);
599
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * jump)), ymm1[7]);
600
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * jump)), ymm1[8]);
601
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * jump)), ymm1[12]);
602
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * jump)), ymm1[10]);
603
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * jump)), ymm1[14]);
604
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * jump)), ymm1[9]);
605
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * jump)), ymm1[13]);
606
0
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * jump)), ymm1[11]);
607
0
    _mm256_maskstore_epi32((int *)(dest + (i * bytesoftype) + (15 * jump)), store_mask, ymm1[15]);
608
0
  }
609
0
}
610
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
611
static void
612
unshuffle16_tiled_avx2(const uint8_t *dest, const uint8_t* const src,
613
806
                       const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
614
806
  int32_t i;
615
806
  int j;
616
806
  __m256i ymm0[16], ymm1[16];
617
618
806
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
619
806
  const int32_t vecs_rem = (int32_t)vecs_per_el.rem;
620
621
  /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
622
     to optimize cache utilization. */
623
806
  int32_t offset_into_type;
624
4.33k
  for (offset_into_type = 0; offset_into_type < bytesoftype;
625
3.52k
       offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) {
626
11.0k
    for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
627
      /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
628
7.57k
      const uint8_t* const src_for_ith_element = src + i;
629
128k
      for (j = 0; j < 16; j++) {
630
121k
        ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
631
121k
      }
632
633
      /* Shuffle bytes */
634
68.1k
      for (j = 0; j < 8; j++) {
635
        /* Compute the low 32 bytes */
636
60.5k
        ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
637
        /* Compute the hi 32 bytes */
638
60.5k
        ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
639
60.5k
      }
640
      /* Shuffle 2-byte words */
641
68.1k
      for (j = 0; j < 8; j++) {
642
        /* Compute the low 32 bytes */
643
60.5k
        ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
644
        /* Compute the hi 32 bytes */
645
60.5k
        ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
646
60.5k
      }
647
      /* Shuffle 4-byte dwords */
648
68.1k
      for (j = 0; j < 8; j++) {
649
        /* Compute the low 32 bytes */
650
60.5k
        ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
651
        /* Compute the hi 32 bytes */
652
60.5k
        ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
653
60.5k
      }
654
655
      /* Shuffle 8-byte qwords */
656
68.1k
      for (j = 0; j < 8; j++) {
657
        /* Compute the low 32 bytes */
658
60.5k
        ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
659
        /* Compute the hi 32 bytes */
660
60.5k
        ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
661
60.5k
      }
662
663
68.1k
      for (j = 0; j < 8; j++) {
664
60.5k
        ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
665
60.5k
        ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
666
60.5k
      }
667
668
      /* Store the result vectors in proper order */
669
7.57k
      const uint8_t* const dest_with_offset = dest + offset_into_type;
670
7.57k
      _mm256_storeu2_m128i(
671
7.57k
          (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
672
7.57k
          (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
673
7.57k
      _mm256_storeu2_m128i(
674
7.57k
          (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
675
7.57k
          (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
676
7.57k
      _mm256_storeu2_m128i(
677
7.57k
          (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
678
7.57k
          (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
679
7.57k
      _mm256_storeu2_m128i(
680
7.57k
          (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
681
7.57k
          (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
682
7.57k
      _mm256_storeu2_m128i(
683
7.57k
          (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
684
7.57k
          (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
685
7.57k
      _mm256_storeu2_m128i(
686
7.57k
          (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
687
7.57k
          (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
688
7.57k
      _mm256_storeu2_m128i(
689
7.57k
          (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
690
7.57k
          (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
691
7.57k
      _mm256_storeu2_m128i(
692
7.57k
          (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
693
7.57k
          (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
694
7.57k
      _mm256_storeu2_m128i(
695
7.57k
          (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
696
7.57k
          (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
697
7.57k
      _mm256_storeu2_m128i(
698
7.57k
          (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
699
7.57k
          (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
700
7.57k
      _mm256_storeu2_m128i(
701
7.57k
          (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
702
7.57k
          (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
703
7.57k
      _mm256_storeu2_m128i(
704
7.57k
          (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
705
7.57k
          (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
706
7.57k
      _mm256_storeu2_m128i(
707
7.57k
          (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
708
7.57k
          (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
709
7.57k
      _mm256_storeu2_m128i(
710
7.57k
          (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
711
7.57k
          (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
712
7.57k
      _mm256_storeu2_m128i(
713
7.57k
          (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
714
7.57k
          (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
715
7.57k
      _mm256_storeu2_m128i(
716
7.57k
          (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
717
7.57k
          (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
718
7.57k
    }
719
3.52k
  }
720
806
}
721
722
/* Shuffle a block.  This can never fail. */
723
void
724
shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
725
21.1k
             const uint8_t *_src, uint8_t *_dest) {
726
21.1k
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i);
727
728
  /* If the block size is too small to be vectorized,
729
     use the generic implementation. */
730
21.1k
  if (blocksize < vectorized_chunk_size) {
731
3.28k
    shuffle_generic(bytesoftype, blocksize, _src, _dest);
732
3.28k
    return;
733
3.28k
  }
734
735
  /* If the blocksize is not a multiple of both the typesize and
736
     the vector size, round the blocksize down to the next value
737
     which is a multiple of both. The vectorized shuffle can be
738
     used for that portion of the data, and the naive implementation
739
     can be used for the remaining portion. */
740
17.8k
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
741
742
17.8k
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
743
17.8k
  const int32_t total_elements = blocksize / bytesoftype;
744
745
  /* Optimized shuffle implementations */
746
17.8k
  switch (bytesoftype) {
747
0
    case 2:
748
0
      shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
749
0
      break;
750
2.82k
    case 4:
751
2.82k
      shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
752
2.82k
      break;
753
0
    case 8:
754
0
      shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
755
0
      break;
756
406
    case 16:
757
406
      shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
758
406
      break;
759
14.6k
    default:
760
      /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
761
14.6k
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
762
2.16k
        shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
763
2.16k
      }
764
12.4k
      else {
765
        /* Non-optimized shuffle */
766
12.4k
        shuffle_generic(bytesoftype, blocksize, _src, _dest);
767
        /* The non-optimized function covers the whole buffer,
768
           so we're done processing here. */
769
12.4k
        return;
770
12.4k
      }
771
17.8k
  }
772
773
  /* If the buffer had any bytes at the end which couldn't be handled
774
     by the vectorized implementations, use the non-optimized version
775
     to finish them up. */
776
5.39k
  if (vectorizable_bytes < blocksize) {
777
1.96k
    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
778
1.96k
  }
779
5.39k
}
780
781
/* Unshuffle a block.  This can never fail. */
782
void
783
unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
784
5.89k
               const uint8_t *_src, uint8_t *_dest) {
785
5.89k
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i);
786
787
  /* If the block size is too small to be vectorized,
788
     use the generic implementation. */
789
5.89k
  if (blocksize < vectorized_chunk_size) {
790
534
    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
791
534
    return;
792
534
  }
793
794
  /* If the blocksize is not a multiple of both the typesize and
795
     the vector size, round the blocksize down to the next value
796
     which is a multiple of both. The vectorized unshuffle can be
797
     used for that portion of the data, and the naive implementation
798
     can be used for the remaining portion. */
799
5.36k
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
800
801
5.36k
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
802
5.36k
  const int32_t total_elements = blocksize / bytesoftype;
803
804
  /* Optimized unshuffle implementations */
805
5.36k
  switch (bytesoftype) {
806
0
    case 2:
807
0
      unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
808
0
      break;
809
872
    case 4:
810
872
      unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
811
872
      break;
812
0
    case 8:
813
0
      unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
814
0
      break;
815
0
    case 12:
816
0
      unshuffle12_avx2(_dest, _src, vectorizable_elements, total_elements);
817
0
      break;
818
268
    case 16:
819
268
      unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
820
268
      break;
821
4.22k
    default:
822
      /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
823
4.22k
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
824
806
        unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
825
806
      }
826
3.41k
      else {
827
        /* Non-optimized unshuffle */
828
3.41k
        unshuffle_generic(bytesoftype, blocksize, _src, _dest);
829
        /* The non-optimized function covers the whole buffer,
830
           so we're done processing here. */
831
3.41k
        return;
832
3.41k
      }
833
5.36k
  }
834
835
  /* If the buffer had any bytes at the end which couldn't be handled
836
     by the vectorized implementations, use the non-optimized version
837
     to finish them up. */
838
1.94k
  if (vectorizable_bytes < blocksize) {
839
620
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
840
620
  }
841
1.94k
}
842
843
const bool is_shuffle_avx2 = true;
844
845
#else
846
847
const bool is_shuffle_avx2 = false;
848
849
void shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
850
                  const uint8_t *_src, uint8_t *_dest) {
851
  abort();
852
}
853
854
void unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize,
855
                    const uint8_t *_src, uint8_t *_dest) {
856
  abort();
857
}
858
859
#endif /* defined(__AVX2__) */