Coverage Report

Created: 2026-01-10 06:55

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/shuffle-avx2.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Author: Francesc Alted <francesc@blosc.org>
5
6
  See LICENSE.txt for details about copyright and rights to use.
7
**********************************************************************/
8
9
#include "shuffle-generic.h"
10
#include "shuffle-avx2.h"
11
12
/* Define dummy functions if AVX2 is not available for the compilation target and compiler. */
13
#if !defined(__AVX2__)
14
#include <stdlib.h>
15
16
void
17
blosc_internal_shuffle_avx2(const size_t bytesoftype, const size_t blocksize,
18
                            const uint8_t* const _src, uint8_t* const _dest) {
19
  abort();
20
}
21
22
void
23
blosc_internal_unshuffle_avx2(const size_t bytesoftype, const size_t blocksize,
24
                              const uint8_t* const _src, uint8_t* const _dest) {
25
  abort();
26
}
27
28
#else /* defined(__AVX2__) */
29
30
#include <immintrin.h>
31
32
33
/* The next is useful for debugging purposes */
34
#if 0
35
#include <stdio.h>
36
#include <string.h>
37
38
static void printymm(__m256i ymm0)
39
{
40
  uint8_t buf[32];
41
42
  ((__m256i *)buf)[0] = ymm0;
43
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
44
          buf[0], buf[1], buf[2], buf[3],
45
          buf[4], buf[5], buf[6], buf[7],
46
          buf[8], buf[9], buf[10], buf[11],
47
          buf[12], buf[13], buf[14], buf[15],
48
          buf[16], buf[17], buf[18], buf[19],
49
          buf[20], buf[21], buf[22], buf[23],
50
          buf[24], buf[25], buf[26], buf[27],
51
          buf[28], buf[29], buf[30], buf[31]);
52
}
53
#endif
54
55
/* GCC doesn't include the split load/store intrinsics
56
   needed for the tiled shuffle, so define them here. */
57
#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
58
static inline __m256i
59
__attribute__((__always_inline__))
60
_mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr)
61
{
62
  return _mm256_inserti128_si256(
63
    _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1);
64
}
65
66
static inline void
67
__attribute__((__always_inline__))
68
_mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a)
69
{
70
  _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
71
  _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
72
}
73
#endif  /* defined(__GNUC__) */
74
75
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
76
static void
77
shuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
78
  const size_t vectorizable_elements, const size_t total_elements)
79
0
{
80
0
  static const size_t bytesoftype = 2;
81
0
  size_t j;
82
0
  int k;
83
0
  __m256i ymm0[2], ymm1[2];
84
85
  /* Create the shuffle mask.
86
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
87
     most to least significant (i.e., their order is reversed when compared to
88
     loading the mask from an array). */
89
0
  const __m256i shmask = _mm256_set_epi8(
90
0
    0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
91
0
    0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
92
0
    0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
93
0
    0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
94
95
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
96
    /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */
97
0
    for (k = 0; k < 2; k++) {
98
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
99
0
      ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
100
0
    }
101
102
0
    ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
103
0
    ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
104
105
0
    ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
106
0
    ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
107
0
    ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
108
109
    /* Store the result vectors */
110
0
    uint8_t* const dest_for_jth_element = dest + j;
111
0
    for (k = 0; k < 2; k++) {
112
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]);
113
0
    }
114
0
  }
115
0
}
116
117
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
118
static void
119
shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
120
  const size_t vectorizable_elements, const size_t total_elements)
121
0
{
122
0
  static const size_t bytesoftype = 4;
123
0
  size_t i;
124
0
  int j;
125
0
  __m256i ymm0[4], ymm1[4];
126
127
  /* Create the shuffle mask.
128
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
129
     most to least significant (i.e., their order is reversed when compared to
130
     loading the mask from an array). */
131
0
  const __m256i mask = _mm256_set_epi32(
132
0
    0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
133
134
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
135
    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
136
0
    for (j = 0; j < 4; j++) {
137
0
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
138
0
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
139
0
      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
140
0
      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
141
0
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
142
0
      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
143
0
    }
144
    /* Transpose double words */
145
0
    for (j = 0; j < 2; j++) {
146
0
      ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
147
0
      ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
148
0
    }
149
    /* Transpose quad words */
150
0
    for (j = 0; j < 2; j++) {
151
0
      ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]);
152
0
      ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]);
153
0
    }
154
0
    for (j = 0; j < 4; j++) {
155
0
      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
156
0
    }
157
    /* Store the result vectors */
158
0
    uint8_t* const dest_for_ith_element = dest + i;
159
0
    for (j = 0; j < 4; j++) {
160
0
      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
161
0
    }
162
0
  }
163
0
}
164
165
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
166
static void
167
shuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
168
  const size_t vectorizable_elements, const size_t total_elements)
169
0
{
170
0
  static const size_t bytesoftype = 8;
171
0
  size_t j;
172
0
  int k, l;
173
0
  __m256i ymm0[8], ymm1[8];
174
175
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
176
    /* Fetch 32 elements (256 bytes) then transpose bytes. */
177
0
    for (k = 0; k < 8; k++) {
178
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
179
0
      ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e);
180
0
      ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]);
181
0
    }
182
    /* Transpose words */
183
0
    for (k = 0, l = 0; k < 4; k++, l +=2) {
184
0
      ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]);
185
0
      ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]);
186
0
    }
187
    /* Transpose double words */
188
0
    for (k = 0, l = 0; k < 4; k++, l++) {
189
0
      if (k == 2) l += 2;
190
0
      ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]);
191
0
      ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]);
192
0
    }
193
    /* Transpose quad words */
194
0
    for (k = 0; k < 4; k++) {
195
0
      ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]);
196
0
      ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]);
197
0
    }
198
0
    for(k = 0; k < 8; k++) {
199
0
      ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72);
200
0
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8);
201
0
      ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]);
202
0
    }
203
    /* Store the result vectors */
204
0
    uint8_t* const dest_for_jth_element = dest + j;
205
0
    for (k = 0; k < 8; k++) {
206
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
207
0
    }
208
0
  }
209
0
}
210
211
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
212
static void
213
shuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
214
  const size_t vectorizable_elements, const size_t total_elements)
215
0
{
216
0
  static const size_t bytesoftype = 16;
217
0
  size_t j;
218
0
  int k, l;
219
0
  __m256i ymm0[16], ymm1[16];
220
221
  /* Create the shuffle mask.
222
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
223
     most to least significant (i.e., their order is reversed when compared to
224
     loading the mask from an array). */
225
0
  const __m256i shmask = _mm256_set_epi8(
226
0
    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
227
0
    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
228
0
    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
229
0
    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
230
231
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
232
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
233
0
    for (k = 0; k < 16; k++) {
234
0
      ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i))));
235
0
    }
236
    /* Transpose bytes */
237
0
    for (k = 0, l = 0; k < 8; k++, l +=2) {
238
0
      ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
239
0
      ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
240
0
    }
241
    /* Transpose words */
242
0
    for (k = 0, l = -2; k < 8; k++, l++) {
243
0
      if ((k%2) == 0) l += 2;
244
0
      ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
245
0
      ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
246
0
    }
247
    /* Transpose double words */
248
0
    for (k = 0, l = -4; k < 8; k++, l++) {
249
0
      if ((k%4) == 0) l += 4;
250
0
      ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
251
0
      ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
252
0
    }
253
    /* Transpose quad words */
254
0
    for (k = 0; k < 8; k++) {
255
0
      ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
256
0
      ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
257
0
    }
258
0
    for (k = 0; k < 16; k++) {
259
0
      ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
260
0
      ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
261
0
    }
262
    /* Store the result vectors */
263
0
    uint8_t* const dest_for_jth_element = dest + j;
264
0
    for (k = 0; k < 16; k++) {
265
0
      _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]);
266
0
    }
267
0
  }
268
0
}
269
270
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
271
static void
272
shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
273
  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
274
0
{
275
0
  size_t j;
276
0
  int k, l;
277
0
  __m256i ymm0[16], ymm1[16];
278
279
0
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
280
281
  /* Create the shuffle mask.
282
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
283
     most to least significant (i.e., their order is reversed when compared to
284
     loading the mask from an array). */
285
0
  const __m256i shmask = _mm256_set_epi8(
286
0
    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
287
0
    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
288
0
    0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
289
0
    0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
290
291
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) {
292
    /* Advance the offset into the type by the vector size (in bytes), unless this is
293
    the initial iteration and the type size is not a multiple of the vector size.
294
    In that case, only advance by the number of bytes necessary so that the number
295
    of remaining bytes in the type will be a multiple of the vector size. */
296
0
    size_t offset_into_type;
297
0
    for (offset_into_type = 0; offset_into_type < bytesoftype;
298
0
      offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
299
300
      /* Fetch elements in groups of 512 bytes */
301
0
      const uint8_t* const src_with_offset = src + offset_into_type;
302
0
      for (k = 0; k < 16; k++) {
303
0
        ymm0[k] = _mm256_loadu2_m128i(
304
0
          (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype),
305
0
          (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype));
306
0
      }
307
      /* Transpose bytes */
308
0
      for (k = 0, l = 0; k < 8; k++, l +=2) {
309
0
        ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]);
310
0
        ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]);
311
0
      }
312
      /* Transpose words */
313
0
      for (k = 0, l = -2; k < 8; k++, l++) {
314
0
        if ((k%2) == 0) l += 2;
315
0
        ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]);
316
0
        ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]);
317
0
      }
318
      /* Transpose double words */
319
0
      for (k = 0, l = -4; k < 8; k++, l++) {
320
0
        if ((k%4) == 0) l += 4;
321
0
        ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]);
322
0
        ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]);
323
0
      }
324
      /* Transpose quad words */
325
0
      for (k = 0; k < 8; k++) {
326
0
        ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]);
327
0
        ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]);
328
0
      }
329
0
      for (k = 0; k < 16; k++) {
330
0
        ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
331
0
        ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
332
0
      }
333
      /* Store the result vectors */
334
0
      uint8_t* const dest_for_jth_element = dest + j;
335
0
      for (k = 0; k < 16; k++) {
336
0
        _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]);
337
0
      }
338
0
    }
339
0
  }
340
0
}
341
342
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
343
static void
344
unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src,
345
  const size_t vectorizable_elements, const size_t total_elements)
346
434
{
347
434
  static const size_t bytesoftype = 2;
348
434
  size_t i;
349
434
  int j;
350
434
  __m256i ymm0[2], ymm1[2];
351
352
49.9k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
353
    /* Load 32 elements (64 bytes) into 2 YMM registers. */
354
49.5k
    const uint8_t* const src_for_ith_element = src + i;
355
148k
    for (j = 0; j < 2; j++) {
356
99.1k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
357
99.1k
    }
358
    /* Shuffle bytes */
359
148k
    for (j = 0; j < 2; j++) {
360
99.1k
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
361
99.1k
    }
362
    /* Compute the low 64 bytes */
363
49.5k
    ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
364
    /* Compute the hi 64 bytes */
365
49.5k
    ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
366
    /* Store the result vectors in proper order */
367
49.5k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
368
49.5k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]);
369
49.5k
  }
370
434
}
371
372
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
373
static void
374
unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
375
  const size_t vectorizable_elements, const size_t total_elements)
376
227
{
377
227
  static const size_t bytesoftype = 4;
378
227
  size_t i;
379
227
  int j;
380
227
  __m256i ymm0[4], ymm1[4];
381
382
11.9k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
383
    /* Load 32 elements (128 bytes) into 4 YMM registers. */
384
11.7k
    const uint8_t* const src_for_ith_element = src + i;
385
58.5k
    for (j = 0; j < 4; j++) {
386
46.8k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
387
46.8k
    }
388
    /* Shuffle bytes */
389
35.1k
    for (j = 0; j < 2; j++) {
390
      /* Compute the low 64 bytes */
391
23.4k
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
392
      /* Compute the hi 64 bytes */
393
23.4k
      ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
394
23.4k
    }
395
    /* Shuffle 2-byte words */
396
35.1k
    for (j = 0; j < 2; j++) {
397
      /* Compute the low 64 bytes */
398
23.4k
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
399
      /* Compute the hi 64 bytes */
400
23.4k
      ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
401
23.4k
    }
402
11.7k
    ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20);
403
11.7k
    ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20);
404
11.7k
    ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31);
405
11.7k
    ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31);
406
407
    /* Store the result vectors in proper order */
408
58.5k
    for (j = 0; j < 4; j++) {
409
46.8k
      _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]);
410
46.8k
    }
411
11.7k
  }
412
227
}
413
414
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
415
static void
416
unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src,
417
  const size_t vectorizable_elements, const size_t total_elements)
418
267
{
419
267
  static const size_t bytesoftype = 8;
420
267
  size_t i;
421
267
  int j;
422
267
  __m256i ymm0[8], ymm1[8];
423
424
2.17k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
425
    /* Fetch 32 elements (256 bytes) into 8 YMM registers. */
426
1.90k
    const uint8_t* const src_for_ith_element = src + i;
427
17.1k
    for (j = 0; j < 8; j++) {
428
15.2k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
429
15.2k
    }
430
    /* Shuffle bytes */
431
9.53k
    for (j = 0; j < 4; j++) {
432
      /* Compute the low 32 bytes */
433
7.62k
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
434
      /* Compute the hi 32 bytes */
435
7.62k
      ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
436
7.62k
    }
437
    /* Shuffle words */
438
9.53k
    for (j = 0; j < 4; j++) {
439
      /* Compute the low 32 bytes */
440
7.62k
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
441
      /* Compute the hi 32 bytes */
442
7.62k
      ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
443
7.62k
    }
444
17.1k
    for (j = 0; j < 8; j++) {
445
15.2k
      ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
446
15.2k
    }
447
448
    /* Shuffle 4-byte dwords */
449
9.53k
    for (j = 0; j < 4; j++) {
450
      /* Compute the low 32 bytes */
451
7.62k
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
452
      /* Compute the hi 32 bytes */
453
7.62k
      ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
454
7.62k
    }
455
456
    /* Store the result vectors in proper order */
457
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
458
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]);
459
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]);
460
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]);
461
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]);
462
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]);
463
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]);
464
1.90k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
465
1.90k
  }
466
267
}
467
468
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
469
static void
470
unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src,
471
  const size_t vectorizable_elements, const size_t total_elements)
472
359
{
473
359
  static const size_t bytesoftype = 16;
474
359
  size_t i;
475
359
  int j;
476
359
  __m256i ymm0[16], ymm1[16];
477
478
4.47k
  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
479
    /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
480
4.11k
    const uint8_t* const src_for_ith_element = src + i;
481
69.8k
    for (j = 0; j < 16; j++) {
482
65.7k
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements)));
483
65.7k
    }
484
485
    /* Shuffle bytes */
486
36.9k
    for (j = 0; j < 8; j++) {
487
      /* Compute the low 32 bytes */
488
32.8k
      ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
489
      /* Compute the hi 32 bytes */
490
32.8k
      ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
491
32.8k
    }
492
    /* Shuffle 2-byte words */
493
36.9k
    for (j = 0; j < 8; j++) {
494
      /* Compute the low 32 bytes */
495
32.8k
      ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
496
      /* Compute the hi 32 bytes */
497
32.8k
      ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
498
32.8k
    }
499
    /* Shuffle 4-byte dwords */
500
36.9k
    for (j = 0; j < 8; j++) {
501
      /* Compute the low 32 bytes */
502
32.8k
      ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
503
      /* Compute the hi 32 bytes */
504
32.8k
      ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
505
32.8k
    }
506
507
    /* Shuffle 8-byte qwords */
508
36.9k
    for (j = 0; j < 8; j++) {
509
      /* Compute the low 32 bytes */
510
32.8k
      ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
511
      /* Compute the hi 32 bytes */
512
32.8k
      ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
513
32.8k
    }
514
515
36.9k
    for (j = 0; j < 8; j++) {
516
32.8k
      ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
517
32.8k
      ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
518
32.8k
    }
519
520
    /* Store the result vectors in proper order */
521
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]);
522
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]);
523
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]);
524
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]);
525
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]);
526
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]);
527
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]);
528
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]);
529
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]);
530
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]);
531
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]);
532
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]);
533
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]);
534
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]);
535
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]);
536
4.11k
    _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]);
537
4.11k
  }
538
359
}
539
540
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
541
static void
542
unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src,
543
  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
544
590
{
545
590
  size_t i;
546
590
  int j;
547
590
  __m256i ymm0[16], ymm1[16];
548
549
590
  const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i));
550
551
  /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
552
     to optimize cache utilization. */
553
590
  size_t offset_into_type;
554
2.31k
  for (offset_into_type = 0; offset_into_type < bytesoftype;
555
1.72k
    offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) {
556
6.83k
    for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
557
      /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
558
5.10k
      const uint8_t* const src_for_ith_element = src + i;
559
86.8k
      for (j = 0; j < 16; j++) {
560
81.6k
        ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
561
81.6k
      }
562
563
      /* Shuffle bytes */
564
45.9k
      for (j = 0; j < 8; j++) {
565
        /* Compute the low 32 bytes */
566
40.8k
        ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]);
567
        /* Compute the hi 32 bytes */
568
40.8k
        ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]);
569
40.8k
      }
570
      /* Shuffle 2-byte words */
571
45.9k
      for (j = 0; j < 8; j++) {
572
        /* Compute the low 32 bytes */
573
40.8k
        ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]);
574
        /* Compute the hi 32 bytes */
575
40.8k
        ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]);
576
40.8k
      }
577
      /* Shuffle 4-byte dwords */
578
45.9k
      for (j = 0; j < 8; j++) {
579
        /* Compute the low 32 bytes */
580
40.8k
        ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
581
        /* Compute the hi 32 bytes */
582
40.8k
        ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
583
40.8k
      }
584
585
      /* Shuffle 8-byte qwords */
586
45.9k
      for (j = 0; j < 8; j++) {
587
        /* Compute the low 32 bytes */
588
40.8k
        ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]);
589
        /* Compute the hi 32 bytes */
590
40.8k
        ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]);
591
40.8k
      }
592
593
45.9k
      for (j = 0; j < 8; j++) {
594
40.8k
        ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20);
595
40.8k
        ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31);
596
40.8k
      }
597
598
      /* Store the result vectors in proper order */
599
5.10k
      const uint8_t* const dest_with_offset = dest + offset_into_type;
600
5.10k
      _mm256_storeu2_m128i(
601
5.10k
        (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype),
602
5.10k
        (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
603
5.10k
      _mm256_storeu2_m128i(
604
5.10k
        (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype),
605
5.10k
        (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
606
5.10k
      _mm256_storeu2_m128i(
607
5.10k
        (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype),
608
5.10k
        (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
609
5.10k
      _mm256_storeu2_m128i(
610
5.10k
        (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype),
611
5.10k
        (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
612
5.10k
      _mm256_storeu2_m128i(
613
5.10k
        (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype),
614
5.10k
        (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
615
5.10k
      _mm256_storeu2_m128i(
616
5.10k
        (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype),
617
5.10k
        (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
618
5.10k
      _mm256_storeu2_m128i(
619
5.10k
        (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype),
620
5.10k
        (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
621
5.10k
      _mm256_storeu2_m128i(
622
5.10k
        (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype),
623
5.10k
        (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
624
5.10k
      _mm256_storeu2_m128i(
625
5.10k
        (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype),
626
5.10k
        (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
627
5.10k
      _mm256_storeu2_m128i(
628
5.10k
        (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype),
629
5.10k
        (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
630
5.10k
      _mm256_storeu2_m128i(
631
5.10k
        (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype),
632
5.10k
        (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
633
5.10k
      _mm256_storeu2_m128i(
634
5.10k
        (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype),
635
5.10k
        (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
636
5.10k
      _mm256_storeu2_m128i(
637
5.10k
        (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype),
638
5.10k
        (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
639
5.10k
      _mm256_storeu2_m128i(
640
5.10k
        (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype),
641
5.10k
        (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
642
5.10k
      _mm256_storeu2_m128i(
643
5.10k
        (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype),
644
5.10k
        (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
645
5.10k
      _mm256_storeu2_m128i(
646
5.10k
        (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype),
647
5.10k
        (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
648
5.10k
    }
649
1.72k
  }
650
590
}
651
652
/* Shuffle a block.  This can never fail. */
653
void
654
blosc_internal_shuffle_avx2(const size_t bytesoftype, const size_t blocksize,
655
0
                            const uint8_t* const _src, uint8_t* const _dest) {
656
0
  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
657
658
  /* If the block size is too small to be vectorized,
659
     use the generic implementation. */
660
0
  if (blocksize < vectorized_chunk_size) {
661
0
    blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
662
0
    return;
663
0
  }
664
665
  /* If the blocksize is not a multiple of both the typesize and
666
     the vector size, round the blocksize down to the next value
667
     which is a multiple of both. The vectorized shuffle can be
668
     used for that portion of the data, and the naive implementation
669
     can be used for the remaining portion. */
670
0
  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
671
672
0
  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
673
0
  const size_t total_elements = blocksize / bytesoftype;
674
675
  /* Optimized shuffle implementations */
676
0
  switch (bytesoftype)
677
0
  {
678
0
  case 2:
679
0
    shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
680
0
    break;
681
0
  case 4:
682
0
    shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
683
0
    break;
684
0
  case 8:
685
0
    shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
686
0
    break;
687
0
  case 16:
688
0
    shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
689
0
    break;
690
0
  default:
691
    /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */
692
0
    if (bytesoftype > sizeof(__m128i)) {
693
0
      shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
694
0
    }
695
0
    else {
696
      /* Non-optimized shuffle */
697
0
      blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
698
      /* The non-optimized function covers the whole buffer,
699
         so we're done processing here. */
700
0
      return;
701
0
    }
702
0
  }
703
704
  /* If the buffer had any bytes at the end which couldn't be handled
705
     by the vectorized implementations, use the non-optimized version
706
     to finish them up. */
707
0
  if (vectorizable_bytes < blocksize) {
708
0
    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
709
0
  }
710
0
}
711
712
/* Unshuffle a block.  This can never fail. */
713
void
714
blosc_internal_unshuffle_avx2(const size_t bytesoftype, const size_t blocksize,
715
5.20k
                              const uint8_t* const _src, uint8_t* const _dest) {
716
5.20k
  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);
717
718
  /* If the block size is too small to be vectorized,
719
     use the generic implementation. */
720
5.20k
  if (blocksize < vectorized_chunk_size) {
721
3.11k
    blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
722
3.11k
    return;
723
3.11k
  }
724
725
  /* If the blocksize is not a multiple of both the typesize and
726
     the vector size, round the blocksize down to the next value
727
     which is a multiple of both. The vectorized unshuffle can be
728
     used for that portion of the data, and the naive implementation
729
     can be used for the remaining portion. */
730
2.09k
  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
731
732
2.09k
  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
733
2.09k
  const size_t total_elements = blocksize / bytesoftype;
734
735
  /* Optimized unshuffle implementations */
736
2.09k
  switch (bytesoftype)
737
2.09k
  {
738
434
  case 2:
739
434
    unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
740
434
    break;
741
227
  case 4:
742
227
    unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
743
227
    break;
744
267
  case 8:
745
267
    unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
746
267
    break;
747
359
  case 16:
748
359
    unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
749
359
    break;
750
805
  default:
751
    /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
752
805
    if (bytesoftype > sizeof(__m128i)) {
753
590
      unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
754
590
    }
755
215
    else {
756
      /* Non-optimized unshuffle */
757
215
      blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
758
      /* The non-optimized function covers the whole buffer,
759
         so we're done processing here. */
760
215
      return;
761
215
    }
762
2.09k
  }
763
764
  /* If the buffer had any bytes at the end which couldn't be handled
765
     by the vectorized implementations, use the non-optimized version
766
     to finish them up. */
767
1.87k
  if (vectorizable_bytes < blocksize) {
768
902
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
769
902
  }
770
1.87k
}
771
772
#endif /* !defined(__AVX2__) */