Coverage Report

Created: 2023-12-08 06:59

/src/c-blosc/blosc/shuffle-sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Author: Francesc Alted <francesc@blosc.org>
5
6
  See LICENSE.txt for details about copyright and rights to use.
7
**********************************************************************/
8
9
#include "shuffle-generic.h"
10
#include "shuffle-sse2.h"
11
12
/* Define dummy functions if SSE2 is not available for the compilation target and compiler. */
13
#if !defined(__SSE2__)
14
15
void
16
blosc_internal_shuffle_sse2(const size_t bytesoftype, const size_t blocksize,
17
                            const uint8_t* const _src, uint8_t* const _dest) {
18
  abort();
19
}
20
21
void
22
blosc_internal_unshuffle_sse2(const size_t bytesoftype, const size_t blocksize,
23
                              const uint8_t* const _src, uint8_t* const _dest) {
24
  abort();
25
}
26
27
# else /* defined(__SSE2__) */
28
29
#include <emmintrin.h>
30
31
32
/* The next is useful for debugging purposes */
33
#if 0
34
#include <stdio.h>
35
#include <string.h>
36
37
static void printxmm(__m128i xmm0)
38
{
39
  uint8_t buf[16];
40
41
  ((__m128i *)buf)[0] = xmm0;
42
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
43
          buf[0], buf[1], buf[2], buf[3],
44
          buf[4], buf[5], buf[6], buf[7],
45
          buf[8], buf[9], buf[10], buf[11],
46
          buf[12], buf[13], buf[14], buf[15]);
47
}
48
#endif
49
50
51
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
52
static void
53
shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
54
  const size_t vectorizable_elements, const size_t total_elements)
55
0
{
56
0
  static const size_t bytesoftype = 2;
57
0
  size_t j;
58
0
  int k;
59
0
  uint8_t* dest_for_jth_element;
60
0
  __m128i xmm0[2], xmm1[2];
61
62
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
63
    /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
64
0
    for (k = 0; k < 2; k++) {
65
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
66
0
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
67
0
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
68
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
69
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
70
0
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
71
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
72
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
73
0
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
74
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
75
0
    }
76
    /* Transpose quad words */
77
0
    for (k = 0; k < 1; k++) {
78
0
      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
79
0
      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
80
0
    }
81
    /* Store the result vectors */
82
0
    dest_for_jth_element = dest + j;
83
0
    for (k = 0; k < 2; k++) {
84
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
85
0
    }
86
0
  }
87
0
}
88
89
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
90
static void
91
shuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
92
  const size_t vectorizable_elements, const size_t total_elements)
93
0
{
94
0
  static const size_t bytesoftype = 4;
95
0
  size_t i;
96
0
  int j;
97
0
  uint8_t* dest_for_ith_element;
98
0
  __m128i xmm0[4], xmm1[4];
99
100
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
101
    /* Fetch 16 elements (64 bytes) then transpose bytes and words. */
102
0
    for (j = 0; j < 4; j++) {
103
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i))));
104
0
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8);
105
0
      xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d);
106
0
      xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]);
107
0
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e);
108
0
      xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]);
109
0
    }
110
    /* Transpose double words */
111
0
    for (j = 0; j < 2; j++) {
112
0
      xmm1[j*2] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
113
0
      xmm1[j*2+1] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
114
0
    }
115
    /* Transpose quad words */
116
0
    for (j = 0; j < 2; j++) {
117
0
      xmm0[j*2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j+2]);
118
0
      xmm0[j*2+1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j+2]);
119
0
    }
120
    /* Store the result vectors */
121
0
    dest_for_ith_element = dest + i;
122
0
    for (j = 0; j < 4; j++) {
123
0
      _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]);
124
0
    }
125
0
  }
126
0
}
127
128
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
129
static void
130
shuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
131
  const size_t vectorizable_elements, const size_t total_elements)
132
0
{
133
0
  static const size_t bytesoftype = 8;
134
0
  size_t j;
135
0
  int k, l;
136
0
  uint8_t* dest_for_jth_element;
137
0
  __m128i xmm0[8], xmm1[8];
138
139
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
140
    /* Fetch 16 elements (128 bytes) then transpose bytes. */
141
0
    for (k = 0; k < 8; k++) {
142
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
143
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
144
0
      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
145
0
    }
146
    /* Transpose words */
147
0
    for (k = 0, l = 0; k < 4; k++, l +=2) {
148
0
      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]);
149
0
      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]);
150
0
    }
151
    /* Transpose double words */
152
0
    for (k = 0, l = 0; k < 4; k++, l++) {
153
0
      if (k == 2) l += 2;
154
0
      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]);
155
0
      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]);
156
0
    }
157
    /* Transpose quad words */
158
0
    for (k = 0; k < 4; k++) {
159
0
      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]);
160
0
      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]);
161
0
    }
162
    /* Store the result vectors */
163
0
    dest_for_jth_element = dest + j;
164
0
    for (k = 0; k < 8; k++) {
165
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
166
0
    }
167
0
  }
168
0
}
169
170
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
171
static void
172
shuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
173
  const size_t vectorizable_elements, const size_t total_elements)
174
0
{
175
0
  static const size_t bytesoftype = 16;
176
0
  size_t j;
177
0
  int k, l;
178
0
  uint8_t* dest_for_jth_element;
179
0
  __m128i xmm0[16], xmm1[16];
180
181
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
182
    /* Fetch 16 elements (256 bytes). */
183
0
    for (k = 0; k < 16; k++) {
184
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
185
0
    }
186
    /* Transpose bytes */
187
0
    for (k = 0, l = 0; k < 8; k++, l +=2) {
188
0
      xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
189
0
      xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
190
0
    }
191
    /* Transpose words */
192
0
    for (k = 0, l = -2; k < 8; k++, l++) {
193
0
      if ((k%2) == 0) l += 2;
194
0
      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
195
0
      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
196
0
    }
197
    /* Transpose double words */
198
0
    for (k = 0, l = -4; k < 8; k++, l++) {
199
0
      if ((k%4) == 0) l += 4;
200
0
      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
201
0
      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
202
0
    }
203
    /* Transpose quad words */
204
0
    for (k = 0; k < 8; k++) {
205
0
      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
206
0
      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
207
0
    }
208
    /* Store the result vectors */
209
0
    dest_for_jth_element = dest + j;
210
0
    for (k = 0; k < 16; k++) {
211
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
212
0
    }
213
0
  }
214
0
}
215
216
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
217
static void
218
shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src,
219
  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
220
0
{
221
0
  size_t j;
222
0
  const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i);
223
0
  int k, l;
224
0
  uint8_t* dest_for_jth_element;
225
0
  __m128i xmm0[16], xmm1[16];
226
227
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
228
    /* Advance the offset into the type by the vector size (in bytes), unless this is
229
    the initial iteration and the type size is not a multiple of the vector size.
230
    In that case, only advance by the number of bytes necessary so that the number
231
    of remaining bytes in the type will be a multiple of the vector size. */
232
0
    size_t offset_into_type;
233
0
    for (offset_into_type = 0; offset_into_type < bytesoftype;
234
0
      offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) {
235
236
      /* Fetch elements in groups of 256 bytes */
237
0
      const uint8_t* const src_with_offset = src + offset_into_type;
238
0
      for (k = 0; k < 16; k++) {
239
0
        xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype));
240
0
      }
241
      /* Transpose bytes */
242
0
      for (k = 0, l = 0; k < 8; k++, l +=2) {
243
0
        xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
244
0
        xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
245
0
      }
246
      /* Transpose words */
247
0
      for (k = 0, l = -2; k < 8; k++, l++) {
248
0
        if ((k%2) == 0) l += 2;
249
0
        xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
250
0
        xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
251
0
      }
252
      /* Transpose double words */
253
0
      for (k = 0, l = -4; k < 8; k++, l++) {
254
0
        if ((k%4) == 0) l += 4;
255
0
        xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
256
0
        xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
257
0
      }
258
      /* Transpose quad words */
259
0
      for (k = 0; k < 8; k++) {
260
0
        xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
261
0
        xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
262
0
      }
263
      /* Store the result vectors */
264
0
      dest_for_jth_element = dest + j;
265
0
      for (k = 0; k < 16; k++) {
266
0
        _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]);
267
0
      }
268
0
    }
269
0
  }
270
0
}
271
272
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
273
static void
274
unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
275
  const size_t vectorizable_elements, const size_t total_elements)
276
0
{
277
0
  static const size_t bytesoftype = 2;
278
0
  size_t i;
279
0
  int j;
280
0
  __m128i xmm0[2], xmm1[2];
281
282
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
283
    /* Load 16 elements (32 bytes) into 2 XMM registers. */
284
0
    const uint8_t* const src_for_ith_element = src + i;
285
0
    for (j = 0; j < 2; j++) {
286
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
287
0
    }
288
    /* Shuffle bytes */
289
    /* Compute the low 32 bytes */
290
0
    xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]);
291
    /* Compute the hi 32 bytes */
292
0
    xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]);
293
    /* Store the result vectors in proper order */
294
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
295
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]);
296
0
  }
297
0
}
298
299
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
300
static void
301
unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
302
  const size_t vectorizable_elements, const size_t total_elements)
303
0
{
304
0
  static const size_t bytesoftype = 4;
305
0
  size_t i;
306
0
  int j;
307
0
  __m128i xmm0[4], xmm1[4];
308
309
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
310
    /* Load 16 elements (64 bytes) into 4 XMM registers. */
311
0
    const uint8_t* const src_for_ith_element = src + i;
312
0
    for (j = 0; j < 4; j++) {
313
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
314
0
    }
315
    /* Shuffle bytes */
316
0
    for (j = 0; j < 2; j++) {
317
      /* Compute the low 32 bytes */
318
0
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
319
      /* Compute the hi 32 bytes */
320
0
      xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
321
0
    }
322
    /* Shuffle 2-byte words */
323
0
    for (j = 0; j < 2; j++) {
324
      /* Compute the low 32 bytes */
325
0
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
326
      /* Compute the hi 32 bytes */
327
0
      xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
328
0
    }
329
    /* Store the result vectors in proper order */
330
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]);
331
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]);
332
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]);
333
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]);
334
0
  }
335
0
}
336
337
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
338
static void
339
unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
340
  const size_t vectorizable_elements, const size_t total_elements)
341
0
{
342
0
  static const size_t bytesoftype = 8;
343
0
  size_t i;
344
0
  int j;
345
0
  __m128i xmm0[8], xmm1[8];
346
347
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
348
    /* Load 16 elements (128 bytes) into 8 XMM registers. */
349
0
    const uint8_t* const src_for_ith_element = src + i;
350
0
    for (j = 0; j < 8; j++) {
351
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
352
0
    }
353
    /* Shuffle bytes */
354
0
    for (j = 0; j < 4; j++) {
355
      /* Compute the low 32 bytes */
356
0
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
357
      /* Compute the hi 32 bytes */
358
0
      xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
359
0
    }
360
    /* Shuffle 2-byte words */
361
0
    for (j = 0; j < 4; j++) {
362
      /* Compute the low 32 bytes */
363
0
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
364
      /* Compute the hi 32 bytes */
365
0
      xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
366
0
    }
367
    /* Shuffle 4-byte dwords */
368
0
    for (j = 0; j < 4; j++) {
369
      /* Compute the low 32 bytes */
370
0
      xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
371
      /* Compute the hi 32 bytes */
372
0
      xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
373
0
    }
374
    /* Store the result vectors in proper order */
375
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
376
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]);
377
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]);
378
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]);
379
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]);
380
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]);
381
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]);
382
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]);
383
0
  }
384
0
}
385
386
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
387
static void
388
unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
389
  const size_t vectorizable_elements, const size_t total_elements)
390
0
{
391
0
  static const size_t bytesoftype = 16;
392
0
  size_t i;
393
0
  int j;
394
0
  __m128i xmm1[16], xmm2[16];
395
396
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
397
    /* Load 16 elements (256 bytes) into 16 XMM registers. */
398
0
    const uint8_t* const src_for_ith_element = src + i;
399
0
    for (j = 0; j < 16; j++) {
400
0
      xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
401
0
    }
402
    /* Shuffle bytes */
403
0
    for (j = 0; j < 8; j++) {
404
      /* Compute the low 32 bytes */
405
0
      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
406
      /* Compute the hi 32 bytes */
407
0
      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
408
0
    }
409
    /* Shuffle 2-byte words */
410
0
    for (j = 0; j < 8; j++) {
411
      /* Compute the low 32 bytes */
412
0
      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
413
      /* Compute the hi 32 bytes */
414
0
      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
415
0
    }
416
    /* Shuffle 4-byte dwords */
417
0
    for (j = 0; j < 8; j++) {
418
      /* Compute the low 32 bytes */
419
0
      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
420
      /* Compute the hi 32 bytes */
421
0
      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
422
0
    }
423
    /* Shuffle 8-byte qwords */
424
0
    for (j = 0; j < 8; j++) {
425
      /* Compute the low 32 bytes */
426
0
      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
427
      /* Compute the hi 32 bytes */
428
0
      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
429
0
    }
430
431
    /* Store the result vectors in proper order */
432
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
433
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]);
434
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]);
435
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]);
436
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]);
437
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]);
438
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]);
439
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]);
440
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]);
441
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]);
442
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]);
443
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]);
444
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]);
445
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]);
446
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]);
447
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]);
448
0
  }
449
0
}
450
451
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
452
static void
453
unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig,
454
  const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
455
0
{
456
0
  size_t i;
457
0
  const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i);
458
459
0
  int j;
460
0
  uint8_t* dest_with_offset;
461
0
  __m128i xmm1[16], xmm2[16];
462
463
  /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2)
464
     to optimize cache utilization. */
465
0
  size_t offset_into_type;
466
0
  for (offset_into_type = 0; offset_into_type < bytesoftype;
467
0
    offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) {
468
0
    for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
469
      /* Load the first 128 bytes in 16 XMM registers */
470
0
      const uint8_t* const src_for_ith_element = orig + i;
471
0
      for (j = 0; j < 16; j++) {
472
0
        xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
473
0
      }
474
      /* Shuffle bytes */
475
0
      for (j = 0; j < 8; j++) {
476
        /* Compute the low 32 bytes */
477
0
        xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
478
        /* Compute the hi 32 bytes */
479
0
        xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
480
0
      }
481
      /* Shuffle 2-byte words */
482
0
      for (j = 0; j < 8; j++) {
483
        /* Compute the low 32 bytes */
484
0
        xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
485
        /* Compute the hi 32 bytes */
486
0
        xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
487
0
      }
488
      /* Shuffle 4-byte dwords */
489
0
      for (j = 0; j < 8; j++) {
490
        /* Compute the low 32 bytes */
491
0
        xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
492
        /* Compute the hi 32 bytes */
493
0
        xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
494
0
      }
495
      /* Shuffle 8-byte qwords */
496
0
      for (j = 0; j < 8; j++) {
497
        /* Compute the low 32 bytes */
498
0
        xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
499
        /* Compute the hi 32 bytes */
500
0
        xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
501
0
      }
502
503
      /* Store the result vectors in proper order */
504
0
      dest_with_offset = dest + offset_into_type;
505
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]);
506
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]);
507
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]);
508
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]);
509
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]);
510
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]);
511
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]);
512
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]);
513
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]);
514
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]);
515
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]);
516
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]);
517
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]);
518
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]);
519
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]);
520
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]);
521
0
    }
522
0
  }
523
0
}
524
525
/* Shuffle a block.  This can never fail. */
526
void
527
blosc_internal_shuffle_sse2(const size_t bytesoftype, const size_t blocksize,
528
0
                            const uint8_t* const _src, uint8_t* const _dest) {
529
0
  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i);
530
  /* If the blocksize is not a multiple of both the typesize and
531
     the vector size, round the blocksize down to the next value
532
     which is a multiple of both. The vectorized shuffle can be
533
     used for that portion of the data, and the naive implementation
534
     can be used for the remaining portion. */
535
0
  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
536
0
  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
537
0
  const size_t total_elements = blocksize / bytesoftype;
538
539
  /* If the block size is too small to be vectorized,
540
     use the generic implementation. */
541
0
  if (blocksize < vectorized_chunk_size) {
542
0
    blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
543
0
    return;
544
0
  }
545
546
  /* Optimized shuffle implementations */
547
0
  switch (bytesoftype)
548
0
  {
549
0
  case 2:
550
0
    shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
551
0
    break;
552
0
  case 4:
553
0
    shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
554
0
    break;
555
0
  case 8:
556
0
    shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
557
0
    break;
558
0
  case 16:
559
0
    shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
560
0
    break;
561
0
  default:
562
0
    if (bytesoftype > sizeof(__m128i)) {
563
0
      shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
564
0
    }
565
0
    else {
566
      /* Non-optimized shuffle */
567
0
      blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest);
568
      /* The non-optimized function covers the whole buffer,
569
         so we're done processing here. */
570
0
      return;
571
0
    }
572
0
  }
573
574
  /* If the buffer had any bytes at the end which couldn't be handled
575
     by the vectorized implementations, use the non-optimized version
576
     to finish them up. */
577
0
  if (vectorizable_bytes < blocksize) {
578
0
    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
579
0
  }
580
0
}
581
582
/* Unshuffle a block.  This can never fail. */
583
void
584
blosc_internal_unshuffle_sse2(const size_t bytesoftype, const size_t blocksize,
585
0
                              const uint8_t* const _src, uint8_t* const _dest) {
586
0
  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i);
587
  /* If the blocksize is not a multiple of both the typesize and
588
     the vector size, round the blocksize down to the next value
589
     which is a multiple of both. The vectorized unshuffle can be
590
     used for that portion of the data, and the naive implementation
591
     can be used for the remaining portion. */
592
0
  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
593
0
  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
594
0
  const size_t total_elements = blocksize / bytesoftype;
595
596
597
  /* If the block size is too small to be vectorized,
598
     use the generic implementation. */
599
0
  if (blocksize < vectorized_chunk_size) {
600
0
    blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
601
0
    return;
602
0
  }
603
604
  /* Optimized unshuffle implementations */
605
0
  switch (bytesoftype)
606
0
  {
607
0
  case 2:
608
0
    unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
609
0
    break;
610
0
  case 4:
611
0
    unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
612
0
    break;
613
0
  case 8:
614
0
    unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
615
0
    break;
616
0
  case 16:
617
0
    unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
618
0
    break;
619
0
  default:
620
0
    if (bytesoftype > sizeof(__m128i)) {
621
0
      unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
622
0
    }
623
0
    else {
624
      /* Non-optimized unshuffle */
625
0
      blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest);
626
      /* The non-optimized function covers the whole buffer,
627
         so we're done processing here. */
628
0
      return;
629
0
    }
630
0
  }
631
632
  /* If the buffer had any bytes at the end which couldn't be handled
633
     by the vectorized implementations, use the non-optimized version
634
     to finish them up. */
635
0
  if (vectorizable_bytes < blocksize) {
636
0
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
637
0
  }
638
0
}
639
640
#endif /* !defined(__SSE2__) */