Coverage Report

Created: 2024-07-27 06:19

/src/c-blosc2/blosc/shuffle-sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
#include "shuffle-sse2.h"
12
#include "shuffle-generic.h"
13
#include <stdlib.h>
14
15
/* Make sure SSE2 is available for the compilation target and compiler. */
16
#if defined(__SSE2__)
17
18
#include <emmintrin.h>
19
20
#include <stdint.h>
21
22
/* The next is useful for debugging purposes */
23
#if 0
24
#include <stdio.h>
25
#include <string.h>
26
27
static void printxmm(__m128i xmm0)
28
{
29
  uint8_t buf[16];
30
31
  ((__m128i *)buf)[0] = xmm0;
32
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
33
          buf[0], buf[1], buf[2], buf[3],
34
          buf[4], buf[5], buf[6], buf[7],
35
          buf[8], buf[9], buf[10], buf[11],
36
          buf[12], buf[13], buf[14], buf[15]);
37
}
38
#endif
39
40
41
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
42
static void
43
shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
44
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
45
0
  static const int32_t bytesoftype = 2;
46
0
  int32_t j;
47
0
  int k;
48
0
  uint8_t* dest_for_jth_element;
49
0
  __m128i xmm0[2], xmm1[2];
50
51
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
52
    /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
53
0
    for (k = 0; k < 2; k++) {
54
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
55
0
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
56
0
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
57
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
58
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
59
0
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
60
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
61
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
62
0
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
63
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
64
0
    }
65
    /* Transpose quad words */
66
0
    for (k = 0; k < 1; k++) {
67
0
      xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]);
68
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]);
69
0
    }
70
    /* Store the result vectors */
71
0
    dest_for_jth_element = dest + j;
72
0
    for (k = 0; k < 2; k++) {
73
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
74
0
    }
75
0
  }
76
0
}
77
78
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
79
static void
80
shuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
81
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
82
0
  static const int32_t bytesoftype = 4;
83
0
  int32_t i;
84
0
  int j;
85
0
  uint8_t* dest_for_ith_element;
86
0
  __m128i xmm0[4], xmm1[4];
87
88
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
89
    /* Fetch 16 elements (64 bytes) then transpose bytes and words. */
90
0
    for (j = 0; j < 4; j++) {
91
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i))));
92
0
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8);
93
0
      xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d);
94
0
      xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]);
95
0
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e);
96
0
      xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]);
97
0
    }
98
    /* Transpose double words */
99
0
    for (j = 0; j < 2; j++) {
100
0
      xmm1[j * 2] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
101
0
      xmm1[j * 2 + 1] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
102
0
    }
103
    /* Transpose quad words */
104
0
    for (j = 0; j < 2; j++) {
105
0
      xmm0[j * 2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j + 2]);
106
0
      xmm0[j * 2 + 1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j + 2]);
107
0
    }
108
    /* Store the result vectors */
109
0
    dest_for_ith_element = dest + i;
110
0
    for (j = 0; j < 4; j++) {
111
0
      _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]);
112
0
    }
113
0
  }
114
0
}
115
116
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
117
static void
118
shuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
119
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
120
0
  static const int32_t bytesoftype = 8;
121
0
  int32_t j;
122
0
  int k, l;
123
0
  uint8_t* dest_for_jth_element;
124
0
  __m128i xmm0[8], xmm1[8];
125
126
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
127
    /* Fetch 16 elements (128 bytes) then transpose bytes. */
128
0
    for (k = 0; k < 8; k++) {
129
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
130
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
131
0
      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
132
0
    }
133
    /* Transpose words */
134
0
    for (k = 0, l = 0; k < 4; k++, l += 2) {
135
0
      xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 1]);
136
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 1]);
137
0
    }
138
    /* Transpose double words */
139
0
    for (k = 0, l = 0; k < 4; k++, l++) {
140
0
      if (k == 2) l += 2;
141
0
      xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 2]);
142
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 2]);
143
0
    }
144
    /* Transpose quad words */
145
0
    for (k = 0; k < 4; k++) {
146
0
      xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 4]);
147
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 4]);
148
0
    }
149
    /* Store the result vectors */
150
0
    dest_for_jth_element = dest + j;
151
0
    for (k = 0; k < 8; k++) {
152
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
153
0
    }
154
0
  }
155
0
}
156
157
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
158
static void
159
shuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
160
0
               const int32_t vectorizable_elements, const int32_t total_elements) {
161
0
  static const int32_t bytesoftype = 16;
162
0
  int32_t j;
163
0
  int k, l;
164
0
  uint8_t* dest_for_jth_element;
165
0
  __m128i xmm0[16], xmm1[16];
166
167
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
168
    /* Fetch 16 elements (256 bytes). */
169
0
    for (k = 0; k < 16; k++) {
170
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
171
0
    }
172
    /* Transpose bytes */
173
0
    for (k = 0, l = 0; k < 8; k++, l += 2) {
174
0
      xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]);
175
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]);
176
0
    }
177
    /* Transpose words */
178
0
    for (k = 0, l = -2; k < 8; k++, l++) {
179
0
      if ((k % 2) == 0) l += 2;
180
0
      xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]);
181
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]);
182
0
    }
183
    /* Transpose double words */
184
0
    for (k = 0, l = -4; k < 8; k++, l++) {
185
0
      if ((k % 4) == 0) l += 4;
186
0
      xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]);
187
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]);
188
0
    }
189
    /* Transpose quad words */
190
0
    for (k = 0; k < 8; k++) {
191
0
      xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]);
192
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]);
193
0
    }
194
    /* Store the result vectors */
195
0
    dest_for_jth_element = dest + j;
196
0
    for (k = 0; k < 16; k++) {
197
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
198
0
    }
199
0
  }
200
0
}
201
202
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
203
static void
204
shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src,
205
0
                     const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
206
0
  int32_t j;
207
0
  const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i);
208
0
  int k, l;
209
0
  uint8_t* dest_for_jth_element;
210
0
  __m128i xmm0[16], xmm1[16];
211
212
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
213
    /* Advance the offset into the type by the vector size (in bytes), unless this is
214
    the initial iteration and the type size is not a multiple of the vector size.
215
    In that case, only advance by the number of bytes necessary so that the number
216
    of remaining bytes in the type will be a multiple of the vector size. */
217
0
    int32_t offset_into_type;
218
0
    for (offset_into_type = 0; offset_into_type < bytesoftype;
219
0
         offset_into_type += (offset_into_type == 0 &&
220
0
                              vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) {
221
222
      /* Fetch elements in groups of 256 bytes */
223
0
      const uint8_t* const src_with_offset = src + offset_into_type;
224
0
      for (k = 0; k < 16; k++) {
225
0
        xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype));
226
0
      }
227
      /* Transpose bytes */
228
0
      for (k = 0, l = 0; k < 8; k++, l += 2) {
229
0
        xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]);
230
0
        xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]);
231
0
      }
232
      /* Transpose words */
233
0
      for (k = 0, l = -2; k < 8; k++, l++) {
234
0
        if ((k % 2) == 0) l += 2;
235
0
        xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]);
236
0
        xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]);
237
0
      }
238
      /* Transpose double words */
239
0
      for (k = 0, l = -4; k < 8; k++, l++) {
240
0
        if ((k % 4) == 0) l += 4;
241
0
        xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]);
242
0
        xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]);
243
0
      }
244
      /* Transpose quad words */
245
0
      for (k = 0; k < 8; k++) {
246
0
        xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]);
247
0
        xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]);
248
0
      }
249
      /* Store the result vectors */
250
0
      dest_for_jth_element = dest + j;
251
0
      for (k = 0; k < 16; k++) {
252
0
        _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]);
253
0
      }
254
0
    }
255
0
  }
256
0
}
257
258
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
259
static void
260
unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
261
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
262
0
  static const int32_t bytesoftype = 2;
263
0
  int32_t i;
264
0
  int j;
265
0
  __m128i xmm0[2], xmm1[2];
266
267
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
268
    /* Load 16 elements (32 bytes) into 2 XMM registers. */
269
0
    const uint8_t* const src_for_ith_element = src + i;
270
0
    for (j = 0; j < 2; j++) {
271
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
272
0
    }
273
    /* Shuffle bytes */
274
    /* Compute the low 32 bytes */
275
0
    xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]);
276
    /* Compute the hi 32 bytes */
277
0
    xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]);
278
    /* Store the result vectors in proper order */
279
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
280
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]);
281
0
  }
282
0
}
283
284
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
285
static void
286
unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
287
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
288
0
  static const int32_t bytesoftype = 4;
289
0
  int32_t i;
290
0
  int j;
291
0
  __m128i xmm0[4], xmm1[4];
292
293
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
294
    /* Load 16 elements (64 bytes) into 4 XMM registers. */
295
0
    const uint8_t* const src_for_ith_element = src + i;
296
0
    for (j = 0; j < 4; j++) {
297
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
298
0
    }
299
    /* Shuffle bytes */
300
0
    for (j = 0; j < 2; j++) {
301
      /* Compute the low 32 bytes */
302
0
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
303
      /* Compute the hi 32 bytes */
304
0
      xmm1[2 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
305
0
    }
306
    /* Shuffle 2-byte words */
307
0
    for (j = 0; j < 2; j++) {
308
      /* Compute the low 32 bytes */
309
0
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
310
      /* Compute the hi 32 bytes */
311
0
      xmm0[2 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
312
0
    }
313
    /* Store the result vectors in proper order */
314
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]);
315
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]);
316
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]);
317
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]);
318
0
  }
319
0
}
320
321
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
322
static void
323
unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
324
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
325
0
  static const int32_t bytesoftype = 8;
326
0
  int32_t i;
327
0
  int j;
328
0
  __m128i xmm0[8], xmm1[8];
329
330
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
331
    /* Load 16 elements (128 bytes) into 8 XMM registers. */
332
0
    const uint8_t* const src_for_ith_element = src + i;
333
0
    for (j = 0; j < 8; j++) {
334
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
335
0
    }
336
    /* Shuffle bytes */
337
0
    for (j = 0; j < 4; j++) {
338
      /* Compute the low 32 bytes */
339
0
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
340
      /* Compute the hi 32 bytes */
341
0
      xmm1[4 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
342
0
    }
343
    /* Shuffle 2-byte words */
344
0
    for (j = 0; j < 4; j++) {
345
      /* Compute the low 32 bytes */
346
0
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
347
      /* Compute the hi 32 bytes */
348
0
      xmm0[4 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
349
0
    }
350
    /* Shuffle 4-byte dwords */
351
0
    for (j = 0; j < 4; j++) {
352
      /* Compute the low 32 bytes */
353
0
      xmm1[j] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
354
      /* Compute the hi 32 bytes */
355
0
      xmm1[4 + j] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
356
0
    }
357
    /* Store the result vectors in proper order */
358
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
359
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]);
360
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]);
361
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]);
362
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]);
363
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]);
364
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]);
365
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]);
366
0
  }
367
0
}
368
369
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
370
static void
371
unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
372
0
                 const int32_t vectorizable_elements, const int32_t total_elements) {
373
0
  static const int32_t bytesoftype = 16;
374
0
  int32_t i;
375
0
  int j;
376
0
  __m128i xmm1[16], xmm2[16];
377
378
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
379
    /* Load 16 elements (256 bytes) into 16 XMM registers. */
380
0
    const uint8_t* const src_for_ith_element = src + i;
381
0
    for (j = 0; j < 16; j++) {
382
0
      xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
383
0
    }
384
    /* Shuffle bytes */
385
0
    for (j = 0; j < 8; j++) {
386
      /* Compute the low 32 bytes */
387
0
      xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
388
      /* Compute the hi 32 bytes */
389
0
      xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
390
0
    }
391
    /* Shuffle 2-byte words */
392
0
    for (j = 0; j < 8; j++) {
393
      /* Compute the low 32 bytes */
394
0
      xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
395
      /* Compute the hi 32 bytes */
396
0
      xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
397
0
    }
398
    /* Shuffle 4-byte dwords */
399
0
    for (j = 0; j < 8; j++) {
400
      /* Compute the low 32 bytes */
401
0
      xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
402
      /* Compute the hi 32 bytes */
403
0
      xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
404
0
    }
405
    /* Shuffle 8-byte qwords */
406
0
    for (j = 0; j < 8; j++) {
407
      /* Compute the low 32 bytes */
408
0
      xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
409
      /* Compute the hi 32 bytes */
410
0
      xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
411
0
    }
412
413
    /* Store the result vectors in proper order */
414
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
415
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]);
416
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]);
417
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]);
418
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]);
419
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]);
420
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]);
421
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]);
422
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]);
423
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]);
424
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]);
425
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]);
426
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]);
427
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]);
428
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]);
429
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]);
430
0
  }
431
0
}
432
433
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
434
static void
435
unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig,
436
0
                       const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
437
0
  int32_t i;
438
0
  const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i);
439
440
0
  int j;
441
0
  uint8_t* dest_with_offset;
442
0
  __m128i xmm1[16], xmm2[16];
443
444
  /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2)
445
     to optimize cache utilization. */
446
0
  int32_t offset_into_type;
447
0
  for (offset_into_type = 0; offset_into_type < bytesoftype;
448
0
       offset_into_type += (offset_into_type == 0 &&
449
0
           vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) {
450
0
    for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
451
      /* Load the first 128 bytes in 16 XMM registers */
452
0
      const uint8_t* const src_for_ith_element = orig + i;
453
0
      for (j = 0; j < 16; j++) {
454
0
        xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
455
0
      }
456
      /* Shuffle bytes */
457
0
      for (j = 0; j < 8; j++) {
458
        /* Compute the low 32 bytes */
459
0
        xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
460
        /* Compute the hi 32 bytes */
461
0
        xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
462
0
      }
463
      /* Shuffle 2-byte words */
464
0
      for (j = 0; j < 8; j++) {
465
        /* Compute the low 32 bytes */
466
0
        xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
467
        /* Compute the hi 32 bytes */
468
0
        xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
469
0
      }
470
      /* Shuffle 4-byte dwords */
471
0
      for (j = 0; j < 8; j++) {
472
        /* Compute the low 32 bytes */
473
0
        xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
474
        /* Compute the hi 32 bytes */
475
0
        xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
476
0
      }
477
      /* Shuffle 8-byte qwords */
478
0
      for (j = 0; j < 8; j++) {
479
        /* Compute the low 32 bytes */
480
0
        xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
481
        /* Compute the hi 32 bytes */
482
0
        xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
483
0
      }
484
485
      /* Store the result vectors in proper order */
486
0
      dest_with_offset = dest + offset_into_type;
487
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]);
488
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]);
489
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]);
490
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]);
491
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]);
492
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]);
493
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]);
494
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]);
495
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]);
496
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]);
497
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]);
498
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]);
499
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]);
500
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]);
501
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]);
502
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]);
503
0
    }
504
0
  }
505
0
}
506
507
/* Shuffle a block.  This can never fail. */
508
void
509
shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
510
0
             const uint8_t *_src, uint8_t *_dest) {
511
0
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i);
512
  /* If the blocksize is not a multiple of both the typesize and
513
     the vector size, round the blocksize down to the next value
514
     which is a multiple of both. The vectorized shuffle can be
515
     used for that portion of the data, and the naive implementation
516
     can be used for the remaining portion. */
517
0
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
518
0
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
519
0
  const int32_t total_elements = blocksize / bytesoftype;
520
521
  /* If the block size is too small to be vectorized,
522
     use the generic implementation. */
523
0
  if (blocksize < vectorized_chunk_size) {
524
0
    shuffle_generic(bytesoftype, blocksize, _src, _dest);
525
0
    return;
526
0
  }
527
528
  /* Optimized shuffle implementations */
529
0
  switch (bytesoftype) {
530
0
    case 2:
531
0
      shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
532
0
      break;
533
0
    case 4:
534
0
      shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
535
0
      break;
536
0
    case 8:
537
0
      shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
538
0
      break;
539
0
    case 16:
540
0
      shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
541
0
      break;
542
0
    default:
543
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
544
0
        shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
545
0
      }
546
0
      else {
547
        /* Non-optimized shuffle */
548
0
        shuffle_generic(bytesoftype, blocksize, _src, _dest);
549
        /* The non-optimized function covers the whole buffer,
550
           so we're done processing here. */
551
0
        return;
552
0
      }
553
0
  }
554
555
  /* If the buffer had any bytes at the end which couldn't be handled
556
     by the vectorized implementations, use the non-optimized version
557
     to finish them up. */
558
0
  if (vectorizable_bytes < blocksize) {
559
0
    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
560
0
  }
561
0
}
562
563
/* Unshuffle a block.  This can never fail. */
564
void
565
unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
566
0
               const uint8_t *_src, uint8_t *_dest) {
567
0
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i);
568
  /* If the blocksize is not a multiple of both the typesize and
569
     the vector size, round the blocksize down to the next value
570
     which is a multiple of both. The vectorized unshuffle can be
571
     used for that portion of the data, and the naive implementation
572
     can be used for the remaining portion. */
573
0
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
574
0
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
575
0
  const int32_t total_elements = blocksize / bytesoftype;
576
577
  /* If the block size is too small to be vectorized,
578
     use the generic implementation. */
579
0
  if (blocksize < vectorized_chunk_size) {
580
0
    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
581
0
    return;
582
0
  }
583
584
  /* Optimized unshuffle implementations */
585
0
  switch (bytesoftype) {
586
0
    case 2:
587
0
      unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
588
0
      break;
589
0
    case 4:
590
0
      unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
591
0
      break;
592
0
    case 8:
593
0
      unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
594
0
      break;
595
0
    case 16:
596
0
      unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
597
0
      break;
598
0
    default:
599
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
600
0
        unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
601
0
      }
602
0
      else {
603
        /* Non-optimized unshuffle */
604
0
        unshuffle_generic(bytesoftype, blocksize, _src, _dest);
605
        /* The non-optimized function covers the whole buffer,
606
           so we're done processing here. */
607
0
        return;
608
0
      }
609
0
  }
610
611
  /* If the buffer had any bytes at the end which couldn't be handled
612
     by the vectorized implementations, use the non-optimized version
613
     to finish them up. */
614
0
  if (vectorizable_bytes < blocksize) {
615
0
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
616
0
  }
617
0
}
618
619
const bool is_shuffle_sse2 = true;
620
621
#else /* defined(__SSE2__) */
622
623
const bool is_shuffle_sse2 = false;
624
625
void shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
626
                  const uint8_t *_src, uint8_t *_dest) {
627
  abort();
628
}
629
630
void unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
631
                    const uint8_t *_src, uint8_t *_dest) {
632
  abort();
633
}
634
635
#endif /* defined(__SSE2__) */