Coverage Report

Created: 2025-11-09 06:55

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/shuffle-sse2.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
#include "shuffle-sse2.h"
12
#include "shuffle-generic.h"
13
#include <stdlib.h>
14
15
/* Make sure SSE2 is available for the compilation target and compiler. */
16
#if defined(__SSE2__)
17
18
#include <emmintrin.h>
19
20
#include <stdint.h>
21
22
/* The next is useful for debugging purposes */
23
#if 0
24
#include <stdio.h>
25
#include <string.h>
26
27
static void printxmm(__m128i xmm0)
28
{
29
  uint8_t buf[16];
30
31
  ((__m128i *)buf)[0] = xmm0;
32
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
33
          buf[0], buf[1], buf[2], buf[3],
34
          buf[4], buf[5], buf[6], buf[7],
35
          buf[8], buf[9], buf[10], buf[11],
36
          buf[12], buf[13], buf[14], buf[15]);
37
}
38
#endif
39
40
41
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
42
static void
43
shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
44
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
45
0
  static const int32_t bytesoftype = 2;
46
0
  int32_t j;
47
0
  int k;
48
0
  uint8_t* dest_for_jth_element;
49
0
  __m128i xmm0[2], xmm1[2];
50
51
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
52
    /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
53
0
    for (k = 0; k < 2; k++) {
54
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
55
0
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
56
0
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
57
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
58
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
59
0
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
60
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
61
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
62
0
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
63
0
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
64
0
    }
65
    /* Transpose quad words */
66
0
    for (k = 0; k < 1; k++) {
67
0
      xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]);
68
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]);
69
0
    }
70
    /* Store the result vectors */
71
0
    dest_for_jth_element = dest + j;
72
0
    for (k = 0; k < 2; k++) {
73
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
74
0
    }
75
0
  }
76
0
}
77
78
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
79
static void
80
shuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
81
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
82
0
  static const int32_t bytesoftype = 4;
83
0
  int32_t i;
84
0
  int j;
85
0
  uint8_t* dest_for_ith_element;
86
0
  __m128i xmm0[4], xmm1[4];
87
88
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
89
    /* Fetch 16 elements (64 bytes) then transpose bytes and words. */
90
0
    for (j = 0; j < 4; j++) {
91
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i))));
92
0
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8);
93
0
      xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d);
94
0
      xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]);
95
0
      xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e);
96
0
      xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]);
97
0
    }
98
    /* Transpose double words */
99
0
    for (j = 0; j < 2; j++) {
100
0
      xmm1[j * 2] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
101
0
      xmm1[j * 2 + 1] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
102
0
    }
103
    /* Transpose quad words */
104
0
    for (j = 0; j < 2; j++) {
105
0
      xmm0[j * 2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j + 2]);
106
0
      xmm0[j * 2 + 1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j + 2]);
107
0
    }
108
    /* Store the result vectors */
109
0
    dest_for_ith_element = dest + i;
110
0
    for (j = 0; j < 4; j++) {
111
0
      _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]);
112
0
    }
113
0
  }
114
0
}
115
116
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
117
static void
118
shuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
119
0
              const int32_t vectorizable_elements, const int32_t total_elements) {
120
0
  static const int32_t bytesoftype = 8;
121
0
  int32_t j;
122
0
  int k, l;
123
0
  uint8_t* dest_for_jth_element;
124
0
  __m128i xmm0[8], xmm1[8];
125
126
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
127
    /* Fetch 16 elements (128 bytes) then transpose bytes. */
128
0
    for (k = 0; k < 8; k++) {
129
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
130
0
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
131
0
      xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
132
0
    }
133
    /* Transpose words */
134
0
    for (k = 0, l = 0; k < 4; k++, l += 2) {
135
0
      xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 1]);
136
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 1]);
137
0
    }
138
    /* Transpose double words */
139
0
    for (k = 0, l = 0; k < 4; k++, l++) {
140
0
      if (k == 2) l += 2;
141
0
      xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 2]);
142
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 2]);
143
0
    }
144
    /* Transpose quad words */
145
0
    for (k = 0; k < 4; k++) {
146
0
      xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 4]);
147
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 4]);
148
0
    }
149
    /* Store the result vectors */
150
0
    dest_for_jth_element = dest + j;
151
0
    for (k = 0; k < 8; k++) {
152
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
153
0
    }
154
0
  }
155
0
}
156
157
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
158
static void
159
shuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
160
0
               const int32_t vectorizable_elements, const int32_t total_elements) {
161
0
  static const int32_t bytesoftype = 16;
162
0
  int32_t j;
163
0
  int k, l;
164
0
  uint8_t* dest_for_jth_element;
165
0
  __m128i xmm0[16], xmm1[16];
166
167
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
168
    /* Fetch 16 elements (256 bytes). */
169
0
    for (k = 0; k < 16; k++) {
170
0
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
171
0
    }
172
    /* Transpose bytes */
173
0
    for (k = 0, l = 0; k < 8; k++, l += 2) {
174
0
      xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]);
175
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]);
176
0
    }
177
    /* Transpose words */
178
0
    for (k = 0, l = -2; k < 8; k++, l++) {
179
0
      if ((k % 2) == 0) l += 2;
180
0
      xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]);
181
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]);
182
0
    }
183
    /* Transpose double words */
184
0
    for (k = 0, l = -4; k < 8; k++, l++) {
185
0
      if ((k % 4) == 0) l += 4;
186
0
      xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]);
187
0
      xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]);
188
0
    }
189
    /* Transpose quad words */
190
0
    for (k = 0; k < 8; k++) {
191
0
      xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]);
192
0
      xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]);
193
0
    }
194
    /* Store the result vectors */
195
0
    dest_for_jth_element = dest + j;
196
0
    for (k = 0; k < 16; k++) {
197
0
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
198
0
    }
199
0
  }
200
0
}
201
202
/* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
203
static void
204
shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src,
205
0
                     const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
206
0
  int32_t j;
207
0
  const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i);
208
0
  int k, l;
209
0
  uint8_t* dest_for_jth_element;
210
0
  __m128i xmm0[16], xmm1[16];
211
212
0
  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
213
    /* Advance the offset into the type by the vector size (in bytes), unless this is
214
    the initial iteration and the type size is not a multiple of the vector size.
215
    In that case, only advance by the number of bytes necessary so that the number
216
    of remaining bytes in the type will be a multiple of the vector size. */
217
0
    int32_t offset_into_type;
218
0
    for (offset_into_type = 0; offset_into_type < bytesoftype;
219
0
         offset_into_type += (offset_into_type == 0 &&
220
0
                              vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) {
221
222
      /* Fetch elements in groups of 256 bytes */
223
0
      const uint8_t* const src_with_offset = src + offset_into_type;
224
0
      for (k = 0; k < 16; k++) {
225
0
        xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype));
226
0
      }
227
      /* Transpose bytes */
228
0
      for (k = 0, l = 0; k < 8; k++, l += 2) {
229
0
        xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]);
230
0
        xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]);
231
0
      }
232
      /* Transpose words */
233
0
      for (k = 0, l = -2; k < 8; k++, l++) {
234
0
        if ((k % 2) == 0) l += 2;
235
0
        xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]);
236
0
        xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]);
237
0
      }
238
      /* Transpose double words */
239
0
      for (k = 0, l = -4; k < 8; k++, l++) {
240
0
        if ((k % 4) == 0) l += 4;
241
0
        xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]);
242
0
        xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]);
243
0
      }
244
      /* Transpose quad words */
245
0
      for (k = 0; k < 8; k++) {
246
0
        xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]);
247
0
        xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]);
248
0
      }
249
      /* Store the result vectors */
250
0
      dest_for_jth_element = dest + j;
251
0
      for (k = 0; k < 16; k++) {
252
0
        _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]);
253
0
      }
254
0
    }
255
0
  }
256
0
}
257
258
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
259
static void
260
unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
261
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
262
0
  static const int32_t bytesoftype = 2;
263
0
  int32_t i;
264
0
  int j;
265
0
  __m128i xmm0[2], xmm1[2];
266
267
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
268
    /* Load 16 elements (32 bytes) into 2 XMM registers. */
269
0
    const uint8_t* const src_for_ith_element = src + i;
270
0
    for (j = 0; j < 2; j++) {
271
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
272
0
    }
273
    /* Shuffle bytes */
274
    /* Compute the low 32 bytes */
275
0
    xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]);
276
    /* Compute the hi 32 bytes */
277
0
    xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]);
278
    /* Store the result vectors in proper order */
279
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
280
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]);
281
0
  }
282
0
}
283
284
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
285
static void
286
unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
287
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
288
0
  static const int32_t bytesoftype = 4;
289
0
  int32_t i;
290
0
  int j;
291
0
  __m128i xmm0[4], xmm1[4];
292
293
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
294
    /* Load 16 elements (64 bytes) into 4 XMM registers. */
295
0
    const uint8_t* const src_for_ith_element = src + i;
296
0
    for (j = 0; j < 4; j++) {
297
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
298
0
    }
299
    /* Shuffle bytes */
300
0
    for (j = 0; j < 2; j++) {
301
      /* Compute the low 32 bytes */
302
0
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
303
      /* Compute the hi 32 bytes */
304
0
      xmm1[2 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
305
0
    }
306
    /* Shuffle 2-byte words */
307
0
    for (j = 0; j < 2; j++) {
308
      /* Compute the low 32 bytes */
309
0
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
310
      /* Compute the hi 32 bytes */
311
0
      xmm0[2 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
312
0
    }
313
    /* Store the result vectors in proper order */
314
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]);
315
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]);
316
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]);
317
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]);
318
0
  }
319
0
}
320
321
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
322
static void
323
unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
324
0
                const int32_t vectorizable_elements, const int32_t total_elements) {
325
0
  static const int32_t bytesoftype = 8;
326
0
  int32_t i;
327
0
  int j;
328
0
  __m128i xmm0[8], xmm1[8];
329
330
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
331
    /* Load 16 elements (128 bytes) into 8 XMM registers. */
332
0
    const uint8_t* const src_for_ith_element = src + i;
333
0
    for (j = 0; j < 8; j++) {
334
0
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
335
0
    }
336
    /* Shuffle bytes */
337
0
    for (j = 0; j < 4; j++) {
338
      /* Compute the low 32 bytes */
339
0
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
340
      /* Compute the hi 32 bytes */
341
0
      xmm1[4 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
342
0
    }
343
    /* Shuffle 2-byte words */
344
0
    for (j = 0; j < 4; j++) {
345
      /* Compute the low 32 bytes */
346
0
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
347
      /* Compute the hi 32 bytes */
348
0
      xmm0[4 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
349
0
    }
350
    /* Shuffle 4-byte dwords */
351
0
    for (j = 0; j < 4; j++) {
352
      /* Compute the low 32 bytes */
353
0
      xmm1[j] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
354
      /* Compute the hi 32 bytes */
355
0
      xmm1[4 + j] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
356
0
    }
357
    /* Store the result vectors in proper order */
358
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
359
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]);
360
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]);
361
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]);
362
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]);
363
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]);
364
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]);
365
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]);
366
0
  }
367
0
}
368
369
/* Routine optimized for unshuffling a buffer for a type size of 12 bytes. */
370
/* Based on the 16-byte implementation */
371
static void
372
unshuffle12_sse2(uint8_t* const dest, const uint8_t* const src,
373
0
                 const int32_t vectorizable_elements, const int32_t total_elements) {
374
0
  static const int32_t bytesoftype = 12;
375
0
  int32_t i;
376
0
  int j;
377
0
  __m128i xmm1[16], xmm2[16];
378
379
0
  __m128i mask = _mm_set_epi8( 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff);
380
381
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
382
    /* Load 12 elements (192 bytes) into 12 XMM registers. */
383
0
    const uint8_t* const src_for_ith_element = src + i;
384
0
    for (j = 0; j < bytesoftype; j++) {
385
0
      xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
386
0
    }
387
    /* Initialize the last 4 registers (64 bytes) to null */
388
0
    for (j = bytesoftype; j < 16; j++) {
389
0
      xmm1[j] = _mm_setzero_si128();
390
0
    }
391
    /* Shuffle bytes */
392
0
    for (j = 0; j < 8; j++) {
393
      /* Compute the low 32 bytes */
394
0
      xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
395
      /* Compute the hi 32 bytes */
396
0
      xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
397
0
    }
398
    /* Shuffle 2-byte words */
399
0
    for (j = 0; j < 8; j++) {
400
      /* Compute the low 32 bytes */
401
0
      xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
402
      /* Compute the hi 32 bytes */
403
0
      xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
404
0
    }
405
    /* Shuffle 4-byte dwords */
406
0
    for (j = 0; j < 8; j++) {
407
      /* Compute the low 32 bytes */
408
0
      xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
409
      /* Compute the hi 32 bytes */
410
0
      xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
411
0
    }
412
    /* Shuffle 8-byte qwords */
413
0
    for (j = 0; j < 8; j++) {
414
      /* Compute the low 32 bytes */
415
0
      xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
416
      /* Compute the hi 32 bytes */
417
0
      xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
418
0
    }
419
420
421
    /* Store the result vectors in proper order */
422
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * 12)), xmm1[0]);
423
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * 12)), xmm1[8]);
424
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * 12)), xmm1[4]);
425
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * 12)), xmm1[12]);
426
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * 12)), xmm1[2]);
427
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * 12)), xmm1[10]);
428
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * 12)), xmm1[6]);
429
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * 12)), xmm1[14]);
430
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * 12)), xmm1[1]);
431
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * 12)), xmm1[9]);
432
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * 12)), xmm1[5]);
433
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * 12)), xmm1[13]);
434
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * 12)), xmm1[3]);
435
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * 12)), xmm1[11]);
436
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * 12)), xmm1[7]);
437
0
    _mm_maskmoveu_si128(xmm1[15], mask, (char *)(dest + (i * bytesoftype) + (15 * 12)));
438
0
  }
439
0
}
440
441
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
442
static void
443
unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
444
0
                 const int32_t vectorizable_elements, const int32_t total_elements) {
445
0
  static const int32_t bytesoftype = 16;
446
0
  int32_t i;
447
0
  int j;
448
0
  __m128i xmm1[16], xmm2[16];
449
450
0
  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
451
    /* Load 16 elements (256 bytes) into 16 XMM registers. */
452
0
    const uint8_t* const src_for_ith_element = src + i;
453
0
    for (j = 0; j < 16; j++) {
454
0
      xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
455
0
    }
456
    /* Shuffle bytes */
457
0
    for (j = 0; j < 8; j++) {
458
      /* Compute the low 32 bytes */
459
0
      xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
460
      /* Compute the hi 32 bytes */
461
0
      xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
462
0
    }
463
    /* Shuffle 2-byte words */
464
0
    for (j = 0; j < 8; j++) {
465
      /* Compute the low 32 bytes */
466
0
      xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
467
      /* Compute the hi 32 bytes */
468
0
      xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
469
0
    }
470
    /* Shuffle 4-byte dwords */
471
0
    for (j = 0; j < 8; j++) {
472
      /* Compute the low 32 bytes */
473
0
      xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
474
      /* Compute the hi 32 bytes */
475
0
      xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
476
0
    }
477
    /* Shuffle 8-byte qwords */
478
0
    for (j = 0; j < 8; j++) {
479
      /* Compute the low 32 bytes */
480
0
      xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
481
      /* Compute the hi 32 bytes */
482
0
      xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
483
0
    }
484
485
    /* Store the result vectors in proper order */
486
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
487
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]);
488
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]);
489
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]);
490
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]);
491
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]);
492
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]);
493
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]);
494
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]);
495
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]);
496
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]);
497
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]);
498
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]);
499
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]);
500
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]);
501
0
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]);
502
0
  }
503
0
}
504
505
/* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
506
static void
507
unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig,
508
0
                       const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) {
509
0
  int32_t i;
510
0
  const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i);
511
512
0
  int j;
513
0
  uint8_t* dest_with_offset;
514
0
  __m128i xmm1[16], xmm2[16];
515
516
  /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2)
517
     to optimize cache utilization. */
518
0
  int32_t offset_into_type;
519
0
  for (offset_into_type = 0; offset_into_type < bytesoftype;
520
0
       offset_into_type += (offset_into_type == 0 &&
521
0
           vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) {
522
0
    for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
523
      /* Load the first 128 bytes in 16 XMM registers */
524
0
      const uint8_t* const src_for_ith_element = orig + i;
525
0
      for (j = 0; j < 16; j++) {
526
0
        xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
527
0
      }
528
      /* Shuffle bytes */
529
0
      for (j = 0; j < 8; j++) {
530
        /* Compute the low 32 bytes */
531
0
        xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
532
        /* Compute the hi 32 bytes */
533
0
        xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]);
534
0
      }
535
      /* Shuffle 2-byte words */
536
0
      for (j = 0; j < 8; j++) {
537
        /* Compute the low 32 bytes */
538
0
        xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
539
        /* Compute the hi 32 bytes */
540
0
        xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]);
541
0
      }
542
      /* Shuffle 4-byte dwords */
543
0
      for (j = 0; j < 8; j++) {
544
        /* Compute the low 32 bytes */
545
0
        xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
546
        /* Compute the hi 32 bytes */
547
0
        xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]);
548
0
      }
549
      /* Shuffle 8-byte qwords */
550
0
      for (j = 0; j < 8; j++) {
551
        /* Compute the low 32 bytes */
552
0
        xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
553
        /* Compute the hi 32 bytes */
554
0
        xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]);
555
0
      }
556
557
      /* Store the result vectors in proper order */
558
0
      dest_with_offset = dest + offset_into_type;
559
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]);
560
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]);
561
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]);
562
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]);
563
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]);
564
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]);
565
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]);
566
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]);
567
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]);
568
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]);
569
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]);
570
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]);
571
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]);
572
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]);
573
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]);
574
0
      _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]);
575
0
    }
576
0
  }
577
0
}
578
579
/* Shuffle a block.  This can never fail. */
580
void
581
shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
582
0
             const uint8_t *_src, uint8_t *_dest) {
583
0
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i);
584
  /* If the blocksize is not a multiple of both the typesize and
585
     the vector size, round the blocksize down to the next value
586
     which is a multiple of both. The vectorized shuffle can be
587
     used for that portion of the data, and the naive implementation
588
     can be used for the remaining portion. */
589
0
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
590
0
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
591
0
  const int32_t total_elements = blocksize / bytesoftype;
592
593
  /* If the block size is too small to be vectorized,
594
     use the generic implementation. */
595
0
  if (blocksize < vectorized_chunk_size) {
596
0
    shuffle_generic(bytesoftype, blocksize, _src, _dest);
597
0
    return;
598
0
  }
599
600
  /* Optimized shuffle implementations */
601
0
  switch (bytesoftype) {
602
0
    case 2:
603
0
      shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
604
0
      break;
605
0
    case 4:
606
0
      shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
607
0
      break;
608
0
    case 8:
609
0
      shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
610
0
      break;
611
0
    case 16:
612
0
      shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
613
0
      break;
614
0
    default:
615
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
616
0
        shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
617
0
      }
618
0
      else {
619
        /* Non-optimized shuffle */
620
0
        shuffle_generic(bytesoftype, blocksize, _src, _dest);
621
        /* The non-optimized function covers the whole buffer,
622
           so we're done processing here. */
623
0
        return;
624
0
      }
625
0
  }
626
627
  /* If the buffer had any bytes at the end which couldn't be handled
628
     by the vectorized implementations, use the non-optimized version
629
     to finish them up. */
630
0
  if (vectorizable_bytes < blocksize) {
631
0
    shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
632
0
  }
633
0
}
634
635
/* Unshuffle a block.  This can never fail. */
636
void
637
unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
638
0
               const uint8_t *_src, uint8_t *_dest) {
639
0
  const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i);
640
  /* If the blocksize is not a multiple of both the typesize and
641
     the vector size, round the blocksize down to the next value
642
     which is a multiple of both. The vectorized unshuffle can be
643
     used for that portion of the data, and the naive implementation
644
     can be used for the remaining portion. */
645
0
  const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
646
0
  const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype;
647
0
  const int32_t total_elements = blocksize / bytesoftype;
648
649
  /* If the block size is too small to be vectorized,
650
     use the generic implementation. */
651
0
  if (blocksize < vectorized_chunk_size) {
652
0
    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
653
0
    return;
654
0
  }
655
656
  /* Optimized unshuffle implementations */
657
0
  switch (bytesoftype) {
658
0
    case 2:
659
0
      unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
660
0
      break;
661
0
    case 4:
662
0
      unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
663
0
      break;
664
0
    case 8:
665
0
      unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
666
0
      break;
667
0
    case 12:
668
0
      unshuffle12_sse2(_dest, _src, vectorizable_elements, total_elements);
669
0
      break;
670
0
    case 16:
671
0
      unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
672
0
      break;
673
0
    default:
674
0
      if (bytesoftype > (int32_t)sizeof(__m128i)) {
675
0
        unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
676
0
      }
677
0
      else {
678
        /* Non-optimized unshuffle */
679
0
        unshuffle_generic(bytesoftype, blocksize, _src, _dest);
680
        /* The non-optimized function covers the whole buffer,
681
           so we're done processing here. */
682
0
        return;
683
0
      }
684
0
  }
685
686
  /* If the buffer had any bytes at the end which couldn't be handled
687
     by the vectorized implementations, use the non-optimized version
688
     to finish them up. */
689
0
  if (vectorizable_bytes < blocksize) {
690
0
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
691
0
  }
692
0
}
693
694
const bool is_shuffle_sse2 = true;
695
696
#else /* defined(__SSE2__) */
697
698
const bool is_shuffle_sse2 = false;
699
700
void shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
701
                  const uint8_t *_src, uint8_t *_dest) {
702
  abort();
703
}
704
705
void unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize,
706
                    const uint8_t *_src, uint8_t *_dest) {
707
  abort();
708
}
709
710
#endif /* defined(__SSE2__) */