Coverage Report

Created: 2024-09-08 06:37

/src/c-blosc2/blosc/bitshuffle-sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  Bitshuffle - Filter for improving compression of typed binary data.
13
14
  Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
15
  Website: https://github.com/kiyo-masui/bitshuffle
16
17
  Note: Adapted for c-blosc by Francesc Alted.
18
19
  See LICENSES/BITSHUFFLE.txt file for details about copyright and
20
  rights to use.
21
**********************************************************************/
22
23
24
#include "bitshuffle-sse2.h"
25
#include "bitshuffle-generic.h"
26
#include <stdlib.h>
27
28
/* Make sure SSE2 is available for the compilation target and compiler. */
29
#if defined(__SSE2__)
30
31
#include <emmintrin.h>
32
33
/* The next is useful for debugging purposes */
34
#if 0
35
#include <stdio.h>
36
#include <string.h>
37
38
39
static void printxmm(__m128i xmm0)
40
{
41
  uint8_t buf[32];
42
43
  ((__m128i *)buf)[0] = xmm0;
44
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
45
          buf[0], buf[1], buf[2], buf[3],
46
          buf[4], buf[5], buf[6], buf[7],
47
          buf[8], buf[9], buf[10], buf[11],
48
          buf[12], buf[13], buf[14], buf[15]);
49
}
50
#endif
51
52
53
/* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */
54
55
56
/* Transpose bytes within elements for 16 bit elements. */
57
0
int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) {
58
59
0
  size_t ii;
60
0
  const char *in_b = (const char*) in;
61
0
  char *out_b = (char*) out;
62
0
  __m128i a0, b0, a1, b1;
63
64
0
  for (ii=0; ii + 15 < size; ii += 16) {
65
0
    a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
66
0
    b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);
67
68
0
    a1 = _mm_unpacklo_epi8(a0, b0);
69
0
    b1 = _mm_unpackhi_epi8(a0, b0);
70
71
0
    a0 = _mm_unpacklo_epi8(a1, b1);
72
0
    b0 = _mm_unpackhi_epi8(a1, b1);
73
74
0
    a1 = _mm_unpacklo_epi8(a0, b0);
75
0
    b1 = _mm_unpackhi_epi8(a0, b0);
76
77
0
    a0 = _mm_unpacklo_epi8(a1, b1);
78
0
    b0 = _mm_unpackhi_epi8(a1, b1);
79
80
0
    _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
81
0
    _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
82
0
  }
83
0
  return bshuf_trans_byte_elem_remainder(in, out, size, 2,
84
0
                                         size - size % 16);
85
0
}
86
87
88
/* Transpose bytes within elements for 32 bit elements. */
89
0
int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) {
90
91
0
  size_t ii;
92
0
  const char *in_b;
93
0
  char *out_b;
94
0
  in_b = (const char*) in;
95
0
  out_b = (char*) out;
96
0
  __m128i a0, b0, c0, d0, a1, b1, c1, d1;
97
98
0
  for (ii=0; ii + 15 < size; ii += 16) {
99
0
    a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
100
0
    b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
101
0
    c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
102
0
    d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);
103
104
0
    a1 = _mm_unpacklo_epi8(a0, b0);
105
0
    b1 = _mm_unpackhi_epi8(a0, b0);
106
0
    c1 = _mm_unpacklo_epi8(c0, d0);
107
0
    d1 = _mm_unpackhi_epi8(c0, d0);
108
109
0
    a0 = _mm_unpacklo_epi8(a1, b1);
110
0
    b0 = _mm_unpackhi_epi8(a1, b1);
111
0
    c0 = _mm_unpacklo_epi8(c1, d1);
112
0
    d0 = _mm_unpackhi_epi8(c1, d1);
113
114
0
    a1 = _mm_unpacklo_epi8(a0, b0);
115
0
    b1 = _mm_unpackhi_epi8(a0, b0);
116
0
    c1 = _mm_unpacklo_epi8(c0, d0);
117
0
    d1 = _mm_unpackhi_epi8(c0, d0);
118
119
0
    a0 = _mm_unpacklo_epi64(a1, c1);
120
0
    b0 = _mm_unpackhi_epi64(a1, c1);
121
0
    c0 = _mm_unpacklo_epi64(b1, d1);
122
0
    d0 = _mm_unpackhi_epi64(b1, d1);
123
124
0
    _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
125
0
    _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
126
0
    _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
127
0
    _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
128
0
  }
129
0
  return bshuf_trans_byte_elem_remainder(in, out, size, 4,
130
0
                                         size - size % 16);
131
0
}
132
133
134
/* Transpose bytes within elements for 64 bit elements. */
135
0
int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) {
136
137
0
  size_t ii;
138
0
  const char* in_b = (const char*) in;
139
0
  char* out_b = (char*) out;
140
0
  __m128i a0, b0, c0, d0, e0, f0, g0, h0;
141
0
  __m128i a1, b1, c1, d1, e1, f1, g1, h1;
142
143
0
  for (ii=0; ii + 15 < size; ii += 16) {
144
0
    a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
145
0
    b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
146
0
    c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
147
0
    d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
148
0
    e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
149
0
    f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
150
0
    g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
151
0
    h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);
152
153
0
    a1 = _mm_unpacklo_epi8(a0, b0);
154
0
    b1 = _mm_unpackhi_epi8(a0, b0);
155
0
    c1 = _mm_unpacklo_epi8(c0, d0);
156
0
    d1 = _mm_unpackhi_epi8(c0, d0);
157
0
    e1 = _mm_unpacklo_epi8(e0, f0);
158
0
    f1 = _mm_unpackhi_epi8(e0, f0);
159
0
    g1 = _mm_unpacklo_epi8(g0, h0);
160
0
    h1 = _mm_unpackhi_epi8(g0, h0);
161
162
0
    a0 = _mm_unpacklo_epi8(a1, b1);
163
0
    b0 = _mm_unpackhi_epi8(a1, b1);
164
0
    c0 = _mm_unpacklo_epi8(c1, d1);
165
0
    d0 = _mm_unpackhi_epi8(c1, d1);
166
0
    e0 = _mm_unpacklo_epi8(e1, f1);
167
0
    f0 = _mm_unpackhi_epi8(e1, f1);
168
0
    g0 = _mm_unpacklo_epi8(g1, h1);
169
0
    h0 = _mm_unpackhi_epi8(g1, h1);
170
171
0
    a1 = _mm_unpacklo_epi32(a0, c0);
172
0
    b1 = _mm_unpackhi_epi32(a0, c0);
173
0
    c1 = _mm_unpacklo_epi32(b0, d0);
174
0
    d1 = _mm_unpackhi_epi32(b0, d0);
175
0
    e1 = _mm_unpacklo_epi32(e0, g0);
176
0
    f1 = _mm_unpackhi_epi32(e0, g0);
177
0
    g1 = _mm_unpacklo_epi32(f0, h0);
178
0
    h1 = _mm_unpackhi_epi32(f0, h0);
179
180
0
    a0 = _mm_unpacklo_epi64(a1, e1);
181
0
    b0 = _mm_unpackhi_epi64(a1, e1);
182
0
    c0 = _mm_unpacklo_epi64(b1, f1);
183
0
    d0 = _mm_unpackhi_epi64(b1, f1);
184
0
    e0 = _mm_unpacklo_epi64(c1, g1);
185
0
    f0 = _mm_unpackhi_epi64(c1, g1);
186
0
    g0 = _mm_unpacklo_epi64(d1, h1);
187
0
    h0 = _mm_unpackhi_epi64(d1, h1);
188
189
0
    _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
190
0
    _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
191
0
    _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
192
0
    _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
193
0
    _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
194
0
    _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
195
0
    _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
196
0
    _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
197
0
  }
198
0
  return bshuf_trans_byte_elem_remainder(in, out, size, 8,
199
0
                                         size - size % 16);
200
0
}
201
202
203
/* Transpose bytes within elements using the best SSE algorithm available. */
204
int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
205
107k
                                  const size_t elem_size) {
206
207
107k
  int64_t count;
208
209
  // Trivial cases: power of 2 bytes.
210
107k
  switch (elem_size) {
211
107k
    case 1:
212
107k
      count = bshuf_copy(in, out, size, elem_size);
213
107k
      return count;
214
0
    case 2:
215
0
      count = bshuf_trans_byte_elem_SSE_16(in, out, size);
216
0
      return count;
217
0
    case 4:
218
0
      count = bshuf_trans_byte_elem_SSE_32(in, out, size);
219
0
      return count;
220
0
    case 8:
221
0
      count = bshuf_trans_byte_elem_SSE_64(in, out, size);
222
0
      return count;
223
107k
  }
224
225
  // Worst case: odd number of bytes. Turns out that this is faster for
226
  // (odd * 2) byte elements as well (hence % 4).
227
0
  if (elem_size % 4) {
228
0
    count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
229
0
    return count;
230
0
  }
231
232
  // Multiple of power of 2: transpose hierarchically.
233
0
  {
234
0
    size_t nchunk_elem;
235
0
    void* tmp_buf = malloc(size * elem_size);
236
0
    if (tmp_buf == NULL) return -1;
237
238
0
    if ((elem_size % 8) == 0) {
239
0
      nchunk_elem = elem_size / 8;
240
0
      TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
241
0
      count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
242
0
                                           size * nchunk_elem);
243
0
      bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
244
0
    } else if ((elem_size % 4) == 0) {
245
0
      nchunk_elem = elem_size / 4;
246
0
      TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
247
0
      count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
248
0
                                           size * nchunk_elem);
249
0
      bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
250
0
    } else {
251
      // Not used since scalar algorithm is faster.
252
0
      nchunk_elem = elem_size / 2;
253
0
      TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
254
0
      count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
255
0
                                           size * nchunk_elem);
256
0
      bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
257
0
    }
258
259
0
    free(tmp_buf);
260
0
    return count;
261
0
  }
262
0
}
263
264
265
/* Transpose bits within bytes. */
266
int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
267
0
                                 const size_t elem_size) {
268
269
0
  size_t ii, kk;
270
0
  const char* in_b = (const char*) in;
271
0
  char* out_b = (char*) out;
272
0
  uint16_t* out_ui16;
273
274
0
  int64_t count;
275
276
0
  size_t nbyte = elem_size * size;
277
278
0
  CHECK_MULT_EIGHT(nbyte);
279
280
0
  __m128i xmm;
281
0
  int32_t bt;
282
283
0
  for (ii = 0; ii + 15 < nbyte; ii += 16) {
284
0
    xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
285
0
    for (kk = 0; kk < 8; kk++) {
286
0
      bt = _mm_movemask_epi8(xmm);
287
0
      xmm = _mm_slli_epi16(xmm, 1);
288
0
      out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
289
0
      *out_ui16 = bt;
290
0
    }
291
0
  }
292
0
  count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
293
0
                                         nbyte - nbyte % 16);
294
0
  return count;
295
0
}
296
297
298
/* Transpose bits within elements. */
299
int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
300
0
                                 const size_t elem_size) {
301
302
0
  int64_t count;
303
304
0
  CHECK_MULT_EIGHT(size);
305
306
0
  void* tmp_buf = malloc(size * elem_size);
307
0
  if (tmp_buf == NULL) return -1;
308
309
0
  count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
310
0
  CHECK_ERR_FREE(count, tmp_buf);
311
0
  count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size);
312
0
  CHECK_ERR_FREE(count, tmp_buf);
313
0
  count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
314
315
0
  free(tmp_buf);
316
317
0
  return count;
318
0
}
319
320
321
/* For data organized into a row for each bit (8 * elem_size rows), transpose
322
 * the bytes. */
323
int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size,
324
21.8k
                                    const size_t elem_size) {
325
326
21.8k
  size_t ii, jj;
327
21.8k
  const char* in_b = (const char*) in;
328
21.8k
  char* out_b = (char*) out;
329
330
21.8k
  CHECK_MULT_EIGHT(size);
331
332
21.8k
  size_t nrows = 8 * elem_size;
333
21.8k
  size_t nbyte_row = size / 8;
334
335
21.8k
  __m128i a0, b0, c0, d0, e0, f0, g0, h0;
336
21.8k
  __m128i a1, b1, c1, d1, e1, f1, g1, h1;
337
21.8k
  __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
338
339
43.7k
  for (ii = 0; ii + 7 < nrows; ii += 8) {
340
701k
    for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
341
679k
      a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
342
679k
      b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
343
679k
      c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
344
679k
      d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
345
679k
      e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
346
679k
      f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
347
679k
      g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
348
679k
      h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);
349
350
351
679k
      a1 = _mm_unpacklo_epi8(a0, b0);
352
679k
      b1 = _mm_unpacklo_epi8(c0, d0);
353
679k
      c1 = _mm_unpacklo_epi8(e0, f0);
354
679k
      d1 = _mm_unpacklo_epi8(g0, h0);
355
679k
      e1 = _mm_unpackhi_epi8(a0, b0);
356
679k
      f1 = _mm_unpackhi_epi8(c0, d0);
357
679k
      g1 = _mm_unpackhi_epi8(e0, f0);
358
679k
      h1 = _mm_unpackhi_epi8(g0, h0);
359
360
361
679k
      a0 = _mm_unpacklo_epi16(a1, b1);
362
679k
      b0 = _mm_unpacklo_epi16(c1, d1);
363
679k
      c0 = _mm_unpackhi_epi16(a1, b1);
364
679k
      d0 = _mm_unpackhi_epi16(c1, d1);
365
366
679k
      e0 = _mm_unpacklo_epi16(e1, f1);
367
679k
      f0 = _mm_unpacklo_epi16(g1, h1);
368
679k
      g0 = _mm_unpackhi_epi16(e1, f1);
369
679k
      h0 = _mm_unpackhi_epi16(g1, h1);
370
371
372
679k
      a1 = _mm_unpacklo_epi32(a0, b0);
373
679k
      b1 = _mm_unpackhi_epi32(a0, b0);
374
375
679k
      c1 = _mm_unpacklo_epi32(c0, d0);
376
679k
      d1 = _mm_unpackhi_epi32(c0, d0);
377
378
679k
      e1 = _mm_unpacklo_epi32(e0, f0);
379
679k
      f1 = _mm_unpackhi_epi32(e0, f0);
380
381
679k
      g1 = _mm_unpacklo_epi32(g0, h0);
382
679k
      h1 = _mm_unpackhi_epi32(g0, h0);
383
384
      // We don't have a storeh instruction for integers, so interpret
385
      // as a float. Have a storel (_mm_storel_epi64).
386
679k
      as = (__m128 *) &a1;
387
679k
      bs = (__m128 *) &b1;
388
679k
      cs = (__m128 *) &c1;
389
679k
      ds = (__m128 *) &d1;
390
679k
      es = (__m128 *) &e1;
391
679k
      fs = (__m128 *) &f1;
392
679k
      gs = (__m128 *) &g1;
393
679k
      hs = (__m128 *) &h1;
394
395
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
396
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
397
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
398
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
399
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
400
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
401
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
402
679k
      _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);
403
404
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
405
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
406
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
407
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
408
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
409
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
410
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
411
679k
      _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
412
679k
    }
413
26.0k
    for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
414
4.22k
      out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
415
4.22k
      out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
416
4.22k
      out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
417
4.22k
      out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
418
4.22k
      out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
419
4.22k
      out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
420
4.22k
      out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
421
4.22k
      out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
422
4.22k
    }
423
21.8k
  }
424
21.8k
  return size * elem_size;
425
21.8k
}
426
427
428
/* Shuffle bits within the bytes of eight element blocks. */
429
int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size,
430
21.8k
                                        const size_t elem_size) {
431
432
21.8k
  CHECK_MULT_EIGHT(size);
433
434
  // With a bit of care, this could be written such that such that it is
435
  // in_buf = out_buf safe.
436
21.8k
  const char* in_b = (const char*) in;
437
21.8k
  uint16_t* out_ui16 = (uint16_t*) out;
438
439
21.8k
  size_t ii, jj, kk;
440
21.8k
  size_t nbyte = elem_size * size;
441
442
21.8k
  __m128i xmm;
443
21.8k
  int32_t bt;
444
445
21.8k
  if (elem_size % 2) {
446
21.8k
    bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
447
21.8k
  } else {
448
0
    for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
449
0
         ii += 8 * elem_size) {
450
0
      for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
451
0
        xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
452
0
        for (kk = 0; kk < 8; kk++) {
453
0
          bt = _mm_movemask_epi8(xmm);
454
0
          xmm = _mm_slli_epi16(xmm, 1);
455
0
          size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
456
0
          out_ui16[ind / 2] = bt;
457
0
        }
458
0
      }
459
0
    }
460
0
  }
461
21.8k
  return size * elem_size;
462
21.8k
}
463
464
465
/* Untranspose bits within elements. */
466
int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
467
0
                                   const size_t elem_size) {
468
469
0
  int64_t count;
470
471
0
  CHECK_MULT_EIGHT(size);
472
473
0
  void* tmp_buf = malloc(size * elem_size);
474
0
  if (tmp_buf == NULL) return -1;
475
476
0
  count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size);
477
0
  CHECK_ERR_FREE(count, tmp_buf);
478
0
  count =  bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size);
479
480
0
  free(tmp_buf);
481
482
0
  return count;
483
0
}
484
485
const bool is_bshuf_SSE = true;
486
487
#else /* defined(__SSE2__) */
488
489
const bool is_bshuf_SSE = false;
490
491
int64_t
492
bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
493
                         const size_t elem_size) {
494
  abort();
495
}
496
497
int64_t
498
bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
499
                           const size_t elem_size) {
500
  abort();
501
}
502
503
#endif /* defined(__SSE2__) */