Coverage Report

Created: 2023-12-08 06:59

/src/c-blosc/blosc/bitshuffle-sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Bitshuffle - Filter for improving compression of typed binary data.
3
 *
4
 * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
5
 * Website: http://www.github.com/kiyo-masui/bitshuffle
6
 * Created: 2014
7
 *
8
 * Note: Adapted for c-blosc by Francesc Alted.
9
 *
10
 * See LICENSES/BITSHUFFLE.txt file for details about copyright and
11
 * rights to use.
12
 *
13
 */
14
15
#include "bitshuffle-generic.h"
16
#include "bitshuffle-sse2.h"
17
18
/* Define dummy functions if SSE2 is not available for the compilation target and compiler. */
19
#if !defined(__SSE2__)
20
#include <stdlib.h>
21
22
int64_t blosc_internal_bshuf_trans_byte_elem_sse2(void* in, void* out, const size_t size,
23
                                                  const size_t elem_size, void* tmp_buf) {
24
    abort();
25
}
26
27
int64_t blosc_internal_bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size,
28
                                           const size_t elem_size, void* tmp_buf) {
29
    abort();
30
}
31
32
#else /* defined(__SSE2__) */
33
34
#include <emmintrin.h>
35
36
/* The next is useful for debugging purposes */
37
#if 0
38
#include <stdio.h>
39
#include <string.h>
40
41
42
static void printxmm(__m128i xmm0)
43
{
44
  uint8_t buf[32];
45
46
  ((__m128i *)buf)[0] = xmm0;
47
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
48
          buf[0], buf[1], buf[2], buf[3],
49
          buf[4], buf[5], buf[6], buf[7],
50
          buf[8], buf[9], buf[10], buf[11],
51
          buf[12], buf[13], buf[14], buf[15]);
52
}
53
#endif
54
55
56
/* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */
57
58
/* Transpose bytes within elements for 16 bit elements. */
59
0
static int64_t bshuf_trans_byte_elem_SSE_16(void* in, void* out, const size_t size) {
60
61
0
    char* in_b = (char*) in;
62
0
    char* out_b = (char*) out;
63
0
    __m128i a0, b0, a1, b1;
64
0
    size_t ii;
65
66
0
    for (ii=0; ii + 15 < size; ii += 16) {
67
0
        a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
68
0
        b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);
69
70
0
        a1 = _mm_unpacklo_epi8(a0, b0);
71
0
        b1 = _mm_unpackhi_epi8(a0, b0);
72
73
0
        a0 = _mm_unpacklo_epi8(a1, b1);
74
0
        b0 = _mm_unpackhi_epi8(a1, b1);
75
76
0
        a1 = _mm_unpacklo_epi8(a0, b0);
77
0
        b1 = _mm_unpackhi_epi8(a0, b0);
78
79
0
        a0 = _mm_unpacklo_epi8(a1, b1);
80
0
        b0 = _mm_unpackhi_epi8(a1, b1);
81
82
0
        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
83
0
        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
84
0
    }
85
0
    return blosc_internal_bshuf_trans_byte_elem_remainder(in, out, size, 2,
86
0
            size - size % 16);
87
0
}
88
89
90
/* Transpose bytes within elements for 32 bit elements. */
91
0
static int64_t bshuf_trans_byte_elem_SSE_32(void* in, void* out, const size_t size) {
92
93
0
    char* in_b = (char*) in;
94
0
    char* out_b = (char*) out;
95
0
    __m128i a0, b0, c0, d0, a1, b1, c1, d1;
96
0
    size_t ii;
97
98
0
    for (ii=0; ii + 15 < size; ii += 16) {
99
0
        a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
100
0
        b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
101
0
        c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
102
0
        d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);
103
104
0
        a1 = _mm_unpacklo_epi8(a0, b0);
105
0
        b1 = _mm_unpackhi_epi8(a0, b0);
106
0
        c1 = _mm_unpacklo_epi8(c0, d0);
107
0
        d1 = _mm_unpackhi_epi8(c0, d0);
108
109
0
        a0 = _mm_unpacklo_epi8(a1, b1);
110
0
        b0 = _mm_unpackhi_epi8(a1, b1);
111
0
        c0 = _mm_unpacklo_epi8(c1, d1);
112
0
        d0 = _mm_unpackhi_epi8(c1, d1);
113
114
0
        a1 = _mm_unpacklo_epi8(a0, b0);
115
0
        b1 = _mm_unpackhi_epi8(a0, b0);
116
0
        c1 = _mm_unpacklo_epi8(c0, d0);
117
0
        d1 = _mm_unpackhi_epi8(c0, d0);
118
119
0
        a0 = _mm_unpacklo_epi64(a1, c1);
120
0
        b0 = _mm_unpackhi_epi64(a1, c1);
121
0
        c0 = _mm_unpacklo_epi64(b1, d1);
122
0
        d0 = _mm_unpackhi_epi64(b1, d1);
123
124
0
        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
125
0
        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
126
0
        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
127
0
        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
128
0
    }
129
0
    return blosc_internal_bshuf_trans_byte_elem_remainder(in, out, size, 4,
130
0
            size - size % 16);
131
0
}
132
133
134
/* Transpose bytes within elements for 64 bit elements. */
135
0
static int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) {
136
137
0
    char* in_b = (char*) in;
138
0
    char* out_b = (char*) out;
139
0
    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
140
0
    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
141
0
    size_t ii;
142
143
0
    for (ii=0; ii + 15 < size; ii += 16) {
144
0
        a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
145
0
        b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
146
0
        c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
147
0
        d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
148
0
        e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
149
0
        f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
150
0
        g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
151
0
        h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);
152
153
0
        a1 = _mm_unpacklo_epi8(a0, b0);
154
0
        b1 = _mm_unpackhi_epi8(a0, b0);
155
0
        c1 = _mm_unpacklo_epi8(c0, d0);
156
0
        d1 = _mm_unpackhi_epi8(c0, d0);
157
0
        e1 = _mm_unpacklo_epi8(e0, f0);
158
0
        f1 = _mm_unpackhi_epi8(e0, f0);
159
0
        g1 = _mm_unpacklo_epi8(g0, h0);
160
0
        h1 = _mm_unpackhi_epi8(g0, h0);
161
162
0
        a0 = _mm_unpacklo_epi8(a1, b1);
163
0
        b0 = _mm_unpackhi_epi8(a1, b1);
164
0
        c0 = _mm_unpacklo_epi8(c1, d1);
165
0
        d0 = _mm_unpackhi_epi8(c1, d1);
166
0
        e0 = _mm_unpacklo_epi8(e1, f1);
167
0
        f0 = _mm_unpackhi_epi8(e1, f1);
168
0
        g0 = _mm_unpacklo_epi8(g1, h1);
169
0
        h0 = _mm_unpackhi_epi8(g1, h1);
170
171
0
        a1 = _mm_unpacklo_epi32(a0, c0);
172
0
        b1 = _mm_unpackhi_epi32(a0, c0);
173
0
        c1 = _mm_unpacklo_epi32(b0, d0);
174
0
        d1 = _mm_unpackhi_epi32(b0, d0);
175
0
        e1 = _mm_unpacklo_epi32(e0, g0);
176
0
        f1 = _mm_unpackhi_epi32(e0, g0);
177
0
        g1 = _mm_unpacklo_epi32(f0, h0);
178
0
        h1 = _mm_unpackhi_epi32(f0, h0);
179
180
0
        a0 = _mm_unpacklo_epi64(a1, e1);
181
0
        b0 = _mm_unpackhi_epi64(a1, e1);
182
0
        c0 = _mm_unpacklo_epi64(b1, f1);
183
0
        d0 = _mm_unpackhi_epi64(b1, f1);
184
0
        e0 = _mm_unpacklo_epi64(c1, g1);
185
0
        f0 = _mm_unpackhi_epi64(c1, g1);
186
0
        g0 = _mm_unpacklo_epi64(d1, h1);
187
0
        h0 = _mm_unpackhi_epi64(d1, h1);
188
189
0
        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
190
0
        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
191
0
        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
192
0
        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
193
0
        _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
194
0
        _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
195
0
        _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
196
0
        _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
197
0
    }
198
0
    return blosc_internal_bshuf_trans_byte_elem_remainder(in, out, size, 8,
199
0
            size - size % 16);
200
0
}
201
202
203
/* Memory copy with bshuf call signature. */
204
static int64_t bshuf_copy(void* in, void* out, const size_t size,
205
53.8k
                          const size_t elem_size) {
206
207
53.8k
    char* in_b = (char*) in;
208
53.8k
    char* out_b = (char*) out;
209
210
53.8k
    memcpy(out_b, in_b, size * elem_size);
211
53.8k
    return size * elem_size;
212
53.8k
}
213
214
215
/* Transpose bytes within elements using best SSE algorithm available. */
216
int64_t blosc_internal_bshuf_trans_byte_elem_sse2(void* in, void* out, const size_t size,
217
53.8k
                                                  const size_t elem_size, void* tmp_buf) {
218
219
53.8k
    int64_t count;
220
221
    /*  Trivial cases: power of 2 bytes. */
222
53.8k
    switch (elem_size) {
223
53.8k
        case 1:
224
53.8k
            count = bshuf_copy(in, out, size, elem_size);
225
53.8k
            return count;
226
0
        case 2:
227
0
            count = bshuf_trans_byte_elem_SSE_16(in, out, size);
228
0
            return count;
229
0
        case 4:
230
0
            count = bshuf_trans_byte_elem_SSE_32(in, out, size);
231
0
            return count;
232
0
        case 8:
233
0
            count = bshuf_trans_byte_elem_SSE_64(in, out, size);
234
0
            return count;
235
53.8k
    }
236
237
    /*  Worst case: odd number of bytes. Turns out that this is faster for */
238
    /*  (odd * 2) byte elements as well (hence % 4). */
239
0
    if (elem_size % 4) {
240
0
        count = blosc_internal_bshuf_trans_byte_elem_scal(in, out, size, elem_size);
241
0
        return count;
242
0
    }
243
244
    /*  Multiple of power of 2: transpose hierarchically. */
245
0
    {
246
0
        size_t nchunk_elem;
247
248
0
        if ((elem_size % 8) == 0) {
249
0
            nchunk_elem = elem_size / 8;
250
0
            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
251
0
            count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
252
0
                    size * nchunk_elem);
253
0
            blosc_internal_bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
254
0
        } else if ((elem_size % 4) == 0) {
255
0
            nchunk_elem = elem_size / 4;
256
0
            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
257
0
            count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
258
0
                    size * nchunk_elem);
259
0
            blosc_internal_bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
260
0
        } else {
261
            /*  Not used since scalar algorithm is faster. */
262
0
            nchunk_elem = elem_size / 2;
263
0
            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
264
0
            count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
265
0
                    size * nchunk_elem);
266
0
            blosc_internal_bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
267
0
        }
268
269
0
        return count;
270
0
    }
271
0
}
272
273
274
/* Transpose bits within bytes. */
275
static int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size,
276
0
                                         const size_t elem_size) {
277
278
0
    char* in_b = (char*) in;
279
0
    char* out_b = (char*) out;
280
0
    uint16_t* out_ui16;
281
0
    int64_t count;
282
0
    size_t nbyte = elem_size * size;
283
0
    __m128i xmm;
284
0
    int32_t bt;
285
0
    size_t ii, kk;
286
287
0
    CHECK_MULT_EIGHT(nbyte);
288
289
0
    for (ii = 0; ii + 15 < nbyte; ii += 16) {
290
0
        xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
291
0
        for (kk = 0; kk < 8; kk++) {
292
0
            bt = _mm_movemask_epi8(xmm);
293
0
            xmm = _mm_slli_epi16(xmm, 1);
294
0
            out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
295
0
            *out_ui16 = bt;
296
0
        }
297
0
    }
298
0
    count = blosc_internal_bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
299
0
            nbyte - nbyte % 16);
300
0
    return count;
301
0
}
302
303
304
/* Transpose bits within elements. */
305
int64_t blosc_internal_bshuf_trans_bit_elem_sse2(void* in, void* out, const size_t size,
306
0
          const size_t elem_size, void* tmp_buf) {
307
308
0
    int64_t count;
309
310
0
    CHECK_MULT_EIGHT(size);
311
312
0
    count = blosc_internal_bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
313
0
    CHECK_ERR(count);
314
0
    count = bshuf_trans_bit_byte_sse2(out, tmp_buf, size, elem_size);
315
0
    CHECK_ERR(count);
316
0
    count = blosc_internal_bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
317
318
0
    return count;
319
0
}
320
321
322
/* For data organized into a row for each bit (8 * elem_size rows), transpose
323
 * the bytes. */
324
int64_t blosc_internal_bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size,
325
6.00k
             const size_t elem_size) {
326
327
6.00k
    char* in_b = (char*) in;
328
6.00k
    char* out_b = (char*) out;
329
6.00k
    size_t nrows = 8 * elem_size;
330
6.00k
    size_t nbyte_row = size / 8;
331
6.00k
    size_t ii, jj;
332
333
6.00k
    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
334
6.00k
    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
335
6.00k
    __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
336
337
6.00k
    CHECK_MULT_EIGHT(size);
338
339
13.3k
    for (ii = 0; ii + 7 < nrows; ii += 8) {
340
210k
        for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
341
202k
            a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
342
202k
            b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
343
202k
            c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
344
202k
            d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
345
202k
            e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
346
202k
            f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
347
202k
            g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
348
202k
            h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);
349
350
351
202k
            a1 = _mm_unpacklo_epi8(a0, b0);
352
202k
            b1 = _mm_unpacklo_epi8(c0, d0);
353
202k
            c1 = _mm_unpacklo_epi8(e0, f0);
354
202k
            d1 = _mm_unpacklo_epi8(g0, h0);
355
202k
            e1 = _mm_unpackhi_epi8(a0, b0);
356
202k
            f1 = _mm_unpackhi_epi8(c0, d0);
357
202k
            g1 = _mm_unpackhi_epi8(e0, f0);
358
202k
            h1 = _mm_unpackhi_epi8(g0, h0);
359
360
361
202k
            a0 = _mm_unpacklo_epi16(a1, b1);
362
202k
            b0 = _mm_unpacklo_epi16(c1, d1);
363
202k
            c0 = _mm_unpackhi_epi16(a1, b1);
364
202k
            d0 = _mm_unpackhi_epi16(c1, d1);
365
366
202k
            e0 = _mm_unpacklo_epi16(e1, f1);
367
202k
            f0 = _mm_unpacklo_epi16(g1, h1);
368
202k
            g0 = _mm_unpackhi_epi16(e1, f1);
369
202k
            h0 = _mm_unpackhi_epi16(g1, h1);
370
371
372
202k
            a1 = _mm_unpacklo_epi32(a0, b0);
373
202k
            b1 = _mm_unpackhi_epi32(a0, b0);
374
375
202k
            c1 = _mm_unpacklo_epi32(c0, d0);
376
202k
            d1 = _mm_unpackhi_epi32(c0, d0);
377
378
202k
            e1 = _mm_unpacklo_epi32(e0, f0);
379
202k
            f1 = _mm_unpackhi_epi32(e0, f0);
380
381
202k
            g1 = _mm_unpacklo_epi32(g0, h0);
382
202k
            h1 = _mm_unpackhi_epi32(g0, h0);
383
384
            /*  We don't have a storeh instruction for integers, so interpret */
385
            /*  as a float. Have a storel (_mm_storel_epi64). */
386
202k
            as = (__m128 *) &a1;
387
202k
            bs = (__m128 *) &b1;
388
202k
            cs = (__m128 *) &c1;
389
202k
            ds = (__m128 *) &d1;
390
202k
            es = (__m128 *) &e1;
391
202k
            fs = (__m128 *) &f1;
392
202k
            gs = (__m128 *) &g1;
393
202k
            hs = (__m128 *) &h1;
394
395
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
396
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
397
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
398
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
399
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
400
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
401
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
402
202k
            _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);
403
404
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
405
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
406
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
407
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
408
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
409
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
410
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
411
202k
            _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
412
202k
        }
413
11.7k
        for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
414
4.34k
            out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
415
4.34k
            out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
416
4.34k
            out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
417
4.34k
            out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
418
4.34k
            out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
419
4.34k
            out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
420
4.34k
            out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
421
4.34k
            out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
422
4.34k
        }
423
7.38k
    }
424
6.00k
    return size * elem_size;
425
6.00k
}
426
427
428
/* Shuffle bits within the bytes of eight element blocks. */
429
int64_t blosc_internal_bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size,
430
6.00k
           const size_t elem_size) {
431
    /*  With a bit of care, this could be written such that such that it is */
432
    /*  in_buf = out_buf safe. */
433
6.00k
    char* in_b = (char*) in;
434
6.00k
    uint16_t* out_ui16 = (uint16_t*) out;
435
436
6.00k
    size_t nbyte = elem_size * size;
437
438
6.00k
    __m128i xmm;
439
6.00k
    int32_t bt;
440
6.00k
    size_t ii, jj, kk;
441
6.00k
    size_t ind;
442
443
6.00k
    CHECK_MULT_EIGHT(size);
444
445
6.00k
    if (elem_size % 2) {
446
5.50k
        blosc_internal_bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
447
5.50k
    } else {
448
67.0k
        for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
449
66.5k
                ii += 8 * elem_size) {
450
133k
            for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
451
66.8k
                xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
452
601k
                for (kk = 0; kk < 8; kk++) {
453
534k
                    bt = _mm_movemask_epi8(xmm);
454
534k
                    xmm = _mm_slli_epi16(xmm, 1);
455
534k
                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
456
534k
                    out_ui16[ind / 2] = bt;
457
534k
                }
458
66.8k
            }
459
66.5k
        }
460
501
    }
461
6.00k
    return size * elem_size;
462
6.00k
}
463
464
465
/* Untranspose bits within elements. */
466
int64_t blosc_internal_bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size,
467
0
            const size_t elem_size, void* tmp_buf) {
468
469
0
    int64_t count;
470
471
0
    CHECK_MULT_EIGHT(size);
472
473
0
    count = blosc_internal_bshuf_trans_byte_bitrow_sse2(in, tmp_buf, size, elem_size);
474
0
    CHECK_ERR(count);
475
0
    count = blosc_internal_bshuf_shuffle_bit_eightelem_sse2(tmp_buf, out, size, elem_size);
476
477
0
    return count;
478
0
}
479
480
#endif /* !defined(__SSE2__) */