/src/c-blosc2/blosc/bitshuffle-sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | Bitshuffle - Filter for improving compression of typed binary data. |
13 | | |
14 | | Author: Kiyoshi Masui <kiyo@physics.ubc.ca> |
15 | | Website: https://github.com/kiyo-masui/bitshuffle |
16 | | |
17 | | Note: Adapted for c-blosc by Francesc Alted. |
18 | | |
19 | | See LICENSES/BITSHUFFLE.txt file for details about copyright and |
20 | | rights to use. |
21 | | **********************************************************************/ |
22 | | |
23 | | |
24 | | #include "bitshuffle-sse2.h" |
25 | | #include "bitshuffle-generic.h" |
26 | | #include <stdlib.h> |
27 | | |
28 | | /* Make sure SSE2 is available for the compilation target and compiler. */ |
29 | | #if defined(__SSE2__) |
30 | | |
31 | | #include <emmintrin.h> |
32 | | |
33 | | /* The next is useful for debugging purposes */ |
34 | | #if 0 |
35 | | #include <stdio.h> |
36 | | #include <string.h> |
37 | | |
38 | | |
39 | | static void printxmm(__m128i xmm0) |
40 | | { |
41 | | uint8_t buf[32]; |
42 | | |
43 | | ((__m128i *)buf)[0] = xmm0; |
44 | | printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
45 | | buf[0], buf[1], buf[2], buf[3], |
46 | | buf[4], buf[5], buf[6], buf[7], |
47 | | buf[8], buf[9], buf[10], buf[11], |
48 | | buf[12], buf[13], buf[14], buf[15]); |
49 | | } |
50 | | #endif |
51 | | |
52 | | |
53 | | /* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */ |
54 | | |
55 | | |
56 | | /* Transpose bytes within elements for 16 bit elements. */ |
57 | 0 | int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) { |
58 | |
|
59 | 0 | size_t ii; |
60 | 0 | const char *in_b = (const char*) in; |
61 | 0 | char *out_b = (char*) out; |
62 | 0 | __m128i a0, b0, a1, b1; |
63 | |
|
64 | 0 | for (ii=0; ii + 15 < size; ii += 16) { |
65 | 0 | a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]); |
66 | 0 | b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]); |
67 | |
|
68 | 0 | a1 = _mm_unpacklo_epi8(a0, b0); |
69 | 0 | b1 = _mm_unpackhi_epi8(a0, b0); |
70 | |
|
71 | 0 | a0 = _mm_unpacklo_epi8(a1, b1); |
72 | 0 | b0 = _mm_unpackhi_epi8(a1, b1); |
73 | |
|
74 | 0 | a1 = _mm_unpacklo_epi8(a0, b0); |
75 | 0 | b1 = _mm_unpackhi_epi8(a0, b0); |
76 | |
|
77 | 0 | a0 = _mm_unpacklo_epi8(a1, b1); |
78 | 0 | b0 = _mm_unpackhi_epi8(a1, b1); |
79 | |
|
80 | 0 | _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); |
81 | 0 | _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); |
82 | 0 | } |
83 | 0 | return bshuf_trans_byte_elem_remainder(in, out, size, 2, |
84 | 0 | size - size % 16); |
85 | 0 | } |
86 | | |
87 | | |
88 | | /* Transpose bytes within elements for 32 bit elements. */ |
89 | 0 | int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) { |
90 | |
|
91 | 0 | size_t ii; |
92 | 0 | const char *in_b; |
93 | 0 | char *out_b; |
94 | 0 | in_b = (const char*) in; |
95 | 0 | out_b = (char*) out; |
96 | 0 | __m128i a0, b0, c0, d0, a1, b1, c1, d1; |
97 | |
|
98 | 0 | for (ii=0; ii + 15 < size; ii += 16) { |
99 | 0 | a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]); |
100 | 0 | b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]); |
101 | 0 | c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]); |
102 | 0 | d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]); |
103 | |
|
104 | 0 | a1 = _mm_unpacklo_epi8(a0, b0); |
105 | 0 | b1 = _mm_unpackhi_epi8(a0, b0); |
106 | 0 | c1 = _mm_unpacklo_epi8(c0, d0); |
107 | 0 | d1 = _mm_unpackhi_epi8(c0, d0); |
108 | |
|
109 | 0 | a0 = _mm_unpacklo_epi8(a1, b1); |
110 | 0 | b0 = _mm_unpackhi_epi8(a1, b1); |
111 | 0 | c0 = _mm_unpacklo_epi8(c1, d1); |
112 | 0 | d0 = _mm_unpackhi_epi8(c1, d1); |
113 | |
|
114 | 0 | a1 = _mm_unpacklo_epi8(a0, b0); |
115 | 0 | b1 = _mm_unpackhi_epi8(a0, b0); |
116 | 0 | c1 = _mm_unpacklo_epi8(c0, d0); |
117 | 0 | d1 = _mm_unpackhi_epi8(c0, d0); |
118 | |
|
119 | 0 | a0 = _mm_unpacklo_epi64(a1, c1); |
120 | 0 | b0 = _mm_unpackhi_epi64(a1, c1); |
121 | 0 | c0 = _mm_unpacklo_epi64(b1, d1); |
122 | 0 | d0 = _mm_unpackhi_epi64(b1, d1); |
123 | |
|
124 | 0 | _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); |
125 | 0 | _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); |
126 | 0 | _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); |
127 | 0 | _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); |
128 | 0 | } |
129 | 0 | return bshuf_trans_byte_elem_remainder(in, out, size, 4, |
130 | 0 | size - size % 16); |
131 | 0 | } |
132 | | |
133 | | |
134 | | /* Transpose bytes within elements for 64 bit elements. */ |
135 | 0 | int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) { |
136 | |
|
137 | 0 | size_t ii; |
138 | 0 | const char* in_b = (const char*) in; |
139 | 0 | char* out_b = (char*) out; |
140 | 0 | __m128i a0, b0, c0, d0, e0, f0, g0, h0; |
141 | 0 | __m128i a1, b1, c1, d1, e1, f1, g1, h1; |
142 | |
|
143 | 0 | for (ii=0; ii + 15 < size; ii += 16) { |
144 | 0 | a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]); |
145 | 0 | b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]); |
146 | 0 | c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]); |
147 | 0 | d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]); |
148 | 0 | e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]); |
149 | 0 | f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]); |
150 | 0 | g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]); |
151 | 0 | h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]); |
152 | |
|
153 | 0 | a1 = _mm_unpacklo_epi8(a0, b0); |
154 | 0 | b1 = _mm_unpackhi_epi8(a0, b0); |
155 | 0 | c1 = _mm_unpacklo_epi8(c0, d0); |
156 | 0 | d1 = _mm_unpackhi_epi8(c0, d0); |
157 | 0 | e1 = _mm_unpacklo_epi8(e0, f0); |
158 | 0 | f1 = _mm_unpackhi_epi8(e0, f0); |
159 | 0 | g1 = _mm_unpacklo_epi8(g0, h0); |
160 | 0 | h1 = _mm_unpackhi_epi8(g0, h0); |
161 | |
|
162 | 0 | a0 = _mm_unpacklo_epi8(a1, b1); |
163 | 0 | b0 = _mm_unpackhi_epi8(a1, b1); |
164 | 0 | c0 = _mm_unpacklo_epi8(c1, d1); |
165 | 0 | d0 = _mm_unpackhi_epi8(c1, d1); |
166 | 0 | e0 = _mm_unpacklo_epi8(e1, f1); |
167 | 0 | f0 = _mm_unpackhi_epi8(e1, f1); |
168 | 0 | g0 = _mm_unpacklo_epi8(g1, h1); |
169 | 0 | h0 = _mm_unpackhi_epi8(g1, h1); |
170 | |
|
171 | 0 | a1 = _mm_unpacklo_epi32(a0, c0); |
172 | 0 | b1 = _mm_unpackhi_epi32(a0, c0); |
173 | 0 | c1 = _mm_unpacklo_epi32(b0, d0); |
174 | 0 | d1 = _mm_unpackhi_epi32(b0, d0); |
175 | 0 | e1 = _mm_unpacklo_epi32(e0, g0); |
176 | 0 | f1 = _mm_unpackhi_epi32(e0, g0); |
177 | 0 | g1 = _mm_unpacklo_epi32(f0, h0); |
178 | 0 | h1 = _mm_unpackhi_epi32(f0, h0); |
179 | |
|
180 | 0 | a0 = _mm_unpacklo_epi64(a1, e1); |
181 | 0 | b0 = _mm_unpackhi_epi64(a1, e1); |
182 | 0 | c0 = _mm_unpacklo_epi64(b1, f1); |
183 | 0 | d0 = _mm_unpackhi_epi64(b1, f1); |
184 | 0 | e0 = _mm_unpacklo_epi64(c1, g1); |
185 | 0 | f0 = _mm_unpackhi_epi64(c1, g1); |
186 | 0 | g0 = _mm_unpacklo_epi64(d1, h1); |
187 | 0 | h0 = _mm_unpackhi_epi64(d1, h1); |
188 | |
|
189 | 0 | _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); |
190 | 0 | _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); |
191 | 0 | _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); |
192 | 0 | _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); |
193 | 0 | _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0); |
194 | 0 | _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0); |
195 | 0 | _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0); |
196 | 0 | _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0); |
197 | 0 | } |
198 | 0 | return bshuf_trans_byte_elem_remainder(in, out, size, 8, |
199 | 0 | size - size % 16); |
200 | 0 | } |
201 | | |
202 | | |
203 | | /* Transpose bytes within elements using the best SSE algorithm available. */ |
204 | | int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size, |
205 | 107k | const size_t elem_size) { |
206 | | |
207 | 107k | int64_t count; |
208 | | |
209 | | // Trivial cases: power of 2 bytes. |
210 | 107k | switch (elem_size) { |
211 | 107k | case 1: |
212 | 107k | count = bshuf_copy(in, out, size, elem_size); |
213 | 107k | return count; |
214 | 0 | case 2: |
215 | 0 | count = bshuf_trans_byte_elem_SSE_16(in, out, size); |
216 | 0 | return count; |
217 | 0 | case 4: |
218 | 0 | count = bshuf_trans_byte_elem_SSE_32(in, out, size); |
219 | 0 | return count; |
220 | 0 | case 8: |
221 | 0 | count = bshuf_trans_byte_elem_SSE_64(in, out, size); |
222 | 0 | return count; |
223 | 107k | } |
224 | | |
225 | | // Worst case: odd number of bytes. Turns out that this is faster for |
226 | | // (odd * 2) byte elements as well (hence % 4). |
227 | 0 | if (elem_size % 4) { |
228 | 0 | count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); |
229 | 0 | return count; |
230 | 0 | } |
231 | | |
232 | | // Multiple of power of 2: transpose hierarchically. |
233 | 0 | { |
234 | 0 | size_t nchunk_elem; |
235 | 0 | void* tmp_buf = malloc(size * elem_size); |
236 | 0 | if (tmp_buf == NULL) return -1; |
237 | | |
238 | 0 | if ((elem_size % 8) == 0) { |
239 | 0 | nchunk_elem = elem_size / 8; |
240 | 0 | TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t); |
241 | 0 | count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf, |
242 | 0 | size * nchunk_elem); |
243 | 0 | bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size); |
244 | 0 | } else if ((elem_size % 4) == 0) { |
245 | 0 | nchunk_elem = elem_size / 4; |
246 | 0 | TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t); |
247 | 0 | count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf, |
248 | 0 | size * nchunk_elem); |
249 | 0 | bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size); |
250 | 0 | } else { |
251 | | // Not used since scalar algorithm is faster. |
252 | 0 | nchunk_elem = elem_size / 2; |
253 | 0 | TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t); |
254 | 0 | count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf, |
255 | 0 | size * nchunk_elem); |
256 | 0 | bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size); |
257 | 0 | } |
258 | |
|
259 | 0 | free(tmp_buf); |
260 | 0 | return count; |
261 | 0 | } |
262 | 0 | } |
263 | | |
264 | | |
265 | | /* Transpose bits within bytes. */ |
266 | | int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, |
267 | 0 | const size_t elem_size) { |
268 | |
|
269 | 0 | size_t ii, kk; |
270 | 0 | const char* in_b = (const char*) in; |
271 | 0 | char* out_b = (char*) out; |
272 | 0 | uint16_t* out_ui16; |
273 | |
|
274 | 0 | int64_t count; |
275 | |
|
276 | 0 | size_t nbyte = elem_size * size; |
277 | |
|
278 | 0 | CHECK_MULT_EIGHT(nbyte); |
279 | |
|
280 | 0 | __m128i xmm; |
281 | 0 | int32_t bt; |
282 | |
|
283 | 0 | for (ii = 0; ii + 15 < nbyte; ii += 16) { |
284 | 0 | xmm = _mm_loadu_si128((__m128i *) &in_b[ii]); |
285 | 0 | for (kk = 0; kk < 8; kk++) { |
286 | 0 | bt = _mm_movemask_epi8(xmm); |
287 | 0 | xmm = _mm_slli_epi16(xmm, 1); |
288 | 0 | out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
289 | 0 | *out_ui16 = bt; |
290 | 0 | } |
291 | 0 | } |
292 | 0 | count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, |
293 | 0 | nbyte - nbyte % 16); |
294 | 0 | return count; |
295 | 0 | } |
296 | | |
297 | | |
298 | | /* Transpose bits within elements. */ |
299 | | int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, |
300 | 0 | const size_t elem_size) { |
301 | |
|
302 | 0 | int64_t count; |
303 | |
|
304 | 0 | CHECK_MULT_EIGHT(size); |
305 | |
|
306 | 0 | void* tmp_buf = malloc(size * elem_size); |
307 | 0 | if (tmp_buf == NULL) return -1; |
308 | | |
309 | 0 | count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); |
310 | 0 | CHECK_ERR_FREE(count, tmp_buf); |
311 | 0 | count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size); |
312 | 0 | CHECK_ERR_FREE(count, tmp_buf); |
313 | 0 | count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); |
314 | |
|
315 | 0 | free(tmp_buf); |
316 | |
|
317 | 0 | return count; |
318 | 0 | } |
319 | | |
320 | | |
321 | | /* For data organized into a row for each bit (8 * elem_size rows), transpose |
322 | | * the bytes. */ |
323 | | int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size, |
324 | 21.8k | const size_t elem_size) { |
325 | | |
326 | 21.8k | size_t ii, jj; |
327 | 21.8k | const char* in_b = (const char*) in; |
328 | 21.8k | char* out_b = (char*) out; |
329 | | |
330 | 21.8k | CHECK_MULT_EIGHT(size); |
331 | | |
332 | 21.8k | size_t nrows = 8 * elem_size; |
333 | 21.8k | size_t nbyte_row = size / 8; |
334 | | |
335 | 21.8k | __m128i a0, b0, c0, d0, e0, f0, g0, h0; |
336 | 21.8k | __m128i a1, b1, c1, d1, e1, f1, g1, h1; |
337 | 21.8k | __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; |
338 | | |
339 | 43.7k | for (ii = 0; ii + 7 < nrows; ii += 8) { |
340 | 701k | for (jj = 0; jj + 15 < nbyte_row; jj += 16) { |
341 | 679k | a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]); |
342 | 679k | b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]); |
343 | 679k | c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]); |
344 | 679k | d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]); |
345 | 679k | e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]); |
346 | 679k | f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]); |
347 | 679k | g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]); |
348 | 679k | h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]); |
349 | | |
350 | | |
351 | 679k | a1 = _mm_unpacklo_epi8(a0, b0); |
352 | 679k | b1 = _mm_unpacklo_epi8(c0, d0); |
353 | 679k | c1 = _mm_unpacklo_epi8(e0, f0); |
354 | 679k | d1 = _mm_unpacklo_epi8(g0, h0); |
355 | 679k | e1 = _mm_unpackhi_epi8(a0, b0); |
356 | 679k | f1 = _mm_unpackhi_epi8(c0, d0); |
357 | 679k | g1 = _mm_unpackhi_epi8(e0, f0); |
358 | 679k | h1 = _mm_unpackhi_epi8(g0, h0); |
359 | | |
360 | | |
361 | 679k | a0 = _mm_unpacklo_epi16(a1, b1); |
362 | 679k | b0 = _mm_unpacklo_epi16(c1, d1); |
363 | 679k | c0 = _mm_unpackhi_epi16(a1, b1); |
364 | 679k | d0 = _mm_unpackhi_epi16(c1, d1); |
365 | | |
366 | 679k | e0 = _mm_unpacklo_epi16(e1, f1); |
367 | 679k | f0 = _mm_unpacklo_epi16(g1, h1); |
368 | 679k | g0 = _mm_unpackhi_epi16(e1, f1); |
369 | 679k | h0 = _mm_unpackhi_epi16(g1, h1); |
370 | | |
371 | | |
372 | 679k | a1 = _mm_unpacklo_epi32(a0, b0); |
373 | 679k | b1 = _mm_unpackhi_epi32(a0, b0); |
374 | | |
375 | 679k | c1 = _mm_unpacklo_epi32(c0, d0); |
376 | 679k | d1 = _mm_unpackhi_epi32(c0, d0); |
377 | | |
378 | 679k | e1 = _mm_unpacklo_epi32(e0, f0); |
379 | 679k | f1 = _mm_unpackhi_epi32(e0, f0); |
380 | | |
381 | 679k | g1 = _mm_unpacklo_epi32(g0, h0); |
382 | 679k | h1 = _mm_unpackhi_epi32(g0, h0); |
383 | | |
384 | | // We don't have a storeh instruction for integers, so interpret |
385 | | // as a float. Have a storel (_mm_storel_epi64). |
386 | 679k | as = (__m128 *) &a1; |
387 | 679k | bs = (__m128 *) &b1; |
388 | 679k | cs = (__m128 *) &c1; |
389 | 679k | ds = (__m128 *) &d1; |
390 | 679k | es = (__m128 *) &e1; |
391 | 679k | fs = (__m128 *) &f1; |
392 | 679k | gs = (__m128 *) &g1; |
393 | 679k | hs = (__m128 *) &h1; |
394 | | |
395 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as); |
396 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs); |
397 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs); |
398 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds); |
399 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es); |
400 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs); |
401 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs); |
402 | 679k | _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs); |
403 | | |
404 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as); |
405 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs); |
406 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs); |
407 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds); |
408 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es); |
409 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs); |
410 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs); |
411 | 679k | _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs); |
412 | 679k | } |
413 | 26.0k | for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { |
414 | 4.22k | out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; |
415 | 4.22k | out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; |
416 | 4.22k | out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; |
417 | 4.22k | out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; |
418 | 4.22k | out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; |
419 | 4.22k | out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; |
420 | 4.22k | out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; |
421 | 4.22k | out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; |
422 | 4.22k | } |
423 | 21.8k | } |
424 | 21.8k | return size * elem_size; |
425 | 21.8k | } |
426 | | |
427 | | |
428 | | /* Shuffle bits within the bytes of eight element blocks. */ |
429 | | int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size, |
430 | 21.8k | const size_t elem_size) { |
431 | | |
432 | 21.8k | CHECK_MULT_EIGHT(size); |
433 | | |
434 | | // With a bit of care, this could be written such that such that it is |
435 | | // in_buf = out_buf safe. |
436 | 21.8k | const char* in_b = (const char*) in; |
437 | 21.8k | uint16_t* out_ui16 = (uint16_t*) out; |
438 | | |
439 | 21.8k | size_t ii, jj, kk; |
440 | 21.8k | size_t nbyte = elem_size * size; |
441 | | |
442 | 21.8k | __m128i xmm; |
443 | 21.8k | int32_t bt; |
444 | | |
445 | 21.8k | if (elem_size % 2) { |
446 | 21.8k | bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); |
447 | 21.8k | } else { |
448 | 0 | for (ii = 0; ii + 8 * elem_size - 1 < nbyte; |
449 | 0 | ii += 8 * elem_size) { |
450 | 0 | for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { |
451 | 0 | xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]); |
452 | 0 | for (kk = 0; kk < 8; kk++) { |
453 | 0 | bt = _mm_movemask_epi8(xmm); |
454 | 0 | xmm = _mm_slli_epi16(xmm, 1); |
455 | 0 | size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); |
456 | 0 | out_ui16[ind / 2] = bt; |
457 | 0 | } |
458 | 0 | } |
459 | 0 | } |
460 | 0 | } |
461 | 21.8k | return size * elem_size; |
462 | 21.8k | } |
463 | | |
464 | | |
465 | | /* Untranspose bits within elements. */ |
466 | | int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, |
467 | 0 | const size_t elem_size) { |
468 | |
|
469 | 0 | int64_t count; |
470 | |
|
471 | 0 | CHECK_MULT_EIGHT(size); |
472 | |
|
473 | 0 | void* tmp_buf = malloc(size * elem_size); |
474 | 0 | if (tmp_buf == NULL) return -1; |
475 | | |
476 | 0 | count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size); |
477 | 0 | CHECK_ERR_FREE(count, tmp_buf); |
478 | 0 | count = bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size); |
479 | |
|
480 | 0 | free(tmp_buf); |
481 | |
|
482 | 0 | return count; |
483 | 0 | } |
484 | | |
485 | | const bool is_bshuf_SSE = true; |
486 | | |
487 | | #else /* defined(__SSE2__) */ |
488 | | |
489 | | const bool is_bshuf_SSE = false; |
490 | | |
491 | | int64_t |
492 | | bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, |
493 | | const size_t elem_size) { |
494 | | abort(); |
495 | | } |
496 | | |
497 | | int64_t |
498 | | bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, |
499 | | const size_t elem_size) { |
500 | | abort(); |
501 | | } |
502 | | |
503 | | #endif /* defined(__SSE2__) */ |