/src/c-blosc/blosc/shuffle-sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Author: Francesc Alted <francesc@blosc.org> |
5 | | |
6 | | See LICENSE.txt for details about copyright and rights to use. |
7 | | **********************************************************************/ |
8 | | |
9 | | #include "shuffle-generic.h" |
10 | | #include "shuffle-sse2.h" |
11 | | |
12 | | /* Define dummy functions if SSE2 is not available for the compilation target and compiler. */ |
13 | | #if !defined(__SSE2__) |
14 | | |
15 | | void |
16 | | blosc_internal_shuffle_sse2(const size_t bytesoftype, const size_t blocksize, |
17 | | const uint8_t* const _src, uint8_t* const _dest) { |
18 | | abort(); |
19 | | } |
20 | | |
21 | | void |
22 | | blosc_internal_unshuffle_sse2(const size_t bytesoftype, const size_t blocksize, |
23 | | const uint8_t* const _src, uint8_t* const _dest) { |
24 | | abort(); |
25 | | } |
26 | | |
27 | | # else /* defined(__SSE2__) */ |
28 | | |
29 | | #include <emmintrin.h> |
30 | | |
31 | | |
32 | | /* The next is useful for debugging purposes */ |
33 | | #if 0 |
34 | | #include <stdio.h> |
35 | | #include <string.h> |
36 | | |
37 | | static void printxmm(__m128i xmm0) |
38 | | { |
39 | | uint8_t buf[16]; |
40 | | |
41 | | ((__m128i *)buf)[0] = xmm0; |
42 | | printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
43 | | buf[0], buf[1], buf[2], buf[3], |
44 | | buf[4], buf[5], buf[6], buf[7], |
45 | | buf[8], buf[9], buf[10], buf[11], |
46 | | buf[12], buf[13], buf[14], buf[15]); |
47 | | } |
48 | | #endif |
49 | | |
50 | | |
51 | | /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ |
52 | | static void |
53 | | shuffle2_sse2(uint8_t* const dest, const uint8_t* const src, |
54 | | const size_t vectorizable_elements, const size_t total_elements) |
55 | 0 | { |
56 | 0 | static const size_t bytesoftype = 2; |
57 | 0 | size_t j; |
58 | 0 | int k; |
59 | 0 | uint8_t* dest_for_jth_element; |
60 | 0 | __m128i xmm0[2], xmm1[2]; |
61 | |
|
62 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
63 | | /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */ |
64 | 0 | for (k = 0; k < 2; k++) { |
65 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
66 | 0 | xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); |
67 | 0 | xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); |
68 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
69 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
70 | 0 | xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); |
71 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
72 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
73 | 0 | xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); |
74 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
75 | 0 | } |
76 | | /* Transpose quad words */ |
77 | 0 | for (k = 0; k < 1; k++) { |
78 | 0 | xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]); |
79 | 0 | xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]); |
80 | 0 | } |
81 | | /* Store the result vectors */ |
82 | 0 | dest_for_jth_element = dest + j; |
83 | 0 | for (k = 0; k < 2; k++) { |
84 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]); |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | | /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ |
90 | | static void |
91 | | shuffle4_sse2(uint8_t* const dest, const uint8_t* const src, |
92 | | const size_t vectorizable_elements, const size_t total_elements) |
93 | 0 | { |
94 | 0 | static const size_t bytesoftype = 4; |
95 | 0 | size_t i; |
96 | 0 | int j; |
97 | 0 | uint8_t* dest_for_ith_element; |
98 | 0 | __m128i xmm0[4], xmm1[4]; |
99 | |
|
100 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
101 | | /* Fetch 16 elements (64 bytes) then transpose bytes and words. */ |
102 | 0 | for (j = 0; j < 4; j++) { |
103 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i)))); |
104 | 0 | xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8); |
105 | 0 | xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d); |
106 | 0 | xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]); |
107 | 0 | xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e); |
108 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]); |
109 | 0 | } |
110 | | /* Transpose double words */ |
111 | 0 | for (j = 0; j < 2; j++) { |
112 | 0 | xmm1[j*2] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]); |
113 | 0 | xmm1[j*2+1] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]); |
114 | 0 | } |
115 | | /* Transpose quad words */ |
116 | 0 | for (j = 0; j < 2; j++) { |
117 | 0 | xmm0[j*2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j+2]); |
118 | 0 | xmm0[j*2+1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j+2]); |
119 | 0 | } |
120 | | /* Store the result vectors */ |
121 | 0 | dest_for_ith_element = dest + i; |
122 | 0 | for (j = 0; j < 4; j++) { |
123 | 0 | _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]); |
124 | 0 | } |
125 | 0 | } |
126 | 0 | } |
127 | | |
128 | | /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ |
129 | | static void |
130 | | shuffle8_sse2(uint8_t* const dest, const uint8_t* const src, |
131 | | const size_t vectorizable_elements, const size_t total_elements) |
132 | 0 | { |
133 | 0 | static const size_t bytesoftype = 8; |
134 | 0 | size_t j; |
135 | 0 | int k, l; |
136 | 0 | uint8_t* dest_for_jth_element; |
137 | 0 | __m128i xmm0[8], xmm1[8]; |
138 | |
|
139 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
140 | | /* Fetch 16 elements (128 bytes) then transpose bytes. */ |
141 | 0 | for (k = 0; k < 8; k++) { |
142 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
143 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
144 | 0 | xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); |
145 | 0 | } |
146 | | /* Transpose words */ |
147 | 0 | for (k = 0, l = 0; k < 4; k++, l +=2) { |
148 | 0 | xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]); |
149 | 0 | xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]); |
150 | 0 | } |
151 | | /* Transpose double words */ |
152 | 0 | for (k = 0, l = 0; k < 4; k++, l++) { |
153 | 0 | if (k == 2) l += 2; |
154 | 0 | xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]); |
155 | 0 | xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]); |
156 | 0 | } |
157 | | /* Transpose quad words */ |
158 | 0 | for (k = 0; k < 4; k++) { |
159 | 0 | xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]); |
160 | 0 | xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]); |
161 | 0 | } |
162 | | /* Store the result vectors */ |
163 | 0 | dest_for_jth_element = dest + j; |
164 | 0 | for (k = 0; k < 8; k++) { |
165 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); |
166 | 0 | } |
167 | 0 | } |
168 | 0 | } |
169 | | |
170 | | /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ |
171 | | static void |
172 | | shuffle16_sse2(uint8_t* const dest, const uint8_t* const src, |
173 | | const size_t vectorizable_elements, const size_t total_elements) |
174 | 0 | { |
175 | 0 | static const size_t bytesoftype = 16; |
176 | 0 | size_t j; |
177 | 0 | int k, l; |
178 | 0 | uint8_t* dest_for_jth_element; |
179 | 0 | __m128i xmm0[16], xmm1[16]; |
180 | |
|
181 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
182 | | /* Fetch 16 elements (256 bytes). */ |
183 | 0 | for (k = 0; k < 16; k++) { |
184 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
185 | 0 | } |
186 | | /* Transpose bytes */ |
187 | 0 | for (k = 0, l = 0; k < 8; k++, l +=2) { |
188 | 0 | xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]); |
189 | 0 | xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]); |
190 | 0 | } |
191 | | /* Transpose words */ |
192 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
193 | 0 | if ((k%2) == 0) l += 2; |
194 | 0 | xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]); |
195 | 0 | xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]); |
196 | 0 | } |
197 | | /* Transpose double words */ |
198 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
199 | 0 | if ((k%4) == 0) l += 4; |
200 | 0 | xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]); |
201 | 0 | xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]); |
202 | 0 | } |
203 | | /* Transpose quad words */ |
204 | 0 | for (k = 0; k < 8; k++) { |
205 | 0 | xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]); |
206 | 0 | xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]); |
207 | 0 | } |
208 | | /* Store the result vectors */ |
209 | 0 | dest_for_jth_element = dest + j; |
210 | 0 | for (k = 0; k < 16; k++) { |
211 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); |
212 | 0 | } |
213 | 0 | } |
214 | 0 | } |
215 | | |
216 | | /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ |
217 | | static void |
218 | | shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src, |
219 | | const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) |
220 | 0 | { |
221 | 0 | size_t j; |
222 | 0 | const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i); |
223 | 0 | int k, l; |
224 | 0 | uint8_t* dest_for_jth_element; |
225 | 0 | __m128i xmm0[16], xmm1[16]; |
226 | |
|
227 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
228 | | /* Advance the offset into the type by the vector size (in bytes), unless this is |
229 | | the initial iteration and the type size is not a multiple of the vector size. |
230 | | In that case, only advance by the number of bytes necessary so that the number |
231 | | of remaining bytes in the type will be a multiple of the vector size. */ |
232 | 0 | size_t offset_into_type; |
233 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
234 | 0 | offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) { |
235 | | |
236 | | /* Fetch elements in groups of 256 bytes */ |
237 | 0 | const uint8_t* const src_with_offset = src + offset_into_type; |
238 | 0 | for (k = 0; k < 16; k++) { |
239 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype)); |
240 | 0 | } |
241 | | /* Transpose bytes */ |
242 | 0 | for (k = 0, l = 0; k < 8; k++, l +=2) { |
243 | 0 | xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]); |
244 | 0 | xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]); |
245 | 0 | } |
246 | | /* Transpose words */ |
247 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
248 | 0 | if ((k%2) == 0) l += 2; |
249 | 0 | xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]); |
250 | 0 | xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]); |
251 | 0 | } |
252 | | /* Transpose double words */ |
253 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
254 | 0 | if ((k%4) == 0) l += 4; |
255 | 0 | xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]); |
256 | 0 | xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]); |
257 | 0 | } |
258 | | /* Transpose quad words */ |
259 | 0 | for (k = 0; k < 8; k++) { |
260 | 0 | xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]); |
261 | 0 | xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]); |
262 | 0 | } |
263 | | /* Store the result vectors */ |
264 | 0 | dest_for_jth_element = dest + j; |
265 | 0 | for (k = 0; k < 16; k++) { |
266 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]); |
267 | 0 | } |
268 | 0 | } |
269 | 0 | } |
270 | 0 | } |
271 | | |
272 | | /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ |
273 | | static void |
274 | | unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src, |
275 | | const size_t vectorizable_elements, const size_t total_elements) |
276 | 0 | { |
277 | 0 | static const size_t bytesoftype = 2; |
278 | 0 | size_t i; |
279 | 0 | int j; |
280 | 0 | __m128i xmm0[2], xmm1[2]; |
281 | |
|
282 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
283 | | /* Load 16 elements (32 bytes) into 2 XMM registers. */ |
284 | 0 | const uint8_t* const src_for_ith_element = src + i; |
285 | 0 | for (j = 0; j < 2; j++) { |
286 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
287 | 0 | } |
288 | | /* Shuffle bytes */ |
289 | | /* Compute the low 32 bytes */ |
290 | 0 | xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]); |
291 | | /* Compute the hi 32 bytes */ |
292 | 0 | xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]); |
293 | | /* Store the result vectors in proper order */ |
294 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
295 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]); |
296 | 0 | } |
297 | 0 | } |
298 | | |
299 | | /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ |
300 | | static void |
301 | | unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src, |
302 | | const size_t vectorizable_elements, const size_t total_elements) |
303 | 0 | { |
304 | 0 | static const size_t bytesoftype = 4; |
305 | 0 | size_t i; |
306 | 0 | int j; |
307 | 0 | __m128i xmm0[4], xmm1[4]; |
308 | |
|
309 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
310 | | /* Load 16 elements (64 bytes) into 4 XMM registers. */ |
311 | 0 | const uint8_t* const src_for_ith_element = src + i; |
312 | 0 | for (j = 0; j < 4; j++) { |
313 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
314 | 0 | } |
315 | | /* Shuffle bytes */ |
316 | 0 | for (j = 0; j < 2; j++) { |
317 | | /* Compute the low 32 bytes */ |
318 | 0 | xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]); |
319 | | /* Compute the hi 32 bytes */ |
320 | 0 | xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]); |
321 | 0 | } |
322 | | /* Shuffle 2-byte words */ |
323 | 0 | for (j = 0; j < 2; j++) { |
324 | | /* Compute the low 32 bytes */ |
325 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]); |
326 | | /* Compute the hi 32 bytes */ |
327 | 0 | xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]); |
328 | 0 | } |
329 | | /* Store the result vectors in proper order */ |
330 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]); |
331 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]); |
332 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]); |
333 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]); |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | | /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ |
338 | | static void |
339 | | unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src, |
340 | | const size_t vectorizable_elements, const size_t total_elements) |
341 | 0 | { |
342 | 0 | static const size_t bytesoftype = 8; |
343 | 0 | size_t i; |
344 | 0 | int j; |
345 | 0 | __m128i xmm0[8], xmm1[8]; |
346 | |
|
347 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
348 | | /* Load 16 elements (128 bytes) into 8 XMM registers. */ |
349 | 0 | const uint8_t* const src_for_ith_element = src + i; |
350 | 0 | for (j = 0; j < 8; j++) { |
351 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
352 | 0 | } |
353 | | /* Shuffle bytes */ |
354 | 0 | for (j = 0; j < 4; j++) { |
355 | | /* Compute the low 32 bytes */ |
356 | 0 | xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]); |
357 | | /* Compute the hi 32 bytes */ |
358 | 0 | xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]); |
359 | 0 | } |
360 | | /* Shuffle 2-byte words */ |
361 | 0 | for (j = 0; j < 4; j++) { |
362 | | /* Compute the low 32 bytes */ |
363 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]); |
364 | | /* Compute the hi 32 bytes */ |
365 | 0 | xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]); |
366 | 0 | } |
367 | | /* Shuffle 4-byte dwords */ |
368 | 0 | for (j = 0; j < 4; j++) { |
369 | | /* Compute the low 32 bytes */ |
370 | 0 | xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]); |
371 | | /* Compute the hi 32 bytes */ |
372 | 0 | xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]); |
373 | 0 | } |
374 | | /* Store the result vectors in proper order */ |
375 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
376 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]); |
377 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]); |
378 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]); |
379 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]); |
380 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]); |
381 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]); |
382 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]); |
383 | 0 | } |
384 | 0 | } |
385 | | |
386 | | /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ |
387 | | static void |
388 | | unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src, |
389 | | const size_t vectorizable_elements, const size_t total_elements) |
390 | 0 | { |
391 | 0 | static const size_t bytesoftype = 16; |
392 | 0 | size_t i; |
393 | 0 | int j; |
394 | 0 | __m128i xmm1[16], xmm2[16]; |
395 | |
|
396 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
397 | | /* Load 16 elements (256 bytes) into 16 XMM registers. */ |
398 | 0 | const uint8_t* const src_for_ith_element = src + i; |
399 | 0 | for (j = 0; j < 16; j++) { |
400 | 0 | xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
401 | 0 | } |
402 | | /* Shuffle bytes */ |
403 | 0 | for (j = 0; j < 8; j++) { |
404 | | /* Compute the low 32 bytes */ |
405 | 0 | xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]); |
406 | | /* Compute the hi 32 bytes */ |
407 | 0 | xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]); |
408 | 0 | } |
409 | | /* Shuffle 2-byte words */ |
410 | 0 | for (j = 0; j < 8; j++) { |
411 | | /* Compute the low 32 bytes */ |
412 | 0 | xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]); |
413 | | /* Compute the hi 32 bytes */ |
414 | 0 | xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]); |
415 | 0 | } |
416 | | /* Shuffle 4-byte dwords */ |
417 | 0 | for (j = 0; j < 8; j++) { |
418 | | /* Compute the low 32 bytes */ |
419 | 0 | xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]); |
420 | | /* Compute the hi 32 bytes */ |
421 | 0 | xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]); |
422 | 0 | } |
423 | | /* Shuffle 8-byte qwords */ |
424 | 0 | for (j = 0; j < 8; j++) { |
425 | | /* Compute the low 32 bytes */ |
426 | 0 | xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]); |
427 | | /* Compute the hi 32 bytes */ |
428 | 0 | xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]); |
429 | 0 | } |
430 | | |
431 | | /* Store the result vectors in proper order */ |
432 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
433 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]); |
434 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]); |
435 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]); |
436 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]); |
437 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]); |
438 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]); |
439 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]); |
440 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]); |
441 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]); |
442 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]); |
443 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]); |
444 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]); |
445 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]); |
446 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]); |
447 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]); |
448 | 0 | } |
449 | 0 | } |
450 | | |
451 | | /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ |
452 | | static void |
453 | | unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig, |
454 | | const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) |
455 | 0 | { |
456 | 0 | size_t i; |
457 | 0 | const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i); |
458 | |
|
459 | 0 | int j; |
460 | 0 | uint8_t* dest_with_offset; |
461 | 0 | __m128i xmm1[16], xmm2[16]; |
462 | | |
463 | | /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2) |
464 | | to optimize cache utilization. */ |
465 | 0 | size_t offset_into_type; |
466 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
467 | 0 | offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) { |
468 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
469 | | /* Load the first 128 bytes in 16 XMM registers */ |
470 | 0 | const uint8_t* const src_for_ith_element = orig + i; |
471 | 0 | for (j = 0; j < 16; j++) { |
472 | 0 | xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); |
473 | 0 | } |
474 | | /* Shuffle bytes */ |
475 | 0 | for (j = 0; j < 8; j++) { |
476 | | /* Compute the low 32 bytes */ |
477 | 0 | xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]); |
478 | | /* Compute the hi 32 bytes */ |
479 | 0 | xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]); |
480 | 0 | } |
481 | | /* Shuffle 2-byte words */ |
482 | 0 | for (j = 0; j < 8; j++) { |
483 | | /* Compute the low 32 bytes */ |
484 | 0 | xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]); |
485 | | /* Compute the hi 32 bytes */ |
486 | 0 | xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]); |
487 | 0 | } |
488 | | /* Shuffle 4-byte dwords */ |
489 | 0 | for (j = 0; j < 8; j++) { |
490 | | /* Compute the low 32 bytes */ |
491 | 0 | xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]); |
492 | | /* Compute the hi 32 bytes */ |
493 | 0 | xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]); |
494 | 0 | } |
495 | | /* Shuffle 8-byte qwords */ |
496 | 0 | for (j = 0; j < 8; j++) { |
497 | | /* Compute the low 32 bytes */ |
498 | 0 | xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]); |
499 | | /* Compute the hi 32 bytes */ |
500 | 0 | xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]); |
501 | 0 | } |
502 | | |
503 | | /* Store the result vectors in proper order */ |
504 | 0 | dest_with_offset = dest + offset_into_type; |
505 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]); |
506 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]); |
507 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]); |
508 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]); |
509 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]); |
510 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]); |
511 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]); |
512 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]); |
513 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]); |
514 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]); |
515 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]); |
516 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]); |
517 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]); |
518 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]); |
519 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]); |
520 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]); |
521 | 0 | } |
522 | 0 | } |
523 | 0 | } |
524 | | |
525 | | /* Shuffle a block. This can never fail. */ |
526 | | void |
527 | | blosc_internal_shuffle_sse2(const size_t bytesoftype, const size_t blocksize, |
528 | 0 | const uint8_t* const _src, uint8_t* const _dest) { |
529 | 0 | const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i); |
530 | | /* If the blocksize is not a multiple of both the typesize and |
531 | | the vector size, round the blocksize down to the next value |
532 | | which is a multiple of both. The vectorized shuffle can be |
533 | | used for that portion of the data, and the naive implementation |
534 | | can be used for the remaining portion. */ |
535 | 0 | const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
536 | 0 | const size_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
537 | 0 | const size_t total_elements = blocksize / bytesoftype; |
538 | | |
539 | | /* If the block size is too small to be vectorized, |
540 | | use the generic implementation. */ |
541 | 0 | if (blocksize < vectorized_chunk_size) { |
542 | 0 | blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest); |
543 | 0 | return; |
544 | 0 | } |
545 | | |
546 | | /* Optimized shuffle implementations */ |
547 | 0 | switch (bytesoftype) |
548 | 0 | { |
549 | 0 | case 2: |
550 | 0 | shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements); |
551 | 0 | break; |
552 | 0 | case 4: |
553 | 0 | shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements); |
554 | 0 | break; |
555 | 0 | case 8: |
556 | 0 | shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements); |
557 | 0 | break; |
558 | 0 | case 16: |
559 | 0 | shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements); |
560 | 0 | break; |
561 | 0 | default: |
562 | 0 | if (bytesoftype > sizeof(__m128i)) { |
563 | 0 | shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
564 | 0 | } |
565 | 0 | else { |
566 | | /* Non-optimized shuffle */ |
567 | 0 | blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest); |
568 | | /* The non-optimized function covers the whole buffer, |
569 | | so we're done processing here. */ |
570 | 0 | return; |
571 | 0 | } |
572 | 0 | } |
573 | | |
574 | | /* If the buffer had any bytes at the end which couldn't be handled |
575 | | by the vectorized implementations, use the non-optimized version |
576 | | to finish them up. */ |
577 | 0 | if (vectorizable_bytes < blocksize) { |
578 | 0 | shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
579 | 0 | } |
580 | 0 | } |
581 | | |
582 | | /* Unshuffle a block. This can never fail. */ |
583 | | void |
584 | | blosc_internal_unshuffle_sse2(const size_t bytesoftype, const size_t blocksize, |
585 | 0 | const uint8_t* const _src, uint8_t* const _dest) { |
586 | 0 | const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i); |
587 | | /* If the blocksize is not a multiple of both the typesize and |
588 | | the vector size, round the blocksize down to the next value |
589 | | which is a multiple of both. The vectorized unshuffle can be |
590 | | used for that portion of the data, and the naive implementation |
591 | | can be used for the remaining portion. */ |
592 | 0 | const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
593 | 0 | const size_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
594 | 0 | const size_t total_elements = blocksize / bytesoftype; |
595 | | |
596 | | |
597 | | /* If the block size is too small to be vectorized, |
598 | | use the generic implementation. */ |
599 | 0 | if (blocksize < vectorized_chunk_size) { |
600 | 0 | blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
601 | 0 | return; |
602 | 0 | } |
603 | | |
604 | | /* Optimized unshuffle implementations */ |
605 | 0 | switch (bytesoftype) |
606 | 0 | { |
607 | 0 | case 2: |
608 | 0 | unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements); |
609 | 0 | break; |
610 | 0 | case 4: |
611 | 0 | unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements); |
612 | 0 | break; |
613 | 0 | case 8: |
614 | 0 | unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements); |
615 | 0 | break; |
616 | 0 | case 16: |
617 | 0 | unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements); |
618 | 0 | break; |
619 | 0 | default: |
620 | 0 | if (bytesoftype > sizeof(__m128i)) { |
621 | 0 | unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
622 | 0 | } |
623 | 0 | else { |
624 | | /* Non-optimized unshuffle */ |
625 | 0 | blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
626 | | /* The non-optimized function covers the whole buffer, |
627 | | so we're done processing here. */ |
628 | 0 | return; |
629 | 0 | } |
630 | 0 | } |
631 | | |
632 | | /* If the buffer had any bytes at the end which couldn't be handled |
633 | | by the vectorized implementations, use the non-optimized version |
634 | | to finish them up. */ |
635 | 0 | if (vectorizable_bytes < blocksize) { |
636 | 0 | unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
637 | 0 | } |
638 | 0 | } |
639 | | |
640 | | #endif /* !defined(__SSE2__) */ |