/src/c-blosc2/blosc/shuffle-sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | #include "shuffle-sse2.h" |
12 | | #include "shuffle-generic.h" |
13 | | #include <stdlib.h> |
14 | | |
15 | | /* Make sure SSE2 is available for the compilation target and compiler. */ |
16 | | #if defined(__SSE2__) |
17 | | |
18 | | #include <emmintrin.h> |
19 | | |
20 | | #include <stdint.h> |
21 | | |
22 | | /* The next is useful for debugging purposes */ |
23 | | #if 0 |
24 | | #include <stdio.h> |
25 | | #include <string.h> |
26 | | |
27 | | static void printxmm(__m128i xmm0) |
28 | | { |
29 | | uint8_t buf[16]; |
30 | | |
31 | | ((__m128i *)buf)[0] = xmm0; |
32 | | printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
33 | | buf[0], buf[1], buf[2], buf[3], |
34 | | buf[4], buf[5], buf[6], buf[7], |
35 | | buf[8], buf[9], buf[10], buf[11], |
36 | | buf[12], buf[13], buf[14], buf[15]); |
37 | | } |
38 | | #endif |
39 | | |
40 | | |
41 | | /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ |
42 | | static void |
43 | | shuffle2_sse2(uint8_t* const dest, const uint8_t* const src, |
44 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
45 | 0 | static const int32_t bytesoftype = 2; |
46 | 0 | int32_t j; |
47 | 0 | int k; |
48 | 0 | uint8_t* dest_for_jth_element; |
49 | 0 | __m128i xmm0[2], xmm1[2]; |
50 | |
|
51 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
52 | | /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */ |
53 | 0 | for (k = 0; k < 2; k++) { |
54 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
55 | 0 | xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); |
56 | 0 | xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); |
57 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
58 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
59 | 0 | xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); |
60 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
61 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
62 | 0 | xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); |
63 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
64 | 0 | } |
65 | | /* Transpose quad words */ |
66 | 0 | for (k = 0; k < 1; k++) { |
67 | 0 | xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]); |
68 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]); |
69 | 0 | } |
70 | | /* Store the result vectors */ |
71 | 0 | dest_for_jth_element = dest + j; |
72 | 0 | for (k = 0; k < 2; k++) { |
73 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]); |
74 | 0 | } |
75 | 0 | } |
76 | 0 | } |
77 | | |
78 | | /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ |
79 | | static void |
80 | | shuffle4_sse2(uint8_t* const dest, const uint8_t* const src, |
81 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
82 | 0 | static const int32_t bytesoftype = 4; |
83 | 0 | int32_t i; |
84 | 0 | int j; |
85 | 0 | uint8_t* dest_for_ith_element; |
86 | 0 | __m128i xmm0[4], xmm1[4]; |
87 | |
|
88 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
89 | | /* Fetch 16 elements (64 bytes) then transpose bytes and words. */ |
90 | 0 | for (j = 0; j < 4; j++) { |
91 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i)))); |
92 | 0 | xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8); |
93 | 0 | xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d); |
94 | 0 | xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]); |
95 | 0 | xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e); |
96 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]); |
97 | 0 | } |
98 | | /* Transpose double words */ |
99 | 0 | for (j = 0; j < 2; j++) { |
100 | 0 | xmm1[j * 2] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
101 | 0 | xmm1[j * 2 + 1] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
102 | 0 | } |
103 | | /* Transpose quad words */ |
104 | 0 | for (j = 0; j < 2; j++) { |
105 | 0 | xmm0[j * 2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j + 2]); |
106 | 0 | xmm0[j * 2 + 1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j + 2]); |
107 | 0 | } |
108 | | /* Store the result vectors */ |
109 | 0 | dest_for_ith_element = dest + i; |
110 | 0 | for (j = 0; j < 4; j++) { |
111 | 0 | _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]); |
112 | 0 | } |
113 | 0 | } |
114 | 0 | } |
115 | | |
116 | | /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ |
117 | | static void |
118 | | shuffle8_sse2(uint8_t* const dest, const uint8_t* const src, |
119 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
120 | 0 | static const int32_t bytesoftype = 8; |
121 | 0 | int32_t j; |
122 | 0 | int k, l; |
123 | 0 | uint8_t* dest_for_jth_element; |
124 | 0 | __m128i xmm0[8], xmm1[8]; |
125 | |
|
126 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
127 | | /* Fetch 16 elements (128 bytes) then transpose bytes. */ |
128 | 0 | for (k = 0; k < 8; k++) { |
129 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
130 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
131 | 0 | xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); |
132 | 0 | } |
133 | | /* Transpose words */ |
134 | 0 | for (k = 0, l = 0; k < 4; k++, l += 2) { |
135 | 0 | xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 1]); |
136 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 1]); |
137 | 0 | } |
138 | | /* Transpose double words */ |
139 | 0 | for (k = 0, l = 0; k < 4; k++, l++) { |
140 | 0 | if (k == 2) l += 2; |
141 | 0 | xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 2]); |
142 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 2]); |
143 | 0 | } |
144 | | /* Transpose quad words */ |
145 | 0 | for (k = 0; k < 4; k++) { |
146 | 0 | xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 4]); |
147 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 4]); |
148 | 0 | } |
149 | | /* Store the result vectors */ |
150 | 0 | dest_for_jth_element = dest + j; |
151 | 0 | for (k = 0; k < 8; k++) { |
152 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); |
153 | 0 | } |
154 | 0 | } |
155 | 0 | } |
156 | | |
157 | | /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ |
158 | | static void |
159 | | shuffle16_sse2(uint8_t* const dest, const uint8_t* const src, |
160 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
161 | 0 | static const int32_t bytesoftype = 16; |
162 | 0 | int32_t j; |
163 | 0 | int k, l; |
164 | 0 | uint8_t* dest_for_jth_element; |
165 | 0 | __m128i xmm0[16], xmm1[16]; |
166 | |
|
167 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
168 | | /* Fetch 16 elements (256 bytes). */ |
169 | 0 | for (k = 0; k < 16; k++) { |
170 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
171 | 0 | } |
172 | | /* Transpose bytes */ |
173 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
174 | 0 | xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]); |
175 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]); |
176 | 0 | } |
177 | | /* Transpose words */ |
178 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
179 | 0 | if ((k % 2) == 0) l += 2; |
180 | 0 | xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]); |
181 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]); |
182 | 0 | } |
183 | | /* Transpose double words */ |
184 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
185 | 0 | if ((k % 4) == 0) l += 4; |
186 | 0 | xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]); |
187 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]); |
188 | 0 | } |
189 | | /* Transpose quad words */ |
190 | 0 | for (k = 0; k < 8; k++) { |
191 | 0 | xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]); |
192 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]); |
193 | 0 | } |
194 | | /* Store the result vectors */ |
195 | 0 | dest_for_jth_element = dest + j; |
196 | 0 | for (k = 0; k < 16; k++) { |
197 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); |
198 | 0 | } |
199 | 0 | } |
200 | 0 | } |
201 | | |
202 | | /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ |
203 | | static void |
204 | | shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src, |
205 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
206 | 0 | int32_t j; |
207 | 0 | const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i); |
208 | 0 | int k, l; |
209 | 0 | uint8_t* dest_for_jth_element; |
210 | 0 | __m128i xmm0[16], xmm1[16]; |
211 | |
|
212 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
213 | | /* Advance the offset into the type by the vector size (in bytes), unless this is |
214 | | the initial iteration and the type size is not a multiple of the vector size. |
215 | | In that case, only advance by the number of bytes necessary so that the number |
216 | | of remaining bytes in the type will be a multiple of the vector size. */ |
217 | 0 | int32_t offset_into_type; |
218 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
219 | 0 | offset_into_type += (offset_into_type == 0 && |
220 | 0 | vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) { |
221 | | |
222 | | /* Fetch elements in groups of 256 bytes */ |
223 | 0 | const uint8_t* const src_with_offset = src + offset_into_type; |
224 | 0 | for (k = 0; k < 16; k++) { |
225 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype)); |
226 | 0 | } |
227 | | /* Transpose bytes */ |
228 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
229 | 0 | xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]); |
230 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]); |
231 | 0 | } |
232 | | /* Transpose words */ |
233 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
234 | 0 | if ((k % 2) == 0) l += 2; |
235 | 0 | xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]); |
236 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]); |
237 | 0 | } |
238 | | /* Transpose double words */ |
239 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
240 | 0 | if ((k % 4) == 0) l += 4; |
241 | 0 | xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]); |
242 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]); |
243 | 0 | } |
244 | | /* Transpose quad words */ |
245 | 0 | for (k = 0; k < 8; k++) { |
246 | 0 | xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]); |
247 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]); |
248 | 0 | } |
249 | | /* Store the result vectors */ |
250 | 0 | dest_for_jth_element = dest + j; |
251 | 0 | for (k = 0; k < 16; k++) { |
252 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]); |
253 | 0 | } |
254 | 0 | } |
255 | 0 | } |
256 | 0 | } |
257 | | |
258 | | /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ |
259 | | static void |
260 | | unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src, |
261 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
262 | 0 | static const int32_t bytesoftype = 2; |
263 | 0 | int32_t i; |
264 | 0 | int j; |
265 | 0 | __m128i xmm0[2], xmm1[2]; |
266 | |
|
267 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
268 | | /* Load 16 elements (32 bytes) into 2 XMM registers. */ |
269 | 0 | const uint8_t* const src_for_ith_element = src + i; |
270 | 0 | for (j = 0; j < 2; j++) { |
271 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
272 | 0 | } |
273 | | /* Shuffle bytes */ |
274 | | /* Compute the low 32 bytes */ |
275 | 0 | xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]); |
276 | | /* Compute the hi 32 bytes */ |
277 | 0 | xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]); |
278 | | /* Store the result vectors in proper order */ |
279 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
280 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]); |
281 | 0 | } |
282 | 0 | } |
283 | | |
284 | | /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ |
285 | | static void |
286 | | unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src, |
287 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
288 | 0 | static const int32_t bytesoftype = 4; |
289 | 0 | int32_t i; |
290 | 0 | int j; |
291 | 0 | __m128i xmm0[4], xmm1[4]; |
292 | |
|
293 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
294 | | /* Load 16 elements (64 bytes) into 4 XMM registers. */ |
295 | 0 | const uint8_t* const src_for_ith_element = src + i; |
296 | 0 | for (j = 0; j < 4; j++) { |
297 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
298 | 0 | } |
299 | | /* Shuffle bytes */ |
300 | 0 | for (j = 0; j < 2; j++) { |
301 | | /* Compute the low 32 bytes */ |
302 | 0 | xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
303 | | /* Compute the hi 32 bytes */ |
304 | 0 | xmm1[2 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
305 | 0 | } |
306 | | /* Shuffle 2-byte words */ |
307 | 0 | for (j = 0; j < 2; j++) { |
308 | | /* Compute the low 32 bytes */ |
309 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
310 | | /* Compute the hi 32 bytes */ |
311 | 0 | xmm0[2 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
312 | 0 | } |
313 | | /* Store the result vectors in proper order */ |
314 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]); |
315 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]); |
316 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]); |
317 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]); |
318 | 0 | } |
319 | 0 | } |
320 | | |
321 | | /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ |
322 | | static void |
323 | | unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src, |
324 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
325 | 0 | static const int32_t bytesoftype = 8; |
326 | 0 | int32_t i; |
327 | 0 | int j; |
328 | 0 | __m128i xmm0[8], xmm1[8]; |
329 | |
|
330 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
331 | | /* Load 16 elements (128 bytes) into 8 XMM registers. */ |
332 | 0 | const uint8_t* const src_for_ith_element = src + i; |
333 | 0 | for (j = 0; j < 8; j++) { |
334 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
335 | 0 | } |
336 | | /* Shuffle bytes */ |
337 | 0 | for (j = 0; j < 4; j++) { |
338 | | /* Compute the low 32 bytes */ |
339 | 0 | xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
340 | | /* Compute the hi 32 bytes */ |
341 | 0 | xmm1[4 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
342 | 0 | } |
343 | | /* Shuffle 2-byte words */ |
344 | 0 | for (j = 0; j < 4; j++) { |
345 | | /* Compute the low 32 bytes */ |
346 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
347 | | /* Compute the hi 32 bytes */ |
348 | 0 | xmm0[4 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
349 | 0 | } |
350 | | /* Shuffle 4-byte dwords */ |
351 | 0 | for (j = 0; j < 4; j++) { |
352 | | /* Compute the low 32 bytes */ |
353 | 0 | xmm1[j] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
354 | | /* Compute the hi 32 bytes */ |
355 | 0 | xmm1[4 + j] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
356 | 0 | } |
357 | | /* Store the result vectors in proper order */ |
358 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
359 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]); |
360 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]); |
361 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]); |
362 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]); |
363 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]); |
364 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]); |
365 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]); |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ |
370 | | static void |
371 | | unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src, |
372 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
373 | 0 | static const int32_t bytesoftype = 16; |
374 | 0 | int32_t i; |
375 | 0 | int j; |
376 | 0 | __m128i xmm1[16], xmm2[16]; |
377 | |
|
378 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
379 | | /* Load 16 elements (256 bytes) into 16 XMM registers. */ |
380 | 0 | const uint8_t* const src_for_ith_element = src + i; |
381 | 0 | for (j = 0; j < 16; j++) { |
382 | 0 | xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
383 | 0 | } |
384 | | /* Shuffle bytes */ |
385 | 0 | for (j = 0; j < 8; j++) { |
386 | | /* Compute the low 32 bytes */ |
387 | 0 | xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
388 | | /* Compute the hi 32 bytes */ |
389 | 0 | xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
390 | 0 | } |
391 | | /* Shuffle 2-byte words */ |
392 | 0 | for (j = 0; j < 8; j++) { |
393 | | /* Compute the low 32 bytes */ |
394 | 0 | xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
395 | | /* Compute the hi 32 bytes */ |
396 | 0 | xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
397 | 0 | } |
398 | | /* Shuffle 4-byte dwords */ |
399 | 0 | for (j = 0; j < 8; j++) { |
400 | | /* Compute the low 32 bytes */ |
401 | 0 | xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
402 | | /* Compute the hi 32 bytes */ |
403 | 0 | xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
404 | 0 | } |
405 | | /* Shuffle 8-byte qwords */ |
406 | 0 | for (j = 0; j < 8; j++) { |
407 | | /* Compute the low 32 bytes */ |
408 | 0 | xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
409 | | /* Compute the hi 32 bytes */ |
410 | 0 | xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
411 | 0 | } |
412 | | |
413 | | /* Store the result vectors in proper order */ |
414 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
415 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]); |
416 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]); |
417 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]); |
418 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]); |
419 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]); |
420 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]); |
421 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]); |
422 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]); |
423 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]); |
424 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]); |
425 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]); |
426 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]); |
427 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]); |
428 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]); |
429 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]); |
430 | 0 | } |
431 | 0 | } |
432 | | |
433 | | /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ |
434 | | static void |
435 | | unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig, |
436 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
437 | 0 | int32_t i; |
438 | 0 | const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i); |
439 | |
|
440 | 0 | int j; |
441 | 0 | uint8_t* dest_with_offset; |
442 | 0 | __m128i xmm1[16], xmm2[16]; |
443 | | |
444 | | /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2) |
445 | | to optimize cache utilization. */ |
446 | 0 | int32_t offset_into_type; |
447 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
448 | 0 | offset_into_type += (offset_into_type == 0 && |
449 | 0 | vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) { |
450 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
451 | | /* Load the first 128 bytes in 16 XMM registers */ |
452 | 0 | const uint8_t* const src_for_ith_element = orig + i; |
453 | 0 | for (j = 0; j < 16; j++) { |
454 | 0 | xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); |
455 | 0 | } |
456 | | /* Shuffle bytes */ |
457 | 0 | for (j = 0; j < 8; j++) { |
458 | | /* Compute the low 32 bytes */ |
459 | 0 | xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
460 | | /* Compute the hi 32 bytes */ |
461 | 0 | xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
462 | 0 | } |
463 | | /* Shuffle 2-byte words */ |
464 | 0 | for (j = 0; j < 8; j++) { |
465 | | /* Compute the low 32 bytes */ |
466 | 0 | xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
467 | | /* Compute the hi 32 bytes */ |
468 | 0 | xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
469 | 0 | } |
470 | | /* Shuffle 4-byte dwords */ |
471 | 0 | for (j = 0; j < 8; j++) { |
472 | | /* Compute the low 32 bytes */ |
473 | 0 | xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
474 | | /* Compute the hi 32 bytes */ |
475 | 0 | xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
476 | 0 | } |
477 | | /* Shuffle 8-byte qwords */ |
478 | 0 | for (j = 0; j < 8; j++) { |
479 | | /* Compute the low 32 bytes */ |
480 | 0 | xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
481 | | /* Compute the hi 32 bytes */ |
482 | 0 | xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
483 | 0 | } |
484 | | |
485 | | /* Store the result vectors in proper order */ |
486 | 0 | dest_with_offset = dest + offset_into_type; |
487 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]); |
488 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]); |
489 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]); |
490 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]); |
491 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]); |
492 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]); |
493 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]); |
494 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]); |
495 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]); |
496 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]); |
497 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]); |
498 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]); |
499 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]); |
500 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]); |
501 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]); |
502 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]); |
503 | 0 | } |
504 | 0 | } |
505 | 0 | } |
506 | | |
507 | | /* Shuffle a block. This can never fail. */ |
508 | | void |
509 | | shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
510 | 0 | const uint8_t *_src, uint8_t *_dest) { |
511 | 0 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i); |
512 | | /* If the blocksize is not a multiple of both the typesize and |
513 | | the vector size, round the blocksize down to the next value |
514 | | which is a multiple of both. The vectorized shuffle can be |
515 | | used for that portion of the data, and the naive implementation |
516 | | can be used for the remaining portion. */ |
517 | 0 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
518 | 0 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
519 | 0 | const int32_t total_elements = blocksize / bytesoftype; |
520 | | |
521 | | /* If the block size is too small to be vectorized, |
522 | | use the generic implementation. */ |
523 | 0 | if (blocksize < vectorized_chunk_size) { |
524 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
525 | 0 | return; |
526 | 0 | } |
527 | | |
528 | | /* Optimized shuffle implementations */ |
529 | 0 | switch (bytesoftype) { |
530 | 0 | case 2: |
531 | 0 | shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements); |
532 | 0 | break; |
533 | 0 | case 4: |
534 | 0 | shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements); |
535 | 0 | break; |
536 | 0 | case 8: |
537 | 0 | shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements); |
538 | 0 | break; |
539 | 0 | case 16: |
540 | 0 | shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements); |
541 | 0 | break; |
542 | 0 | default: |
543 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
544 | 0 | shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
545 | 0 | } |
546 | 0 | else { |
547 | | /* Non-optimized shuffle */ |
548 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
549 | | /* The non-optimized function covers the whole buffer, |
550 | | so we're done processing here. */ |
551 | 0 | return; |
552 | 0 | } |
553 | 0 | } |
554 | | |
555 | | /* If the buffer had any bytes at the end which couldn't be handled |
556 | | by the vectorized implementations, use the non-optimized version |
557 | | to finish them up. */ |
558 | 0 | if (vectorizable_bytes < blocksize) { |
559 | 0 | shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
560 | 0 | } |
561 | 0 | } |
562 | | |
563 | | /* Unshuffle a block. This can never fail. */ |
564 | | void |
565 | | unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
566 | 0 | const uint8_t *_src, uint8_t *_dest) { |
567 | 0 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i); |
568 | | /* If the blocksize is not a multiple of both the typesize and |
569 | | the vector size, round the blocksize down to the next value |
570 | | which is a multiple of both. The vectorized unshuffle can be |
571 | | used for that portion of the data, and the naive implementation |
572 | | can be used for the remaining portion. */ |
573 | 0 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
574 | 0 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
575 | 0 | const int32_t total_elements = blocksize / bytesoftype; |
576 | | |
577 | | /* If the block size is too small to be vectorized, |
578 | | use the generic implementation. */ |
579 | 0 | if (blocksize < vectorized_chunk_size) { |
580 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
581 | 0 | return; |
582 | 0 | } |
583 | | |
584 | | /* Optimized unshuffle implementations */ |
585 | 0 | switch (bytesoftype) { |
586 | 0 | case 2: |
587 | 0 | unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements); |
588 | 0 | break; |
589 | 0 | case 4: |
590 | 0 | unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements); |
591 | 0 | break; |
592 | 0 | case 8: |
593 | 0 | unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements); |
594 | 0 | break; |
595 | 0 | case 16: |
596 | 0 | unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements); |
597 | 0 | break; |
598 | 0 | default: |
599 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
600 | 0 | unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
601 | 0 | } |
602 | 0 | else { |
603 | | /* Non-optimized unshuffle */ |
604 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
605 | | /* The non-optimized function covers the whole buffer, |
606 | | so we're done processing here. */ |
607 | 0 | return; |
608 | 0 | } |
609 | 0 | } |
610 | | |
611 | | /* If the buffer had any bytes at the end which couldn't be handled |
612 | | by the vectorized implementations, use the non-optimized version |
613 | | to finish them up. */ |
614 | 0 | if (vectorizable_bytes < blocksize) { |
615 | 0 | unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
616 | 0 | } |
617 | 0 | } |
618 | | |
619 | | const bool is_shuffle_sse2 = true; |
620 | | |
621 | | #else /* defined(__SSE2__) */ |
622 | | |
623 | | const bool is_shuffle_sse2 = false; |
624 | | |
625 | | void shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
626 | | const uint8_t *_src, uint8_t *_dest) { |
627 | | abort(); |
628 | | } |
629 | | |
630 | | void unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
631 | | const uint8_t *_src, uint8_t *_dest) { |
632 | | abort(); |
633 | | } |
634 | | |
635 | | #endif /* defined(__SSE2__) */ |