/src/c-blosc2/blosc/shuffle-sse2.c
Line | Count | Source |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | #include "shuffle-sse2.h" |
12 | | #include "shuffle-generic.h" |
13 | | #include <stdlib.h> |
14 | | |
15 | | /* Make sure SSE2 is available for the compilation target and compiler. */ |
16 | | #if defined(__SSE2__) |
17 | | |
18 | | #include <emmintrin.h> |
19 | | |
20 | | #include <stdint.h> |
21 | | |
22 | | /* The next is useful for debugging purposes */ |
23 | | #if 0 |
24 | | #include <stdio.h> |
25 | | #include <string.h> |
26 | | |
27 | | static void printxmm(__m128i xmm0) |
28 | | { |
29 | | uint8_t buf[16]; |
30 | | |
31 | | ((__m128i *)buf)[0] = xmm0; |
32 | | printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
33 | | buf[0], buf[1], buf[2], buf[3], |
34 | | buf[4], buf[5], buf[6], buf[7], |
35 | | buf[8], buf[9], buf[10], buf[11], |
36 | | buf[12], buf[13], buf[14], buf[15]); |
37 | | } |
38 | | #endif |
39 | | |
40 | | |
41 | | /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ |
42 | | static void |
43 | | shuffle2_sse2(uint8_t* const dest, const uint8_t* const src, |
44 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
45 | 0 | static const int32_t bytesoftype = 2; |
46 | 0 | int32_t j; |
47 | 0 | int k; |
48 | 0 | uint8_t* dest_for_jth_element; |
49 | 0 | __m128i xmm0[2], xmm1[2]; |
50 | |
|
51 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
52 | | /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */ |
53 | 0 | for (k = 0; k < 2; k++) { |
54 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
55 | 0 | xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); |
56 | 0 | xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); |
57 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
58 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
59 | 0 | xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); |
60 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
61 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
62 | 0 | xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); |
63 | 0 | xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); |
64 | 0 | } |
65 | | /* Transpose quad words */ |
66 | 0 | for (k = 0; k < 1; k++) { |
67 | 0 | xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]); |
68 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]); |
69 | 0 | } |
70 | | /* Store the result vectors */ |
71 | 0 | dest_for_jth_element = dest + j; |
72 | 0 | for (k = 0; k < 2; k++) { |
73 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]); |
74 | 0 | } |
75 | 0 | } |
76 | 0 | } |
77 | | |
78 | | /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ |
79 | | static void |
80 | | shuffle4_sse2(uint8_t* const dest, const uint8_t* const src, |
81 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
82 | 0 | static const int32_t bytesoftype = 4; |
83 | 0 | int32_t i; |
84 | 0 | int j; |
85 | 0 | uint8_t* dest_for_ith_element; |
86 | 0 | __m128i xmm0[4], xmm1[4]; |
87 | |
|
88 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
89 | | /* Fetch 16 elements (64 bytes) then transpose bytes and words. */ |
90 | 0 | for (j = 0; j < 4; j++) { |
91 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i)))); |
92 | 0 | xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8); |
93 | 0 | xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d); |
94 | 0 | xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]); |
95 | 0 | xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e); |
96 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]); |
97 | 0 | } |
98 | | /* Transpose double words */ |
99 | 0 | for (j = 0; j < 2; j++) { |
100 | 0 | xmm1[j * 2] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
101 | 0 | xmm1[j * 2 + 1] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
102 | 0 | } |
103 | | /* Transpose quad words */ |
104 | 0 | for (j = 0; j < 2; j++) { |
105 | 0 | xmm0[j * 2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j + 2]); |
106 | 0 | xmm0[j * 2 + 1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j + 2]); |
107 | 0 | } |
108 | | /* Store the result vectors */ |
109 | 0 | dest_for_ith_element = dest + i; |
110 | 0 | for (j = 0; j < 4; j++) { |
111 | 0 | _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]); |
112 | 0 | } |
113 | 0 | } |
114 | 0 | } |
115 | | |
116 | | /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ |
117 | | static void |
118 | | shuffle8_sse2(uint8_t* const dest, const uint8_t* const src, |
119 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
120 | 0 | static const int32_t bytesoftype = 8; |
121 | 0 | int32_t j; |
122 | 0 | int k, l; |
123 | 0 | uint8_t* dest_for_jth_element; |
124 | 0 | __m128i xmm0[8], xmm1[8]; |
125 | |
|
126 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
127 | | /* Fetch 16 elements (128 bytes) then transpose bytes. */ |
128 | 0 | for (k = 0; k < 8; k++) { |
129 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
130 | 0 | xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); |
131 | 0 | xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); |
132 | 0 | } |
133 | | /* Transpose words */ |
134 | 0 | for (k = 0, l = 0; k < 4; k++, l += 2) { |
135 | 0 | xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 1]); |
136 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 1]); |
137 | 0 | } |
138 | | /* Transpose double words */ |
139 | 0 | for (k = 0, l = 0; k < 4; k++, l++) { |
140 | 0 | if (k == 2) l += 2; |
141 | 0 | xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 2]); |
142 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 2]); |
143 | 0 | } |
144 | | /* Transpose quad words */ |
145 | 0 | for (k = 0; k < 4; k++) { |
146 | 0 | xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 4]); |
147 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 4]); |
148 | 0 | } |
149 | | /* Store the result vectors */ |
150 | 0 | dest_for_jth_element = dest + j; |
151 | 0 | for (k = 0; k < 8; k++) { |
152 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); |
153 | 0 | } |
154 | 0 | } |
155 | 0 | } |
156 | | |
157 | | /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ |
158 | | static void |
159 | | shuffle16_sse2(uint8_t* const dest, const uint8_t* const src, |
160 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
161 | 0 | static const int32_t bytesoftype = 16; |
162 | 0 | int32_t j; |
163 | 0 | int k, l; |
164 | 0 | uint8_t* dest_for_jth_element; |
165 | 0 | __m128i xmm0[16], xmm1[16]; |
166 | |
|
167 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
168 | | /* Fetch 16 elements (256 bytes). */ |
169 | 0 | for (k = 0; k < 16; k++) { |
170 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); |
171 | 0 | } |
172 | | /* Transpose bytes */ |
173 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
174 | 0 | xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]); |
175 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]); |
176 | 0 | } |
177 | | /* Transpose words */ |
178 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
179 | 0 | if ((k % 2) == 0) l += 2; |
180 | 0 | xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]); |
181 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]); |
182 | 0 | } |
183 | | /* Transpose double words */ |
184 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
185 | 0 | if ((k % 4) == 0) l += 4; |
186 | 0 | xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]); |
187 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]); |
188 | 0 | } |
189 | | /* Transpose quad words */ |
190 | 0 | for (k = 0; k < 8; k++) { |
191 | 0 | xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]); |
192 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]); |
193 | 0 | } |
194 | | /* Store the result vectors */ |
195 | 0 | dest_for_jth_element = dest + j; |
196 | 0 | for (k = 0; k < 16; k++) { |
197 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); |
198 | 0 | } |
199 | 0 | } |
200 | 0 | } |
201 | | |
202 | | /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ |
203 | | static void |
204 | | shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src, |
205 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
206 | 0 | int32_t j; |
207 | 0 | const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i); |
208 | 0 | int k, l; |
209 | 0 | uint8_t* dest_for_jth_element; |
210 | 0 | __m128i xmm0[16], xmm1[16]; |
211 | |
|
212 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { |
213 | | /* Advance the offset into the type by the vector size (in bytes), unless this is |
214 | | the initial iteration and the type size is not a multiple of the vector size. |
215 | | In that case, only advance by the number of bytes necessary so that the number |
216 | | of remaining bytes in the type will be a multiple of the vector size. */ |
217 | 0 | int32_t offset_into_type; |
218 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
219 | 0 | offset_into_type += (offset_into_type == 0 && |
220 | 0 | vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) { |
221 | | |
222 | | /* Fetch elements in groups of 256 bytes */ |
223 | 0 | const uint8_t* const src_with_offset = src + offset_into_type; |
224 | 0 | for (k = 0; k < 16; k++) { |
225 | 0 | xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype)); |
226 | 0 | } |
227 | | /* Transpose bytes */ |
228 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
229 | 0 | xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]); |
230 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]); |
231 | 0 | } |
232 | | /* Transpose words */ |
233 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
234 | 0 | if ((k % 2) == 0) l += 2; |
235 | 0 | xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]); |
236 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]); |
237 | 0 | } |
238 | | /* Transpose double words */ |
239 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
240 | 0 | if ((k % 4) == 0) l += 4; |
241 | 0 | xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]); |
242 | 0 | xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]); |
243 | 0 | } |
244 | | /* Transpose quad words */ |
245 | 0 | for (k = 0; k < 8; k++) { |
246 | 0 | xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]); |
247 | 0 | xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]); |
248 | 0 | } |
249 | | /* Store the result vectors */ |
250 | 0 | dest_for_jth_element = dest + j; |
251 | 0 | for (k = 0; k < 16; k++) { |
252 | 0 | _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]); |
253 | 0 | } |
254 | 0 | } |
255 | 0 | } |
256 | 0 | } |
257 | | |
258 | | /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ |
259 | | static void |
260 | | unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src, |
261 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
262 | 0 | static const int32_t bytesoftype = 2; |
263 | 0 | int32_t i; |
264 | 0 | int j; |
265 | 0 | __m128i xmm0[2], xmm1[2]; |
266 | |
|
267 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
268 | | /* Load 16 elements (32 bytes) into 2 XMM registers. */ |
269 | 0 | const uint8_t* const src_for_ith_element = src + i; |
270 | 0 | for (j = 0; j < 2; j++) { |
271 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
272 | 0 | } |
273 | | /* Shuffle bytes */ |
274 | | /* Compute the low 32 bytes */ |
275 | 0 | xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]); |
276 | | /* Compute the hi 32 bytes */ |
277 | 0 | xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]); |
278 | | /* Store the result vectors in proper order */ |
279 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
280 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]); |
281 | 0 | } |
282 | 0 | } |
283 | | |
284 | | /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ |
285 | | static void |
286 | | unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src, |
287 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
288 | 0 | static const int32_t bytesoftype = 4; |
289 | 0 | int32_t i; |
290 | 0 | int j; |
291 | 0 | __m128i xmm0[4], xmm1[4]; |
292 | |
|
293 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
294 | | /* Load 16 elements (64 bytes) into 4 XMM registers. */ |
295 | 0 | const uint8_t* const src_for_ith_element = src + i; |
296 | 0 | for (j = 0; j < 4; j++) { |
297 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
298 | 0 | } |
299 | | /* Shuffle bytes */ |
300 | 0 | for (j = 0; j < 2; j++) { |
301 | | /* Compute the low 32 bytes */ |
302 | 0 | xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
303 | | /* Compute the hi 32 bytes */ |
304 | 0 | xmm1[2 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
305 | 0 | } |
306 | | /* Shuffle 2-byte words */ |
307 | 0 | for (j = 0; j < 2; j++) { |
308 | | /* Compute the low 32 bytes */ |
309 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
310 | | /* Compute the hi 32 bytes */ |
311 | 0 | xmm0[2 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
312 | 0 | } |
313 | | /* Store the result vectors in proper order */ |
314 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]); |
315 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]); |
316 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]); |
317 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]); |
318 | 0 | } |
319 | 0 | } |
320 | | |
321 | | /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ |
322 | | static void |
323 | | unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src, |
324 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
325 | 0 | static const int32_t bytesoftype = 8; |
326 | 0 | int32_t i; |
327 | 0 | int j; |
328 | 0 | __m128i xmm0[8], xmm1[8]; |
329 | |
|
330 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
331 | | /* Load 16 elements (128 bytes) into 8 XMM registers. */ |
332 | 0 | const uint8_t* const src_for_ith_element = src + i; |
333 | 0 | for (j = 0; j < 8; j++) { |
334 | 0 | xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
335 | 0 | } |
336 | | /* Shuffle bytes */ |
337 | 0 | for (j = 0; j < 4; j++) { |
338 | | /* Compute the low 32 bytes */ |
339 | 0 | xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
340 | | /* Compute the hi 32 bytes */ |
341 | 0 | xmm1[4 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); |
342 | 0 | } |
343 | | /* Shuffle 2-byte words */ |
344 | 0 | for (j = 0; j < 4; j++) { |
345 | | /* Compute the low 32 bytes */ |
346 | 0 | xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
347 | | /* Compute the hi 32 bytes */ |
348 | 0 | xmm0[4 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); |
349 | 0 | } |
350 | | /* Shuffle 4-byte dwords */ |
351 | 0 | for (j = 0; j < 4; j++) { |
352 | | /* Compute the low 32 bytes */ |
353 | 0 | xmm1[j] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
354 | | /* Compute the hi 32 bytes */ |
355 | 0 | xmm1[4 + j] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); |
356 | 0 | } |
357 | | /* Store the result vectors in proper order */ |
358 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
359 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]); |
360 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]); |
361 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]); |
362 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]); |
363 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]); |
364 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]); |
365 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]); |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | /* Routine optimized for unshuffling a buffer for a type size of 12 bytes. */ |
370 | | /* Based on the 16-byte implementation */ |
371 | | static void |
372 | | unshuffle12_sse2(uint8_t* const dest, const uint8_t* const src, |
373 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
374 | 0 | static const int32_t bytesoftype = 12; |
375 | 0 | int32_t i; |
376 | 0 | int j; |
377 | 0 | __m128i xmm1[16], xmm2[16]; |
378 | |
|
379 | 0 | __m128i mask = _mm_set_epi8( 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff); |
380 | |
|
381 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
382 | | /* Load 12 elements (192 bytes) into 12 XMM registers. */ |
383 | 0 | const uint8_t* const src_for_ith_element = src + i; |
384 | 0 | for (j = 0; j < bytesoftype; j++) { |
385 | 0 | xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
386 | 0 | } |
387 | | /* Initialize the last 4 registers (64 bytes) to null */ |
388 | 0 | for (j = bytesoftype; j < 16; j++) { |
389 | 0 | xmm1[j] = _mm_setzero_si128(); |
390 | 0 | } |
391 | | /* Shuffle bytes */ |
392 | 0 | for (j = 0; j < 8; j++) { |
393 | | /* Compute the low 32 bytes */ |
394 | 0 | xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
395 | | /* Compute the hi 32 bytes */ |
396 | 0 | xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
397 | 0 | } |
398 | | /* Shuffle 2-byte words */ |
399 | 0 | for (j = 0; j < 8; j++) { |
400 | | /* Compute the low 32 bytes */ |
401 | 0 | xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
402 | | /* Compute the hi 32 bytes */ |
403 | 0 | xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
404 | 0 | } |
405 | | /* Shuffle 4-byte dwords */ |
406 | 0 | for (j = 0; j < 8; j++) { |
407 | | /* Compute the low 32 bytes */ |
408 | 0 | xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
409 | | /* Compute the hi 32 bytes */ |
410 | 0 | xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
411 | 0 | } |
412 | | /* Shuffle 8-byte qwords */ |
413 | 0 | for (j = 0; j < 8; j++) { |
414 | | /* Compute the low 32 bytes */ |
415 | 0 | xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
416 | | /* Compute the hi 32 bytes */ |
417 | 0 | xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
418 | 0 | } |
419 | | |
420 | | |
421 | | /* Store the result vectors in proper order */ |
422 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * 12)), xmm1[0]); |
423 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * 12)), xmm1[8]); |
424 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * 12)), xmm1[4]); |
425 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * 12)), xmm1[12]); |
426 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * 12)), xmm1[2]); |
427 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * 12)), xmm1[10]); |
428 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * 12)), xmm1[6]); |
429 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * 12)), xmm1[14]); |
430 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * 12)), xmm1[1]); |
431 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * 12)), xmm1[9]); |
432 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * 12)), xmm1[5]); |
433 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * 12)), xmm1[13]); |
434 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * 12)), xmm1[3]); |
435 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * 12)), xmm1[11]); |
436 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * 12)), xmm1[7]); |
437 | 0 | _mm_maskmoveu_si128(xmm1[15], mask, (char *)(dest + (i * bytesoftype) + (15 * 12))); |
438 | 0 | } |
439 | 0 | } |
440 | | |
441 | | /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ |
442 | | static void |
443 | | unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src, |
444 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
445 | 0 | static const int32_t bytesoftype = 16; |
446 | 0 | int32_t i; |
447 | 0 | int j; |
448 | 0 | __m128i xmm1[16], xmm2[16]; |
449 | |
|
450 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
451 | | /* Load 16 elements (256 bytes) into 16 XMM registers. */ |
452 | 0 | const uint8_t* const src_for_ith_element = src + i; |
453 | 0 | for (j = 0; j < 16; j++) { |
454 | 0 | xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); |
455 | 0 | } |
456 | | /* Shuffle bytes */ |
457 | 0 | for (j = 0; j < 8; j++) { |
458 | | /* Compute the low 32 bytes */ |
459 | 0 | xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
460 | | /* Compute the hi 32 bytes */ |
461 | 0 | xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
462 | 0 | } |
463 | | /* Shuffle 2-byte words */ |
464 | 0 | for (j = 0; j < 8; j++) { |
465 | | /* Compute the low 32 bytes */ |
466 | 0 | xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
467 | | /* Compute the hi 32 bytes */ |
468 | 0 | xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
469 | 0 | } |
470 | | /* Shuffle 4-byte dwords */ |
471 | 0 | for (j = 0; j < 8; j++) { |
472 | | /* Compute the low 32 bytes */ |
473 | 0 | xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
474 | | /* Compute the hi 32 bytes */ |
475 | 0 | xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
476 | 0 | } |
477 | | /* Shuffle 8-byte qwords */ |
478 | 0 | for (j = 0; j < 8; j++) { |
479 | | /* Compute the low 32 bytes */ |
480 | 0 | xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
481 | | /* Compute the hi 32 bytes */ |
482 | 0 | xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
483 | 0 | } |
484 | | |
485 | | /* Store the result vectors in proper order */ |
486 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); |
487 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]); |
488 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]); |
489 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]); |
490 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]); |
491 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]); |
492 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]); |
493 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]); |
494 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]); |
495 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]); |
496 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]); |
497 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]); |
498 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]); |
499 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]); |
500 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]); |
501 | 0 | _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]); |
502 | 0 | } |
503 | 0 | } |
504 | | |
505 | | /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ |
506 | | static void |
507 | | unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig, |
508 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
509 | 0 | int32_t i; |
510 | 0 | const int32_t vecs_per_el_rem = bytesoftype % (int32_t)sizeof(__m128i); |
511 | |
|
512 | 0 | int j; |
513 | 0 | uint8_t* dest_with_offset; |
514 | 0 | __m128i xmm1[16], xmm2[16]; |
515 | | |
516 | | /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2) |
517 | | to optimize cache utilization. */ |
518 | 0 | int32_t offset_into_type; |
519 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
520 | 0 | offset_into_type += (offset_into_type == 0 && |
521 | 0 | vecs_per_el_rem > 0 ? vecs_per_el_rem : (int32_t)sizeof(__m128i))) { |
522 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { |
523 | | /* Load the first 128 bytes in 16 XMM registers */ |
524 | 0 | const uint8_t* const src_for_ith_element = orig + i; |
525 | 0 | for (j = 0; j < 16; j++) { |
526 | 0 | xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); |
527 | 0 | } |
528 | | /* Shuffle bytes */ |
529 | 0 | for (j = 0; j < 8; j++) { |
530 | | /* Compute the low 32 bytes */ |
531 | 0 | xmm2[j] = _mm_unpacklo_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
532 | | /* Compute the hi 32 bytes */ |
533 | 0 | xmm2[8 + j] = _mm_unpackhi_epi8(xmm1[j * 2], xmm1[j * 2 + 1]); |
534 | 0 | } |
535 | | /* Shuffle 2-byte words */ |
536 | 0 | for (j = 0; j < 8; j++) { |
537 | | /* Compute the low 32 bytes */ |
538 | 0 | xmm1[j] = _mm_unpacklo_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
539 | | /* Compute the hi 32 bytes */ |
540 | 0 | xmm1[8 + j] = _mm_unpackhi_epi16(xmm2[j * 2], xmm2[j * 2 + 1]); |
541 | 0 | } |
542 | | /* Shuffle 4-byte dwords */ |
543 | 0 | for (j = 0; j < 8; j++) { |
544 | | /* Compute the low 32 bytes */ |
545 | 0 | xmm2[j] = _mm_unpacklo_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
546 | | /* Compute the hi 32 bytes */ |
547 | 0 | xmm2[8 + j] = _mm_unpackhi_epi32(xmm1[j * 2], xmm1[j * 2 + 1]); |
548 | 0 | } |
549 | | /* Shuffle 8-byte qwords */ |
550 | 0 | for (j = 0; j < 8; j++) { |
551 | | /* Compute the low 32 bytes */ |
552 | 0 | xmm1[j] = _mm_unpacklo_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
553 | | /* Compute the hi 32 bytes */ |
554 | 0 | xmm1[8 + j] = _mm_unpackhi_epi64(xmm2[j * 2], xmm2[j * 2 + 1]); |
555 | 0 | } |
556 | | |
557 | | /* Store the result vectors in proper order */ |
558 | 0 | dest_with_offset = dest + offset_into_type; |
559 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]); |
560 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]); |
561 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]); |
562 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]); |
563 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]); |
564 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]); |
565 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]); |
566 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]); |
567 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]); |
568 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]); |
569 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]); |
570 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]); |
571 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]); |
572 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]); |
573 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]); |
574 | 0 | _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]); |
575 | 0 | } |
576 | 0 | } |
577 | 0 | } |
578 | | |
579 | | /* Shuffle a block. This can never fail. */ |
580 | | void |
581 | | shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
582 | 0 | const uint8_t *_src, uint8_t *_dest) { |
583 | 0 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i); |
584 | | /* If the blocksize is not a multiple of both the typesize and |
585 | | the vector size, round the blocksize down to the next value |
586 | | which is a multiple of both. The vectorized shuffle can be |
587 | | used for that portion of the data, and the naive implementation |
588 | | can be used for the remaining portion. */ |
589 | 0 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
590 | 0 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
591 | 0 | const int32_t total_elements = blocksize / bytesoftype; |
592 | | |
593 | | /* If the block size is too small to be vectorized, |
594 | | use the generic implementation. */ |
595 | 0 | if (blocksize < vectorized_chunk_size) { |
596 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
597 | 0 | return; |
598 | 0 | } |
599 | | |
600 | | /* Optimized shuffle implementations */ |
601 | 0 | switch (bytesoftype) { |
602 | 0 | case 2: |
603 | 0 | shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements); |
604 | 0 | break; |
605 | 0 | case 4: |
606 | 0 | shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements); |
607 | 0 | break; |
608 | 0 | case 8: |
609 | 0 | shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements); |
610 | 0 | break; |
611 | 0 | case 16: |
612 | 0 | shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements); |
613 | 0 | break; |
614 | 0 | default: |
615 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
616 | 0 | shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
617 | 0 | } |
618 | 0 | else { |
619 | | /* Non-optimized shuffle */ |
620 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
621 | | /* The non-optimized function covers the whole buffer, |
622 | | so we're done processing here. */ |
623 | 0 | return; |
624 | 0 | } |
625 | 0 | } |
626 | | |
627 | | /* If the buffer had any bytes at the end which couldn't be handled |
628 | | by the vectorized implementations, use the non-optimized version |
629 | | to finish them up. */ |
630 | 0 | if (vectorizable_bytes < blocksize) { |
631 | 0 | shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
632 | 0 | } |
633 | 0 | } |
634 | | |
635 | | /* Unshuffle a block. This can never fail. */ |
636 | | void |
637 | | unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
638 | 0 | const uint8_t *_src, uint8_t *_dest) { |
639 | 0 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m128i); |
640 | | /* If the blocksize is not a multiple of both the typesize and |
641 | | the vector size, round the blocksize down to the next value |
642 | | which is a multiple of both. The vectorized unshuffle can be |
643 | | used for that portion of the data, and the naive implementation |
644 | | can be used for the remaining portion. */ |
645 | 0 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
646 | 0 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
647 | 0 | const int32_t total_elements = blocksize / bytesoftype; |
648 | | |
649 | | /* If the block size is too small to be vectorized, |
650 | | use the generic implementation. */ |
651 | 0 | if (blocksize < vectorized_chunk_size) { |
652 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
653 | 0 | return; |
654 | 0 | } |
655 | | |
656 | | /* Optimized unshuffle implementations */ |
657 | 0 | switch (bytesoftype) { |
658 | 0 | case 2: |
659 | 0 | unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements); |
660 | 0 | break; |
661 | 0 | case 4: |
662 | 0 | unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements); |
663 | 0 | break; |
664 | 0 | case 8: |
665 | 0 | unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements); |
666 | 0 | break; |
667 | 0 | case 12: |
668 | 0 | unshuffle12_sse2(_dest, _src, vectorizable_elements, total_elements); |
669 | 0 | break; |
670 | 0 | case 16: |
671 | 0 | unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements); |
672 | 0 | break; |
673 | 0 | default: |
674 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
675 | 0 | unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
676 | 0 | } |
677 | 0 | else { |
678 | | /* Non-optimized unshuffle */ |
679 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
680 | | /* The non-optimized function covers the whole buffer, |
681 | | so we're done processing here. */ |
682 | 0 | return; |
683 | 0 | } |
684 | 0 | } |
685 | | |
686 | | /* If the buffer had any bytes at the end which couldn't be handled |
687 | | by the vectorized implementations, use the non-optimized version |
688 | | to finish them up. */ |
689 | 0 | if (vectorizable_bytes < blocksize) { |
690 | 0 | unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
691 | 0 | } |
692 | 0 | } |
693 | | |
694 | | const bool is_shuffle_sse2 = true; |
695 | | |
696 | | #else /* defined(__SSE2__) */ |
697 | | |
698 | | const bool is_shuffle_sse2 = false; |
699 | | |
700 | | void shuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
701 | | const uint8_t *_src, uint8_t *_dest) { |
702 | | abort(); |
703 | | } |
704 | | |
705 | | void unshuffle_sse2(const int32_t bytesoftype, const int32_t blocksize, |
706 | | const uint8_t *_src, uint8_t *_dest) { |
707 | | abort(); |
708 | | } |
709 | | |
710 | | #endif /* defined(__SSE2__) */ |