/src/c-blosc/blosc/shuffle-avx2.c
Line | Count | Source |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Author: Francesc Alted <francesc@blosc.org> |
5 | | |
6 | | See LICENSE.txt for details about copyright and rights to use. |
7 | | **********************************************************************/ |
8 | | |
9 | | #include "shuffle-generic.h" |
10 | | #include "shuffle-avx2.h" |
11 | | |
12 | | /* Define dummy functions if AVX2 is not available for the compilation target and compiler. */ |
13 | | #if !defined(__AVX2__) |
14 | | #include <stdlib.h> |
15 | | |
16 | | void |
17 | | blosc_internal_shuffle_avx2(const size_t bytesoftype, const size_t blocksize, |
18 | | const uint8_t* const _src, uint8_t* const _dest) { |
19 | | abort(); |
20 | | } |
21 | | |
22 | | void |
23 | | blosc_internal_unshuffle_avx2(const size_t bytesoftype, const size_t blocksize, |
24 | | const uint8_t* const _src, uint8_t* const _dest) { |
25 | | abort(); |
26 | | } |
27 | | |
28 | | #else /* defined(__AVX2__) */ |
29 | | |
30 | | #include <immintrin.h> |
31 | | |
32 | | |
33 | | /* The next is useful for debugging purposes */ |
34 | | #if 0 |
35 | | #include <stdio.h> |
36 | | #include <string.h> |
37 | | |
38 | | static void printymm(__m256i ymm0) |
39 | | { |
40 | | uint8_t buf[32]; |
41 | | |
42 | | ((__m256i *)buf)[0] = ymm0; |
43 | | printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
44 | | buf[0], buf[1], buf[2], buf[3], |
45 | | buf[4], buf[5], buf[6], buf[7], |
46 | | buf[8], buf[9], buf[10], buf[11], |
47 | | buf[12], buf[13], buf[14], buf[15], |
48 | | buf[16], buf[17], buf[18], buf[19], |
49 | | buf[20], buf[21], buf[22], buf[23], |
50 | | buf[24], buf[25], buf[26], buf[27], |
51 | | buf[28], buf[29], buf[30], buf[31]); |
52 | | } |
53 | | #endif |
54 | | |
55 | | /* GCC doesn't include the split load/store intrinsics |
56 | | needed for the tiled shuffle, so define them here. */ |
57 | | #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) |
58 | | static inline __m256i |
59 | | __attribute__((__always_inline__)) |
60 | | _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) |
61 | | { |
62 | | return _mm256_inserti128_si256( |
63 | | _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1); |
64 | | } |
65 | | |
66 | | static inline void |
67 | | __attribute__((__always_inline__)) |
68 | | _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) |
69 | | { |
70 | | _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a)); |
71 | | _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1)); |
72 | | } |
73 | | #endif /* defined(__GNUC__) */ |
74 | | |
75 | | /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ |
76 | | static void |
77 | | shuffle2_avx2(uint8_t* const dest, const uint8_t* const src, |
78 | | const size_t vectorizable_elements, const size_t total_elements) |
79 | 0 | { |
80 | 0 | static const size_t bytesoftype = 2; |
81 | 0 | size_t j; |
82 | 0 | int k; |
83 | 0 | __m256i ymm0[2], ymm1[2]; |
84 | | |
85 | | /* Create the shuffle mask. |
86 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
87 | | most to least significant (i.e., their order is reversed when compared to |
88 | | loading the mask from an array). */ |
89 | 0 | const __m256i shmask = _mm256_set_epi8( |
90 | 0 | 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, |
91 | 0 | 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, |
92 | 0 | 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, |
93 | 0 | 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00); |
94 | |
|
95 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
96 | | /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */ |
97 | 0 | for (k = 0; k < 2; k++) { |
98 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
99 | 0 | ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
100 | 0 | } |
101 | |
|
102 | 0 | ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8); |
103 | 0 | ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d); |
104 | |
|
105 | 0 | ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0); |
106 | 0 | ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f); |
107 | 0 | ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e); |
108 | | |
109 | | /* Store the result vectors */ |
110 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
111 | 0 | for (k = 0; k < 2; k++) { |
112 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]); |
113 | 0 | } |
114 | 0 | } |
115 | 0 | } |
116 | | |
117 | | /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ |
118 | | static void |
119 | | shuffle4_avx2(uint8_t* const dest, const uint8_t* const src, |
120 | | const size_t vectorizable_elements, const size_t total_elements) |
121 | 0 | { |
122 | 0 | static const size_t bytesoftype = 4; |
123 | 0 | size_t i; |
124 | 0 | int j; |
125 | 0 | __m256i ymm0[4], ymm1[4]; |
126 | | |
127 | | /* Create the shuffle mask. |
128 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
129 | | most to least significant (i.e., their order is reversed when compared to |
130 | | loading the mask from an array). */ |
131 | 0 | const __m256i mask = _mm256_set_epi32( |
132 | 0 | 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); |
133 | |
|
134 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
135 | | /* Fetch 32 elements (128 bytes) then transpose bytes and words. */ |
136 | 0 | for (j = 0; j < 4; j++) { |
137 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i)))); |
138 | 0 | ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8); |
139 | 0 | ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d); |
140 | 0 | ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]); |
141 | 0 | ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e); |
142 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]); |
143 | 0 | } |
144 | | /* Transpose double words */ |
145 | 0 | for (j = 0; j < 2; j++) { |
146 | 0 | ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); |
147 | 0 | ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); |
148 | 0 | } |
149 | | /* Transpose quad words */ |
150 | 0 | for (j = 0; j < 2; j++) { |
151 | 0 | ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]); |
152 | 0 | ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]); |
153 | 0 | } |
154 | 0 | for (j = 0; j < 4; j++) { |
155 | 0 | ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask); |
156 | 0 | } |
157 | | /* Store the result vectors */ |
158 | 0 | uint8_t* const dest_for_ith_element = dest + i; |
159 | 0 | for (j = 0; j < 4; j++) { |
160 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]); |
161 | 0 | } |
162 | 0 | } |
163 | 0 | } |
164 | | |
165 | | /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ |
166 | | static void |
167 | | shuffle8_avx2(uint8_t* const dest, const uint8_t* const src, |
168 | | const size_t vectorizable_elements, const size_t total_elements) |
169 | 0 | { |
170 | 0 | static const size_t bytesoftype = 8; |
171 | 0 | size_t j; |
172 | 0 | int k, l; |
173 | 0 | __m256i ymm0[8], ymm1[8]; |
174 | |
|
175 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
176 | | /* Fetch 32 elements (256 bytes) then transpose bytes. */ |
177 | 0 | for (k = 0; k < 8; k++) { |
178 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
179 | 0 | ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e); |
180 | 0 | ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]); |
181 | 0 | } |
182 | | /* Transpose words */ |
183 | 0 | for (k = 0, l = 0; k < 4; k++, l +=2) { |
184 | 0 | ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+1]); |
185 | 0 | ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+1]); |
186 | 0 | } |
187 | | /* Transpose double words */ |
188 | 0 | for (k = 0, l = 0; k < 4; k++, l++) { |
189 | 0 | if (k == 2) l += 2; |
190 | 0 | ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+2]); |
191 | 0 | ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+2]); |
192 | 0 | } |
193 | | /* Transpose quad words */ |
194 | 0 | for (k = 0; k < 4; k++) { |
195 | 0 | ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+4]); |
196 | 0 | ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+4]); |
197 | 0 | } |
198 | 0 | for(k = 0; k < 8; k++) { |
199 | 0 | ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72); |
200 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8); |
201 | 0 | ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]); |
202 | 0 | } |
203 | | /* Store the result vectors */ |
204 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
205 | 0 | for (k = 0; k < 8; k++) { |
206 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); |
207 | 0 | } |
208 | 0 | } |
209 | 0 | } |
210 | | |
211 | | /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ |
212 | | static void |
213 | | shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, |
214 | | const size_t vectorizable_elements, const size_t total_elements) |
215 | 0 | { |
216 | 0 | static const size_t bytesoftype = 16; |
217 | 0 | size_t j; |
218 | 0 | int k, l; |
219 | 0 | __m256i ymm0[16], ymm1[16]; |
220 | | |
221 | | /* Create the shuffle mask. |
222 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
223 | | most to least significant (i.e., their order is reversed when compared to |
224 | | loading the mask from an array). */ |
225 | 0 | const __m256i shmask = _mm256_set_epi8( |
226 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
227 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, |
228 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
229 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); |
230 | |
|
231 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
232 | | /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ |
233 | 0 | for (k = 0; k < 16; k++) { |
234 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
235 | 0 | } |
236 | | /* Transpose bytes */ |
237 | 0 | for (k = 0, l = 0; k < 8; k++, l +=2) { |
238 | 0 | ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); |
239 | 0 | ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); |
240 | 0 | } |
241 | | /* Transpose words */ |
242 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
243 | 0 | if ((k%2) == 0) l += 2; |
244 | 0 | ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); |
245 | 0 | ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); |
246 | 0 | } |
247 | | /* Transpose double words */ |
248 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
249 | 0 | if ((k%4) == 0) l += 4; |
250 | 0 | ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); |
251 | 0 | ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); |
252 | 0 | } |
253 | | /* Transpose quad words */ |
254 | 0 | for (k = 0; k < 8; k++) { |
255 | 0 | ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); |
256 | 0 | ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); |
257 | 0 | } |
258 | 0 | for (k = 0; k < 16; k++) { |
259 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); |
260 | 0 | ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
261 | 0 | } |
262 | | /* Store the result vectors */ |
263 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
264 | 0 | for (k = 0; k < 16; k++) { |
265 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); |
266 | 0 | } |
267 | 0 | } |
268 | 0 | } |
269 | | |
270 | | /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ |
271 | | static void |
272 | | shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, |
273 | | const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) |
274 | 0 | { |
275 | 0 | size_t j; |
276 | 0 | int k, l; |
277 | 0 | __m256i ymm0[16], ymm1[16]; |
278 | |
|
279 | 0 | const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); |
280 | | |
281 | | /* Create the shuffle mask. |
282 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
283 | | most to least significant (i.e., their order is reversed when compared to |
284 | | loading the mask from an array). */ |
285 | 0 | const __m256i shmask = _mm256_set_epi8( |
286 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
287 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, |
288 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
289 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); |
290 | |
|
291 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
292 | | /* Advance the offset into the type by the vector size (in bytes), unless this is |
293 | | the initial iteration and the type size is not a multiple of the vector size. |
294 | | In that case, only advance by the number of bytes necessary so that the number |
295 | | of remaining bytes in the type will be a multiple of the vector size. */ |
296 | 0 | size_t offset_into_type; |
297 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
298 | 0 | offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { |
299 | | |
300 | | /* Fetch elements in groups of 512 bytes */ |
301 | 0 | const uint8_t* const src_with_offset = src + offset_into_type; |
302 | 0 | for (k = 0; k < 16; k++) { |
303 | 0 | ymm0[k] = _mm256_loadu2_m128i( |
304 | 0 | (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype), |
305 | 0 | (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype)); |
306 | 0 | } |
307 | | /* Transpose bytes */ |
308 | 0 | for (k = 0, l = 0; k < 8; k++, l +=2) { |
309 | 0 | ymm1[k*2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l+1]); |
310 | 0 | ymm1[k*2+1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l+1]); |
311 | 0 | } |
312 | | /* Transpose words */ |
313 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
314 | 0 | if ((k%2) == 0) l += 2; |
315 | 0 | ymm0[k*2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l+2]); |
316 | 0 | ymm0[k*2+1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l+2]); |
317 | 0 | } |
318 | | /* Transpose double words */ |
319 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
320 | 0 | if ((k%4) == 0) l += 4; |
321 | 0 | ymm1[k*2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l+4]); |
322 | 0 | ymm1[k*2+1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l+4]); |
323 | 0 | } |
324 | | /* Transpose quad words */ |
325 | 0 | for (k = 0; k < 8; k++) { |
326 | 0 | ymm0[k*2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k+8]); |
327 | 0 | ymm0[k*2+1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k+8]); |
328 | 0 | } |
329 | 0 | for (k = 0; k < 16; k++) { |
330 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); |
331 | 0 | ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
332 | 0 | } |
333 | | /* Store the result vectors */ |
334 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
335 | 0 | for (k = 0; k < 16; k++) { |
336 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]); |
337 | 0 | } |
338 | 0 | } |
339 | 0 | } |
340 | 0 | } |
341 | | |
342 | | /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ |
343 | | static void |
344 | | unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src, |
345 | | const size_t vectorizable_elements, const size_t total_elements) |
346 | 434 | { |
347 | 434 | static const size_t bytesoftype = 2; |
348 | 434 | size_t i; |
349 | 434 | int j; |
350 | 434 | __m256i ymm0[2], ymm1[2]; |
351 | | |
352 | 49.9k | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
353 | | /* Load 32 elements (64 bytes) into 2 YMM registers. */ |
354 | 49.5k | const uint8_t* const src_for_ith_element = src + i; |
355 | 148k | for (j = 0; j < 2; j++) { |
356 | 99.1k | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
357 | 99.1k | } |
358 | | /* Shuffle bytes */ |
359 | 148k | for (j = 0; j < 2; j++) { |
360 | 99.1k | ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); |
361 | 99.1k | } |
362 | | /* Compute the low 64 bytes */ |
363 | 49.5k | ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]); |
364 | | /* Compute the hi 64 bytes */ |
365 | 49.5k | ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]); |
366 | | /* Store the result vectors in proper order */ |
367 | 49.5k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
368 | 49.5k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]); |
369 | 49.5k | } |
370 | 434 | } |
371 | | |
372 | | /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ |
373 | | static void |
374 | | unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src, |
375 | | const size_t vectorizable_elements, const size_t total_elements) |
376 | 227 | { |
377 | 227 | static const size_t bytesoftype = 4; |
378 | 227 | size_t i; |
379 | 227 | int j; |
380 | 227 | __m256i ymm0[4], ymm1[4]; |
381 | | |
382 | 11.9k | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
383 | | /* Load 32 elements (128 bytes) into 4 YMM registers. */ |
384 | 11.7k | const uint8_t* const src_for_ith_element = src + i; |
385 | 58.5k | for (j = 0; j < 4; j++) { |
386 | 46.8k | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
387 | 46.8k | } |
388 | | /* Shuffle bytes */ |
389 | 35.1k | for (j = 0; j < 2; j++) { |
390 | | /* Compute the low 64 bytes */ |
391 | 23.4k | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); |
392 | | /* Compute the hi 64 bytes */ |
393 | 23.4k | ymm1[2+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); |
394 | 23.4k | } |
395 | | /* Shuffle 2-byte words */ |
396 | 35.1k | for (j = 0; j < 2; j++) { |
397 | | /* Compute the low 64 bytes */ |
398 | 23.4k | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); |
399 | | /* Compute the hi 64 bytes */ |
400 | 23.4k | ymm0[2+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); |
401 | 23.4k | } |
402 | 11.7k | ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20); |
403 | 11.7k | ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20); |
404 | 11.7k | ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31); |
405 | 11.7k | ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31); |
406 | | |
407 | | /* Store the result vectors in proper order */ |
408 | 58.5k | for (j = 0; j < 4; j++) { |
409 | 46.8k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]); |
410 | 46.8k | } |
411 | 11.7k | } |
412 | 227 | } |
413 | | |
414 | | /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ |
415 | | static void |
416 | | unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src, |
417 | | const size_t vectorizable_elements, const size_t total_elements) |
418 | 267 | { |
419 | 267 | static const size_t bytesoftype = 8; |
420 | 267 | size_t i; |
421 | 267 | int j; |
422 | 267 | __m256i ymm0[8], ymm1[8]; |
423 | | |
424 | 2.17k | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
425 | | /* Fetch 32 elements (256 bytes) into 8 YMM registers. */ |
426 | 1.90k | const uint8_t* const src_for_ith_element = src + i; |
427 | 17.1k | for (j = 0; j < 8; j++) { |
428 | 15.2k | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
429 | 15.2k | } |
430 | | /* Shuffle bytes */ |
431 | 9.53k | for (j = 0; j < 4; j++) { |
432 | | /* Compute the low 32 bytes */ |
433 | 7.62k | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); |
434 | | /* Compute the hi 32 bytes */ |
435 | 7.62k | ymm1[4+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); |
436 | 7.62k | } |
437 | | /* Shuffle words */ |
438 | 9.53k | for (j = 0; j < 4; j++) { |
439 | | /* Compute the low 32 bytes */ |
440 | 7.62k | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); |
441 | | /* Compute the hi 32 bytes */ |
442 | 7.62k | ymm0[4+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); |
443 | 7.62k | } |
444 | 17.1k | for (j = 0; j < 8; j++) { |
445 | 15.2k | ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); |
446 | 15.2k | } |
447 | | |
448 | | /* Shuffle 4-byte dwords */ |
449 | 9.53k | for (j = 0; j < 4; j++) { |
450 | | /* Compute the low 32 bytes */ |
451 | 7.62k | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); |
452 | | /* Compute the hi 32 bytes */ |
453 | 7.62k | ymm1[4+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); |
454 | 7.62k | } |
455 | | |
456 | | /* Store the result vectors in proper order */ |
457 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
458 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]); |
459 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]); |
460 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]); |
461 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]); |
462 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]); |
463 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]); |
464 | 1.90k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); |
465 | 1.90k | } |
466 | 267 | } |
467 | | |
468 | | /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ |
469 | | static void |
470 | | unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src, |
471 | | const size_t vectorizable_elements, const size_t total_elements) |
472 | 359 | { |
473 | 359 | static const size_t bytesoftype = 16; |
474 | 359 | size_t i; |
475 | 359 | int j; |
476 | 359 | __m256i ymm0[16], ymm1[16]; |
477 | | |
478 | 4.47k | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
479 | | /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ |
480 | 4.11k | const uint8_t* const src_for_ith_element = src + i; |
481 | 69.8k | for (j = 0; j < 16; j++) { |
482 | 65.7k | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
483 | 65.7k | } |
484 | | |
485 | | /* Shuffle bytes */ |
486 | 36.9k | for (j = 0; j < 8; j++) { |
487 | | /* Compute the low 32 bytes */ |
488 | 32.8k | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); |
489 | | /* Compute the hi 32 bytes */ |
490 | 32.8k | ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); |
491 | 32.8k | } |
492 | | /* Shuffle 2-byte words */ |
493 | 36.9k | for (j = 0; j < 8; j++) { |
494 | | /* Compute the low 32 bytes */ |
495 | 32.8k | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); |
496 | | /* Compute the hi 32 bytes */ |
497 | 32.8k | ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); |
498 | 32.8k | } |
499 | | /* Shuffle 4-byte dwords */ |
500 | 36.9k | for (j = 0; j < 8; j++) { |
501 | | /* Compute the low 32 bytes */ |
502 | 32.8k | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); |
503 | | /* Compute the hi 32 bytes */ |
504 | 32.8k | ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); |
505 | 32.8k | } |
506 | | |
507 | | /* Shuffle 8-byte qwords */ |
508 | 36.9k | for (j = 0; j < 8; j++) { |
509 | | /* Compute the low 32 bytes */ |
510 | 32.8k | ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); |
511 | | /* Compute the hi 32 bytes */ |
512 | 32.8k | ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); |
513 | 32.8k | } |
514 | | |
515 | 36.9k | for (j = 0; j < 8; j++) { |
516 | 32.8k | ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); |
517 | 32.8k | ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); |
518 | 32.8k | } |
519 | | |
520 | | /* Store the result vectors in proper order */ |
521 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
522 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]); |
523 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]); |
524 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]); |
525 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]); |
526 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]); |
527 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]); |
528 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); |
529 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]); |
530 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]); |
531 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]); |
532 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]); |
533 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]); |
534 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]); |
535 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]); |
536 | 4.11k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]); |
537 | 4.11k | } |
538 | 359 | } |
539 | | |
540 | | /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ |
541 | | static void |
542 | | unshuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, |
543 | | const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype) |
544 | 590 | { |
545 | 590 | size_t i; |
546 | 590 | int j; |
547 | 590 | __m256i ymm0[16], ymm1[16]; |
548 | | |
549 | 590 | const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); |
550 | | |
551 | | /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2) |
552 | | to optimize cache utilization. */ |
553 | 590 | size_t offset_into_type; |
554 | 2.31k | for (offset_into_type = 0; offset_into_type < bytesoftype; |
555 | 1.72k | offset_into_type += (offset_into_type == 0 && vecs_per_el.rem > 0 ? vecs_per_el.rem : sizeof(__m128i))) { |
556 | 6.83k | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
557 | | /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */ |
558 | 5.10k | const uint8_t* const src_for_ith_element = src + i; |
559 | 86.8k | for (j = 0; j < 16; j++) { |
560 | 81.6k | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); |
561 | 81.6k | } |
562 | | |
563 | | /* Shuffle bytes */ |
564 | 45.9k | for (j = 0; j < 8; j++) { |
565 | | /* Compute the low 32 bytes */ |
566 | 40.8k | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j*2], ymm0[j*2+1]); |
567 | | /* Compute the hi 32 bytes */ |
568 | 40.8k | ymm1[8+j] = _mm256_unpackhi_epi8(ymm0[j*2], ymm0[j*2+1]); |
569 | 40.8k | } |
570 | | /* Shuffle 2-byte words */ |
571 | 45.9k | for (j = 0; j < 8; j++) { |
572 | | /* Compute the low 32 bytes */ |
573 | 40.8k | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j*2], ymm1[j*2+1]); |
574 | | /* Compute the hi 32 bytes */ |
575 | 40.8k | ymm0[8+j] = _mm256_unpackhi_epi16(ymm1[j*2], ymm1[j*2+1]); |
576 | 40.8k | } |
577 | | /* Shuffle 4-byte dwords */ |
578 | 45.9k | for (j = 0; j < 8; j++) { |
579 | | /* Compute the low 32 bytes */ |
580 | 40.8k | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); |
581 | | /* Compute the hi 32 bytes */ |
582 | 40.8k | ymm1[8+j] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); |
583 | 40.8k | } |
584 | | |
585 | | /* Shuffle 8-byte qwords */ |
586 | 45.9k | for (j = 0; j < 8; j++) { |
587 | | /* Compute the low 32 bytes */ |
588 | 40.8k | ymm0[j] = _mm256_unpacklo_epi64(ymm1[j*2], ymm1[j*2+1]); |
589 | | /* Compute the hi 32 bytes */ |
590 | 40.8k | ymm0[8+j] = _mm256_unpackhi_epi64(ymm1[j*2], ymm1[j*2+1]); |
591 | 40.8k | } |
592 | | |
593 | 45.9k | for (j = 0; j < 8; j++) { |
594 | 40.8k | ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x20); |
595 | 40.8k | ymm1[j+8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j+8], 0x31); |
596 | 40.8k | } |
597 | | |
598 | | /* Store the result vectors in proper order */ |
599 | 5.10k | const uint8_t* const dest_with_offset = dest + offset_into_type; |
600 | 5.10k | _mm256_storeu2_m128i( |
601 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype), |
602 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]); |
603 | 5.10k | _mm256_storeu2_m128i( |
604 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype), |
605 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]); |
606 | 5.10k | _mm256_storeu2_m128i( |
607 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype), |
608 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]); |
609 | 5.10k | _mm256_storeu2_m128i( |
610 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype), |
611 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]); |
612 | 5.10k | _mm256_storeu2_m128i( |
613 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype), |
614 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]); |
615 | 5.10k | _mm256_storeu2_m128i( |
616 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype), |
617 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]); |
618 | 5.10k | _mm256_storeu2_m128i( |
619 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype), |
620 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]); |
621 | 5.10k | _mm256_storeu2_m128i( |
622 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype), |
623 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]); |
624 | 5.10k | _mm256_storeu2_m128i( |
625 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype), |
626 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]); |
627 | 5.10k | _mm256_storeu2_m128i( |
628 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype), |
629 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]); |
630 | 5.10k | _mm256_storeu2_m128i( |
631 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype), |
632 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]); |
633 | 5.10k | _mm256_storeu2_m128i( |
634 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype), |
635 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]); |
636 | 5.10k | _mm256_storeu2_m128i( |
637 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype), |
638 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]); |
639 | 5.10k | _mm256_storeu2_m128i( |
640 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype), |
641 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]); |
642 | 5.10k | _mm256_storeu2_m128i( |
643 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype), |
644 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]); |
645 | 5.10k | _mm256_storeu2_m128i( |
646 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype), |
647 | 5.10k | (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]); |
648 | 5.10k | } |
649 | 1.72k | } |
650 | 590 | } |
651 | | |
652 | | /* Shuffle a block. This can never fail. */ |
653 | | void |
654 | | blosc_internal_shuffle_avx2(const size_t bytesoftype, const size_t blocksize, |
655 | 0 | const uint8_t* const _src, uint8_t* const _dest) { |
656 | 0 | const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i); |
657 | | |
658 | | /* If the block size is too small to be vectorized, |
659 | | use the generic implementation. */ |
660 | 0 | if (blocksize < vectorized_chunk_size) { |
661 | 0 | blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest); |
662 | 0 | return; |
663 | 0 | } |
664 | | |
665 | | /* If the blocksize is not a multiple of both the typesize and |
666 | | the vector size, round the blocksize down to the next value |
667 | | which is a multiple of both. The vectorized shuffle can be |
668 | | used for that portion of the data, and the naive implementation |
669 | | can be used for the remaining portion. */ |
670 | 0 | const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
671 | |
|
672 | 0 | const size_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
673 | 0 | const size_t total_elements = blocksize / bytesoftype; |
674 | | |
675 | | /* Optimized shuffle implementations */ |
676 | 0 | switch (bytesoftype) |
677 | 0 | { |
678 | 0 | case 2: |
679 | 0 | shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements); |
680 | 0 | break; |
681 | 0 | case 4: |
682 | 0 | shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements); |
683 | 0 | break; |
684 | 0 | case 8: |
685 | 0 | shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements); |
686 | 0 | break; |
687 | 0 | case 16: |
688 | 0 | shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements); |
689 | 0 | break; |
690 | 0 | default: |
691 | | /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */ |
692 | 0 | if (bytesoftype > sizeof(__m128i)) { |
693 | 0 | shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
694 | 0 | } |
695 | 0 | else { |
696 | | /* Non-optimized shuffle */ |
697 | 0 | blosc_internal_shuffle_generic(bytesoftype, blocksize, _src, _dest); |
698 | | /* The non-optimized function covers the whole buffer, |
699 | | so we're done processing here. */ |
700 | 0 | return; |
701 | 0 | } |
702 | 0 | } |
703 | | |
704 | | /* If the buffer had any bytes at the end which couldn't be handled |
705 | | by the vectorized implementations, use the non-optimized version |
706 | | to finish them up. */ |
707 | 0 | if (vectorizable_bytes < blocksize) { |
708 | 0 | shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
709 | 0 | } |
710 | 0 | } |
711 | | |
712 | | /* Unshuffle a block. This can never fail. */ |
713 | | void |
714 | | blosc_internal_unshuffle_avx2(const size_t bytesoftype, const size_t blocksize, |
715 | 5.20k | const uint8_t* const _src, uint8_t* const _dest) { |
716 | 5.20k | const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i); |
717 | | |
718 | | /* If the block size is too small to be vectorized, |
719 | | use the generic implementation. */ |
720 | 5.20k | if (blocksize < vectorized_chunk_size) { |
721 | 3.11k | blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
722 | 3.11k | return; |
723 | 3.11k | } |
724 | | |
725 | | /* If the blocksize is not a multiple of both the typesize and |
726 | | the vector size, round the blocksize down to the next value |
727 | | which is a multiple of both. The vectorized unshuffle can be |
728 | | used for that portion of the data, and the naive implementation |
729 | | can be used for the remaining portion. */ |
730 | 2.09k | const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
731 | | |
732 | 2.09k | const size_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
733 | 2.09k | const size_t total_elements = blocksize / bytesoftype; |
734 | | |
735 | | /* Optimized unshuffle implementations */ |
736 | 2.09k | switch (bytesoftype) |
737 | 2.09k | { |
738 | 434 | case 2: |
739 | 434 | unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements); |
740 | 434 | break; |
741 | 227 | case 4: |
742 | 227 | unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements); |
743 | 227 | break; |
744 | 267 | case 8: |
745 | 267 | unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements); |
746 | 267 | break; |
747 | 359 | case 16: |
748 | 359 | unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements); |
749 | 359 | break; |
750 | 805 | default: |
751 | | /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */ |
752 | 805 | if (bytesoftype > sizeof(__m128i)) { |
753 | 590 | unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
754 | 590 | } |
755 | 215 | else { |
756 | | /* Non-optimized unshuffle */ |
757 | 215 | blosc_internal_unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
758 | | /* The non-optimized function covers the whole buffer, |
759 | | so we're done processing here. */ |
760 | 215 | return; |
761 | 215 | } |
762 | 2.09k | } |
763 | | |
764 | | /* If the buffer had any bytes at the end which couldn't be handled |
765 | | by the vectorized implementations, use the non-optimized version |
766 | | to finish them up. */ |
767 | 1.87k | if (vectorizable_bytes < blocksize) { |
768 | 902 | unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
769 | 902 | } |
770 | 1.87k | } |
771 | | |
772 | | #endif /* !defined(__AVX2__) */ |