/src/c-blosc2/blosc/shuffle-avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | #include "shuffle-avx2.h" |
12 | | #include "shuffle-generic.h" |
13 | | #include <stdlib.h> |
14 | | |
15 | | /* Make sure AVX2 is available for the compilation target and compiler. */ |
16 | | #if defined(__AVX2__) |
17 | | |
18 | | #include <immintrin.h> |
19 | | |
20 | | #include <stdint.h> |
21 | | |
22 | | /* The next is useful for debugging purposes */ |
23 | | #if 0 |
24 | | #include <stdio.h> |
25 | | #include <string.h> |
26 | | |
27 | | static void printymm(__m256i ymm0) |
28 | | { |
29 | | uint8_t buf[32]; |
30 | | |
31 | | ((__m256i *)buf)[0] = ymm0; |
32 | | printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
33 | | buf[0], buf[1], buf[2], buf[3], |
34 | | buf[4], buf[5], buf[6], buf[7], |
35 | | buf[8], buf[9], buf[10], buf[11], |
36 | | buf[12], buf[13], buf[14], buf[15], |
37 | | buf[16], buf[17], buf[18], buf[19], |
38 | | buf[20], buf[21], buf[22], buf[23], |
39 | | buf[24], buf[25], buf[26], buf[27], |
40 | | buf[28], buf[29], buf[30], buf[31]); |
41 | | } |
42 | | #endif |
43 | | |
44 | | /* GCC doesn't include the split load/store intrinsics |
45 | | needed for the tiled shuffle, so define them here. */ |
46 | | #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) |
47 | | static inline __m256i |
48 | | __attribute__((__always_inline__)) |
49 | | _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) { |
50 | | return _mm256_inserti128_si256( |
51 | | _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1); |
52 | | } |
53 | | |
54 | | static inline void |
55 | | __attribute__((__always_inline__)) |
56 | | _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) { |
57 | | _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a)); |
58 | | _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1)); |
59 | | } |
60 | | #endif /* defined(__GNUC__) */ |
61 | | |
62 | | /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ |
63 | | static void |
64 | | shuffle2_avx2(uint8_t* const dest, const uint8_t* const src, |
65 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
66 | 0 | static const int32_t bytesoftype = 2; |
67 | 0 | int32_t j; |
68 | 0 | int k; |
69 | 0 | __m256i ymm0[2], ymm1[2]; |
70 | | |
71 | | /* Create the shuffle mask. |
72 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
73 | | most to least significant (i.e., their order is reversed when compared to |
74 | | loading the mask from an array). */ |
75 | 0 | const __m256i shmask = _mm256_set_epi8( |
76 | 0 | 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, |
77 | 0 | 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, |
78 | 0 | 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, |
79 | 0 | 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00); |
80 | |
|
81 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
82 | | /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */ |
83 | 0 | for (k = 0; k < 2; k++) { |
84 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
85 | 0 | ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
86 | 0 | } |
87 | |
|
88 | 0 | ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8); |
89 | 0 | ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d); |
90 | |
|
91 | 0 | ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0); |
92 | 0 | ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f); |
93 | 0 | ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e); |
94 | | |
95 | | /* Store the result vectors */ |
96 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
97 | 0 | for (k = 0; k < 2; k++) { |
98 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]); |
99 | 0 | } |
100 | 0 | } |
101 | 0 | } |
102 | | |
103 | | /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ |
104 | | static void |
105 | | shuffle4_avx2(uint8_t* const dest, const uint8_t* const src, |
106 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
107 | 0 | static const int32_t bytesoftype = 4; |
108 | 0 | int32_t i; |
109 | 0 | int j; |
110 | 0 | __m256i ymm0[4], ymm1[4]; |
111 | | |
112 | | /* Create the shuffle mask. |
113 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
114 | | most to least significant (i.e., their order is reversed when compared to |
115 | | loading the mask from an array). */ |
116 | 0 | const __m256i mask = _mm256_set_epi32( |
117 | 0 | 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); |
118 | |
|
119 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
120 | | /* Fetch 32 elements (128 bytes) then transpose bytes and words. */ |
121 | 0 | for (j = 0; j < 4; j++) { |
122 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i)))); |
123 | 0 | ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8); |
124 | 0 | ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d); |
125 | 0 | ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]); |
126 | 0 | ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e); |
127 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]); |
128 | 0 | } |
129 | | /* Transpose double words */ |
130 | 0 | for (j = 0; j < 2; j++) { |
131 | 0 | ymm1[j * 2] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
132 | 0 | ymm1[j * 2 + 1] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
133 | 0 | } |
134 | | /* Transpose quad words */ |
135 | 0 | for (j = 0; j < 2; j++) { |
136 | 0 | ymm0[j * 2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j + 2]); |
137 | 0 | ymm0[j * 2 + 1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j + 2]); |
138 | 0 | } |
139 | 0 | for (j = 0; j < 4; j++) { |
140 | 0 | ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask); |
141 | 0 | } |
142 | | /* Store the result vectors */ |
143 | 0 | uint8_t* const dest_for_ith_element = dest + i; |
144 | 0 | for (j = 0; j < 4; j++) { |
145 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]); |
146 | 0 | } |
147 | 0 | } |
148 | 0 | } |
149 | | |
150 | | /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ |
151 | | static void |
152 | | shuffle8_avx2(uint8_t* const dest, const uint8_t* const src, |
153 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
154 | 0 | static const int32_t bytesoftype = 8; |
155 | 0 | int32_t j; |
156 | 0 | int k, l; |
157 | 0 | __m256i ymm0[8], ymm1[8]; |
158 | |
|
159 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
160 | | /* Fetch 32 elements (256 bytes) then transpose bytes. */ |
161 | 0 | for (k = 0; k < 8; k++) { |
162 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
163 | 0 | ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e); |
164 | 0 | ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]); |
165 | 0 | } |
166 | | /* Transpose words */ |
167 | 0 | for (k = 0, l = 0; k < 4; k++, l += 2) { |
168 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 1]); |
169 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 1]); |
170 | 0 | } |
171 | | /* Transpose double words */ |
172 | 0 | for (k = 0, l = 0; k < 4; k++, l++) { |
173 | 0 | if (k == 2) l += 2; |
174 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 2]); |
175 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 2]); |
176 | 0 | } |
177 | | /* Transpose quad words */ |
178 | 0 | for (k = 0; k < 4; k++) { |
179 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 4]); |
180 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 4]); |
181 | 0 | } |
182 | 0 | for (k = 0; k < 8; k++) { |
183 | 0 | ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72); |
184 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8); |
185 | 0 | ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]); |
186 | 0 | } |
187 | | /* Store the result vectors */ |
188 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
189 | 0 | for (k = 0; k < 8; k++) { |
190 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); |
191 | 0 | } |
192 | 0 | } |
193 | 0 | } |
194 | | |
195 | | /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ |
196 | | static void |
197 | | shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, |
198 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
199 | 0 | static const int32_t bytesoftype = 16; |
200 | 0 | int32_t j; |
201 | 0 | int k, l; |
202 | 0 | __m256i ymm0[16], ymm1[16]; |
203 | | |
204 | | /* Create the shuffle mask. |
205 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
206 | | most to least significant (i.e., their order is reversed when compared to |
207 | | loading the mask from an array). */ |
208 | 0 | const __m256i shmask = _mm256_set_epi8( |
209 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
210 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, |
211 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
212 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); |
213 | |
|
214 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
215 | | /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ |
216 | 0 | for (k = 0; k < 16; k++) { |
217 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
218 | 0 | } |
219 | | /* Transpose bytes */ |
220 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
221 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]); |
222 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]); |
223 | 0 | } |
224 | | /* Transpose words */ |
225 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
226 | 0 | if ((k % 2) == 0) l += 2; |
227 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]); |
228 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]); |
229 | 0 | } |
230 | | /* Transpose double words */ |
231 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
232 | 0 | if ((k % 4) == 0) l += 4; |
233 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]); |
234 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]); |
235 | 0 | } |
236 | | /* Transpose quad words */ |
237 | 0 | for (k = 0; k < 8; k++) { |
238 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]); |
239 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]); |
240 | 0 | } |
241 | 0 | for (k = 0; k < 16; k++) { |
242 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); |
243 | 0 | ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
244 | 0 | } |
245 | | /* Store the result vectors */ |
246 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
247 | 0 | for (k = 0; k < 16; k++) { |
248 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); |
249 | 0 | } |
250 | 0 | } |
251 | 0 | } |
252 | | |
253 | | /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ |
254 | | static void |
255 | | shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, |
256 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
257 | 0 | int32_t j; |
258 | 0 | int k, l; |
259 | 0 | __m256i ymm0[16], ymm1[16]; |
260 | |
|
261 | 0 | const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); |
262 | 0 | const int32_t vecs_rem = (int32_t)vecs_per_el.rem; |
263 | | |
264 | | /* Create the shuffle mask. |
265 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
266 | | most to least significant (i.e., their order is reversed when compared to |
267 | | loading the mask from an array). */ |
268 | 0 | const __m256i shmask = _mm256_set_epi8( |
269 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
270 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, |
271 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
272 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); |
273 | |
|
274 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
275 | | /* Advance the offset into the type by the vector size (in bytes), unless this is |
276 | | the initial iteration and the type size is not a multiple of the vector size. |
277 | | In that case, only advance by the number of bytes necessary so that the number |
278 | | of remaining bytes in the type will be a multiple of the vector size. */ |
279 | 0 | int32_t offset_into_type; |
280 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
281 | 0 | offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) { |
282 | | |
283 | | /* Fetch elements in groups of 512 bytes */ |
284 | 0 | const uint8_t* const src_with_offset = src + offset_into_type; |
285 | 0 | for (k = 0; k < 16; k++) { |
286 | 0 | ymm0[k] = _mm256_loadu2_m128i( |
287 | 0 | (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype), |
288 | 0 | (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype)); |
289 | 0 | } |
290 | | /* Transpose bytes */ |
291 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
292 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]); |
293 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]); |
294 | 0 | } |
295 | | /* Transpose words */ |
296 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
297 | 0 | if ((k % 2) == 0) l += 2; |
298 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]); |
299 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]); |
300 | 0 | } |
301 | | /* Transpose double words */ |
302 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
303 | 0 | if ((k % 4) == 0) l += 4; |
304 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]); |
305 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]); |
306 | 0 | } |
307 | | /* Transpose quad words */ |
308 | 0 | for (k = 0; k < 8; k++) { |
309 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]); |
310 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]); |
311 | 0 | } |
312 | 0 | for (k = 0; k < 16; k++) { |
313 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); |
314 | 0 | ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
315 | 0 | } |
316 | | /* Store the result vectors */ |
317 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
318 | 0 | for (k = 0; k < 16; k++) { |
319 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]); |
320 | 0 | } |
321 | 0 | } |
322 | 0 | } |
323 | 0 | } |
324 | | |
325 | | /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ |
326 | | static void |
327 | | unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src, |
328 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
329 | 0 | static const int32_t bytesoftype = 2; |
330 | 0 | int32_t i; |
331 | 0 | int j; |
332 | 0 | __m256i ymm0[2], ymm1[2]; |
333 | |
|
334 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
335 | | /* Load 32 elements (64 bytes) into 2 YMM registers. */ |
336 | 0 | const uint8_t* const src_for_ith_element = src + i; |
337 | 0 | for (j = 0; j < 2; j++) { |
338 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
339 | 0 | } |
340 | | /* Shuffle bytes */ |
341 | 0 | for (j = 0; j < 2; j++) { |
342 | 0 | ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); |
343 | 0 | } |
344 | | /* Compute the low 64 bytes */ |
345 | 0 | ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]); |
346 | | /* Compute the hi 64 bytes */ |
347 | 0 | ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]); |
348 | | /* Store the result vectors in proper order */ |
349 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
350 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]); |
351 | 0 | } |
352 | 0 | } |
353 | | |
354 | | /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ |
355 | | static void |
356 | | unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src, |
357 | 23 | const int32_t vectorizable_elements, const int32_t total_elements) { |
358 | 23 | static const int32_t bytesoftype = 4; |
359 | 23 | int32_t i; |
360 | 23 | int j; |
361 | 23 | __m256i ymm0[4], ymm1[4]; |
362 | | |
363 | 18.2k | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
364 | | /* Load 32 elements (128 bytes) into 4 YMM registers. */ |
365 | 18.2k | const uint8_t* const src_for_ith_element = src + i; |
366 | 91.0k | for (j = 0; j < 4; j++) { |
367 | 72.8k | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
368 | 72.8k | } |
369 | | /* Shuffle bytes */ |
370 | 54.6k | for (j = 0; j < 2; j++) { |
371 | | /* Compute the low 64 bytes */ |
372 | 36.4k | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
373 | | /* Compute the hi 64 bytes */ |
374 | 36.4k | ymm1[2 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
375 | 36.4k | } |
376 | | /* Shuffle 2-byte words */ |
377 | 54.6k | for (j = 0; j < 2; j++) { |
378 | | /* Compute the low 64 bytes */ |
379 | 36.4k | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
380 | | /* Compute the hi 64 bytes */ |
381 | 36.4k | ymm0[2 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
382 | 36.4k | } |
383 | 18.2k | ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20); |
384 | 18.2k | ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20); |
385 | 18.2k | ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31); |
386 | 18.2k | ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31); |
387 | | |
388 | | /* Store the result vectors in proper order */ |
389 | 91.0k | for (j = 0; j < 4; j++) { |
390 | 72.8k | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]); |
391 | 72.8k | } |
392 | 18.2k | } |
393 | 23 | } |
394 | | |
395 | | /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ |
396 | | static void |
397 | | unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src, |
398 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
399 | 0 | static const int32_t bytesoftype = 8; |
400 | 0 | int32_t i; |
401 | 0 | int j; |
402 | 0 | __m256i ymm0[8], ymm1[8]; |
403 | |
|
404 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
405 | | /* Fetch 32 elements (256 bytes) into 8 YMM registers. */ |
406 | 0 | const uint8_t* const src_for_ith_element = src + i; |
407 | 0 | for (j = 0; j < 8; j++) { |
408 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
409 | 0 | } |
410 | | /* Shuffle bytes */ |
411 | 0 | for (j = 0; j < 4; j++) { |
412 | | /* Compute the low 32 bytes */ |
413 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
414 | | /* Compute the hi 32 bytes */ |
415 | 0 | ymm1[4 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
416 | 0 | } |
417 | | /* Shuffle words */ |
418 | 0 | for (j = 0; j < 4; j++) { |
419 | | /* Compute the low 32 bytes */ |
420 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
421 | | /* Compute the hi 32 bytes */ |
422 | 0 | ymm0[4 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
423 | 0 | } |
424 | 0 | for (j = 0; j < 8; j++) { |
425 | 0 | ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); |
426 | 0 | } |
427 | | |
428 | | /* Shuffle 4-byte dwords */ |
429 | 0 | for (j = 0; j < 4; j++) { |
430 | | /* Compute the low 32 bytes */ |
431 | 0 | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
432 | | /* Compute the hi 32 bytes */ |
433 | 0 | ymm1[4 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
434 | 0 | } |
435 | | |
436 | | /* Store the result vectors in proper order */ |
437 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
438 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]); |
439 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]); |
440 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]); |
441 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]); |
442 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]); |
443 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]); |
444 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); |
445 | 0 | } |
446 | 0 | } |
447 | | |
448 | | /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ |
449 | | static void |
450 | | unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src, |
451 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
452 | 0 | static const int32_t bytesoftype = 16; |
453 | 0 | int32_t i; |
454 | 0 | int j; |
455 | 0 | __m256i ymm0[16], ymm1[16]; |
456 | |
|
457 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
458 | | /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ |
459 | 0 | const uint8_t* const src_for_ith_element = src + i; |
460 | 0 | for (j = 0; j < 16; j++) { |
461 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
462 | 0 | } |
463 | | |
464 | | /* Shuffle bytes */ |
465 | 0 | for (j = 0; j < 8; j++) { |
466 | | /* Compute the low 32 bytes */ |
467 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
468 | | /* Compute the hi 32 bytes */ |
469 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
470 | 0 | } |
471 | | /* Shuffle 2-byte words */ |
472 | 0 | for (j = 0; j < 8; j++) { |
473 | | /* Compute the low 32 bytes */ |
474 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
475 | | /* Compute the hi 32 bytes */ |
476 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
477 | 0 | } |
478 | | /* Shuffle 4-byte dwords */ |
479 | 0 | for (j = 0; j < 8; j++) { |
480 | | /* Compute the low 32 bytes */ |
481 | 0 | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
482 | | /* Compute the hi 32 bytes */ |
483 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
484 | 0 | } |
485 | | |
486 | | /* Shuffle 8-byte qwords */ |
487 | 0 | for (j = 0; j < 8; j++) { |
488 | | /* Compute the low 32 bytes */ |
489 | 0 | ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
490 | | /* Compute the hi 32 bytes */ |
491 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
492 | 0 | } |
493 | |
|
494 | 0 | for (j = 0; j < 8; j++) { |
495 | 0 | ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20); |
496 | 0 | ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31); |
497 | 0 | } |
498 | | |
499 | | /* Store the result vectors in proper order */ |
500 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
501 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]); |
502 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]); |
503 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]); |
504 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]); |
505 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]); |
506 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]); |
507 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); |
508 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]); |
509 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]); |
510 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]); |
511 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]); |
512 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]); |
513 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]); |
514 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]); |
515 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]); |
516 | 0 | } |
517 | 0 | } |
518 | | |
519 | | /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ |
520 | | static void |
521 | | unshuffle16_tiled_avx2(const uint8_t *dest, const uint8_t* const src, |
522 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
523 | 0 | int32_t i; |
524 | 0 | int j; |
525 | 0 | __m256i ymm0[16], ymm1[16]; |
526 | |
|
527 | 0 | const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); |
528 | 0 | const int32_t vecs_rem = (int32_t)vecs_per_el.rem; |
529 | | |
530 | | /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2) |
531 | | to optimize cache utilization. */ |
532 | 0 | int32_t offset_into_type; |
533 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
534 | 0 | offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) { |
535 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
536 | | /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */ |
537 | 0 | const uint8_t* const src_for_ith_element = src + i; |
538 | 0 | for (j = 0; j < 16; j++) { |
539 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); |
540 | 0 | } |
541 | | |
542 | | /* Shuffle bytes */ |
543 | 0 | for (j = 0; j < 8; j++) { |
544 | | /* Compute the low 32 bytes */ |
545 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
546 | | /* Compute the hi 32 bytes */ |
547 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
548 | 0 | } |
549 | | /* Shuffle 2-byte words */ |
550 | 0 | for (j = 0; j < 8; j++) { |
551 | | /* Compute the low 32 bytes */ |
552 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
553 | | /* Compute the hi 32 bytes */ |
554 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
555 | 0 | } |
556 | | /* Shuffle 4-byte dwords */ |
557 | 0 | for (j = 0; j < 8; j++) { |
558 | | /* Compute the low 32 bytes */ |
559 | 0 | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
560 | | /* Compute the hi 32 bytes */ |
561 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
562 | 0 | } |
563 | | |
564 | | /* Shuffle 8-byte qwords */ |
565 | 0 | for (j = 0; j < 8; j++) { |
566 | | /* Compute the low 32 bytes */ |
567 | 0 | ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
568 | | /* Compute the hi 32 bytes */ |
569 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
570 | 0 | } |
571 | |
|
572 | 0 | for (j = 0; j < 8; j++) { |
573 | 0 | ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20); |
574 | 0 | ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31); |
575 | 0 | } |
576 | | |
577 | | /* Store the result vectors in proper order */ |
578 | 0 | const uint8_t* const dest_with_offset = dest + offset_into_type; |
579 | 0 | _mm256_storeu2_m128i( |
580 | 0 | (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype), |
581 | 0 | (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]); |
582 | 0 | _mm256_storeu2_m128i( |
583 | 0 | (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype), |
584 | 0 | (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]); |
585 | 0 | _mm256_storeu2_m128i( |
586 | 0 | (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype), |
587 | 0 | (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]); |
588 | 0 | _mm256_storeu2_m128i( |
589 | 0 | (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype), |
590 | 0 | (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]); |
591 | 0 | _mm256_storeu2_m128i( |
592 | 0 | (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype), |
593 | 0 | (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]); |
594 | 0 | _mm256_storeu2_m128i( |
595 | 0 | (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype), |
596 | 0 | (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]); |
597 | 0 | _mm256_storeu2_m128i( |
598 | 0 | (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype), |
599 | 0 | (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]); |
600 | 0 | _mm256_storeu2_m128i( |
601 | 0 | (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype), |
602 | 0 | (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]); |
603 | 0 | _mm256_storeu2_m128i( |
604 | 0 | (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype), |
605 | 0 | (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]); |
606 | 0 | _mm256_storeu2_m128i( |
607 | 0 | (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype), |
608 | 0 | (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]); |
609 | 0 | _mm256_storeu2_m128i( |
610 | 0 | (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype), |
611 | 0 | (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]); |
612 | 0 | _mm256_storeu2_m128i( |
613 | 0 | (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype), |
614 | 0 | (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]); |
615 | 0 | _mm256_storeu2_m128i( |
616 | 0 | (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype), |
617 | 0 | (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]); |
618 | 0 | _mm256_storeu2_m128i( |
619 | 0 | (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype), |
620 | 0 | (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]); |
621 | 0 | _mm256_storeu2_m128i( |
622 | 0 | (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype), |
623 | 0 | (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]); |
624 | 0 | _mm256_storeu2_m128i( |
625 | 0 | (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype), |
626 | 0 | (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]); |
627 | 0 | } |
628 | 0 | } |
629 | 0 | } |
630 | | |
631 | | /* Shuffle a block. This can never fail. */ |
632 | | void |
633 | | shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
634 | 0 | const uint8_t *_src, uint8_t *_dest) { |
635 | 0 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i); |
636 | | |
637 | | /* If the block size is too small to be vectorized, |
638 | | use the generic implementation. */ |
639 | 0 | if (blocksize < vectorized_chunk_size) { |
640 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
641 | 0 | return; |
642 | 0 | } |
643 | | |
644 | | /* If the blocksize is not a multiple of both the typesize and |
645 | | the vector size, round the blocksize down to the next value |
646 | | which is a multiple of both. The vectorized shuffle can be |
647 | | used for that portion of the data, and the naive implementation |
648 | | can be used for the remaining portion. */ |
649 | 0 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
650 | |
|
651 | 0 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
652 | 0 | const int32_t total_elements = blocksize / bytesoftype; |
653 | | |
654 | | /* Optimized shuffle implementations */ |
655 | 0 | switch (bytesoftype) { |
656 | 0 | case 2: |
657 | 0 | shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements); |
658 | 0 | break; |
659 | 0 | case 4: |
660 | 0 | shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements); |
661 | 0 | break; |
662 | 0 | case 8: |
663 | 0 | shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements); |
664 | 0 | break; |
665 | 0 | case 16: |
666 | 0 | shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements); |
667 | 0 | break; |
668 | 0 | default: |
669 | | /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */ |
670 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
671 | 0 | shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
672 | 0 | } |
673 | 0 | else { |
674 | | /* Non-optimized shuffle */ |
675 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
676 | | /* The non-optimized function covers the whole buffer, |
677 | | so we're done processing here. */ |
678 | 0 | return; |
679 | 0 | } |
680 | 0 | } |
681 | | |
682 | | /* If the buffer had any bytes at the end which couldn't be handled |
683 | | by the vectorized implementations, use the non-optimized version |
684 | | to finish them up. */ |
685 | 0 | if (vectorizable_bytes < blocksize) { |
686 | 0 | shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
687 | 0 | } |
688 | 0 | } |
689 | | |
690 | | /* Unshuffle a block. This can never fail. */ |
691 | | void |
692 | | unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
693 | 23 | const uint8_t *_src, uint8_t *_dest) { |
694 | 23 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i); |
695 | | |
696 | | /* If the block size is too small to be vectorized, |
697 | | use the generic implementation. */ |
698 | 23 | if (blocksize < vectorized_chunk_size) { |
699 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
700 | 0 | return; |
701 | 0 | } |
702 | | |
703 | | /* If the blocksize is not a multiple of both the typesize and |
704 | | the vector size, round the blocksize down to the next value |
705 | | which is a multiple of both. The vectorized unshuffle can be |
706 | | used for that portion of the data, and the naive implementation |
707 | | can be used for the remaining portion. */ |
708 | 23 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
709 | | |
710 | 23 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
711 | 23 | const int32_t total_elements = blocksize / bytesoftype; |
712 | | |
713 | | /* Optimized unshuffle implementations */ |
714 | 23 | switch (bytesoftype) { |
715 | 0 | case 2: |
716 | 0 | unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements); |
717 | 0 | break; |
718 | 23 | case 4: |
719 | 23 | unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements); |
720 | 23 | break; |
721 | 0 | case 8: |
722 | 0 | unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements); |
723 | 0 | break; |
724 | 0 | case 16: |
725 | 0 | unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements); |
726 | 0 | break; |
727 | 0 | default: |
728 | | /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */ |
729 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
730 | 0 | unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
731 | 0 | } |
732 | 0 | else { |
733 | | /* Non-optimized unshuffle */ |
734 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
735 | | /* The non-optimized function covers the whole buffer, |
736 | | so we're done processing here. */ |
737 | 0 | return; |
738 | 0 | } |
739 | 23 | } |
740 | | |
741 | | /* If the buffer had any bytes at the end which couldn't be handled |
742 | | by the vectorized implementations, use the non-optimized version |
743 | | to finish them up. */ |
744 | 23 | if (vectorizable_bytes < blocksize) { |
745 | 11 | unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
746 | 11 | } |
747 | 23 | } |
748 | | |
749 | | const bool is_shuffle_avx2 = true; |
750 | | |
751 | | #else |
752 | | |
753 | | const bool is_shuffle_avx2 = false; |
754 | | |
755 | | void shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
756 | | const uint8_t *_src, uint8_t *_dest) { |
757 | | abort(); |
758 | | } |
759 | | |
760 | | void unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
761 | | const uint8_t *_src, uint8_t *_dest) { |
762 | | abort(); |
763 | | } |
764 | | |
765 | | #endif /* defined(__AVX2__) */ |