/src/c-blosc2/blosc/shuffle-avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | #include "shuffle-avx2.h" |
12 | | #include "shuffle-generic.h" |
13 | | #include <stdlib.h> |
14 | | |
15 | | /* Make sure AVX2 is available for the compilation target and compiler. */ |
16 | | #if defined(__AVX2__) |
17 | | |
18 | | #include <immintrin.h> |
19 | | |
20 | | #include <stdint.h> |
21 | | |
22 | | /* The next is useful for debugging purposes */ |
23 | | #if 0 |
24 | | #include <stdio.h> |
25 | | #include <string.h> |
26 | | |
27 | | static void printymm32(__m256i ymm0) |
28 | | { |
29 | | uint32_t buf[8]; |
30 | | |
31 | | ((__m256i *)buf)[0] = ymm0; |
32 | | fprintf(stderr, "%x,%x,%x,%x,%x,%x,%x,%x\n", |
33 | | buf[0], buf[1], buf[2], buf[3], |
34 | | buf[4], buf[5], buf[6], buf[7]); |
35 | | } |
36 | | |
37 | | static void printymm(__m256i ymm0) |
38 | | { |
39 | | uint8_t buf[32]; |
40 | | |
41 | | ((__m256i *)buf)[0] = ymm0; |
42 | | fprintf(stderr, "%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
43 | | buf[0], buf[1], buf[2], buf[3], |
44 | | buf[4], buf[5], buf[6], buf[7], |
45 | | buf[8], buf[9], buf[10], buf[11], |
46 | | buf[12], buf[13], buf[14], buf[15], |
47 | | buf[16], buf[17], buf[18], buf[19], |
48 | | buf[20], buf[21], buf[22], buf[23], |
49 | | buf[24], buf[25], buf[26], buf[27], |
50 | | buf[28], buf[29], buf[30], buf[31]); |
51 | | } |
52 | | #endif |
53 | | |
54 | | /* GCC doesn't include the split load/store intrinsics |
55 | | needed for the tiled shuffle, so define them here. */ |
56 | | #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) |
57 | | static inline __m256i |
58 | | __attribute__((__always_inline__)) |
59 | | _mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) { |
60 | | return _mm256_inserti128_si256( |
61 | | _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1); |
62 | | } |
63 | | |
64 | | static inline void |
65 | | __attribute__((__always_inline__)) |
66 | | _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a) { |
67 | | _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a)); |
68 | | _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1)); |
69 | | } |
70 | | #endif /* defined(__GNUC__) */ |
71 | | |
72 | | /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ |
73 | | static void |
74 | | shuffle2_avx2(uint8_t* const dest, const uint8_t* const src, |
75 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
76 | 0 | static const int32_t bytesoftype = 2; |
77 | 0 | int32_t j; |
78 | 0 | int k; |
79 | 0 | __m256i ymm0[2], ymm1[2]; |
80 | | |
81 | | /* Create the shuffle mask. |
82 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
83 | | most to least significant (i.e., their order is reversed when compared to |
84 | | loading the mask from an array). */ |
85 | 0 | const __m256i shmask = _mm256_set_epi8( |
86 | 0 | 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, |
87 | 0 | 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, |
88 | 0 | 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, |
89 | 0 | 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00); |
90 | |
|
91 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
92 | | /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */ |
93 | 0 | for (k = 0; k < 2; k++) { |
94 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
95 | 0 | ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
96 | 0 | } |
97 | |
|
98 | 0 | ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8); |
99 | 0 | ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d); |
100 | |
|
101 | 0 | ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0); |
102 | 0 | ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f); |
103 | 0 | ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e); |
104 | | |
105 | | /* Store the result vectors */ |
106 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
107 | 0 | for (k = 0; k < 2; k++) { |
108 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm1[k]); |
109 | 0 | } |
110 | 0 | } |
111 | 0 | } |
112 | | |
113 | | /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ |
114 | | static void |
115 | | shuffle4_avx2(uint8_t* const dest, const uint8_t* const src, |
116 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
117 | 0 | static const int32_t bytesoftype = 4; |
118 | 0 | int32_t i; |
119 | 0 | int j; |
120 | 0 | __m256i ymm0[4], ymm1[4]; |
121 | | |
122 | | /* Create the shuffle mask. |
123 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
124 | | most to least significant (i.e., their order is reversed when compared to |
125 | | loading the mask from an array). */ |
126 | 0 | const __m256i mask = _mm256_set_epi32( |
127 | 0 | 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); |
128 | |
|
129 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
130 | | /* Fetch 32 elements (128 bytes) then transpose bytes and words. */ |
131 | 0 | for (j = 0; j < 4; j++) { |
132 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i)))); |
133 | 0 | ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8); |
134 | 0 | ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d); |
135 | 0 | ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]); |
136 | 0 | ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e); |
137 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]); |
138 | 0 | } |
139 | | /* Transpose double words */ |
140 | 0 | for (j = 0; j < 2; j++) { |
141 | 0 | ymm1[j * 2] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
142 | 0 | ymm1[j * 2 + 1] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
143 | 0 | } |
144 | | /* Transpose quad words */ |
145 | 0 | for (j = 0; j < 2; j++) { |
146 | 0 | ymm0[j * 2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j + 2]); |
147 | 0 | ymm0[j * 2 + 1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j + 2]); |
148 | 0 | } |
149 | 0 | for (j = 0; j < 4; j++) { |
150 | 0 | ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask); |
151 | 0 | } |
152 | | /* Store the result vectors */ |
153 | 0 | uint8_t* const dest_for_ith_element = dest + i; |
154 | 0 | for (j = 0; j < 4; j++) { |
155 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]); |
156 | 0 | } |
157 | 0 | } |
158 | 0 | } |
159 | | |
160 | | /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ |
161 | | static void |
162 | | shuffle8_avx2(uint8_t* const dest, const uint8_t* const src, |
163 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
164 | 0 | static const int32_t bytesoftype = 8; |
165 | 0 | int32_t j; |
166 | 0 | int k, l; |
167 | 0 | __m256i ymm0[8], ymm1[8]; |
168 | |
|
169 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
170 | | /* Fetch 32 elements (256 bytes) then transpose bytes. */ |
171 | 0 | for (k = 0; k < 8; k++) { |
172 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
173 | 0 | ymm1[k] = _mm256_shuffle_epi32(ymm0[k], 0x4e); |
174 | 0 | ymm1[k] = _mm256_unpacklo_epi8(ymm0[k], ymm1[k]); |
175 | 0 | } |
176 | | /* Transpose words */ |
177 | 0 | for (k = 0, l = 0; k < 4; k++, l += 2) { |
178 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 1]); |
179 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 1]); |
180 | 0 | } |
181 | | /* Transpose double words */ |
182 | 0 | for (k = 0, l = 0; k < 4; k++, l++) { |
183 | 0 | if (k == 2) l += 2; |
184 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 2]); |
185 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 2]); |
186 | 0 | } |
187 | | /* Transpose quad words */ |
188 | 0 | for (k = 0; k < 4; k++) { |
189 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 4]); |
190 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 4]); |
191 | 0 | } |
192 | 0 | for (k = 0; k < 8; k++) { |
193 | 0 | ymm1[k] = _mm256_permute4x64_epi64(ymm0[k], 0x72); |
194 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xD8); |
195 | 0 | ymm0[k] = _mm256_unpacklo_epi16(ymm0[k], ymm1[k]); |
196 | 0 | } |
197 | | /* Store the result vectors */ |
198 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
199 | 0 | for (k = 0; k < 8; k++) { |
200 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); |
201 | 0 | } |
202 | 0 | } |
203 | 0 | } |
204 | | |
205 | | /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ |
206 | | static void |
207 | | shuffle16_avx2(uint8_t* const dest, const uint8_t* const src, |
208 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
209 | 0 | static const int32_t bytesoftype = 16; |
210 | 0 | int32_t j; |
211 | 0 | int k, l; |
212 | 0 | __m256i ymm0[16], ymm1[16]; |
213 | | |
214 | | /* Create the shuffle mask. |
215 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
216 | | most to least significant (i.e., their order is reversed when compared to |
217 | | loading the mask from an array). */ |
218 | 0 | const __m256i shmask = _mm256_set_epi8( |
219 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
220 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, |
221 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
222 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); |
223 | |
|
224 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
225 | | /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ |
226 | 0 | for (k = 0; k < 16; k++) { |
227 | 0 | ymm0[k] = _mm256_loadu_si256((__m256i*)(src + (j * bytesoftype) + (k * sizeof(__m256i)))); |
228 | 0 | } |
229 | | /* Transpose bytes */ |
230 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
231 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]); |
232 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]); |
233 | 0 | } |
234 | | /* Transpose words */ |
235 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
236 | 0 | if ((k % 2) == 0) l += 2; |
237 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]); |
238 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]); |
239 | 0 | } |
240 | | /* Transpose double words */ |
241 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
242 | 0 | if ((k % 4) == 0) l += 4; |
243 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]); |
244 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]); |
245 | 0 | } |
246 | | /* Transpose quad words */ |
247 | 0 | for (k = 0; k < 8; k++) { |
248 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]); |
249 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]); |
250 | 0 | } |
251 | 0 | for (k = 0; k < 16; k++) { |
252 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); |
253 | 0 | ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
254 | 0 | } |
255 | | /* Store the result vectors */ |
256 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
257 | 0 | for (k = 0; k < 16; k++) { |
258 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (k * total_elements)), ymm0[k]); |
259 | 0 | } |
260 | 0 | } |
261 | 0 | } |
262 | | |
263 | | /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */ |
264 | | static void |
265 | | shuffle16_tiled_avx2(uint8_t* const dest, const uint8_t* const src, |
266 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
267 | 0 | int32_t j; |
268 | 0 | int k, l; |
269 | 0 | __m256i ymm0[16], ymm1[16]; |
270 | |
|
271 | 0 | const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); |
272 | 0 | const int32_t vecs_rem = (int32_t)vecs_per_el.rem; |
273 | | |
274 | | /* Create the shuffle mask. |
275 | | NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from |
276 | | most to least significant (i.e., their order is reversed when compared to |
277 | | loading the mask from an array). */ |
278 | 0 | const __m256i shmask = _mm256_set_epi8( |
279 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
280 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00, |
281 | 0 | 0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04, |
282 | 0 | 0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00); |
283 | |
|
284 | 0 | for (j = 0; j < vectorizable_elements; j += sizeof(__m256i)) { |
285 | | /* Advance the offset into the type by the vector size (in bytes), unless this is |
286 | | the initial iteration and the type size is not a multiple of the vector size. |
287 | | In that case, only advance by the number of bytes necessary so that the number |
288 | | of remaining bytes in the type will be a multiple of the vector size. */ |
289 | 0 | int32_t offset_into_type; |
290 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
291 | 0 | offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) { |
292 | | |
293 | | /* Fetch elements in groups of 512 bytes */ |
294 | 0 | const uint8_t* const src_with_offset = src + offset_into_type; |
295 | 0 | for (k = 0; k < 16; k++) { |
296 | 0 | ymm0[k] = _mm256_loadu2_m128i( |
297 | 0 | (__m128i*)(src_with_offset + (j + (2 * k) + 1) * bytesoftype), |
298 | 0 | (__m128i*)(src_with_offset + (j + (2 * k)) * bytesoftype)); |
299 | 0 | } |
300 | | /* Transpose bytes */ |
301 | 0 | for (k = 0, l = 0; k < 8; k++, l += 2) { |
302 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]); |
303 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]); |
304 | 0 | } |
305 | | /* Transpose words */ |
306 | 0 | for (k = 0, l = -2; k < 8; k++, l++) { |
307 | 0 | if ((k % 2) == 0) l += 2; |
308 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]); |
309 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]); |
310 | 0 | } |
311 | | /* Transpose double words */ |
312 | 0 | for (k = 0, l = -4; k < 8; k++, l++) { |
313 | 0 | if ((k % 4) == 0) l += 4; |
314 | 0 | ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]); |
315 | 0 | ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]); |
316 | 0 | } |
317 | | /* Transpose quad words */ |
318 | 0 | for (k = 0; k < 8; k++) { |
319 | 0 | ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]); |
320 | 0 | ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]); |
321 | 0 | } |
322 | 0 | for (k = 0; k < 16; k++) { |
323 | 0 | ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8); |
324 | 0 | ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask); |
325 | 0 | } |
326 | | /* Store the result vectors */ |
327 | 0 | uint8_t* const dest_for_jth_element = dest + j; |
328 | 0 | for (k = 0; k < 16; k++) { |
329 | 0 | _mm256_storeu_si256((__m256i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), ymm0[k]); |
330 | 0 | } |
331 | 0 | } |
332 | 0 | } |
333 | 0 | } |
334 | | |
335 | | /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ |
336 | | static void |
337 | | unshuffle2_avx2(uint8_t* const dest, const uint8_t* const src, |
338 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
339 | 0 | static const int32_t bytesoftype = 2; |
340 | 0 | int32_t i; |
341 | 0 | int j; |
342 | 0 | __m256i ymm0[2], ymm1[2]; |
343 | |
|
344 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
345 | | /* Load 32 elements (64 bytes) into 2 YMM registers. */ |
346 | 0 | const uint8_t* const src_for_ith_element = src + i; |
347 | 0 | for (j = 0; j < 2; j++) { |
348 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
349 | 0 | } |
350 | | /* Shuffle bytes */ |
351 | 0 | for (j = 0; j < 2; j++) { |
352 | 0 | ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); |
353 | 0 | } |
354 | | /* Compute the low 64 bytes */ |
355 | 0 | ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]); |
356 | | /* Compute the hi 64 bytes */ |
357 | 0 | ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]); |
358 | | /* Store the result vectors in proper order */ |
359 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
360 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[1]); |
361 | 0 | } |
362 | 0 | } |
363 | | |
364 | | /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ |
365 | | static void |
366 | | unshuffle4_avx2(uint8_t* const dest, const uint8_t* const src, |
367 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
368 | 0 | static const int32_t bytesoftype = 4; |
369 | 0 | int32_t i; |
370 | 0 | int j; |
371 | 0 | __m256i ymm0[4], ymm1[4]; |
372 | |
|
373 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
374 | | /* Load 32 elements (128 bytes) into 4 YMM registers. */ |
375 | 0 | const uint8_t* const src_for_ith_element = src + i; |
376 | 0 | for (j = 0; j < 4; j++) { |
377 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
378 | 0 | } |
379 | | /* Shuffle bytes */ |
380 | 0 | for (j = 0; j < 2; j++) { |
381 | | /* Compute the low 64 bytes */ |
382 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
383 | | /* Compute the hi 64 bytes */ |
384 | 0 | ymm1[2 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
385 | 0 | } |
386 | | /* Shuffle 2-byte words */ |
387 | 0 | for (j = 0; j < 2; j++) { |
388 | | /* Compute the low 64 bytes */ |
389 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
390 | | /* Compute the hi 64 bytes */ |
391 | 0 | ymm0[2 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
392 | 0 | } |
393 | 0 | ymm1[0] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x20); |
394 | 0 | ymm1[1] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x20); |
395 | 0 | ymm1[2] = _mm256_permute2x128_si256(ymm0[0], ymm0[2], 0x31); |
396 | 0 | ymm1[3] = _mm256_permute2x128_si256(ymm0[1], ymm0[3], 0x31); |
397 | | |
398 | | /* Store the result vectors in proper order */ |
399 | 0 | for (j = 0; j < 4; j++) { |
400 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (j * sizeof(__m256i))), ymm1[j]); |
401 | 0 | } |
402 | 0 | } |
403 | 0 | } |
404 | | |
405 | | /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ |
406 | | static void |
407 | | unshuffle8_avx2(uint8_t* const dest, const uint8_t* const src, |
408 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
409 | 0 | static const int32_t bytesoftype = 8; |
410 | 0 | int32_t i; |
411 | 0 | int j; |
412 | 0 | __m256i ymm0[8], ymm1[8]; |
413 | |
|
414 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
415 | | /* Fetch 32 elements (256 bytes) into 8 YMM registers. */ |
416 | 0 | const uint8_t* const src_for_ith_element = src + i; |
417 | 0 | for (j = 0; j < 8; j++) { |
418 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
419 | 0 | } |
420 | | /* Shuffle bytes */ |
421 | 0 | for (j = 0; j < 4; j++) { |
422 | | /* Compute the low 32 bytes */ |
423 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
424 | | /* Compute the hi 32 bytes */ |
425 | 0 | ymm1[4 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
426 | 0 | } |
427 | | /* Shuffle words */ |
428 | 0 | for (j = 0; j < 4; j++) { |
429 | | /* Compute the low 32 bytes */ |
430 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
431 | | /* Compute the hi 32 bytes */ |
432 | 0 | ymm0[4 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
433 | 0 | } |
434 | 0 | for (j = 0; j < 8; j++) { |
435 | 0 | ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8); |
436 | 0 | } |
437 | | |
438 | | /* Shuffle 4-byte dwords */ |
439 | 0 | for (j = 0; j < 4; j++) { |
440 | | /* Compute the low 32 bytes */ |
441 | 0 | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
442 | | /* Compute the hi 32 bytes */ |
443 | 0 | ymm1[4 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
444 | 0 | } |
445 | | |
446 | | /* Store the result vectors in proper order */ |
447 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
448 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[2]); |
449 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[1]); |
450 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[3]); |
451 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[4]); |
452 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[6]); |
453 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[5]); |
454 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); |
455 | 0 | } |
456 | 0 | } |
457 | | |
458 | | /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ |
459 | | static void |
460 | | unshuffle16_avx2(uint8_t* const dest, const uint8_t* const src, |
461 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
462 | 0 | static const int32_t bytesoftype = 16; |
463 | 0 | int32_t i; |
464 | 0 | int j; |
465 | 0 | __m256i ymm0[16], ymm1[16]; |
466 | |
|
467 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
468 | | /* Fetch 32 elements (512 bytes) into 16 YMM registers. */ |
469 | 0 | const uint8_t* const src_for_ith_element = src + i; |
470 | 0 | for (j = 0; j < 16; j++) { |
471 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
472 | 0 | } |
473 | | |
474 | | /* Shuffle bytes */ |
475 | 0 | for (j = 0; j < 8; j++) { |
476 | | /* Compute the low 32 bytes */ |
477 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
478 | | /* Compute the hi 32 bytes */ |
479 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
480 | 0 | } |
481 | | /* Shuffle 2-byte words */ |
482 | 0 | for (j = 0; j < 8; j++) { |
483 | | /* Compute the low 32 bytes */ |
484 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
485 | | /* Compute the hi 32 bytes */ |
486 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
487 | 0 | } |
488 | | /* Shuffle 4-byte dwords */ |
489 | 0 | for (j = 0; j < 8; j++) { |
490 | | /* Compute the low 32 bytes */ |
491 | 0 | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
492 | | /* Compute the hi 32 bytes */ |
493 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
494 | 0 | } |
495 | | |
496 | | /* Shuffle 8-byte qwords */ |
497 | 0 | for (j = 0; j < 8; j++) { |
498 | | /* Compute the low 32 bytes */ |
499 | 0 | ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
500 | | /* Compute the hi 32 bytes */ |
501 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
502 | 0 | } |
503 | |
|
504 | 0 | for (j = 0; j < 8; j++) { |
505 | 0 | ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20); |
506 | 0 | ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31); |
507 | 0 | } |
508 | | |
509 | | /* Store the result vectors in proper order */ |
510 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * sizeof(__m256i))), ymm1[0]); |
511 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * sizeof(__m256i))), ymm1[4]); |
512 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * sizeof(__m256i))), ymm1[2]); |
513 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * sizeof(__m256i))), ymm1[6]); |
514 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * sizeof(__m256i))), ymm1[1]); |
515 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * sizeof(__m256i))), ymm1[5]); |
516 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * sizeof(__m256i))), ymm1[3]); |
517 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * sizeof(__m256i))), ymm1[7]); |
518 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * sizeof(__m256i))), ymm1[8]); |
519 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * sizeof(__m256i))), ymm1[12]); |
520 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * sizeof(__m256i))), ymm1[10]); |
521 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * sizeof(__m256i))), ymm1[14]); |
522 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * sizeof(__m256i))), ymm1[9]); |
523 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * sizeof(__m256i))), ymm1[13]); |
524 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * sizeof(__m256i))), ymm1[11]); |
525 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (15 * sizeof(__m256i))), ymm1[15]); |
526 | 0 | } |
527 | 0 | } |
528 | | |
529 | | /* Routine optimized for unshuffling a buffer for a type size of 12 bytes. |
530 | | * Based off 16-byte implementation*/ |
531 | | static void |
532 | | unshuffle12_avx2(uint8_t* const dest, const uint8_t* const src, |
533 | 0 | const int32_t vectorizable_elements, const int32_t total_elements) { |
534 | 0 | static const int32_t bytesoftype = 12; |
535 | 0 | int32_t i; |
536 | 0 | int j; |
537 | 0 | __m256i ymm0[16], ymm1[16]; |
538 | |
|
539 | 0 | __m256i permute = _mm256_set_epi32(0,0,6,5,4,2,1,0); |
540 | 0 | __m256i store_mask = _mm256_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); |
541 | 0 | int32_t jump = 2*bytesoftype; |
542 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
543 | | /* Fetch 24 elements (384 bytes) into 12 YMM registers. */ |
544 | 0 | const uint8_t* const src_for_ith_element = src + i; |
545 | 0 | for (j = 0; j < bytesoftype; j++) { |
546 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (j * total_elements))); |
547 | 0 | } |
548 | | /* Initialize the last 4 registers (128 bytes) to null */ |
549 | 0 | for (j = bytesoftype; j < 16; j++) { |
550 | 0 | ymm0[j] = _mm256_setzero_si256(); |
551 | 0 | } |
552 | | |
553 | | /* Shuffle bytes */ |
554 | 0 | for (j = 0; j < 8; j++) { |
555 | | /* Compute the low 32 bytes */ |
556 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
557 | | /* Compute the hi 32 bytes */ |
558 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
559 | 0 | } |
560 | | /* Shuffle 2-byte words */ |
561 | 0 | for (j = 0; j < 8; j++) { |
562 | | /* Compute the low 32 bytes */ |
563 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
564 | | /* Compute the hi 32 bytes */ |
565 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
566 | 0 | } |
567 | | /* Shuffle 4-byte dwords */ |
568 | 0 | for (j = 0; j < 8; j++) { |
569 | | /* Compute the low 32 bytes */ |
570 | 0 | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
571 | | /* Compute the hi 32 bytes */ |
572 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
573 | 0 | } |
574 | | |
575 | | /* Shuffle 8-byte qwords */ |
576 | 0 | for (j = 0; j < 8; j++) { |
577 | | /* Compute the low 32 bytes */ |
578 | 0 | ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
579 | | /* Compute the hi 32 bytes */ |
580 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
581 | 0 | } |
582 | | |
583 | |
|
584 | 0 | for (j = 0; j < 8; j++) { |
585 | 0 | ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20); |
586 | 0 | ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31); |
587 | 0 | ymm1[j] = _mm256_permutevar8x32_epi32(ymm1[j], permute); |
588 | 0 | ymm1[j+8] = _mm256_permutevar8x32_epi32(ymm1[j+8], permute); |
589 | | |
590 | |
|
591 | 0 | } |
592 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (0 * jump)), ymm1[0]); |
593 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (1 * jump)), ymm1[4]); |
594 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (2 * jump)), ymm1[2]); |
595 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (3 * jump)), ymm1[6]); |
596 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (4 * jump)), ymm1[1]); |
597 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (5 * jump)), ymm1[5]); |
598 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (6 * jump)), ymm1[3]); |
599 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (7 * jump)), ymm1[7]); |
600 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (8 * jump)), ymm1[8]); |
601 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (9 * jump)), ymm1[12]); |
602 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (10 * jump)), ymm1[10]); |
603 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (11 * jump)), ymm1[14]); |
604 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (12 * jump)), ymm1[9]); |
605 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (13 * jump)), ymm1[13]); |
606 | 0 | _mm256_storeu_si256((__m256i*)(dest + (i * bytesoftype) + (14 * jump)), ymm1[11]); |
607 | 0 | _mm256_maskstore_epi32((int *)(dest + (i * bytesoftype) + (15 * jump)), store_mask, ymm1[15]); |
608 | 0 | } |
609 | 0 | } |
610 | | /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */ |
611 | | static void |
612 | | unshuffle16_tiled_avx2(const uint8_t *dest, const uint8_t* const src, |
613 | 0 | const int32_t vectorizable_elements, const int32_t total_elements, const int32_t bytesoftype) { |
614 | 0 | int32_t i; |
615 | 0 | int j; |
616 | 0 | __m256i ymm0[16], ymm1[16]; |
617 | |
|
618 | 0 | const lldiv_t vecs_per_el = lldiv(bytesoftype, sizeof(__m128i)); |
619 | 0 | const int32_t vecs_rem = (int32_t)vecs_per_el.rem; |
620 | | |
621 | | /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2) |
622 | | to optimize cache utilization. */ |
623 | 0 | int32_t offset_into_type; |
624 | 0 | for (offset_into_type = 0; offset_into_type < bytesoftype; |
625 | 0 | offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int32_t)sizeof(__m128i))) { |
626 | 0 | for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { |
627 | | /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */ |
628 | 0 | const uint8_t* const src_for_ith_element = src + i; |
629 | 0 | for (j = 0; j < 16; j++) { |
630 | 0 | ymm0[j] = _mm256_loadu_si256((__m256i*)(src_for_ith_element + (total_elements * (offset_into_type + j)))); |
631 | 0 | } |
632 | | |
633 | | /* Shuffle bytes */ |
634 | 0 | for (j = 0; j < 8; j++) { |
635 | | /* Compute the low 32 bytes */ |
636 | 0 | ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
637 | | /* Compute the hi 32 bytes */ |
638 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]); |
639 | 0 | } |
640 | | /* Shuffle 2-byte words */ |
641 | 0 | for (j = 0; j < 8; j++) { |
642 | | /* Compute the low 32 bytes */ |
643 | 0 | ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
644 | | /* Compute the hi 32 bytes */ |
645 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]); |
646 | 0 | } |
647 | | /* Shuffle 4-byte dwords */ |
648 | 0 | for (j = 0; j < 8; j++) { |
649 | | /* Compute the low 32 bytes */ |
650 | 0 | ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
651 | | /* Compute the hi 32 bytes */ |
652 | 0 | ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]); |
653 | 0 | } |
654 | | |
655 | | /* Shuffle 8-byte qwords */ |
656 | 0 | for (j = 0; j < 8; j++) { |
657 | | /* Compute the low 32 bytes */ |
658 | 0 | ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
659 | | /* Compute the hi 32 bytes */ |
660 | 0 | ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]); |
661 | 0 | } |
662 | |
|
663 | 0 | for (j = 0; j < 8; j++) { |
664 | 0 | ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20); |
665 | 0 | ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31); |
666 | 0 | } |
667 | | |
668 | | /* Store the result vectors in proper order */ |
669 | 0 | const uint8_t* const dest_with_offset = dest + offset_into_type; |
670 | 0 | _mm256_storeu2_m128i( |
671 | 0 | (__m128i*)(dest_with_offset + (i + 0x01) * bytesoftype), |
672 | 0 | (__m128i*)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]); |
673 | 0 | _mm256_storeu2_m128i( |
674 | 0 | (__m128i*)(dest_with_offset + (i + 0x03) * bytesoftype), |
675 | 0 | (__m128i*)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]); |
676 | 0 | _mm256_storeu2_m128i( |
677 | 0 | (__m128i*)(dest_with_offset + (i + 0x05) * bytesoftype), |
678 | 0 | (__m128i*)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]); |
679 | 0 | _mm256_storeu2_m128i( |
680 | 0 | (__m128i*)(dest_with_offset + (i + 0x07) * bytesoftype), |
681 | 0 | (__m128i*)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]); |
682 | 0 | _mm256_storeu2_m128i( |
683 | 0 | (__m128i*)(dest_with_offset + (i + 0x09) * bytesoftype), |
684 | 0 | (__m128i*)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]); |
685 | 0 | _mm256_storeu2_m128i( |
686 | 0 | (__m128i*)(dest_with_offset + (i + 0x0b) * bytesoftype), |
687 | 0 | (__m128i*)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]); |
688 | 0 | _mm256_storeu2_m128i( |
689 | 0 | (__m128i*)(dest_with_offset + (i + 0x0d) * bytesoftype), |
690 | 0 | (__m128i*)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]); |
691 | 0 | _mm256_storeu2_m128i( |
692 | 0 | (__m128i*)(dest_with_offset + (i + 0x0f) * bytesoftype), |
693 | 0 | (__m128i*)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]); |
694 | 0 | _mm256_storeu2_m128i( |
695 | 0 | (__m128i*)(dest_with_offset + (i + 0x11) * bytesoftype), |
696 | 0 | (__m128i*)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]); |
697 | 0 | _mm256_storeu2_m128i( |
698 | 0 | (__m128i*)(dest_with_offset + (i + 0x13) * bytesoftype), |
699 | 0 | (__m128i*)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]); |
700 | 0 | _mm256_storeu2_m128i( |
701 | 0 | (__m128i*)(dest_with_offset + (i + 0x15) * bytesoftype), |
702 | 0 | (__m128i*)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]); |
703 | 0 | _mm256_storeu2_m128i( |
704 | 0 | (__m128i*)(dest_with_offset + (i + 0x17) * bytesoftype), |
705 | 0 | (__m128i*)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]); |
706 | 0 | _mm256_storeu2_m128i( |
707 | 0 | (__m128i*)(dest_with_offset + (i + 0x19) * bytesoftype), |
708 | 0 | (__m128i*)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]); |
709 | 0 | _mm256_storeu2_m128i( |
710 | 0 | (__m128i*)(dest_with_offset + (i + 0x1b) * bytesoftype), |
711 | 0 | (__m128i*)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]); |
712 | 0 | _mm256_storeu2_m128i( |
713 | 0 | (__m128i*)(dest_with_offset + (i + 0x1d) * bytesoftype), |
714 | 0 | (__m128i*)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]); |
715 | 0 | _mm256_storeu2_m128i( |
716 | 0 | (__m128i*)(dest_with_offset + (i + 0x1f) * bytesoftype), |
717 | 0 | (__m128i*)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]); |
718 | 0 | } |
719 | 0 | } |
720 | 0 | } |
721 | | |
722 | | /* Shuffle a block. This can never fail. */ |
723 | | void |
724 | | shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
725 | 0 | const uint8_t *_src, uint8_t *_dest) { |
726 | 0 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i); |
727 | | |
728 | | /* If the block size is too small to be vectorized, |
729 | | use the generic implementation. */ |
730 | 0 | if (blocksize < vectorized_chunk_size) { |
731 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
732 | 0 | return; |
733 | 0 | } |
734 | | |
735 | | /* If the blocksize is not a multiple of both the typesize and |
736 | | the vector size, round the blocksize down to the next value |
737 | | which is a multiple of both. The vectorized shuffle can be |
738 | | used for that portion of the data, and the naive implementation |
739 | | can be used for the remaining portion. */ |
740 | 0 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
741 | |
|
742 | 0 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
743 | 0 | const int32_t total_elements = blocksize / bytesoftype; |
744 | | |
745 | | /* Optimized shuffle implementations */ |
746 | 0 | switch (bytesoftype) { |
747 | 0 | case 2: |
748 | 0 | shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements); |
749 | 0 | break; |
750 | 0 | case 4: |
751 | 0 | shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements); |
752 | 0 | break; |
753 | 0 | case 8: |
754 | 0 | shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements); |
755 | 0 | break; |
756 | 0 | case 16: |
757 | 0 | shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements); |
758 | 0 | break; |
759 | 0 | default: |
760 | | /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */ |
761 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
762 | 0 | shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
763 | 0 | } |
764 | 0 | else { |
765 | | /* Non-optimized shuffle */ |
766 | 0 | shuffle_generic(bytesoftype, blocksize, _src, _dest); |
767 | | /* The non-optimized function covers the whole buffer, |
768 | | so we're done processing here. */ |
769 | 0 | return; |
770 | 0 | } |
771 | 0 | } |
772 | | |
773 | | /* If the buffer had any bytes at the end which couldn't be handled |
774 | | by the vectorized implementations, use the non-optimized version |
775 | | to finish them up. */ |
776 | 0 | if (vectorizable_bytes < blocksize) { |
777 | 0 | shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
778 | 0 | } |
779 | 0 | } |
780 | | |
781 | | /* Unshuffle a block. This can never fail. */ |
782 | | void |
783 | | unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
784 | 0 | const uint8_t *_src, uint8_t *_dest) { |
785 | 0 | const int32_t vectorized_chunk_size = bytesoftype * (int32_t)sizeof(__m256i); |
786 | | |
787 | | /* If the block size is too small to be vectorized, |
788 | | use the generic implementation. */ |
789 | 0 | if (blocksize < vectorized_chunk_size) { |
790 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
791 | 0 | return; |
792 | 0 | } |
793 | | |
794 | | /* If the blocksize is not a multiple of both the typesize and |
795 | | the vector size, round the blocksize down to the next value |
796 | | which is a multiple of both. The vectorized unshuffle can be |
797 | | used for that portion of the data, and the naive implementation |
798 | | can be used for the remaining portion. */ |
799 | 0 | const int32_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); |
800 | |
|
801 | 0 | const int32_t vectorizable_elements = vectorizable_bytes / bytesoftype; |
802 | 0 | const int32_t total_elements = blocksize / bytesoftype; |
803 | | |
804 | | /* Optimized unshuffle implementations */ |
805 | 0 | switch (bytesoftype) { |
806 | 0 | case 2: |
807 | 0 | unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements); |
808 | 0 | break; |
809 | 0 | case 4: |
810 | 0 | unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements); |
811 | 0 | break; |
812 | 0 | case 8: |
813 | 0 | unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements); |
814 | 0 | break; |
815 | 0 | case 12: |
816 | 0 | unshuffle12_avx2(_dest, _src, vectorizable_elements, total_elements); |
817 | 0 | break; |
818 | 0 | case 16: |
819 | 0 | unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements); |
820 | 0 | break; |
821 | 0 | default: |
822 | | /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */ |
823 | 0 | if (bytesoftype > (int32_t)sizeof(__m128i)) { |
824 | 0 | unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); |
825 | 0 | } |
826 | 0 | else { |
827 | | /* Non-optimized unshuffle */ |
828 | 0 | unshuffle_generic(bytesoftype, blocksize, _src, _dest); |
829 | | /* The non-optimized function covers the whole buffer, |
830 | | so we're done processing here. */ |
831 | 0 | return; |
832 | 0 | } |
833 | 0 | } |
834 | | |
835 | | /* If the buffer had any bytes at the end which couldn't be handled |
836 | | by the vectorized implementations, use the non-optimized version |
837 | | to finish them up. */ |
838 | 0 | if (vectorizable_bytes < blocksize) { |
839 | 0 | unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); |
840 | 0 | } |
841 | 0 | } |
842 | | |
843 | | const bool is_shuffle_avx2 = true; |
844 | | |
845 | | #else |
846 | | |
847 | | const bool is_shuffle_avx2 = false; |
848 | | |
849 | | void shuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
850 | | const uint8_t *_src, uint8_t *_dest) { |
851 | | abort(); |
852 | | } |
853 | | |
854 | | void unshuffle_avx2(const int32_t bytesoftype, const int32_t blocksize, |
855 | | const uint8_t *_src, uint8_t *_dest) { |
856 | | abort(); |
857 | | } |
858 | | |
859 | | #endif /* defined(__AVX2__) */ |