/src/c-blosc2/blosc/bitshuffle-avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | Bitshuffle - Filter for improving compression of typed binary data. |
13 | | |
14 | | Author: Kiyoshi Masui <kiyo@physics.ubc.ca> |
15 | | Website: https://github.com/kiyo-masui/bitshuffle |
16 | | |
17 | | Note: Adapted for c-blosc by Francesc Alted. |
18 | | |
19 | | See LICENSES/BITSHUFFLE.txt file for details about copyright and |
20 | | rights to use. |
21 | | **********************************************************************/ |
22 | | |
23 | | #include "bitshuffle-avx2.h" |
24 | | #include "bitshuffle-sse2.h" |
25 | | #include "bitshuffle-generic.h" |
26 | | #include <stdlib.h> |
27 | | |
28 | | /* Make sure AVX2 is available for the compilation target and compiler. */ |
29 | | #if defined(__AVX2__) |
30 | | |
31 | | #include <immintrin.h> |
32 | | |
33 | | /* The next is useful for debugging purposes */ |
34 | | #if 0 |
35 | | #include <stdio.h> |
36 | | #include <string.h> |
37 | | |
38 | | static void printymm(__m256i ymm0) |
39 | | { |
40 | | uint8_t buf[32]; |
41 | | |
42 | | ((__m256i *)buf)[0] = ymm0; |
43 | | printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n", |
44 | | buf[0], buf[1], buf[2], buf[3], |
45 | | buf[4], buf[5], buf[6], buf[7], |
46 | | buf[8], buf[9], buf[10], buf[11], |
47 | | buf[12], buf[13], buf[14], buf[15], |
48 | | buf[16], buf[17], buf[18], buf[19], |
49 | | buf[20], buf[21], buf[22], buf[23], |
50 | | buf[24], buf[25], buf[26], buf[27], |
51 | | buf[28], buf[29], buf[30], buf[31]); |
52 | | } |
53 | | #endif |
54 | | |
55 | | |
56 | | /* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */ |
57 | | |
58 | | |
59 | | |
60 | | /* Transpose bits within bytes. */ |
61 | | int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, |
62 | 134k | const size_t elem_size) { |
63 | | |
64 | 134k | size_t ii, kk; |
65 | 134k | const char* in_b = (const char*) in; |
66 | 134k | char* out_b = (char*) out; |
67 | 134k | int32_t* out_i32; |
68 | | |
69 | 134k | size_t nbyte = elem_size * size; |
70 | | |
71 | 134k | int64_t count; |
72 | | |
73 | 134k | __m256i ymm; |
74 | 134k | int32_t bt; |
75 | | |
76 | 16.7M | for (ii = 0; ii + 31 < nbyte; ii += 32) { |
77 | 16.6M | ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); |
78 | 149M | for (kk = 0; kk < 8; kk++) { |
79 | 132M | bt = _mm256_movemask_epi8(ymm); |
80 | 132M | ymm = _mm256_slli_epi16(ymm, 1); |
81 | 132M | out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
82 | 132M | *out_i32 = bt; |
83 | 132M | } |
84 | 16.6M | } |
85 | 134k | count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, |
86 | 134k | nbyte - nbyte % 32); |
87 | 134k | return count; |
88 | 134k | } |
89 | | |
90 | | |
91 | | /* Transpose bits within elements. */ |
92 | | int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, |
93 | 134k | const size_t elem_size) { |
94 | | |
95 | 134k | int64_t count; |
96 | | |
97 | 134k | CHECK_MULT_EIGHT(size); |
98 | | |
99 | 134k | void* tmp_buf = malloc(size * elem_size); |
100 | 134k | if (tmp_buf == NULL) return -1; |
101 | | |
102 | 134k | count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); |
103 | 134k | CHECK_ERR_FREE(count, tmp_buf); |
104 | 134k | count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size); |
105 | 134k | CHECK_ERR_FREE(count, tmp_buf); |
106 | 134k | count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); |
107 | | |
108 | 134k | free(tmp_buf); |
109 | | |
110 | 134k | return count; |
111 | 134k | } |
112 | | |
113 | | |
114 | | /* For data organized into a row for each bit (8 * elem_size rows), transpose |
115 | | * the bytes. */ |
116 | | int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size, |
117 | 34.0k | const size_t elem_size) { |
118 | | |
119 | 34.0k | size_t hh, ii, jj, kk, mm; |
120 | 34.0k | const char* in_b = (const char*) in; |
121 | 34.0k | char* out_b = (char*) out; |
122 | | |
123 | 34.0k | CHECK_MULT_EIGHT(size); |
124 | | |
125 | 34.0k | size_t nrows = 8 * elem_size; |
126 | 34.0k | size_t nbyte_row = size / 8; |
127 | | |
128 | 34.0k | if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, |
129 | 34.0k | elem_size); |
130 | | |
131 | 0 | __m256i ymm_0[8]; |
132 | 0 | __m256i ymm_1[8]; |
133 | 0 | __m256i ymm_storeage[8][4]; |
134 | |
|
135 | 0 | for (jj = 0; jj + 31 < nbyte_row; jj += 32) { |
136 | 0 | for (ii = 0; ii + 3 < elem_size; ii += 4) { |
137 | 0 | for (hh = 0; hh < 4; hh ++) { |
138 | |
|
139 | 0 | for (kk = 0; kk < 8; kk ++){ |
140 | 0 | ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ |
141 | 0 | (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); |
142 | 0 | } |
143 | |
|
144 | 0 | for (kk = 0; kk < 4; kk ++){ |
145 | 0 | ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], |
146 | 0 | ymm_0[kk * 2 + 1]); |
147 | 0 | ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], |
148 | 0 | ymm_0[kk * 2 + 1]); |
149 | 0 | } |
150 | |
|
151 | 0 | for (kk = 0; kk < 2; kk ++){ |
152 | 0 | for (mm = 0; mm < 2; mm ++){ |
153 | 0 | ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( |
154 | 0 | ymm_1[kk * 4 + mm * 2], |
155 | 0 | ymm_1[kk * 4 + mm * 2 + 1]); |
156 | 0 | ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( |
157 | 0 | ymm_1[kk * 4 + mm * 2], |
158 | 0 | ymm_1[kk * 4 + mm * 2 + 1]); |
159 | 0 | } |
160 | 0 | } |
161 | |
|
162 | 0 | for (kk = 0; kk < 4; kk ++){ |
163 | 0 | ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], |
164 | 0 | ymm_0[kk * 2 + 1]); |
165 | 0 | ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], |
166 | 0 | ymm_0[kk * 2 + 1]); |
167 | 0 | } |
168 | |
|
169 | 0 | for (kk = 0; kk < 8; kk ++){ |
170 | 0 | ymm_storeage[kk][hh] = ymm_1[kk]; |
171 | 0 | } |
172 | 0 | } |
173 | |
|
174 | 0 | for (mm = 0; mm < 8; mm ++) { |
175 | |
|
176 | 0 | for (kk = 0; kk < 4; kk ++){ |
177 | 0 | ymm_0[kk] = ymm_storeage[mm][kk]; |
178 | 0 | } |
179 | |
|
180 | 0 | ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); |
181 | 0 | ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); |
182 | 0 | ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); |
183 | 0 | ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); |
184 | |
|
185 | 0 | ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); |
186 | 0 | ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); |
187 | 0 | ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); |
188 | 0 | ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); |
189 | |
|
190 | 0 | _mm256_storeu_si256((__m256i *) &out_b[ |
191 | 0 | (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); |
192 | 0 | _mm256_storeu_si256((__m256i *) &out_b[ |
193 | 0 | (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); |
194 | 0 | _mm256_storeu_si256((__m256i *) &out_b[ |
195 | 0 | (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); |
196 | 0 | _mm256_storeu_si256((__m256i *) &out_b[ |
197 | 0 | (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); |
198 | 0 | } |
199 | 0 | } |
200 | 0 | } |
201 | 0 | for (ii = 0; ii < nrows; ii ++ ) { |
202 | 0 | for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { |
203 | 0 | out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; |
204 | 0 | } |
205 | 0 | } |
206 | 0 | return size * elem_size; |
207 | 34.0k | } |
208 | | |
209 | | |
210 | | /* Shuffle bits within the bytes of eight element blocks. */ |
211 | | int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size, |
212 | 34.0k | const size_t elem_size) { |
213 | | |
214 | 34.0k | CHECK_MULT_EIGHT(size); |
215 | | |
216 | | // With a bit of care, this could be written such that such that it is |
217 | | // in_buf = out_buf safe. |
218 | 34.0k | const char* in_b = (const char*) in; |
219 | 34.0k | char* out_b = (char*) out; |
220 | | |
221 | 34.0k | size_t ii, jj, kk; |
222 | 34.0k | size_t nbyte = elem_size * size; |
223 | | |
224 | 34.0k | __m256i ymm; |
225 | 34.0k | int32_t bt; |
226 | | |
227 | 34.0k | if (elem_size % 4) { |
228 | 34.0k | return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size); |
229 | 34.0k | } else { |
230 | 0 | for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) { |
231 | 0 | for (ii = 0; ii + 8 * elem_size - 1 < nbyte; |
232 | 0 | ii += 8 * elem_size) { |
233 | 0 | ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]); |
234 | 0 | for (kk = 0; kk < 8; kk++) { |
235 | 0 | bt = _mm256_movemask_epi8(ymm); |
236 | 0 | ymm = _mm256_slli_epi16(ymm, 1); |
237 | 0 | size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); |
238 | 0 | * (int32_t *) &out_b[ind] = bt; |
239 | 0 | } |
240 | 0 | } |
241 | 0 | } |
242 | 0 | } |
243 | 0 | return size * elem_size; |
244 | 34.0k | } |
245 | | |
246 | | |
247 | | /* Untranspose bits within elements. */ |
248 | | int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, |
249 | 34.0k | const size_t elem_size) { |
250 | | |
251 | 34.0k | int64_t count; |
252 | | |
253 | 34.0k | CHECK_MULT_EIGHT(size); |
254 | | |
255 | 34.0k | void* tmp_buf = malloc(size * elem_size); |
256 | 34.0k | if (tmp_buf == NULL) return -1; |
257 | | |
258 | 34.0k | count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size); |
259 | 34.0k | CHECK_ERR_FREE(count, tmp_buf); |
260 | 34.0k | count = bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size); |
261 | | |
262 | 34.0k | free(tmp_buf); |
263 | 34.0k | return count; |
264 | 34.0k | } |
265 | | |
266 | | const bool is_bshuf_AVX = true; |
267 | | |
268 | | #else /* defined(__AVX2__) */ |
269 | | |
270 | | const bool is_bshuf_AVX = false; |
271 | | |
272 | | int64_t |
273 | | bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, |
274 | | const size_t elem_size) { |
275 | | abort(); |
276 | | } |
277 | | |
278 | | int64_t |
279 | | bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, |
280 | | const size_t elem_size) { |
281 | | abort(); |
282 | | } |
283 | | |
284 | | #endif /* defined(__AVX2__) */ |