/src/c-blosc2/blosc/bitshuffle-avx512.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | Bitshuffle - Filter for improving compression of typed binary data. |
13 | | |
14 | | Author: Kiyoshi Masui <kiyo@physics.ubc.ca> |
15 | | Website: https://github.com/kiyo-masui/bitshuffle |
16 | | |
17 | | Note: Adapted for c-blosc2 by Francesc Alted. |
18 | | |
19 | | See LICENSES/BITSHUFFLE.txt file for details about copyright and |
20 | | rights to use. |
21 | | **********************************************************************/ |
22 | | |
23 | | #include "bitshuffle-avx512.h" |
24 | | #include "bitshuffle-avx2.h" |
25 | | #include "bitshuffle-sse2.h" |
26 | | #include "bitshuffle-generic.h" |
27 | | #include <stdlib.h> |
28 | | |
29 | | /* Make sure AVX512 is available for the compilation target and compiler. */ |
30 | | #if defined(__AVX512F__) && defined (__AVX512BW__) |
31 | | #include <immintrin.h> |
32 | | |
33 | | |
34 | | /* Transpose bits within bytes. */ |
35 | | int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size, |
36 | 0 | const size_t elem_size) { |
37 | |
|
38 | 0 | size_t ii, kk; |
39 | 0 | const char* in_b = (const char*) in; |
40 | 0 | char* out_b = (char*) out; |
41 | 0 | size_t nbyte = elem_size * size; |
42 | 0 | int64_t count; |
43 | |
|
44 | 0 | int64_t* out_i64; |
45 | 0 | __m512i zmm; |
46 | 0 | __mmask64 bt; |
47 | 0 | if (nbyte >= 64) { |
48 | 0 | const __m512i mask = _mm512_set1_epi8(0); |
49 | |
|
50 | 0 | for (ii = 0; ii + 63 < nbyte; ii += 64) { |
51 | 0 | zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]); |
52 | 0 | for (kk = 0; kk < 8; kk++) { |
53 | 0 | bt = _mm512_cmp_epi8_mask(zmm, mask, 1); |
54 | 0 | zmm = _mm512_slli_epi16(zmm, 1); |
55 | 0 | out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
56 | 0 | *out_i64 = (int64_t)bt; |
57 | 0 | } |
58 | 0 | } |
59 | 0 | } |
60 | |
|
61 | 0 | __m256i ymm; |
62 | 0 | int32_t bt32; |
63 | 0 | int32_t* out_i32; |
64 | 0 | size_t start = nbyte - nbyte % 64; |
65 | 0 | for (ii = start; ii + 31 < nbyte; ii += 32) { |
66 | 0 | ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); |
67 | 0 | for (kk = 0; kk < 8; kk++) { |
68 | 0 | bt32 = _mm256_movemask_epi8(ymm); |
69 | 0 | ymm = _mm256_slli_epi16(ymm, 1); |
70 | 0 | out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; |
71 | 0 | *out_i32 = bt32; |
72 | 0 | } |
73 | 0 | } |
74 | | |
75 | |
|
76 | 0 | count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, |
77 | 0 | nbyte - nbyte % 64 % 32); |
78 | |
|
79 | 0 | return count; |
80 | 0 | } |
81 | | |
82 | | |
83 | | /* Transpose bits within elements. */ |
84 | | int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size, |
85 | 0 | const size_t elem_size) { |
86 | |
|
87 | 0 | int64_t count; |
88 | |
|
89 | 0 | CHECK_MULT_EIGHT(size); |
90 | |
|
91 | 0 | void* tmp_buf = malloc(size * elem_size); |
92 | 0 | if (tmp_buf == NULL) return -1; |
93 | | |
94 | 0 | count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); |
95 | 0 | CHECK_ERR_FREE(count, tmp_buf); |
96 | 0 | count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size); |
97 | 0 | CHECK_ERR_FREE(count, tmp_buf); |
98 | 0 | count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); |
99 | |
|
100 | 0 | free(tmp_buf); |
101 | |
|
102 | 0 | return count; |
103 | |
|
104 | 0 | } |
105 | | |
106 | | /* Shuffle bits within the bytes of eight element blocks. */ |
107 | | int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size, |
108 | 0 | const size_t elem_size) { |
109 | |
|
110 | 0 | CHECK_MULT_EIGHT(size); |
111 | | |
112 | | // With a bit of care, this could be written such that such that it is |
113 | | // in_buf = out_buf safe. |
114 | 0 | const char* in_b = (const char*) in; |
115 | 0 | char* out_b = (char*) out; |
116 | |
|
117 | 0 | size_t ii, jj, kk; |
118 | 0 | size_t nbyte = elem_size * size; |
119 | |
|
120 | 0 | __m512i zmm; |
121 | 0 | __mmask64 bt; |
122 | |
|
123 | 0 | if (elem_size % 8) { |
124 | 0 | return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size); |
125 | 0 | } else { |
126 | 0 | const __m512i mask = _mm512_set1_epi8(0); |
127 | 0 | for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) { |
128 | 0 | for (ii = 0; ii + 8 * elem_size - 1 < nbyte; |
129 | 0 | ii += 8 * elem_size) { |
130 | 0 | zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]); |
131 | 0 | for (kk = 0; kk < 8; kk++) { |
132 | 0 | bt = _mm512_cmp_epi8_mask(zmm, mask, 1); |
133 | 0 | zmm = _mm512_slli_epi16(zmm, 1); |
134 | 0 | size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); |
135 | 0 | * (int64_t *) &out_b[ind] = bt; |
136 | 0 | } |
137 | 0 | } |
138 | 0 | } |
139 | |
|
140 | 0 | } |
141 | 0 | return size * elem_size; |
142 | 0 | } |
143 | | |
144 | | /* Untranspose bits within elements. */ |
145 | | int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size, |
146 | 0 | const size_t elem_size) { |
147 | |
|
148 | 0 | int64_t count; |
149 | |
|
150 | 0 | CHECK_MULT_EIGHT(size); |
151 | |
|
152 | 0 | void* tmp_buf = malloc(size * elem_size); |
153 | 0 | if (tmp_buf == NULL) return -1; |
154 | | |
155 | 0 | count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size); |
156 | 0 | CHECK_ERR_FREE(count, tmp_buf); |
157 | 0 | count = bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size); |
158 | |
|
159 | 0 | free(tmp_buf); |
160 | 0 | return count; |
161 | 0 | } |
162 | | |
163 | | const bool is_bshuf_AVX512 = true; |
164 | | |
165 | | #else /* defined(__AVX512F__) && defined (__AVX512BW__) */ |
166 | | |
167 | | const bool is_bshuf_AVX512 = false; |
168 | | |
169 | | int64_t |
170 | | bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size, |
171 | | const size_t elem_size) { |
172 | | abort(); |
173 | | } |
174 | | |
175 | | int64_t |
176 | | bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size, |
177 | | const size_t elem_size) { |
178 | | abort(); |
179 | | } |
180 | | |
181 | | #endif /* defined(__AVX512F__) && defined (__AVX512BW__) */ |