Coverage Report

Created: 2026-03-19 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/bitshuffle-avx2.c
Line
Count
Source
1
/*
2
 * Bitshuffle - Filter for improving compression of typed binary data.
3
 *
4
 * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
5
 * Website: https://github.com/kiyo-masui/bitshuffle
6
 * Created: 2014
7
 *
8
 * Note: Adapted for c-blosc by Francesc Alted.
9
 *
10
 * See LICENSES/BITSHUFFLE.txt file for details about copyright and
11
 * rights to use.
12
 *
13
 */
14
15
#include "bitshuffle-generic.h"
16
#include "bitshuffle-sse2.h"
17
#include "bitshuffle-avx2.h"
18
19
20
/* Define dummy functions if AVX2 is not available for the compilation target and compiler. */
21
#if !defined(__AVX2__)
22
#include <stdlib.h>
23
24
int64_t blosc_internal_bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
25
                                                 const size_t elem_size, void* tmp_buf) {
26
    abort();
27
}
28
29
int64_t blosc_internal_bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
30
                                                   const size_t elem_size, void* tmp_buf) {
31
    abort();
32
}
33
34
#else /* defined(__AVX2__) */
35
36
#include <immintrin.h>
37
38
/* The next is useful for debugging purposes */
39
#if 0
40
#include <stdio.h>
41
#include <string.h>
42
43
static void printymm(__m256i ymm0)
44
{
45
  uint8_t buf[32];
46
47
  ((__m256i *)buf)[0] = ymm0;
48
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
49
          buf[0], buf[1], buf[2], buf[3],
50
          buf[4], buf[5], buf[6], buf[7],
51
          buf[8], buf[9], buf[10], buf[11],
52
          buf[12], buf[13], buf[14], buf[15],
53
          buf[16], buf[17], buf[18], buf[19],
54
          buf[20], buf[21], buf[22], buf[23],
55
          buf[24], buf[25], buf[26], buf[27],
56
          buf[28], buf[29], buf[30], buf[31]);
57
}
58
#endif
59
60
61
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
62
63
64
/* Transpose bits within bytes. */
65
static int64_t bshuf_trans_bit_byte_avx2(void* in, void* out, const size_t size,
66
103k
                                         const size_t elem_size) {
67
68
103k
    char* in_b = (char*) in;
69
103k
    char* out_b = (char*) out;
70
103k
    int32_t* out_i32;
71
72
103k
    size_t nbyte = elem_size * size;
73
74
103k
    int64_t count;
75
76
103k
    __m256i ymm;
77
103k
    int32_t bt;
78
103k
    size_t ii, kk;
79
80
20.2M
    for (ii = 0; ii + 31 < nbyte; ii += 32) {
81
20.1M
        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
82
181M
        for (kk = 0; kk < 8; kk++) {
83
161M
            bt = _mm256_movemask_epi8(ymm);
84
161M
            ymm = _mm256_slli_epi16(ymm, 1);
85
161M
            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
86
161M
            *out_i32 = bt;
87
161M
        }
88
20.1M
    }
89
103k
    count = blosc_internal_bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
90
103k
            nbyte - nbyte % 32);
91
103k
    return count;
92
103k
}
93
94
/* Transpose bits within elements. */
95
int64_t blosc_internal_bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
96
103k
                                                 const size_t elem_size, void* tmp_buf) {
97
103k
    int64_t count;
98
99
103k
    CHECK_MULT_EIGHT(size);
100
101
103k
    count = blosc_internal_bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
102
103k
    CHECK_ERR(count);
103
103k
    count = bshuf_trans_bit_byte_avx2(out, tmp_buf, size, elem_size);
104
103k
    CHECK_ERR(count);
105
103k
    count = blosc_internal_bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
106
107
103k
    return count;
108
103k
}
109
110
/* For data organized into a row for each bit (8 * elem_size rows), transpose
111
 * the bytes. */
112
static int64_t bshuf_trans_byte_bitrow_avx2(void* in, void* out, const size_t size,
113
28.5k
                                            const size_t elem_size) {
114
115
28.5k
    char* in_b = (char*) in;
116
28.5k
    char* out_b = (char*) out;
117
118
28.5k
    size_t nrows = 8 * elem_size;
119
28.5k
    size_t nbyte_row = size / 8;
120
28.5k
    size_t ii, jj, kk, hh, mm;
121
122
28.5k
    CHECK_MULT_EIGHT(size);
123
124
28.5k
    if (elem_size % 4)
125
27.9k
      return blosc_internal_bshuf_trans_byte_bitrow_sse2(in, out, size, elem_size);
126
127
659
    __m256i ymm_0[8];
128
659
    __m256i ymm_1[8];
129
659
    __m256i ymm_storeage[8][4];
130
131
4.24k
    for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
132
7.86k
        for (ii = 0; ii + 3 < elem_size; ii += 4) {
133
21.3k
            for (hh = 0; hh < 4; hh ++) {
134
135
153k
                for (kk = 0; kk < 8; kk ++){
136
136k
                    ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
137
136k
                            (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
138
136k
                }
139
140
85.5k
                for (kk = 0; kk < 4; kk ++){
141
68.4k
                    ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
142
68.4k
                            ymm_0[kk * 2 + 1]);
143
68.4k
                    ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
144
68.4k
                            ymm_0[kk * 2 + 1]);
145
68.4k
                }
146
147
51.3k
                for (kk = 0; kk < 2; kk ++){
148
102k
                    for (mm = 0; mm < 2; mm ++){
149
68.4k
                        ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
150
68.4k
                                ymm_1[kk * 4 + mm * 2],
151
68.4k
                                ymm_1[kk * 4 + mm * 2 + 1]);
152
68.4k
                        ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
153
68.4k
                                ymm_1[kk * 4 + mm * 2],
154
68.4k
                                ymm_1[kk * 4 + mm * 2 + 1]);
155
68.4k
                    }
156
34.2k
                }
157
158
85.5k
                for (kk = 0; kk < 4; kk ++){
159
68.4k
                    ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
160
68.4k
                            ymm_0[kk * 2 + 1]);
161
68.4k
                    ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
162
68.4k
                            ymm_0[kk * 2 + 1]);
163
68.4k
                }
164
165
153k
                for (kk = 0; kk < 8; kk ++){
166
136k
                    ymm_storeage[kk][hh] = ymm_1[kk];
167
136k
                }
168
17.1k
            }
169
170
38.4k
            for (mm = 0; mm < 8; mm ++) {
171
172
171k
                for (kk = 0; kk < 4; kk ++){
173
136k
                    ymm_0[kk] = ymm_storeage[mm][kk];
174
136k
                }
175
176
34.2k
                ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
177
34.2k
                ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
178
34.2k
                ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
179
34.2k
                ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
180
181
34.2k
                ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
182
34.2k
                ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
183
34.2k
                ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
184
34.2k
                ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
185
186
34.2k
                _mm256_storeu_si256((__m256i *) &out_b[
187
34.2k
                        (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
188
34.2k
                _mm256_storeu_si256((__m256i *) &out_b[
189
34.2k
                        (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
190
34.2k
                _mm256_storeu_si256((__m256i *) &out_b[
191
34.2k
                        (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
192
34.2k
                _mm256_storeu_si256((__m256i *) &out_b[
193
34.2k
                        (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
194
34.2k
            }
195
4.27k
        }
196
3.58k
    }
197
58.2k
    for (ii = 0; ii < nrows; ii ++ ) {
198
612k
        for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
199
555k
            out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
200
555k
        }
201
57.6k
    }
202
659
    return size * elem_size;
203
28.5k
}
204
205
206
/* Shuffle bits within the bytes of eight element blocks. */
207
static int64_t bshuf_shuffle_bit_eightelem_avx2(void* in, void* out, const size_t size,
208
28.5k
                                                const size_t elem_size) {
209
210
28.5k
    CHECK_MULT_EIGHT(size);
211
212
    /*  With a bit of care, this could be written such that such that it is */
213
    /*  in_buf = out_buf safe. */
214
28.5k
    char* in_b = (char*) in;
215
28.5k
    char* out_b = (char*) out;
216
217
28.5k
    size_t nbyte = elem_size * size;
218
28.5k
    size_t ii, jj, kk, ind;
219
220
28.5k
    __m256i ymm;
221
28.5k
    int32_t bt;
222
223
28.5k
    if (elem_size % 4) {
224
27.9k
        return blosc_internal_bshuf_shuffle_bit_eightelem_sse2(in, out, size, elem_size);
225
27.9k
    } else {
226
2.46k
        for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
227
155k
            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
228
154k
                    ii += 8 * elem_size) {
229
154k
                ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
230
1.38M
                for (kk = 0; kk < 8; kk++) {
231
1.23M
                    bt = _mm256_movemask_epi8(ymm);
232
1.23M
                    ymm = _mm256_slli_epi16(ymm, 1);
233
1.23M
                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
234
1.23M
                    * (int32_t *) &out_b[ind] = bt;
235
1.23M
                }
236
154k
            }
237
1.80k
        }
238
659
    }
239
659
    return size * elem_size;
240
28.5k
}
241
242
243
/* Untranspose bits within elements. */
244
int64_t blosc_internal_bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
245
28.5k
                                                   const size_t elem_size, void* tmp_buf) {
246
247
28.5k
    int64_t count;
248
249
28.5k
    CHECK_MULT_EIGHT(size);
250
251
28.5k
    count = bshuf_trans_byte_bitrow_avx2(in, tmp_buf, size, elem_size);
252
28.5k
    CHECK_ERR(count);
253
28.5k
    count =  bshuf_shuffle_bit_eightelem_avx2(tmp_buf, out, size, elem_size);
254
255
28.5k
    return count;
256
28.5k
}
257
258
#endif /* !defined(__AVX2__) */