Coverage Report

Created: 2026-01-10 06:55

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/bitshuffle-avx2.c
Line
Count
Source
1
/*
2
 * Bitshuffle - Filter for improving compression of typed binary data.
3
 *
4
 * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
5
 * Website: https://github.com/kiyo-masui/bitshuffle
6
 * Created: 2014
7
 *
8
 * Note: Adapted for c-blosc by Francesc Alted.
9
 *
10
 * See LICENSES/BITSHUFFLE.txt file for details about copyright and
11
 * rights to use.
12
 *
13
 */
14
15
#include "bitshuffle-generic.h"
16
#include "bitshuffle-sse2.h"
17
#include "bitshuffle-avx2.h"
18
19
20
/* Define dummy functions if AVX2 is not available for the compilation target and compiler. */
21
#if !defined(__AVX2__)
22
#include <stdlib.h>
23
24
int64_t blosc_internal_bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
25
                                                 const size_t elem_size, void* tmp_buf) {
26
    abort();
27
}
28
29
int64_t blosc_internal_bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
30
                                                   const size_t elem_size, void* tmp_buf) {
31
    abort();
32
}
33
34
#else /* defined(__AVX2__) */
35
36
#include <immintrin.h>
37
38
/* The next is useful for debugging purposes */
39
#if 0
40
#include <stdio.h>
41
#include <string.h>
42
43
static void printymm(__m256i ymm0)
44
{
45
  uint8_t buf[32];
46
47
  ((__m256i *)buf)[0] = ymm0;
48
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
49
          buf[0], buf[1], buf[2], buf[3],
50
          buf[4], buf[5], buf[6], buf[7],
51
          buf[8], buf[9], buf[10], buf[11],
52
          buf[12], buf[13], buf[14], buf[15],
53
          buf[16], buf[17], buf[18], buf[19],
54
          buf[20], buf[21], buf[22], buf[23],
55
          buf[24], buf[25], buf[26], buf[27],
56
          buf[28], buf[29], buf[30], buf[31]);
57
}
58
#endif
59
60
61
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
62
63
64
/* Transpose bits within bytes. */
65
static int64_t bshuf_trans_bit_byte_avx2(void* in, void* out, const size_t size,
66
0
                                         const size_t elem_size) {
67
68
0
    char* in_b = (char*) in;
69
0
    char* out_b = (char*) out;
70
0
    int32_t* out_i32;
71
72
0
    size_t nbyte = elem_size * size;
73
74
0
    int64_t count;
75
76
0
    __m256i ymm;
77
0
    int32_t bt;
78
0
    size_t ii, kk;
79
80
0
    for (ii = 0; ii + 31 < nbyte; ii += 32) {
81
0
        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
82
0
        for (kk = 0; kk < 8; kk++) {
83
0
            bt = _mm256_movemask_epi8(ymm);
84
0
            ymm = _mm256_slli_epi16(ymm, 1);
85
0
            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
86
0
            *out_i32 = bt;
87
0
        }
88
0
    }
89
0
    count = blosc_internal_bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
90
0
            nbyte - nbyte % 32);
91
0
    return count;
92
0
}
93
94
/* Transpose bits within elements. */
95
int64_t blosc_internal_bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
96
0
                                                 const size_t elem_size, void* tmp_buf) {
97
0
    int64_t count;
98
99
0
    CHECK_MULT_EIGHT(size);
100
101
0
    count = blosc_internal_bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
102
0
    CHECK_ERR(count);
103
0
    count = bshuf_trans_bit_byte_avx2(out, tmp_buf, size, elem_size);
104
0
    CHECK_ERR(count);
105
0
    count = blosc_internal_bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
106
107
0
    return count;
108
0
}
109
110
/* For data organized into a row for each bit (8 * elem_size rows), transpose
111
 * the bytes. */
112
static int64_t bshuf_trans_byte_bitrow_avx2(void* in, void* out, const size_t size,
113
1.33k
                                            const size_t elem_size) {
114
115
1.33k
    char* in_b = (char*) in;
116
1.33k
    char* out_b = (char*) out;
117
118
1.33k
    size_t nrows = 8 * elem_size;
119
1.33k
    size_t nbyte_row = size / 8;
120
1.33k
    size_t ii, jj, kk, hh, mm;
121
122
1.33k
    CHECK_MULT_EIGHT(size);
123
124
1.33k
    if (elem_size % 4)
125
618
      return blosc_internal_bshuf_trans_byte_bitrow_sse2(in, out, size, elem_size);
126
127
714
    __m256i ymm_0[8];
128
714
    __m256i ymm_1[8];
129
714
    __m256i ymm_storeage[8][4];
130
131
3.62k
    for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
132
7.06k
        for (ii = 0; ii + 3 < elem_size; ii += 4) {
133
20.7k
            for (hh = 0; hh < 4; hh ++) {
134
135
149k
                for (kk = 0; kk < 8; kk ++){
136
132k
                    ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
137
132k
                            (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
138
132k
                }
139
140
83.0k
                for (kk = 0; kk < 4; kk ++){
141
66.4k
                    ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
142
66.4k
                            ymm_0[kk * 2 + 1]);
143
66.4k
                    ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
144
66.4k
                            ymm_0[kk * 2 + 1]);
145
66.4k
                }
146
147
49.8k
                for (kk = 0; kk < 2; kk ++){
148
99.6k
                    for (mm = 0; mm < 2; mm ++){
149
66.4k
                        ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
150
66.4k
                                ymm_1[kk * 4 + mm * 2],
151
66.4k
                                ymm_1[kk * 4 + mm * 2 + 1]);
152
66.4k
                        ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
153
66.4k
                                ymm_1[kk * 4 + mm * 2],
154
66.4k
                                ymm_1[kk * 4 + mm * 2 + 1]);
155
66.4k
                    }
156
33.2k
                }
157
158
83.0k
                for (kk = 0; kk < 4; kk ++){
159
66.4k
                    ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
160
66.4k
                            ymm_0[kk * 2 + 1]);
161
66.4k
                    ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
162
66.4k
                            ymm_0[kk * 2 + 1]);
163
66.4k
                }
164
165
149k
                for (kk = 0; kk < 8; kk ++){
166
132k
                    ymm_storeage[kk][hh] = ymm_1[kk];
167
132k
                }
168
16.6k
            }
169
170
37.3k
            for (mm = 0; mm < 8; mm ++) {
171
172
166k
                for (kk = 0; kk < 4; kk ++){
173
132k
                    ymm_0[kk] = ymm_storeage[mm][kk];
174
132k
                }
175
176
33.2k
                ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
177
33.2k
                ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
178
33.2k
                ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
179
33.2k
                ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
180
181
33.2k
                ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
182
33.2k
                ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
183
33.2k
                ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
184
33.2k
                ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
185
186
33.2k
                _mm256_storeu_si256((__m256i *) &out_b[
187
33.2k
                        (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
188
33.2k
                _mm256_storeu_si256((__m256i *) &out_b[
189
33.2k
                        (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
190
33.2k
                _mm256_storeu_si256((__m256i *) &out_b[
191
33.2k
                        (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
192
33.2k
                _mm256_storeu_si256((__m256i *) &out_b[
193
33.2k
                        (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
194
33.2k
            }
195
4.15k
        }
196
2.90k
    }
197
60.2k
    for (ii = 0; ii < nrows; ii ++ ) {
198
623k
        for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
199
563k
            out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
200
563k
        }
201
59.5k
    }
202
714
    return size * elem_size;
203
1.33k
}
204
205
206
/* Shuffle bits within the bytes of eight element blocks. */
207
static int64_t bshuf_shuffle_bit_eightelem_avx2(void* in, void* out, const size_t size,
208
1.33k
                                                const size_t elem_size) {
209
210
1.33k
    CHECK_MULT_EIGHT(size);
211
212
    /*  With a bit of care, this could be written such that such that it is */
213
    /*  in_buf = out_buf safe. */
214
1.33k
    char* in_b = (char*) in;
215
1.33k
    char* out_b = (char*) out;
216
217
1.33k
    size_t nbyte = elem_size * size;
218
1.33k
    size_t ii, jj, kk, ind;
219
220
1.33k
    __m256i ymm;
221
1.33k
    int32_t bt;
222
223
1.33k
    if (elem_size % 4) {
224
618
        return blosc_internal_bshuf_shuffle_bit_eightelem_sse2(in, out, size, elem_size);
225
714
    } else {
226
2.57k
        for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
227
152k
            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
228
150k
                    ii += 8 * elem_size) {
229
150k
                ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
230
1.35M
                for (kk = 0; kk < 8; kk++) {
231
1.20M
                    bt = _mm256_movemask_epi8(ymm);
232
1.20M
                    ymm = _mm256_slli_epi16(ymm, 1);
233
1.20M
                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
234
1.20M
                    * (int32_t *) &out_b[ind] = bt;
235
1.20M
                }
236
150k
            }
237
1.86k
        }
238
714
    }
239
714
    return size * elem_size;
240
1.33k
}
241
242
243
/* Untranspose bits within elements. */
244
int64_t blosc_internal_bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
245
1.33k
                                                   const size_t elem_size, void* tmp_buf) {
246
247
1.33k
    int64_t count;
248
249
1.33k
    CHECK_MULT_EIGHT(size);
250
251
1.33k
    count = bshuf_trans_byte_bitrow_avx2(in, tmp_buf, size, elem_size);
252
1.33k
    CHECK_ERR(count);
253
1.33k
    count =  bshuf_shuffle_bit_eightelem_avx2(tmp_buf, out, size, elem_size);
254
255
1.33k
    return count;
256
1.33k
}
257
258
#endif /* !defined(__AVX2__) */