Coverage Report

Created: 2026-03-07 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/bitshuffle-avx2.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  Bitshuffle - Filter for improving compression of typed binary data.
13
14
  Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
15
  Website: https://github.com/kiyo-masui/bitshuffle
16
17
  Note: Adapted for c-blosc by Francesc Alted.
18
19
  See LICENSES/BITSHUFFLE.txt file for details about copyright and
20
  rights to use.
21
**********************************************************************/
22
23
#include "bitshuffle-avx2.h"
24
#include "bitshuffle-sse2.h"
25
#include "bitshuffle-generic.h"
26
#include <stdlib.h>
27
28
/* Make sure AVX2 is available for the compilation target and compiler. */
29
#if defined(__AVX2__)
30
31
#include <immintrin.h>
32
33
/* The next is useful for debugging purposes */
34
#if 0
35
#include <stdio.h>
36
#include <string.h>
37
38
static void printymm(__m256i ymm0)
39
{
40
  uint8_t buf[32];
41
42
  ((__m256i *)buf)[0] = ymm0;
43
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
44
          buf[0], buf[1], buf[2], buf[3],
45
          buf[4], buf[5], buf[6], buf[7],
46
          buf[8], buf[9], buf[10], buf[11],
47
          buf[12], buf[13], buf[14], buf[15],
48
          buf[16], buf[17], buf[18], buf[19],
49
          buf[20], buf[21], buf[22], buf[23],
50
          buf[24], buf[25], buf[26], buf[27],
51
          buf[28], buf[29], buf[30], buf[31]);
52
}
53
#endif
54
55
56
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
57
58
59
60
/* Transpose bits within bytes. */
61
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
62
246k
                                 const size_t elem_size) {
63
64
246k
  size_t ii, kk;
65
246k
  const char* in_b = (const char*) in;
66
246k
  char* out_b = (char*) out;
67
246k
  int32_t* out_i32;
68
69
246k
  size_t nbyte = elem_size * size;
70
71
246k
  int64_t count;
72
73
246k
  __m256i ymm;
74
246k
  int32_t bt;
75
76
29.6M
  for (ii = 0; ii + 31 < nbyte; ii += 32) {
77
29.4M
    ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
78
264M
    for (kk = 0; kk < 8; kk++) {
79
235M
      bt = _mm256_movemask_epi8(ymm);
80
235M
      ymm = _mm256_slli_epi16(ymm, 1);
81
235M
      out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
82
235M
      *out_i32 = bt;
83
235M
    }
84
29.4M
  }
85
246k
  count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
86
246k
                                         nbyte - nbyte % 32);
87
246k
  return count;
88
246k
}
89
90
91
/* Transpose bits within elements. */
92
int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
93
246k
                                 const size_t elem_size) {
94
95
246k
  int64_t count;
96
97
246k
  CHECK_MULT_EIGHT(size);
98
99
246k
  void* tmp_buf = malloc(size * elem_size);
100
246k
  if (tmp_buf == NULL) return -1;
101
102
246k
  count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
103
246k
  CHECK_ERR_FREE(count, tmp_buf);
104
246k
  count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size);
105
246k
  CHECK_ERR_FREE(count, tmp_buf);
106
246k
  count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
107
108
246k
  free(tmp_buf);
109
110
246k
  return count;
111
246k
}
112
113
114
/* For data organized into a row for each bit (8 * elem_size rows), transpose
115
 * the bytes. */
116
int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size,
117
35.4k
                                    const size_t elem_size) {
118
119
35.4k
  size_t hh, ii, jj, kk, mm;
120
35.4k
  const char* in_b = (const char*) in;
121
35.4k
  char* out_b = (char*) out;
122
123
35.4k
  CHECK_MULT_EIGHT(size);
124
125
35.4k
  size_t nrows = 8 * elem_size;
126
35.4k
  size_t nbyte_row = size / 8;
127
128
35.4k
  if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size,
129
33.9k
                                                        elem_size);
130
131
1.50k
  __m256i ymm_0[8];
132
1.50k
  __m256i ymm_1[8];
133
1.50k
  __m256i ymm_storeage[8][4];
134
135
9.31k
  for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
136
24.6k
    for (ii = 0; ii + 3 < elem_size; ii += 4) {
137
84.0k
      for (hh = 0; hh < 4; hh ++) {
138
139
605k
        for (kk = 0; kk < 8; kk ++){
140
538k
          ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
141
538k
              (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
142
538k
        }
143
144
336k
        for (kk = 0; kk < 4; kk ++){
145
269k
          ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
146
269k
                                           ymm_0[kk * 2 + 1]);
147
269k
          ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
148
269k
                                               ymm_0[kk * 2 + 1]);
149
269k
        }
150
151
201k
        for (kk = 0; kk < 2; kk ++){
152
403k
          for (mm = 0; mm < 2; mm ++){
153
269k
            ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
154
269k
                ymm_1[kk * 4 + mm * 2],
155
269k
                ymm_1[kk * 4 + mm * 2 + 1]);
156
269k
            ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
157
269k
                ymm_1[kk * 4 + mm * 2],
158
269k
                ymm_1[kk * 4 + mm * 2 + 1]);
159
269k
          }
160
134k
        }
161
162
336k
        for (kk = 0; kk < 4; kk ++){
163
269k
          ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
164
269k
                                                ymm_0[kk * 2 + 1]);
165
269k
          ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
166
269k
                                                    ymm_0[kk * 2 + 1]);
167
269k
        }
168
169
605k
        for (kk = 0; kk < 8; kk ++){
170
538k
          ymm_storeage[kk][hh] = ymm_1[kk];
171
538k
        }
172
67.2k
      }
173
174
151k
      for (mm = 0; mm < 8; mm ++) {
175
176
672k
        for (kk = 0; kk < 4; kk ++){
177
538k
          ymm_0[kk] = ymm_storeage[mm][kk];
178
538k
        }
179
180
134k
        ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
181
134k
        ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
182
134k
        ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
183
134k
        ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
184
185
134k
        ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
186
134k
        ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
187
134k
        ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
188
134k
        ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
189
190
134k
        _mm256_storeu_si256((__m256i *) &out_b[
191
134k
            (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
192
134k
        _mm256_storeu_si256((__m256i *) &out_b[
193
134k
            (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
194
134k
        _mm256_storeu_si256((__m256i *) &out_b[
195
134k
            (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
196
134k
        _mm256_storeu_si256((__m256i *) &out_b[
197
134k
            (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
198
134k
      }
199
16.8k
    }
200
7.81k
  }
201
500k
  for (ii = 0; ii < nrows; ii ++ ) {
202
721k
    for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
203
222k
      out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
204
222k
    }
205
498k
  }
206
1.50k
  return size * elem_size;
207
35.4k
}
208
209
210
/* Shuffle bits within the bytes of eight element blocks. */
211
int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size,
212
35.4k
                                        const size_t elem_size) {
213
214
35.4k
  CHECK_MULT_EIGHT(size);
215
216
  // With a bit of care, this could be written such that such that it is
217
  // in_buf = out_buf safe.
218
35.4k
  const char* in_b = (const char*) in;
219
35.4k
  char* out_b = (char*) out;
220
221
35.4k
  size_t ii, jj, kk;
222
35.4k
  size_t nbyte = elem_size * size;
223
224
35.4k
  __m256i ymm;
225
35.4k
  int32_t bt;
226
227
35.4k
  if (elem_size % 4) {
228
33.9k
    return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size);
229
33.9k
  } else {
230
17.0k
    for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
231
560k
      for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
232
545k
           ii += 8 * elem_size) {
233
545k
        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
234
4.90M
        for (kk = 0; kk < 8; kk++) {
235
4.36M
          bt = _mm256_movemask_epi8(ymm);
236
4.36M
          ymm = _mm256_slli_epi16(ymm, 1);
237
4.36M
          size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
238
4.36M
          * (int32_t *) &out_b[ind] = bt;
239
4.36M
        }
240
545k
      }
241
15.5k
    }
242
1.50k
  }
243
1.50k
  return size * elem_size;
244
35.4k
}
245
246
247
/* Untranspose bits within elements. */
248
int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
249
35.4k
                                   const size_t elem_size) {
250
251
35.4k
  int64_t count;
252
253
35.4k
  CHECK_MULT_EIGHT(size);
254
255
35.4k
  void* tmp_buf = malloc(size * elem_size);
256
35.4k
  if (tmp_buf == NULL) return -1;
257
258
35.4k
  count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
259
35.4k
  CHECK_ERR_FREE(count, tmp_buf);
260
35.4k
  count =  bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size);
261
262
35.4k
  free(tmp_buf);
263
35.4k
  return count;
264
35.4k
}
265
266
const bool is_bshuf_AVX = true;
267
268
#else /* defined(__AVX2__) */
269
270
const bool is_bshuf_AVX = false;
271
272
int64_t
273
bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
274
                         const size_t elem_size) {
275
  abort();
276
}
277
278
int64_t
279
bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
280
                           const size_t elem_size) {
281
  abort();
282
}
283
284
#endif /* defined(__AVX2__) */