Coverage Report

Created: 2025-06-20 06:13

/src/c-blosc2/blosc/bitshuffle-avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  Bitshuffle - Filter for improving compression of typed binary data.
13
14
  Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
15
  Website: https://github.com/kiyo-masui/bitshuffle
16
17
  Note: Adapted for c-blosc by Francesc Alted.
18
19
  See LICENSES/BITSHUFFLE.txt file for details about copyright and
20
  rights to use.
21
**********************************************************************/
22
23
#include "bitshuffle-avx2.h"
24
#include "bitshuffle-sse2.h"
25
#include "bitshuffle-generic.h"
26
#include <stdlib.h>
27
28
/* Make sure AVX2 is available for the compilation target and compiler. */
29
#if defined(__AVX2__)
30
31
#include <immintrin.h>
32
33
/* The next is useful for debugging purposes */
34
#if 0
35
#include <stdio.h>
36
#include <string.h>
37
38
static void printymm(__m256i ymm0)
39
{
40
  uint8_t buf[32];
41
42
  ((__m256i *)buf)[0] = ymm0;
43
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
44
          buf[0], buf[1], buf[2], buf[3],
45
          buf[4], buf[5], buf[6], buf[7],
46
          buf[8], buf[9], buf[10], buf[11],
47
          buf[12], buf[13], buf[14], buf[15],
48
          buf[16], buf[17], buf[18], buf[19],
49
          buf[20], buf[21], buf[22], buf[23],
50
          buf[24], buf[25], buf[26], buf[27],
51
          buf[28], buf[29], buf[30], buf[31]);
52
}
53
#endif
54
55
56
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
57
58
59
60
/* Transpose bits within bytes. */
61
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
62
134k
                                 const size_t elem_size) {
63
64
134k
  size_t ii, kk;
65
134k
  const char* in_b = (const char*) in;
66
134k
  char* out_b = (char*) out;
67
134k
  int32_t* out_i32;
68
69
134k
  size_t nbyte = elem_size * size;
70
71
134k
  int64_t count;
72
73
134k
  __m256i ymm;
74
134k
  int32_t bt;
75
76
16.7M
  for (ii = 0; ii + 31 < nbyte; ii += 32) {
77
16.6M
    ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
78
149M
    for (kk = 0; kk < 8; kk++) {
79
132M
      bt = _mm256_movemask_epi8(ymm);
80
132M
      ymm = _mm256_slli_epi16(ymm, 1);
81
132M
      out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
82
132M
      *out_i32 = bt;
83
132M
    }
84
16.6M
  }
85
134k
  count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
86
134k
                                         nbyte - nbyte % 32);
87
134k
  return count;
88
134k
}
89
90
91
/* Transpose bits within elements. */
92
int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
93
134k
                                 const size_t elem_size) {
94
95
134k
  int64_t count;
96
97
134k
  CHECK_MULT_EIGHT(size);
98
99
134k
  void* tmp_buf = malloc(size * elem_size);
100
134k
  if (tmp_buf == NULL) return -1;
101
102
134k
  count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
103
134k
  CHECK_ERR_FREE(count, tmp_buf);
104
134k
  count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size);
105
134k
  CHECK_ERR_FREE(count, tmp_buf);
106
134k
  count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
107
108
134k
  free(tmp_buf);
109
110
134k
  return count;
111
134k
}
112
113
114
/* For data organized into a row for each bit (8 * elem_size rows), transpose
115
 * the bytes. */
116
int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size,
117
34.0k
                                    const size_t elem_size) {
118
119
34.0k
  size_t hh, ii, jj, kk, mm;
120
34.0k
  const char* in_b = (const char*) in;
121
34.0k
  char* out_b = (char*) out;
122
123
34.0k
  CHECK_MULT_EIGHT(size);
124
125
34.0k
  size_t nrows = 8 * elem_size;
126
34.0k
  size_t nbyte_row = size / 8;
127
128
34.0k
  if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size,
129
34.0k
                                                        elem_size);
130
131
0
  __m256i ymm_0[8];
132
0
  __m256i ymm_1[8];
133
0
  __m256i ymm_storeage[8][4];
134
135
0
  for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
136
0
    for (ii = 0; ii + 3 < elem_size; ii += 4) {
137
0
      for (hh = 0; hh < 4; hh ++) {
138
139
0
        for (kk = 0; kk < 8; kk ++){
140
0
          ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
141
0
              (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
142
0
        }
143
144
0
        for (kk = 0; kk < 4; kk ++){
145
0
          ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
146
0
                                           ymm_0[kk * 2 + 1]);
147
0
          ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
148
0
                                               ymm_0[kk * 2 + 1]);
149
0
        }
150
151
0
        for (kk = 0; kk < 2; kk ++){
152
0
          for (mm = 0; mm < 2; mm ++){
153
0
            ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
154
0
                ymm_1[kk * 4 + mm * 2],
155
0
                ymm_1[kk * 4 + mm * 2 + 1]);
156
0
            ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
157
0
                ymm_1[kk * 4 + mm * 2],
158
0
                ymm_1[kk * 4 + mm * 2 + 1]);
159
0
          }
160
0
        }
161
162
0
        for (kk = 0; kk < 4; kk ++){
163
0
          ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
164
0
                                                ymm_0[kk * 2 + 1]);
165
0
          ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
166
0
                                                    ymm_0[kk * 2 + 1]);
167
0
        }
168
169
0
        for (kk = 0; kk < 8; kk ++){
170
0
          ymm_storeage[kk][hh] = ymm_1[kk];
171
0
        }
172
0
      }
173
174
0
      for (mm = 0; mm < 8; mm ++) {
175
176
0
        for (kk = 0; kk < 4; kk ++){
177
0
          ymm_0[kk] = ymm_storeage[mm][kk];
178
0
        }
179
180
0
        ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
181
0
        ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
182
0
        ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
183
0
        ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
184
185
0
        ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
186
0
        ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
187
0
        ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
188
0
        ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
189
190
0
        _mm256_storeu_si256((__m256i *) &out_b[
191
0
            (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
192
0
        _mm256_storeu_si256((__m256i *) &out_b[
193
0
            (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
194
0
        _mm256_storeu_si256((__m256i *) &out_b[
195
0
            (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
196
0
        _mm256_storeu_si256((__m256i *) &out_b[
197
0
            (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
198
0
      }
199
0
    }
200
0
  }
201
0
  for (ii = 0; ii < nrows; ii ++ ) {
202
0
    for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
203
0
      out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
204
0
    }
205
0
  }
206
0
  return size * elem_size;
207
34.0k
}
208
209
210
/* Shuffle bits within the bytes of eight element blocks. */
211
int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size,
212
34.0k
                                        const size_t elem_size) {
213
214
34.0k
  CHECK_MULT_EIGHT(size);
215
216
  // With a bit of care, this could be written such that such that it is
217
  // in_buf = out_buf safe.
218
34.0k
  const char* in_b = (const char*) in;
219
34.0k
  char* out_b = (char*) out;
220
221
34.0k
  size_t ii, jj, kk;
222
34.0k
  size_t nbyte = elem_size * size;
223
224
34.0k
  __m256i ymm;
225
34.0k
  int32_t bt;
226
227
34.0k
  if (elem_size % 4) {
228
34.0k
    return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size);
229
34.0k
  } else {
230
0
    for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
231
0
      for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
232
0
           ii += 8 * elem_size) {
233
0
        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
234
0
        for (kk = 0; kk < 8; kk++) {
235
0
          bt = _mm256_movemask_epi8(ymm);
236
0
          ymm = _mm256_slli_epi16(ymm, 1);
237
0
          size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
238
0
          * (int32_t *) &out_b[ind] = bt;
239
0
        }
240
0
      }
241
0
    }
242
0
  }
243
0
  return size * elem_size;
244
34.0k
}
245
246
247
/* Untranspose bits within elements. */
248
int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
249
34.0k
                                   const size_t elem_size) {
250
251
34.0k
  int64_t count;
252
253
34.0k
  CHECK_MULT_EIGHT(size);
254
255
34.0k
  void* tmp_buf = malloc(size * elem_size);
256
34.0k
  if (tmp_buf == NULL) return -1;
257
258
34.0k
  count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
259
34.0k
  CHECK_ERR_FREE(count, tmp_buf);
260
34.0k
  count =  bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size);
261
262
34.0k
  free(tmp_buf);
263
34.0k
  return count;
264
34.0k
}
265
266
const bool is_bshuf_AVX = true;
267
268
#else /* defined(__AVX2__) */
269
270
const bool is_bshuf_AVX = false;
271
272
int64_t
273
bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
274
                         const size_t elem_size) {
275
  abort();
276
}
277
278
int64_t
279
bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
280
                           const size_t elem_size) {
281
  abort();
282
}
283
284
#endif /* defined(__AVX2__) */