Coverage Report

Created: 2024-07-27 06:19

/src/c-blosc2/blosc/bitshuffle-avx512.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  Bitshuffle - Filter for improving compression of typed binary data.
13
14
  Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
15
  Website: https://github.com/kiyo-masui/bitshuffle
16
17
  Note: Adapted for c-blosc2 by Francesc Alted.
18
19
  See LICENSES/BITSHUFFLE.txt file for details about copyright and
20
  rights to use.
21
**********************************************************************/
22
23
#include "bitshuffle-avx512.h"
24
#include "bitshuffle-avx2.h"
25
#include "bitshuffle-sse2.h"
26
#include "bitshuffle-generic.h"
27
#include <stdlib.h>
28
29
/* Make sure AVX512 is available for the compilation target and compiler. */
30
#if defined(__AVX512F__) && defined (__AVX512BW__)
31
#include <immintrin.h>
32
33
34
/* Transpose bits within bytes. */
35
int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
36
0
                                    const size_t elem_size) {
37
38
0
  size_t ii, kk;
39
0
  const char* in_b = (const char*) in;
40
0
  char* out_b = (char*) out;
41
0
  size_t nbyte = elem_size * size;
42
0
  int64_t count;
43
44
0
  int64_t* out_i64;
45
0
  __m512i zmm;
46
0
  __mmask64 bt;
47
0
  if (nbyte >= 64) {
48
0
    const __m512i mask = _mm512_set1_epi8(0);
49
50
0
    for (ii = 0; ii + 63 < nbyte; ii += 64) {
51
0
      zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]);
52
0
      for (kk = 0; kk < 8; kk++) {
53
0
        bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
54
0
        zmm = _mm512_slli_epi16(zmm, 1);
55
0
        out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
56
0
        *out_i64 = (int64_t)bt;
57
0
      }
58
0
    }
59
0
  }
60
61
0
  __m256i ymm;
62
0
  int32_t bt32;
63
0
  int32_t* out_i32;
64
0
  size_t start = nbyte - nbyte % 64;
65
0
  for (ii = start; ii + 31 < nbyte; ii += 32) {
66
0
    ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
67
0
    for (kk = 0; kk < 8; kk++) {
68
0
      bt32 = _mm256_movemask_epi8(ymm);
69
0
      ymm = _mm256_slli_epi16(ymm, 1);
70
0
      out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
71
0
      *out_i32 = bt32;
72
0
    }
73
0
  }
74
75
76
0
  count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
77
0
                                         nbyte - nbyte % 64 % 32);
78
79
0
  return count;
80
0
}
81
82
83
/* Transpose bits within elements. */
84
int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
85
0
                                    const size_t elem_size) {
86
87
0
  int64_t count;
88
89
0
  CHECK_MULT_EIGHT(size);
90
91
0
  void* tmp_buf = malloc(size * elem_size);
92
0
  if (tmp_buf == NULL) return -1;
93
94
0
  count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
95
0
  CHECK_ERR_FREE(count, tmp_buf);
96
0
  count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size);
97
0
  CHECK_ERR_FREE(count, tmp_buf);
98
0
  count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
99
100
0
  free(tmp_buf);
101
102
0
  return count;
103
104
0
}
105
106
/* Shuffle bits within the bytes of eight element blocks. */
107
int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
108
0
                                           const size_t elem_size) {
109
110
0
  CHECK_MULT_EIGHT(size);
111
112
  // With a bit of care, this could be written such that such that it is
113
  // in_buf = out_buf safe.
114
0
  const char* in_b = (const char*) in;
115
0
  char* out_b = (char*) out;
116
117
0
  size_t ii, jj, kk;
118
0
  size_t nbyte = elem_size * size;
119
120
0
  __m512i zmm;
121
0
  __mmask64 bt;
122
123
0
  if (elem_size % 8) {
124
0
    return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size);
125
0
  } else {
126
0
    const __m512i mask = _mm512_set1_epi8(0);
127
0
    for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) {
128
0
      for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
129
0
           ii += 8 * elem_size) {
130
0
        zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]);
131
0
        for (kk = 0; kk < 8; kk++) {
132
0
          bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
133
0
          zmm = _mm512_slli_epi16(zmm, 1);
134
0
          size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
135
0
          * (int64_t *) &out_b[ind] = bt;
136
0
        }
137
0
      }
138
0
    }
139
140
0
  }
141
0
  return size * elem_size;
142
0
}
143
144
/* Untranspose bits within elements. */
145
int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
146
0
                                      const size_t elem_size) {
147
148
0
  int64_t count;
149
150
0
  CHECK_MULT_EIGHT(size);
151
152
0
  void* tmp_buf = malloc(size * elem_size);
153
0
  if (tmp_buf == NULL) return -1;
154
155
0
  count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
156
0
  CHECK_ERR_FREE(count, tmp_buf);
157
0
  count =  bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size);
158
159
0
  free(tmp_buf);
160
0
  return count;
161
0
}
162
163
const bool is_bshuf_AVX512 = true;
164
165
#else /* defined(__AVX512F__) && defined (__AVX512BW__) */
166
167
const bool is_bshuf_AVX512 = false;
168
169
int64_t
170
bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
171
                            const size_t elem_size) {
172
  abort();
173
}
174
175
int64_t
176
bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
177
                              const size_t elem_size) {
178
  abort();
179
}
180
181
#endif /* defined(__AVX512F__) && defined (__AVX512BW__) */