Coverage Report

Created: 2025-12-25 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/shuffle.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Author: Francesc Alted <francesc@blosc.org>
5
  Creation date: 2009-05-20
6
7
  See LICENSE.txt for details about copyright and rights to use.
8
**********************************************************************/
9
10
#include "shuffle.h"
11
#include "blosc-common.h"
12
#include "shuffle-generic.h"
13
#include "bitshuffle-generic.h"
14
#include "blosc-comp-features.h"
15
#include <stdio.h>
16
17
#if defined(_WIN32)
18
#include "win32/pthread.h"
19
#else
20
#include <pthread.h>
21
#endif
22
23
#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
24
    __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
25
#define HAVE_CPU_FEAT_INTRIN
26
#endif
27
28
29
/*  Include hardware-accelerated shuffle/unshuffle routines based on
30
    the target architecture. Note that a target architecture may support
31
    more than one type of acceleration!*/
32
#if defined(SHUFFLE_AVX2_ENABLED)
33
  #include "shuffle-avx2.h"
34
  #include "bitshuffle-avx2.h"
35
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
36
37
#if defined(SHUFFLE_SSE2_ENABLED)
38
  #include "shuffle-sse2.h"
39
  #include "bitshuffle-sse2.h"
40
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
41
42
43
/*  Define function pointer types for shuffle/unshuffle routines. */
44
typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
45
typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
46
typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
47
typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
48
49
/* An implementation of shuffle/unshuffle routines. */
50
typedef struct shuffle_implementation {
51
  /* Name of this implementation. */
52
  const char* name;
53
  /* Function pointer to the shuffle routine for this implementation. */
54
  shuffle_func shuffle;
55
  /* Function pointer to the unshuffle routine for this implementation. */
56
  unshuffle_func unshuffle;
57
  /* Function pointer to the bitshuffle routine for this implementation. */
58
  bitshuffle_func bitshuffle;
59
  /* Function pointer to the bitunshuffle routine for this implementation. */
60
  bitunshuffle_func bitunshuffle;
61
} shuffle_implementation_t;
62
63
typedef enum {
64
  BLOSC_HAVE_NOTHING = 0,
65
  BLOSC_HAVE_SSE2 = 1,
66
  BLOSC_HAVE_AVX2 = 2
67
} blosc_cpu_features;
68
69
/*  Detect hardware and set function pointers to the best shuffle/unshuffle
70
    implementations supported by the host processor for Intel/i686
71
     */
72
#if (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) \
73
    && (defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED))
74
75
/*  Disabled the __builtin_cpu_supports() call, as it has issues with
76
    new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
77
      "undefined symbol: __cpu_model"
78
    For a similar report, see:
79
    https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
80
*/
81
#if defined(HAVE_CPU_FEAT_INTRIN) && 0
82
static blosc_cpu_features blosc_get_cpu_features(void) {
83
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
84
  if (__builtin_cpu_supports("sse2")) {
85
    cpu_features |= BLOSC_HAVE_SSE2;
86
  }
87
  if (__builtin_cpu_supports("avx2")) {
88
    cpu_features |= BLOSC_HAVE_AVX2;
89
  }
90
  return cpu_features;
91
}
92
#else
93
94
#if defined(_MSC_VER) && !defined(__clang__)
95
  #include <intrin.h>     /* Needed for __cpuid */
96
97
/*  _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
98
#if _MSC_FULL_VER >= 160040219
99
  #include <immintrin.h>  /* Needed for _xgetbv */
100
  #define blosc_internal_xgetbv _xgetbv
101
#elif defined(_M_IX86)
102
103
/*  Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
104
105
static uint64_t blosc_internal_xgetbv(uint32_t xcr) {
106
    uint32_t xcr0, xcr1;
107
    __asm {
108
        mov        ecx, xcr
109
        _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
110
        mov        xcr0, eax
111
        mov        xcr1, edx
112
    }
113
    return ((uint64_t)xcr1 << 32) | xcr0;
114
}
115
116
#elif defined(_M_X64)
117
118
/*  Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
119
    These compilers don't support any of the newer acceleration ISAs
120
    (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
121
    which means we can get away with returning a hard-coded value from
122
    this implementation of _xgetbv. */
123
124
static __inline uint64_t blosc_internal_xgetbv(uint32_t xcr) {
125
    /* A 64-bit OS must have XMM save support. */
126
    return (xcr == 0 ? (1UL << 1) : 0UL);
127
}
128
129
#else
130
131
/* Hardware detection for any other MSVC targets (e.g., ARM)
132
   isn't implemented at this time. */
133
#error This version of c-blosc only supports x86 and x64 targets with MSVC.
134
135
#endif /* _MSC_FULL_VER >= 160040219 */
136
137
#define blosc_internal_cpuid __cpuid
138
139
#else
140
141
/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
142
    and others using inline assembly. */
143
__attribute__((always_inline))
144
static inline void
145
3
blosc_internal_cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
146
3
  __asm__ __volatile__ (
147
# if defined(__i386__) && defined (__PIC__)
148
  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
149
      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
150
  */
151
    "movl %%ebx, %%edi\n\t"
152
    "cpuid\n\t"
153
    "xchgl %%ebx, %%edi":
154
    "=D" (cpuInfo[1]),
155
#else
156
3
    "cpuid":
157
3
    "=b" (cpuInfo[1]),
158
3
#endif  /* defined(__i386) && defined(__PIC__) */
159
3
    "=a" (cpuInfo[0]),
160
3
    "=c" (cpuInfo[2]),
161
3
    "=d" (cpuInfo[3]) :
162
3
    "a" (function_id), "c" (subfunction_id)
163
3
    );
164
3
}
165
166
3
#define blosc_internal_cpuid(cpuInfo, function_id) blosc_internal_cpuidex(cpuInfo, function_id, 0)
167
168
1
#define _XCR_XFEATURE_ENABLED_MASK 0
169
170
/* Reads the content of an extended control register.
171
   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
172
*/
173
static inline uint64_t
174
1
blosc_internal_xgetbv(uint32_t xcr) {
175
1
  uint32_t eax, edx;
176
1
  __asm__ __volatile__ (
177
    /* "xgetbv"
178
       This is specified as raw instruction bytes due to some older compilers
179
       having issues with the mnemonic form.
180
    */
181
1
    ".byte 0x0f, 0x01, 0xd0":
182
1
    "=a" (eax),
183
1
    "=d" (edx) :
184
1
    "c" (xcr)
185
1
    );
186
1
  return ((uint64_t)edx << 32) | eax;
187
1
}
188
189
#endif  /* defined(_MSC_FULL_VER) */
190
191
#ifndef _XCR_XFEATURE_ENABLED_MASK
192
#define _XCR_XFEATURE_ENABLED_MASK 0x0
193
#endif
194
195
1
static blosc_cpu_features blosc_get_cpu_features(void) {
196
1
  blosc_cpu_features result = BLOSC_HAVE_NOTHING;
197
1
  int32_t max_basic_function_id;
198
  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
199
1
  int32_t cpu_info[4];
200
1
  int sse2_available;
201
1
  int sse3_available;
202
1
  int ssse3_available;
203
1
  int sse41_available;
204
1
  int sse42_available;
205
1
  int xsave_available;
206
1
  int xsave_enabled_by_os;
207
1
  int avx2_available = 0;
208
1
  int avx512bw_available = 0;
209
1
  int xmm_state_enabled = 0;
210
1
  int ymm_state_enabled = 0;
211
1
  int zmm_state_enabled = 0;
212
1
  uint64_t xcr0_contents;
213
1
  char* envvar;
214
215
  /* Get the number of basic functions available. */
216
1
  blosc_internal_cpuid(cpu_info, 0);
217
1
  max_basic_function_id = cpu_info[0];
218
219
  /* Check for SSE-based features and required OS support */
220
1
  blosc_internal_cpuid(cpu_info, 1);
221
1
  sse2_available = (cpu_info[3] & (1 << 26)) != 0;
222
1
  sse3_available = (cpu_info[2] & (1 << 0)) != 0;
223
1
  ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
224
1
  sse41_available = (cpu_info[2] & (1 << 19)) != 0;
225
1
  sse42_available = (cpu_info[2] & (1 << 20)) != 0;
226
227
1
  xsave_available = (cpu_info[2] & (1 << 26)) != 0;
228
1
  xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
229
230
  /* Check for AVX-based features, if the processor supports extended features. */
231
1
  if (max_basic_function_id >= 7) {
232
1
    blosc_internal_cpuid(cpu_info, 7);
233
1
    avx2_available = (cpu_info[1] & (1 << 5)) != 0;
234
1
    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
235
1
  }
236
237
  /*  Even if certain features are supported by the CPU, they may not be supported
238
      by the OS (in which case using them would crash the process or system).
239
      If xsave is available and enabled by the OS, check the contents of the
240
      extended control register XCR0 to see if the CPU features are enabled. */
241
1
#if defined(_XCR_XFEATURE_ENABLED_MASK)
242
1
  if (xsave_available && xsave_enabled_by_os && (
243
1
      sse2_available || sse3_available || ssse3_available
244
0
      || sse41_available || sse42_available
245
1
      || avx2_available || avx512bw_available)) {
246
    /* Determine which register states can be restored by the OS. */
247
1
    xcr0_contents = blosc_internal_xgetbv(_XCR_XFEATURE_ENABLED_MASK);
248
249
1
    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
250
1
    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
251
252
    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
253
        restored as well as all of zmm16-zmm31 and the opmask registers. */
254
1
    zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
255
1
  }
256
1
#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
257
258
1
  envvar = getenv("BLOSC_PRINT_SHUFFLE_ACCEL");
259
1
  if (envvar != NULL) {
260
0
    printf("Shuffle CPU Information:\n");
261
0
    printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
262
0
    printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
263
0
    printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
264
0
    printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
265
0
    printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
266
0
    printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
267
0
    printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
268
0
    printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
269
0
    printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
270
0
    printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
271
0
    printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
272
0
    printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
273
0
  }
274
275
  /* Using the gathered CPU information, determine which implementation to use. */
276
  /* technically could fail on sse2 cpu on os without xmm support, but that
277
   * shouldn't exist anymore */
278
1
  if (sse2_available) {
279
1
    result |= BLOSC_HAVE_SSE2;
280
1
  }
281
1
  if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
282
1
    result |= BLOSC_HAVE_AVX2;
283
1
  }
284
1
  return result;
285
1
}
286
#endif
287
288
#else   /* No hardware acceleration supported for the target architecture. */
289
  #if defined(_MSC_VER)
290
  #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
291
  #else
292
  #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
293
  #endif
294
295
static blosc_cpu_features blosc_get_cpu_features(void) {
296
  return BLOSC_HAVE_NOTHING;
297
}
298
299
#endif
300
301
1
static shuffle_implementation_t get_shuffle_implementation(void) {
302
1
  blosc_cpu_features cpu_features = blosc_get_cpu_features();
303
1
  shuffle_implementation_t impl_generic;
304
305
1
#if defined(SHUFFLE_AVX2_ENABLED)
306
1
  if (cpu_features & BLOSC_HAVE_AVX2) {
307
1
    shuffle_implementation_t impl_avx2;
308
1
    impl_avx2.name = "avx2";
309
1
    impl_avx2.shuffle = (shuffle_func)blosc_internal_shuffle_avx2;
310
1
    impl_avx2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_avx2;
311
1
    impl_avx2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_avx2;
312
1
    impl_avx2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_avx2;
313
1
    return impl_avx2;
314
1
  }
315
0
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
316
317
0
#if defined(SHUFFLE_SSE2_ENABLED)
318
0
  if (cpu_features & BLOSC_HAVE_SSE2) {
319
0
    shuffle_implementation_t impl_sse2;
320
0
    impl_sse2.name = "sse2";
321
0
    impl_sse2.shuffle = (shuffle_func)blosc_internal_shuffle_sse2;
322
0
    impl_sse2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_sse2;
323
0
    impl_sse2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_sse2;
324
0
    impl_sse2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_sse2;
325
0
    return impl_sse2;
326
0
  }
327
0
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
328
329
  /*  Processor doesn't support any of the hardware-accelerated implementations,
330
      so use the generic implementation. */
331
0
  impl_generic.name = "generic";
332
0
  impl_generic.shuffle = (shuffle_func)blosc_internal_shuffle_generic;
333
0
  impl_generic.unshuffle = (unshuffle_func)blosc_internal_unshuffle_generic;
334
0
  impl_generic.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_scal;
335
0
  impl_generic.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_scal;
336
0
  return impl_generic;
337
0
}
338
339
340
/*  Flag indicating whether the implementation has been initialized. */
341
static pthread_once_t implementation_initialized = PTHREAD_ONCE_INIT;
342
343
/*  The dynamically-chosen shuffle/unshuffle implementation.
344
    This is only safe to use once `implementation_initialized` is set. */
345
static shuffle_implementation_t host_implementation;
346
347
1
static void set_host_implementation(void) {
348
1
  host_implementation = get_shuffle_implementation();
349
1
}
350
351
/*  Initialize the shuffle implementation, if necessary. */
352
#if defined(__GNUC__) || defined(__clang__)
353
__attribute__((always_inline))
354
#endif
355
static
356
#if defined(_MSC_VER)
357
__forceinline
358
#else
359
BLOSC_INLINE
360
#endif
361
106k
void init_shuffle_implementation(void) {
362
106k
  pthread_once(&implementation_initialized, &set_host_implementation);
363
106k
}
364
365
/*  Shuffle a block by dynamically dispatching to the appropriate
366
    hardware-accelerated routine at run-time. */
367
void
368
blosc_internal_shuffle(const size_t bytesoftype, const size_t blocksize,
369
0
                       const uint8_t* _src, const uint8_t* _dest) {
370
  /* Initialize the shuffle implementation if necessary. */
371
0
  init_shuffle_implementation();
372
373
  /*  The implementation is initialized.
374
      Dispatch to it's shuffle routine. */
375
0
  (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
376
0
}
377
378
/*  Unshuffle a block by dynamically dispatching to the appropriate
379
    hardware-accelerated routine at run-time. */
380
void
381
blosc_internal_unshuffle(const size_t bytesoftype, const size_t blocksize,
382
0
                         const uint8_t* _src, const uint8_t* _dest) {
383
  /* Initialize the shuffle implementation if necessary. */
384
0
  init_shuffle_implementation();
385
386
  /*  The implementation is initialized.
387
      Dispatch to it's unshuffle routine. */
388
0
  (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
389
0
}
390
391
/*  Bit-shuffle a block by dynamically dispatching to the appropriate
392
    hardware-accelerated routine at run-time. */
393
int
394
blosc_internal_bitshuffle(const size_t bytesoftype, const size_t blocksize,
395
                          const uint8_t* const _src, const uint8_t* _dest,
396
89.8k
                          const uint8_t* _tmp) {
397
89.8k
  int size = blocksize / bytesoftype;
398
  /* Initialize the shuffle implementation if necessary. */
399
89.8k
  init_shuffle_implementation();
400
401
89.8k
  if ((size % 8) == 0) {
402
    /* The number of elems is a multiple of 8 which is supported by
403
       bitshuffle. */
404
86.2k
    int ret = (int)(host_implementation.bitshuffle)((void *) _src, (void *) _dest,
405
86.2k
                                                    blocksize / bytesoftype,
406
86.2k
                                                    bytesoftype, (void *) _tmp);
407
    /* Copy the leftovers */
408
86.2k
    size_t offset = size * bytesoftype;
409
86.2k
    memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
410
86.2k
    return ret;
411
86.2k
  }
412
3.54k
  else {
413
3.54k
    memcpy((void *) _dest, (void *) _src, blocksize);
414
3.54k
  }
415
3.54k
  return size;
416
89.8k
}
417
418
/*  Bit-unshuffle a block by dynamically dispatching to the appropriate
419
    hardware-accelerated routine at run-time. */
420
int
421
blosc_internal_bitunshuffle(const size_t bytesoftype, const size_t blocksize,
422
                            const uint8_t* const _src, const uint8_t* _dest,
423
16.4k
                            const uint8_t* _tmp) {
424
16.4k
  int size = blocksize / bytesoftype;
425
  /* Initialize the shuffle implementation if necessary. */
426
16.4k
  init_shuffle_implementation();
427
428
16.4k
  if ((size % 8) == 0) {
429
    /* The number of elems is a multiple of 8 which is supported by
430
       bitshuffle. */
431
16.0k
    int ret = (int) (host_implementation.bitunshuffle)((void *) _src, (void *) _dest,
432
16.0k
                                                       blocksize / bytesoftype,
433
16.0k
                                                       bytesoftype, (void *) _tmp);
434
    /* Copy the leftovers */
435
16.0k
    size_t offset = size * bytesoftype;
436
16.0k
    memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
437
16.0k
    return ret;
438
16.0k
  }
439
379
  else {
440
379
    memcpy((void *) _dest, (void *) _src, blocksize);
441
379
  }
442
379
  return size;
443
16.4k
}