Coverage Report

Created: 2026-02-14 07:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/shuffle.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
#include "blosc2.h" /* needs to be included first to define macros */
12
#include "shuffle.h"
13
14
/*  Include hardware-accelerated shuffle/unshuffle routines based on
15
    the target architecture. Note that a target architecture may support
16
    more than one type of acceleration!*/
17
#if defined(SHUFFLE_AVX512_ENABLED)
18
  #include "bitshuffle-avx512.h"
19
#endif  /* defined(SHUFFLE_AVX512_ENABLED) */
20
21
#if defined(SHUFFLE_AVX2_ENABLED)
22
  #include "shuffle-avx2.h"
23
  #include "bitshuffle-avx2.h"
24
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
25
26
#if defined(SHUFFLE_SSE2_ENABLED)
27
  #include "shuffle-sse2.h"
28
  #include "bitshuffle-sse2.h"
29
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
30
31
#if defined(SHUFFLE_NEON_ENABLED)
32
  #if defined(__linux__)
33
    #include <sys/auxv.h>
34
    #ifdef ARM_ASM_HWCAP
35
      #include <asm/hwcap.h>
36
    #endif
37
  #endif
38
  #include "shuffle-neon.h"
39
  #include "bitshuffle-neon.h"
40
#endif  /* defined(SHUFFLE_NEON_ENABLED) */
41
42
#if defined(SHUFFLE_ALTIVEC_ENABLED)
43
  #include "shuffle-altivec.h"
44
  #include "bitshuffle-altivec.h"
45
#endif  /* defined(SHUFFLE_ALTIVEC_ENABLED) */
46
47
#include "shuffle-generic.h"
48
#include "bitshuffle-generic.h"
49
50
#include <stdio.h>
51
#include <string.h>
52
53
// __builtin_cpu_supports() fixed in GCC 8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100
54
// Also, clang added support for it in clang 10 at very least (and possibly since 3.8)
55
// Handle clang on windows without MSVC 
56
#if (defined(__clang__) && (__clang_major__ >= 10)) && !(defined(__APPLE__) && defined(__x86_64__) && defined(BUILD_STATIC)) \
57
      && !defined(_WIN32)|| \
58
    (defined(__GNUC__) && defined(__GNUC_MINOR__) && __GNUC__ >= 8)
59
#define HAVE_CPU_FEAT_INTRIN
60
#endif
61
62
63
/*  Define function pointer types for shuffle/unshuffle routines. */
64
typedef void(* shuffle_func)(const int32_t, const int32_t, const uint8_t*, uint8_t*);
65
typedef void(* unshuffle_func)(const int32_t, const int32_t, const uint8_t*, uint8_t*);
66
// For bitshuffle, everything is done in terms of size_t and int64_t (return value)
67
// and although this is not strictly necessary for Blosc, it does not hurt either
68
typedef int64_t(* bitshuffle_func)(const void*, void*, const size_t, const size_t);
69
typedef int64_t(* bitunshuffle_func)(const void*, void*, const size_t, const size_t);
70
71
/* An implementation of shuffle/unshuffle routines. */
72
typedef struct shuffle_implementation {
73
  /* Name of this implementation. */
74
  const char* name;
75
  /* Function pointer to the shuffle routine for this implementation. */
76
  shuffle_func shuffle;
77
  /* Function pointer to the unshuffle routine for this implementation. */
78
  unshuffle_func unshuffle;
79
  /* Function pointer to the bitshuffle routine for this implementation. */
80
  bitshuffle_func bitshuffle;
81
  /* Function pointer to the bitunshuffle routine for this implementation. */
82
  bitunshuffle_func bitunshuffle;
83
} shuffle_implementation_t;
84
85
typedef enum {
86
  BLOSC_HAVE_NOTHING = 0,
87
  BLOSC_HAVE_SSE2 = 1,
88
  BLOSC_HAVE_AVX2 = 2,
89
  BLOSC_HAVE_NEON = 4,
90
  BLOSC_HAVE_ALTIVEC = 8,
91
  BLOSC_HAVE_AVX512 = 16,
92
} blosc_cpu_features;
93
94
/* Detect hardware and set function pointers to the best shuffle/unshuffle
95
   implementations supported by the host processor. */
96
#if (defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED)) && \
97
    (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64))  /* Intel/i686 */
98
99
#if defined(HAVE_CPU_FEAT_INTRIN)
100
1
static blosc_cpu_features blosc_get_cpu_features(void) {
101
1
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
102
1
  if (__builtin_cpu_supports("sse2")) {
103
1
    cpu_features |= BLOSC_HAVE_SSE2;
104
1
  }
105
1
  if (__builtin_cpu_supports("avx2")) {
106
1
    cpu_features |= BLOSC_HAVE_AVX2;
107
1
  }
108
1
  if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
109
0
    cpu_features |= BLOSC_HAVE_AVX512;
110
0
  }
111
1
  return cpu_features;
112
1
}
113
#else
114
115
#if defined(_MSC_VER)
116
  #include <immintrin.h>  /* Needed for _xgetbv */
117
  #include <intrin.h>     /* Needed for __cpuid */
118
#else
119
120
/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
121
    and others using inline assembly. */
122
__attribute__((always_inline))
123
static inline void
124
__cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
125
  __asm__ __volatile__ (
126
# if defined(__i386__) && defined (__PIC__)
127
  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
128
      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
129
  */
130
    "movl %%ebx, %%edi\n\t"
131
    "cpuid\n\t"
132
    "xchgl %%ebx, %%edi":
133
    "=D" (cpuInfo[1]),
134
#else
135
    "cpuid":
136
    "=b" (cpuInfo[1]),
137
#endif  /* defined(__i386) && defined(__PIC__) */
138
    "=a" (cpuInfo[0]),
139
    "=c" (cpuInfo[2]),
140
    "=d" (cpuInfo[3]) :
141
    "a" (function_id), "c" (subfunction_id)
142
    );
143
}
144
145
#define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
146
147
#define _XCR_XFEATURE_ENABLED_MASK 0
148
149
// GCC folks added _xgetbv in immintrin.h starting in GCC 9
150
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71659
151
#if !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900)) && !defined(__IMMINTRIN_H)
152
/* Reads the content of an extended control register.
153
   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
154
*/
155
static inline uint64_t
156
_xgetbv(uint32_t xcr) {
157
  uint32_t eax, edx;
158
  __asm__ __volatile__ (
159
    /* "xgetbv"
160
       This is specified as raw instruction bytes due to some older compilers
161
       having issues with the mnemonic form.
162
    */
163
    ".byte 0x0f, 0x01, 0xd0":
164
    "=a" (eax),
165
    "=d" (edx) :
166
    "c" (xcr)
167
    );
168
  return ((uint64_t)edx << 32) | eax;
169
}
170
#endif  // !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900)) && !defined(__IMMINTRIN_H)
171
#endif /* defined(_MSC_VER) */
172
173
#ifndef _XCR_XFEATURE_ENABLED_MASK
174
#define _XCR_XFEATURE_ENABLED_MASK 0x0
175
#endif
176
177
static blosc_cpu_features blosc_get_cpu_features(void) {
178
  blosc_cpu_features result = BLOSC_HAVE_NOTHING;
179
  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
180
  int32_t cpu_info[4];
181
182
  /* Get the number of basic functions available. */
183
  __cpuid(cpu_info, 0);
184
  int32_t max_basic_function_id = cpu_info[0];
185
186
  /* Check for SSE-based features and required OS support */
187
  __cpuid(cpu_info, 1);
188
  const bool sse2_available = (cpu_info[3] & (1 << 26)) != 0;
189
  const bool sse3_available = (cpu_info[2] & (1 << 0)) != 0;
190
  const bool ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
191
  const bool sse41_available = (cpu_info[2] & (1 << 19)) != 0;
192
  const bool sse42_available = (cpu_info[2] & (1 << 20)) != 0;
193
194
  const bool xsave_available = (cpu_info[2] & (1 << 26)) != 0;
195
  const bool xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
196
197
  /* Check for AVX-based features, if the processor supports extended features. */
198
  bool avx2_available = false;
199
  bool avx512f_available = false;
200
  bool avx512bw_available = false;
201
  if (max_basic_function_id >= 7) {
202
    __cpuid(cpu_info, 7);
203
    avx2_available = (cpu_info[1] & (1 << 5)) != 0;
204
    avx512f_available = (cpu_info[1] & (1 << 16)) != 0;
205
    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
206
  }
207
208
  /*  Even if certain features are supported by the CPU, they may not be supported
209
      by the OS (in which case using them would crash the process or system).
210
      If xsave is available and enabled by the OS, check the contents of the
211
      extended control register XCR0 to see if the CPU features are enabled. */
212
  bool xmm_state_enabled = false;
213
  bool ymm_state_enabled = false;
214
  // Silence an unused variable compiler warning
215
  // bool zmm_state_enabled = false;
216
217
#if defined(_XCR_XFEATURE_ENABLED_MASK)
218
  if (xsave_available && xsave_enabled_by_os && (
219
      sse2_available || sse3_available || ssse3_available
220
      || sse41_available || sse42_available
221
      || avx2_available || avx512f_available || avx512bw_available)) {
222
    /* Determine which register states can be restored by the OS. */
223
    uint64_t xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
224
225
    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
226
    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
227
228
    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
229
        restored as well as all of zmm16-zmm31 and the opmask registers. */
230
    // zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
231
  }
232
#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
233
234
#if defined(BLOSC_DUMP_CPU_INFO)
235
  printf("Shuffle CPU Information:\n");
236
  printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
237
  printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
238
  printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
239
  printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
240
  printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
241
  printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
242
  printf("AVX512F available: %s\n", avx512f_available ? "True" : "False");
243
  printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
244
  printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
245
  printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
246
  printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
247
  printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
248
  // printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
249
#endif /* defined(BLOSC_DUMP_CPU_INFO) */
250
251
  /* Using the gathered CPU information, determine which implementation to use. */
252
  /* technically could fail on sse2 cpu on os without xmm support, but that
253
   * shouldn't exist anymore */
254
  if (sse2_available) {
255
    result |= BLOSC_HAVE_SSE2;
256
  }
257
  if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
258
    result |= BLOSC_HAVE_AVX2;
259
  }
260
  if (xmm_state_enabled && ymm_state_enabled && avx512f_available && avx512bw_available) {
261
    result |= BLOSC_HAVE_AVX512;
262
  }
263
  return result;
264
}
265
#endif /* HAVE_CPU_FEAT_INTRIN */
266
267
#elif defined(SHUFFLE_NEON_ENABLED) /* ARM-NEON */
268
static blosc_cpu_features blosc_get_cpu_features(void) {
269
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
270
#if defined(__aarch64__)
271
  /* aarch64 always has NEON */
272
  cpu_features |= BLOSC_HAVE_NEON;
273
#elif defined(__linux__)
274
  if (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) {
275
    cpu_features |= BLOSC_HAVE_NEON;
276
  }
277
#endif
278
  return cpu_features;
279
}
280
#elif defined(SHUFFLE_ALTIVEC_ENABLED) /* POWER9-ALTIVEC preliminary test*/
281
static blosc_cpu_features blosc_get_cpu_features(void) {
282
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
283
  cpu_features |= BLOSC_HAVE_ALTIVEC;
284
  return cpu_features;
285
}
286
#else   /* No hardware acceleration supported for the target architecture. */
287
  #if defined(_MSC_VER)
288
    #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
289
  #else
290
    #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
291
  #endif
292
293
static blosc_cpu_features blosc_get_cpu_features(void) {
294
return BLOSC_HAVE_NOTHING;
295
}
296
297
#endif /* defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) */
298
299
1
static shuffle_implementation_t get_shuffle_implementation(void) {
300
1
  blosc_cpu_features cpu_features = blosc_get_cpu_features();
301
1
#if defined(SHUFFLE_AVX512_ENABLED)
302
1
  if (cpu_features & BLOSC_HAVE_AVX512 && is_shuffle_avx2 && is_bshuf_AVX512) {
303
0
    shuffle_implementation_t impl_avx512;
304
0
    impl_avx512.name = "avx512";
305
0
    impl_avx512.shuffle = (shuffle_func)shuffle_avx2;
306
0
    impl_avx512.unshuffle = (unshuffle_func)unshuffle_avx2;
307
0
    impl_avx512.bitshuffle = (bitshuffle_func) bshuf_trans_bit_elem_AVX512;
308
0
    impl_avx512.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_AVX512;
309
0
    return impl_avx512;
310
0
  }
311
1
#endif  /* defined(SHUFFLE_AVX512_ENABLED) */
312
313
1
#if defined(SHUFFLE_AVX2_ENABLED)
314
1
  if (cpu_features & BLOSC_HAVE_AVX2 && is_shuffle_avx2 && is_bshuf_AVX) {
315
1
    shuffle_implementation_t impl_avx2;
316
1
    impl_avx2.name = "avx2";
317
1
    impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
318
1
    impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
319
1
    impl_avx2.bitshuffle = (bitshuffle_func) bshuf_trans_bit_elem_AVX;
320
1
    impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_AVX;
321
1
    return impl_avx2;
322
1
  }
323
0
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
324
325
0
#if defined(SHUFFLE_SSE2_ENABLED)
326
0
  if (cpu_features & BLOSC_HAVE_SSE2 && is_shuffle_sse2 && is_bshuf_SSE) {
327
0
    shuffle_implementation_t impl_sse2;
328
0
    impl_sse2.name = "sse2";
329
0
    impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
330
0
    impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
331
0
    impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_SSE;
332
0
    impl_sse2.bitunshuffle = (bitunshuffle_func) bshuf_untrans_bit_elem_SSE;
333
0
    return impl_sse2;
334
0
  }
335
0
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
336
337
#if defined(SHUFFLE_NEON_ENABLED)
338
  if (cpu_features & BLOSC_HAVE_NEON && is_shuffle_neon) { // && is_bshuf_NEON if using NEON bitshuffle
339
    shuffle_implementation_t impl_neon;
340
    impl_neon.name = "neon";
341
    impl_neon.shuffle = (shuffle_func)shuffle_neon;
342
    impl_neon.unshuffle = (unshuffle_func)unshuffle_neon;
343
    //impl_neon.shuffle = (shuffle_func)shuffle_generic;
344
    //impl_neon.unshuffle = (unshuffle_func)unshuffle_generic;
345
    //impl_neon.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_NEON;
346
    //impl_neon.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_NEON;
347
    // The current bitshuffle optimized for NEON is not any faster
348
    // (in fact, it is pretty much slower) than the scalar implementation.
349
    // So, let's use the scalar one, which is pretty fast, at least on a M1 CPU.
350
    impl_neon.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
351
    impl_neon.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
352
    return impl_neon;
353
  }
354
#endif  /* defined(SHUFFLE_NEON_ENABLED) */
355
356
#if defined(SHUFFLE_ALTIVEC_ENABLED)
357
  if (cpu_features & BLOSC_HAVE_ALTIVEC && is_shuffle_altivec && is_bshuf_altivec) {
358
    shuffle_implementation_t impl_altivec;
359
    impl_altivec.name = "altivec";
360
    impl_altivec.shuffle = (shuffle_func)shuffle_altivec;
361
    impl_altivec.unshuffle = (unshuffle_func)unshuffle_altivec;
362
    impl_altivec.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_altivec;
363
    impl_altivec.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_altivec;
364
    return impl_altivec;
365
  }
366
#endif  /* defined(SHUFFLE_ALTIVEC_ENABLED) */
367
368
  /* Processor doesn't support any of the hardware-accelerated implementations,
369
     so use the generic implementation. */
370
0
  shuffle_implementation_t impl_generic;
371
0
  impl_generic.name = "generic";
372
0
  impl_generic.shuffle = (shuffle_func)shuffle_generic;
373
0
  impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
374
0
  impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
375
0
  impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
376
0
  return impl_generic;
377
0
}
378
379
380
/* Flag indicating whether the implementation has been initialized.
381
   Zero means it hasn't been initialized, non-zero means it has. */
382
static int32_t implementation_initialized;
383
384
/* The dynamically-chosen shuffle/unshuffle implementation.
385
   This is only safe to use once `implementation_initialized` is set. */
386
static shuffle_implementation_t host_implementation;
387
388
/* Initialize the shuffle implementation, if necessary. */
389
#if defined(__GNUC__) || defined(__clang__)
390
__attribute__((always_inline))
391
#endif
392
static
393
#if defined(_MSC_VER)
394
__forceinline
395
#else
396
inline
397
#endif
398
12.7k
void init_shuffle_implementation(void) {
399
  /* Initialization could (in rare cases) take place concurrently on
400
     multiple threads, but it shouldn't matter because the
401
     initialization should return the same result on each thread (so
402
     the implementation will be the same). Since that's the case we
403
     can avoid complicated synchronization here and get a small
404
     performance benefit because we don't need to perform a volatile
405
     load on the initialization variable each time this function is
406
     called. */
407
12.7k
#if defined(__GNUC__) || defined(__clang__)
408
12.7k
  if (__builtin_expect(!implementation_initialized, 0)) {
409
#else
410
    if (!implementation_initialized) {
411
#endif
412
    /* Initialize the implementation. */
413
1
    host_implementation = get_shuffle_implementation();
414
415
    /* Set the flag indicating the implementation has been initialized. */
416
1
    implementation_initialized = 1;
417
1
  }
418
12.7k
}
419
420
/* Shuffle a block by dynamically dispatching to the appropriate
421
   hardware-accelerated routine at run-time. */
422
int32_t
423
blosc2_shuffle(const int32_t typesize, const int32_t blocksize,
424
0
               const void* src, void* dest) {
425
  /* Initialize the shuffle implementation if necessary. */
426
0
  init_shuffle_implementation();
427
428
0
  if (typesize < 1 || typesize > 256 || blocksize < 0) {
429
0
    return BLOSC2_ERROR_INVALID_PARAM;
430
0
  }
431
432
  /* The implementation is initialized.
433
     Dispatch to its shuffle routine. */
434
0
  (host_implementation.shuffle)(typesize, blocksize, src, dest);
435
436
0
  return blocksize;
437
0
}
438
439
/* Unshuffle a block by dynamically dispatching to the appropriate
440
   hardware-accelerated routine at run-time. */
441
int32_t
442
blosc2_unshuffle(const int32_t typesize, const int32_t blocksize,
443
7.38k
                 const void* src, void* dest) {
444
  /* Initialize the shuffle implementation if necessary. */
445
7.38k
  init_shuffle_implementation();
446
447
7.38k
  if (typesize < 1 || typesize > 256 || blocksize < 0) {
448
0
    return BLOSC2_ERROR_INVALID_PARAM;
449
0
  }
450
451
  /* The implementation is initialized.
452
     Dispatch to it's unshuffle routine. */
453
7.38k
  (host_implementation.unshuffle)(typesize, blocksize, src, dest);
454
455
7.38k
  return blocksize;
456
7.38k
}
457
458
/*  Bit-shuffle a block by dynamically dispatching to the appropriate
459
    hardware-accelerated routine at run-time. */
460
int32_t
461
blosc2_bitshuffle(const int32_t typesize, const int32_t blocksize,
462
0
                  const void* src, void* dest) {
463
  /* Initialize the shuffle implementation if necessary. */
464
0
  init_shuffle_implementation();
465
0
  size_t size = blocksize / typesize;
466
467
0
  if (typesize < 1 || typesize > 256 || blocksize < 0) {
468
0
    return BLOSC2_ERROR_INVALID_PARAM;
469
0
  }
470
471
  /* bitshuffle only supports a number of elements that is a multiple of 8. */
472
0
  size -= size % 8;
473
0
  int ret = (int) (host_implementation.bitshuffle)(src, dest, size, typesize);
474
0
  if (ret < 0) {
475
    // Some error in bitshuffle (should not happen)
476
0
    BLOSC_TRACE_ERROR("the impossible happened: the bitshuffle filter failed!");
477
0
    return ret;
478
0
  }
479
480
  // Copy the leftovers
481
0
  size_t offset = size * typesize;
482
0
  memcpy((uint8_t *) dest + offset, (const uint8_t *) src + offset, blocksize - offset);
483
484
0
  return blocksize;
485
0
}
486
487
/*  Bit-unshuffle a block by dynamically dispatching to the appropriate
488
    hardware-accelerated routine at run-time. */
489
int32_t bitunshuffle(const int32_t typesize, const int32_t blocksize,
490
                     const void* src, void* dest,
491
5.34k
                     const uint8_t format_version) {
492
  /* Initialize the shuffle implementation if necessary. */
493
5.34k
  init_shuffle_implementation();
494
5.34k
  size_t size = blocksize / typesize;
495
496
5.34k
  if (format_version == 2) {
497
    /* Starting from version 3, bitshuffle() works differently */
498
1.93k
    if ((size % 8) == 0) {
499
      /* The number of elems is a multiple of 8 which is supported by
500
         bitshuffle. */
501
1.45k
      int ret = (int) (host_implementation.bitunshuffle)
502
1.45k
          ((const void *) src, (void *) dest, blocksize / typesize, typesize);
503
1.45k
      if (ret < 0) {
504
        // Some error in bitshuffle (should not happen)
505
0
        BLOSC_TRACE_ERROR("the impossible happened: the bitunshuffle filter failed!");
506
0
        return ret;
507
0
      }
508
1.45k
    }
509
480
    else {
510
480
      memcpy(dest, src, blocksize);
511
480
    }
512
1.93k
  }
513
3.40k
  else {
514
    /* bitshuffle only supports a number of bytes that is a multiple of 8. */
515
3.40k
    size -= size % 8;
516
3.40k
    int ret = (int) (host_implementation.bitunshuffle)(src, dest, size, typesize);
517
3.40k
    if (ret < 0) {
518
0
      BLOSC_TRACE_ERROR("the impossible happened: the bitunshuffle filter failed!");
519
0
      return ret;
520
0
    }
521
522
    /* Copy the leftovers */
523
3.40k
    size_t offset = size * typesize;
524
3.40k
    memcpy((uint8_t *) dest + offset, (const uint8_t *) src + offset, blocksize - offset);
525
3.40k
  }
526
527
5.34k
  return blocksize;
528
5.34k
}
529
530
/* Stub public API that redirects to internal implementation. */
531
int32_t
532
blosc2_bitunshuffle(const int32_t typesize, const int32_t blocksize,
533
0
                    const void * src, void* dest) {
534
0
  return bitunshuffle(typesize, blocksize, src, dest, BLOSC2_VERSION_FORMAT);
535
0
}