Coverage Report

Created: 2025-06-20 06:13

/src/c-blosc2/blosc/shuffle.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
#include "shuffle.h" /* needs to be included first to define macros */
12
13
/*  Include hardware-accelerated shuffle/unshuffle routines based on
14
    the target architecture. Note that a target architecture may support
15
    more than one type of acceleration!*/
16
#if defined(SHUFFLE_AVX512_ENABLED)
17
  #include "bitshuffle-avx512.h"
18
#endif  /* defined(SHUFFLE_AVX512_ENABLED) */
19
20
#if defined(SHUFFLE_AVX2_ENABLED)
21
  #include "shuffle-avx2.h"
22
  #include "bitshuffle-avx2.h"
23
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
24
25
#if defined(SHUFFLE_SSE2_ENABLED)
26
  #include "shuffle-sse2.h"
27
  #include "bitshuffle-sse2.h"
28
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
29
30
#if defined(SHUFFLE_NEON_ENABLED)
31
  #if defined(__linux__)
32
    #include <sys/auxv.h>
33
    #ifdef ARM_ASM_HWCAP
34
      #include <asm/hwcap.h>
35
    #endif
36
  #endif
37
  #include "shuffle-neon.h"
38
  #include "bitshuffle-neon.h"
39
#endif  /* defined(SHUFFLE_NEON_ENABLED) */
40
41
#if defined(SHUFFLE_ALTIVEC_ENABLED)
42
  #include "shuffle-altivec.h"
43
  #include "bitshuffle-altivec.h"
44
#endif  /* defined(SHUFFLE_ALTIVEC_ENABLED) */
45
46
#include "shuffle-generic.h"
47
#include "bitshuffle-generic.h"
48
#include "blosc2.h"
49
50
#include <stdio.h>
51
#include <string.h>
52
53
// __builtin_cpu_supports() fixed in GCC 8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100
54
// Also, clang added support for it in clang 10 at very least (and possibly since 3.8)
55
#if (defined(__clang__) && (__clang_major__ >= 10)) && !(defined(__APPLE__) && defined(__x86_64__) && defined(BUILD_STATIC)) || \
56
    (defined(__GNUC__) && defined(__GNUC_MINOR__) && __GNUC__ >= 8)
57
#define HAVE_CPU_FEAT_INTRIN
58
#endif
59
60
61
/*  Define function pointer types for shuffle/unshuffle routines. */
62
typedef void(* shuffle_func)(const int32_t, const int32_t, const uint8_t*, uint8_t*);
63
typedef void(* unshuffle_func)(const int32_t, const int32_t, const uint8_t*, uint8_t*);
64
// For bitshuffle, everything is done in terms of size_t and int64_t (return value)
65
// and although this is not strictly necessary for Blosc, it does not hurt either
66
typedef int64_t(* bitshuffle_func)(const void*, void*, const size_t, const size_t);
67
typedef int64_t(* bitunshuffle_func)(const void*, void*, const size_t, const size_t);
68
69
/* An implementation of shuffle/unshuffle routines. */
70
typedef struct shuffle_implementation {
71
  /* Name of this implementation. */
72
  const char* name;
73
  /* Function pointer to the shuffle routine for this implementation. */
74
  shuffle_func shuffle;
75
  /* Function pointer to the unshuffle routine for this implementation. */
76
  unshuffle_func unshuffle;
77
  /* Function pointer to the bitshuffle routine for this implementation. */
78
  bitshuffle_func bitshuffle;
79
  /* Function pointer to the bitunshuffle routine for this implementation. */
80
  bitunshuffle_func bitunshuffle;
81
} shuffle_implementation_t;
82
83
typedef enum {
84
  BLOSC_HAVE_NOTHING = 0,
85
  BLOSC_HAVE_SSE2 = 1,
86
  BLOSC_HAVE_AVX2 = 2,
87
  BLOSC_HAVE_NEON = 4,
88
  BLOSC_HAVE_ALTIVEC = 8,
89
  BLOSC_HAVE_AVX512 = 16,
90
} blosc_cpu_features;
91
92
/* Detect hardware and set function pointers to the best shuffle/unshuffle
93
   implementations supported by the host processor. */
94
#if (defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED)) && \
95
    (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64))  /* Intel/i686 */
96
97
#if defined(HAVE_CPU_FEAT_INTRIN)
98
1
static blosc_cpu_features blosc_get_cpu_features(void) {
99
1
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
100
1
  if (__builtin_cpu_supports("sse2")) {
101
1
    cpu_features |= BLOSC_HAVE_SSE2;
102
1
  }
103
1
  if (__builtin_cpu_supports("avx2")) {
104
1
    cpu_features |= BLOSC_HAVE_AVX2;
105
1
  }
106
1
  if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
107
0
    cpu_features |= BLOSC_HAVE_AVX512;
108
0
  }
109
1
  return cpu_features;
110
1
}
111
#else
112
113
#if defined(_MSC_VER) && !defined(__clang__)
114
  #include <immintrin.h>  /* Needed for _xgetbv */
115
  #include <intrin.h>     /* Needed for __cpuid */
116
#else
117
118
/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
119
    and others using inline assembly. */
120
__attribute__((always_inline))
121
static inline void
122
__cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
123
  __asm__ __volatile__ (
124
# if defined(__i386__) && defined (__PIC__)
125
  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
126
      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
127
  */
128
    "movl %%ebx, %%edi\n\t"
129
    "cpuid\n\t"
130
    "xchgl %%ebx, %%edi":
131
    "=D" (cpuInfo[1]),
132
#else
133
    "cpuid":
134
    "=b" (cpuInfo[1]),
135
#endif  /* defined(__i386) && defined(__PIC__) */
136
    "=a" (cpuInfo[0]),
137
    "=c" (cpuInfo[2]),
138
    "=d" (cpuInfo[3]) :
139
    "a" (function_id), "c" (subfunction_id)
140
    );
141
}
142
143
#define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
144
145
#define _XCR_XFEATURE_ENABLED_MASK 0
146
147
// GCC folks added _xgetbv in immintrin.h starting in GCC 9
148
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71659
149
#if !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900)) && !defined(__IMMINTRIN_H)
150
/* Reads the content of an extended control register.
151
   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
152
*/
153
static inline uint64_t
154
_xgetbv(uint32_t xcr) {
155
  uint32_t eax, edx;
156
  __asm__ __volatile__ (
157
    /* "xgetbv"
158
       This is specified as raw instruction bytes due to some older compilers
159
       having issues with the mnemonic form.
160
    */
161
    ".byte 0x0f, 0x01, 0xd0":
162
    "=a" (eax),
163
    "=d" (edx) :
164
    "c" (xcr)
165
    );
166
  return ((uint64_t)edx << 32) | eax;
167
}
168
#endif  // !(defined(_IMMINTRIN_H_INCLUDED) && (BLOSC_GCC_VERSION >= 900)) && !defined(__IMMINTRIN_H)
169
#endif /* defined(_MSC_VER) */
170
171
#ifndef _XCR_XFEATURE_ENABLED_MASK
172
#define _XCR_XFEATURE_ENABLED_MASK 0x0
173
#endif
174
175
static blosc_cpu_features blosc_get_cpu_features(void) {
176
  blosc_cpu_features result = BLOSC_HAVE_NOTHING;
177
  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
178
  int32_t cpu_info[4];
179
180
  /* Get the number of basic functions available. */
181
  __cpuid(cpu_info, 0);
182
  int32_t max_basic_function_id = cpu_info[0];
183
184
  /* Check for SSE-based features and required OS support */
185
  __cpuid(cpu_info, 1);
186
  const bool sse2_available = (cpu_info[3] & (1 << 26)) != 0;
187
  const bool sse3_available = (cpu_info[2] & (1 << 0)) != 0;
188
  const bool ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
189
  const bool sse41_available = (cpu_info[2] & (1 << 19)) != 0;
190
  const bool sse42_available = (cpu_info[2] & (1 << 20)) != 0;
191
192
  const bool xsave_available = (cpu_info[2] & (1 << 26)) != 0;
193
  const bool xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
194
195
  /* Check for AVX-based features, if the processor supports extended features. */
196
  bool avx2_available = false;
197
  bool avx512f_available = false;
198
  bool avx512bw_available = false;
199
  if (max_basic_function_id >= 7) {
200
    __cpuid(cpu_info, 7);
201
    avx2_available = (cpu_info[1] & (1 << 5)) != 0;
202
    avx512f_available = (cpu_info[1] & (1 << 16)) != 0;
203
    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
204
  }
205
206
  /*  Even if certain features are supported by the CPU, they may not be supported
207
      by the OS (in which case using them would crash the process or system).
208
      If xsave is available and enabled by the OS, check the contents of the
209
      extended control register XCR0 to see if the CPU features are enabled. */
210
  bool xmm_state_enabled = false;
211
  bool ymm_state_enabled = false;
212
  // Silence an unused variable compiler warning
213
  // bool zmm_state_enabled = false;
214
215
#if defined(_XCR_XFEATURE_ENABLED_MASK)
216
  if (xsave_available && xsave_enabled_by_os && (
217
      sse2_available || sse3_available || ssse3_available
218
      || sse41_available || sse42_available
219
      || avx2_available || avx512f_available || avx512bw_available)) {
220
    /* Determine which register states can be restored by the OS. */
221
    uint64_t xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
222
223
    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
224
    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
225
226
    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
227
        restored as well as all of zmm16-zmm31 and the opmask registers. */
228
    // zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
229
  }
230
#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
231
232
#if defined(BLOSC_DUMP_CPU_INFO)
233
  printf("Shuffle CPU Information:\n");
234
  printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
235
  printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
236
  printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
237
  printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
238
  printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
239
  printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
240
  printf("AVX512F available: %s\n", avx512f_available ? "True" : "False");
241
  printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
242
  printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
243
  printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
244
  printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
245
  printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
246
  // printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
247
#endif /* defined(BLOSC_DUMP_CPU_INFO) */
248
249
  /* Using the gathered CPU information, determine which implementation to use. */
250
  /* technically could fail on sse2 cpu on os without xmm support, but that
251
   * shouldn't exist anymore */
252
  if (sse2_available) {
253
    result |= BLOSC_HAVE_SSE2;
254
  }
255
  if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
256
    result |= BLOSC_HAVE_AVX2;
257
  }
258
  if (xmm_state_enabled && ymm_state_enabled && avx512f_available && avx512bw_available) {
259
    result |= BLOSC_HAVE_AVX512;
260
  }
261
  return result;
262
}
263
#endif /* HAVE_CPU_FEAT_INTRIN */
264
265
#elif defined(SHUFFLE_NEON_ENABLED) /* ARM-NEON */
266
static blosc_cpu_features blosc_get_cpu_features(void) {
267
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
268
#if defined(__aarch64__)
269
  /* aarch64 always has NEON */
270
  cpu_features |= BLOSC_HAVE_NEON;
271
#elif defined(__linux__)
272
  if (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) {
273
    cpu_features |= BLOSC_HAVE_NEON;
274
  }
275
#endif
276
  return cpu_features;
277
}
278
#elif defined(SHUFFLE_ALTIVEC_ENABLED) /* POWER9-ALTIVEC preliminary test*/
279
static blosc_cpu_features blosc_get_cpu_features(void) {
280
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
281
  cpu_features |= BLOSC_HAVE_ALTIVEC;
282
  return cpu_features;
283
}
284
#else   /* No hardware acceleration supported for the target architecture. */
285
  #if defined(_MSC_VER)
286
    #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
287
  #else
288
    #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
289
  #endif
290
291
static blosc_cpu_features blosc_get_cpu_features(void) {
292
return BLOSC_HAVE_NOTHING;
293
}
294
295
#endif /* defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) */
296
297
1
static shuffle_implementation_t get_shuffle_implementation(void) {
298
1
  blosc_cpu_features cpu_features = blosc_get_cpu_features();
299
1
#if defined(SHUFFLE_AVX512_ENABLED)
300
1
  if (cpu_features & BLOSC_HAVE_AVX512 && is_shuffle_avx2 && is_bshuf_AVX512) {
301
0
    shuffle_implementation_t impl_avx512;
302
0
    impl_avx512.name = "avx512";
303
0
    impl_avx512.shuffle = (shuffle_func)shuffle_avx2;
304
0
    impl_avx512.unshuffle = (unshuffle_func)unshuffle_avx2;
305
0
    impl_avx512.bitshuffle = (bitshuffle_func) bshuf_trans_bit_elem_AVX512;
306
0
    impl_avx512.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_AVX512;
307
0
    return impl_avx512;
308
0
  }
309
1
#endif  /* defined(SHUFFLE_AVX512_ENABLED) */
310
311
1
#if defined(SHUFFLE_AVX2_ENABLED)
312
1
  if (cpu_features & BLOSC_HAVE_AVX2 && is_shuffle_avx2 && is_bshuf_AVX) {
313
1
    shuffle_implementation_t impl_avx2;
314
1
    impl_avx2.name = "avx2";
315
1
    impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
316
1
    impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
317
1
    impl_avx2.bitshuffle = (bitshuffle_func) bshuf_trans_bit_elem_AVX;
318
1
    impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_AVX;
319
1
    return impl_avx2;
320
1
  }
321
0
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
322
323
0
#if defined(SHUFFLE_SSE2_ENABLED)
324
0
  if (cpu_features & BLOSC_HAVE_SSE2 && is_shuffle_sse2 && is_bshuf_SSE) {
325
0
    shuffle_implementation_t impl_sse2;
326
0
    impl_sse2.name = "sse2";
327
0
    impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
328
0
    impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
329
0
    impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_SSE;
330
0
    impl_sse2.bitunshuffle = (bitunshuffle_func) bshuf_untrans_bit_elem_SSE;
331
0
    return impl_sse2;
332
0
  }
333
0
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
334
335
#if defined(SHUFFLE_NEON_ENABLED)
336
  if (cpu_features & BLOSC_HAVE_NEON && is_shuffle_neon) { // && is_bshuf_NEON if using NEON bitshuffle
337
    shuffle_implementation_t impl_neon;
338
    impl_neon.name = "neon";
339
    impl_neon.shuffle = (shuffle_func)shuffle_neon;
340
    impl_neon.unshuffle = (unshuffle_func)unshuffle_neon;
341
    //impl_neon.shuffle = (shuffle_func)shuffle_generic;
342
    //impl_neon.unshuffle = (unshuffle_func)unshuffle_generic;
343
    //impl_neon.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_NEON;
344
    //impl_neon.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_NEON;
345
    // The current bitshuffle optimized for NEON is not any faster
346
    // (in fact, it is pretty much slower) than the scalar implementation.
347
    // So, let's use the scalar one, which is pretty fast, at least on a M1 CPU.
348
    impl_neon.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
349
    impl_neon.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
350
    return impl_neon;
351
  }
352
#endif  /* defined(SHUFFLE_NEON_ENABLED) */
353
354
#if defined(SHUFFLE_ALTIVEC_ENABLED)
355
  if (cpu_features & BLOSC_HAVE_ALTIVEC && is_shuffle_altivec && is_bshuf_altivec) {
356
    shuffle_implementation_t impl_altivec;
357
    impl_altivec.name = "altivec";
358
    impl_altivec.shuffle = (shuffle_func)shuffle_altivec;
359
    impl_altivec.unshuffle = (unshuffle_func)unshuffle_altivec;
360
    impl_altivec.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_altivec;
361
    impl_altivec.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_altivec;
362
    return impl_altivec;
363
  }
364
#endif  /* defined(SHUFFLE_ALTIVEC_ENABLED) */
365
366
  /* Processor doesn't support any of the hardware-accelerated implementations,
367
     so use the generic implementation. */
368
0
  shuffle_implementation_t impl_generic;
369
0
  impl_generic.name = "generic";
370
0
  impl_generic.shuffle = (shuffle_func)shuffle_generic;
371
0
  impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
372
0
  impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
373
0
  impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
374
0
  return impl_generic;
375
0
}
376
377
378
/* Flag indicating whether the implementation has been initialized.
379
   Zero means it hasn't been initialized, non-zero means it has. */
380
static int32_t implementation_initialized;
381
382
/* The dynamically-chosen shuffle/unshuffle implementation.
383
   This is only safe to use once `implementation_initialized` is set. */
384
static shuffle_implementation_t host_implementation;
385
386
/* Initialize the shuffle implementation, if necessary. */
387
#if defined(__GNUC__) || defined(__clang__)
388
__attribute__((always_inline))
389
#endif
390
static
391
#if defined(_MSC_VER)
392
__forceinline
393
#else
394
inline
395
#endif
396
168k
void init_shuffle_implementation(void) {
397
  /* Initialization could (in rare cases) take place concurrently on
398
     multiple threads, but it shouldn't matter because the
399
     initialization should return the same result on each thread (so
400
     the implementation will be the same). Since that's the case we
401
     can avoid complicated synchronization here and get a small
402
     performance benefit because we don't need to perform a volatile
403
     load on the initialization variable each time this function is
404
     called. */
405
168k
#if defined(__GNUC__) || defined(__clang__)
406
168k
  if (__builtin_expect(!implementation_initialized, 0)) {
407
#else
408
    if (!implementation_initialized) {
409
#endif
410
    /* Initialize the implementation. */
411
1
    host_implementation = get_shuffle_implementation();
412
413
    /* Set the flag indicating the implementation has been initialized. */
414
1
    implementation_initialized = 1;
415
1
  }
416
168k
}
417
418
/* Shuffle a block by dynamically dispatching to the appropriate
419
   hardware-accelerated routine at run-time. */
420
void
421
shuffle(const int32_t bytesoftype, const int32_t blocksize,
422
0
        const uint8_t* _src, uint8_t* _dest) {
423
  /* Initialize the shuffle implementation if necessary. */
424
0
  init_shuffle_implementation();
425
426
  /* The implementation is initialized.
427
     Dispatch to its shuffle routine. */
428
0
  (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
429
0
}
430
431
/* Unshuffle a block by dynamically dispatching to the appropriate
432
   hardware-accelerated routine at run-time. */
433
void
434
unshuffle(const int32_t bytesoftype, const int32_t blocksize,
435
0
          const uint8_t* _src, uint8_t* _dest) {
436
  /* Initialize the shuffle implementation if necessary. */
437
0
  init_shuffle_implementation();
438
439
  /* The implementation is initialized.
440
     Dispatch to it's unshuffle routine. */
441
0
  (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
442
0
}
443
444
/*  Bit-shuffle a block by dynamically dispatching to the appropriate
445
    hardware-accelerated routine at run-time. */
446
int32_t
447
bitshuffle(const int32_t bytesoftype, const int32_t blocksize,
448
134k
           const uint8_t *_src, uint8_t *_dest) {
449
  /* Initialize the shuffle implementation if necessary. */
450
134k
  init_shuffle_implementation();
451
134k
  size_t size = blocksize / bytesoftype;
452
  /* bitshuffle only supports a number of elements that is a multiple of 8. */
453
134k
  size -= size % 8;
454
134k
  int ret = (int) (host_implementation.bitshuffle)
455
134k
      ((const void *) _src, (void *) _dest, size, bytesoftype);
456
134k
  if (ret < 0) {
457
    // Some error in bitshuffle (should not happen)
458
0
    BLOSC_TRACE_ERROR("the impossible happened: the bitshuffle filter failed!");
459
0
    return ret;
460
0
  }
461
462
  // Copy the leftovers
463
134k
  size_t offset = size * bytesoftype;
464
134k
  memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
465
466
134k
  return blocksize;
467
134k
}
468
469
/*  Bit-unshuffle a block by dynamically dispatching to the appropriate
470
    hardware-accelerated routine at run-time. */
471
int32_t bitunshuffle(const int32_t bytesoftype, const int32_t blocksize,
472
                     const uint8_t *_src, uint8_t *_dest,
473
34.0k
                     const uint8_t format_version) {
474
  /* Initialize the shuffle implementation if necessary. */
475
34.0k
  init_shuffle_implementation();
476
34.0k
  size_t size = blocksize / bytesoftype;
477
478
34.0k
  if (format_version == 2) {
479
    /* Starting from version 3, bitshuffle() works differently */
480
0
    if ((size % 8) == 0) {
481
      /* The number of elems is a multiple of 8 which is supported by
482
         bitshuffle. */
483
0
      int ret = (int) (host_implementation.bitunshuffle)
484
0
          ((const void *) _src, (void *) _dest, blocksize / bytesoftype, bytesoftype);
485
0
      if (ret < 0) {
486
        // Some error in bitshuffle (should not happen)
487
0
        BLOSC_TRACE_ERROR("the impossible happened: the bitunshuffle filter failed!");
488
0
        return ret;
489
0
      }
490
      /* Copy the leftovers (we do so starting from c-blosc 1.18 on) */
491
0
      size_t offset = size * bytesoftype;
492
0
      memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
493
0
    }
494
0
    else {
495
0
      memcpy((void *) _dest, (void *) _src, blocksize);
496
0
    }
497
0
  }
498
34.0k
  else {
499
    /* bitshuffle only supports a number of bytes that is a multiple of 8. */
500
34.0k
    size -= size % 8;
501
34.0k
    int ret = (int) (host_implementation.bitunshuffle)
502
34.0k
        ((const void *) _src, (void *) _dest, size, bytesoftype);
503
34.0k
    if (ret < 0) {
504
0
      BLOSC_TRACE_ERROR("the impossible happened: the bitunshuffle filter failed!");
505
0
      return ret;
506
0
    }
507
508
    /* Copy the leftovers */
509
34.0k
    size_t offset = size * bytesoftype;
510
34.0k
    memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
511
34.0k
  }
512
513
34.0k
  return blocksize;
514
34.0k
}