Coverage Report

Created: 2024-05-21 06:09

/src/c-blosc/blosc/shuffle.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Author: Francesc Alted <francesc@blosc.org>
5
  Creation date: 2009-05-20
6
7
  See LICENSE.txt for details about copyright and rights to use.
8
**********************************************************************/
9
10
#include "shuffle.h"
11
#include "blosc-common.h"
12
#include "shuffle-generic.h"
13
#include "bitshuffle-generic.h"
14
#include "blosc-comp-features.h"
15
#include <stdio.h>
16
17
#if defined(_WIN32)
18
#include "win32/pthread.h"
19
#else
20
#include <pthread.h>
21
#endif
22
23
/* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
24
#if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
25
/* have a C99 compiler */
26
typedef _Bool bool;
27
#else
28
/* do not have a C99 compiler */
29
typedef unsigned char bool;
30
#endif
31
32
33
#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
34
    __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
35
#define HAVE_CPU_FEAT_INTRIN
36
#endif
37
38
39
/*  Include hardware-accelerated shuffle/unshuffle routines based on
40
    the target architecture. Note that a target architecture may support
41
    more than one type of acceleration!*/
42
#if defined(SHUFFLE_AVX2_ENABLED)
43
  #include "shuffle-avx2.h"
44
  #include "bitshuffle-avx2.h"
45
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
46
47
#if defined(SHUFFLE_SSE2_ENABLED)
48
  #include "shuffle-sse2.h"
49
  #include "bitshuffle-sse2.h"
50
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
51
52
53
/*  Define function pointer types for shuffle/unshuffle routines. */
54
typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
55
typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
56
typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
57
typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
58
59
/* An implementation of shuffle/unshuffle routines. */
60
typedef struct shuffle_implementation {
61
  /* Name of this implementation. */
62
  const char* name;
63
  /* Function pointer to the shuffle routine for this implementation. */
64
  shuffle_func shuffle;
65
  /* Function pointer to the unshuffle routine for this implementation. */
66
  unshuffle_func unshuffle;
67
  /* Function pointer to the bitshuffle routine for this implementation. */
68
  bitshuffle_func bitshuffle;
69
  /* Function pointer to the bitunshuffle routine for this implementation. */
70
  bitunshuffle_func bitunshuffle;
71
} shuffle_implementation_t;
72
73
typedef enum {
74
  BLOSC_HAVE_NOTHING = 0,
75
  BLOSC_HAVE_SSE2 = 1,
76
  BLOSC_HAVE_AVX2 = 2
77
} blosc_cpu_features;
78
79
/*  Detect hardware and set function pointers to the best shuffle/unshuffle
80
    implementations supported by the host processor for Intel/i686
81
     */
82
#if (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) \
83
    && (defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED))
84
85
/*  Disabled the __builtin_cpu_supports() call, as it has issues with
86
    new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
87
      "undefined symbol: __cpu_model"
88
    For a similar report, see:
89
    https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
90
*/
91
#if defined(HAVE_CPU_FEAT_INTRIN) && 0
92
static blosc_cpu_features blosc_get_cpu_features(void) {
93
  blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
94
  if (__builtin_cpu_supports("sse2")) {
95
    cpu_features |= BLOSC_HAVE_SSE2;
96
  }
97
  if (__builtin_cpu_supports("avx2")) {
98
    cpu_features |= BLOSC_HAVE_AVX2;
99
  }
100
  return cpu_features;
101
}
102
#else
103
104
#if defined(_MSC_VER) && !defined(__clang__)
105
  #include <intrin.h>     /* Needed for __cpuid */
106
107
/*  _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
108
#if _MSC_FULL_VER >= 160040219
109
  #include <immintrin.h>  /* Needed for _xgetbv */
110
  #define blosc_internal_xgetbv _xgetbv
111
#elif defined(_M_IX86)
112
113
/*  Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
114
115
static uint64_t blosc_internal_xgetbv(uint32_t xcr) {
116
    uint32_t xcr0, xcr1;
117
    __asm {
118
        mov        ecx, xcr
119
        _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
120
        mov        xcr0, eax
121
        mov        xcr1, edx
122
    }
123
    return ((uint64_t)xcr1 << 32) | xcr0;
124
}
125
126
#elif defined(_M_X64)
127
128
/*  Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
129
    These compilers don't support any of the newer acceleration ISAs
130
    (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
131
    which means we can get away with returning a hard-coded value from
132
    this implementation of _xgetbv. */
133
134
static __inline uint64_t blosc_internal_xgetbv(uint32_t xcr) {
135
    /* A 64-bit OS must have XMM save support. */
136
    return (xcr == 0 ? (1UL << 1) : 0UL);
137
}
138
139
#else
140
141
/* Hardware detection for any other MSVC targets (e.g., ARM)
142
   isn't implemented at this time. */
143
#error This version of c-blosc only supports x86 and x64 targets with MSVC.
144
145
#endif /* _MSC_FULL_VER >= 160040219 */
146
147
#define blosc_internal_cpuid __cpuid
148
149
#else
150
151
/*  Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
152
    and others using inline assembly. */
153
__attribute__((always_inline))
154
static inline void
155
3
blosc_internal_cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
156
3
  __asm__ __volatile__ (
157
# if defined(__i386__) && defined (__PIC__)
158
  /*  Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
159
      https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
160
  */
161
    "movl %%ebx, %%edi\n\t"
162
    "cpuid\n\t"
163
    "xchgl %%ebx, %%edi":
164
    "=D" (cpuInfo[1]),
165
#else
166
3
    "cpuid":
167
3
    "=b" (cpuInfo[1]),
168
3
#endif  /* defined(__i386) && defined(__PIC__) */
169
3
    "=a" (cpuInfo[0]),
170
3
    "=c" (cpuInfo[2]),
171
3
    "=d" (cpuInfo[3]) :
172
3
    "a" (function_id), "c" (subfunction_id)
173
3
    );
174
3
}
175
176
3
#define blosc_internal_cpuid(cpuInfo, function_id) blosc_internal_cpuidex(cpuInfo, function_id, 0)
177
178
1
#define _XCR_XFEATURE_ENABLED_MASK 0
179
180
/* Reads the content of an extended control register.
181
   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
182
*/
183
static inline uint64_t
184
1
blosc_internal_xgetbv(uint32_t xcr) {
185
1
  uint32_t eax, edx;
186
1
  __asm__ __volatile__ (
187
    /* "xgetbv"
188
       This is specified as raw instruction bytes due to some older compilers
189
       having issues with the mnemonic form.
190
    */
191
1
    ".byte 0x0f, 0x01, 0xd0":
192
1
    "=a" (eax),
193
1
    "=d" (edx) :
194
1
    "c" (xcr)
195
1
    );
196
1
  return ((uint64_t)edx << 32) | eax;
197
1
}
198
199
#endif  /* defined(_MSC_FULL_VER) */
200
201
#ifndef _XCR_XFEATURE_ENABLED_MASK
202
#define _XCR_XFEATURE_ENABLED_MASK 0x0
203
#endif
204
205
1
static blosc_cpu_features blosc_get_cpu_features(void) {
206
1
  blosc_cpu_features result = BLOSC_HAVE_NOTHING;
207
1
  int32_t max_basic_function_id;
208
  /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
209
1
  int32_t cpu_info[4];
210
1
  int sse2_available;
211
1
  int sse3_available;
212
1
  int ssse3_available;
213
1
  int sse41_available;
214
1
  int sse42_available;
215
1
  int xsave_available;
216
1
  int xsave_enabled_by_os;
217
1
  int avx2_available = 0;
218
1
  int avx512bw_available = 0;
219
1
  int xmm_state_enabled = 0;
220
1
  int ymm_state_enabled = 0;
221
1
  int zmm_state_enabled = 0;
222
1
  uint64_t xcr0_contents;
223
1
  char* envvar;
224
225
  /* Get the number of basic functions available. */
226
1
  blosc_internal_cpuid(cpu_info, 0);
227
1
  max_basic_function_id = cpu_info[0];
228
229
  /* Check for SSE-based features and required OS support */
230
1
  blosc_internal_cpuid(cpu_info, 1);
231
1
  sse2_available = (cpu_info[3] & (1 << 26)) != 0;
232
1
  sse3_available = (cpu_info[2] & (1 << 0)) != 0;
233
1
  ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
234
1
  sse41_available = (cpu_info[2] & (1 << 19)) != 0;
235
1
  sse42_available = (cpu_info[2] & (1 << 20)) != 0;
236
237
1
  xsave_available = (cpu_info[2] & (1 << 26)) != 0;
238
1
  xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
239
240
  /* Check for AVX-based features, if the processor supports extended features. */
241
1
  if (max_basic_function_id >= 7) {
242
1
    blosc_internal_cpuid(cpu_info, 7);
243
1
    avx2_available = (cpu_info[1] & (1 << 5)) != 0;
244
1
    avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
245
1
  }
246
247
  /*  Even if certain features are supported by the CPU, they may not be supported
248
      by the OS (in which case using them would crash the process or system).
249
      If xsave is available and enabled by the OS, check the contents of the
250
      extended control register XCR0 to see if the CPU features are enabled. */
251
1
#if defined(_XCR_XFEATURE_ENABLED_MASK)
252
1
  if (xsave_available && xsave_enabled_by_os && (
253
1
      sse2_available || sse3_available || ssse3_available
254
1
      || sse41_available || sse42_available
255
1
      || avx2_available || avx512bw_available)) {
256
    /* Determine which register states can be restored by the OS. */
257
1
    xcr0_contents = blosc_internal_xgetbv(_XCR_XFEATURE_ENABLED_MASK);
258
259
1
    xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
260
1
    ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
261
262
    /*  Require support for both the upper 256-bits of zmm0-zmm15 to be
263
        restored as well as all of zmm16-zmm31 and the opmask registers. */
264
1
    zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
265
1
  }
266
1
#endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
267
268
1
  envvar = getenv("BLOSC_PRINT_SHUFFLE_ACCEL");
269
1
  if (envvar != NULL) {
270
0
    printf("Shuffle CPU Information:\n");
271
0
    printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
272
0
    printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
273
0
    printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
274
0
    printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
275
0
    printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
276
0
    printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
277
0
    printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
278
0
    printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
279
0
    printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
280
0
    printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
281
0
    printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
282
0
    printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
283
0
  }
284
285
  /* Using the gathered CPU information, determine which implementation to use. */
286
  /* technically could fail on sse2 cpu on os without xmm support, but that
287
   * shouldn't exist anymore */
288
1
  if (sse2_available) {
289
1
    result |= BLOSC_HAVE_SSE2;
290
1
  }
291
1
  if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
292
1
    result |= BLOSC_HAVE_AVX2;
293
1
  }
294
1
  return result;
295
1
}
296
#endif
297
298
#else   /* No hardware acceleration supported for the target architecture. */
299
  #if defined(_MSC_VER)
300
  #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
301
  #else
302
  #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
303
  #endif
304
305
static blosc_cpu_features blosc_get_cpu_features(void) {
306
  return BLOSC_HAVE_NOTHING;
307
}
308
309
#endif
310
311
1
static shuffle_implementation_t get_shuffle_implementation(void) {
312
1
  blosc_cpu_features cpu_features = blosc_get_cpu_features();
313
1
  shuffle_implementation_t impl_generic;
314
315
1
#if defined(SHUFFLE_AVX2_ENABLED)
316
1
  if (cpu_features & BLOSC_HAVE_AVX2) {
317
1
    shuffle_implementation_t impl_avx2;
318
1
    impl_avx2.name = "avx2";
319
1
    impl_avx2.shuffle = (shuffle_func)blosc_internal_shuffle_avx2;
320
1
    impl_avx2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_avx2;
321
1
    impl_avx2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_avx2;
322
1
    impl_avx2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_avx2;
323
1
    return impl_avx2;
324
1
  }
325
0
#endif  /* defined(SHUFFLE_AVX2_ENABLED) */
326
327
0
#if defined(SHUFFLE_SSE2_ENABLED)
328
0
  if (cpu_features & BLOSC_HAVE_SSE2) {
329
0
    shuffle_implementation_t impl_sse2;
330
0
    impl_sse2.name = "sse2";
331
0
    impl_sse2.shuffle = (shuffle_func)blosc_internal_shuffle_sse2;
332
0
    impl_sse2.unshuffle = (unshuffle_func)blosc_internal_unshuffle_sse2;
333
0
    impl_sse2.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_sse2;
334
0
    impl_sse2.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_sse2;
335
0
    return impl_sse2;
336
0
  }
337
0
#endif  /* defined(SHUFFLE_SSE2_ENABLED) */
338
339
  /*  Processor doesn't support any of the hardware-accelerated implementations,
340
      so use the generic implementation. */
341
0
  impl_generic.name = "generic";
342
0
  impl_generic.shuffle = (shuffle_func)blosc_internal_shuffle_generic;
343
0
  impl_generic.unshuffle = (unshuffle_func)blosc_internal_unshuffle_generic;
344
0
  impl_generic.bitshuffle = (bitshuffle_func)blosc_internal_bshuf_trans_bit_elem_scal;
345
0
  impl_generic.bitunshuffle = (bitunshuffle_func)blosc_internal_bshuf_untrans_bit_elem_scal;
346
0
  return impl_generic;
347
0
}
348
349
350
/*  Flag indicating whether the implementation has been initialized. */
351
static pthread_once_t implementation_initialized = PTHREAD_ONCE_INIT;
352
353
/*  The dynamically-chosen shuffle/unshuffle implementation.
354
    This is only safe to use once `implementation_initialized` is set. */
355
static shuffle_implementation_t host_implementation;
356
357
1
static void set_host_implementation(void) {
358
1
  host_implementation = get_shuffle_implementation();
359
1
}
360
361
/*  Initialize the shuffle implementation, if necessary. */
362
#if defined(__GNUC__) || defined(__clang__)
363
__attribute__((always_inline))
364
#endif
365
static
366
#if defined(_MSC_VER)
367
__forceinline
368
#else
369
BLOSC_INLINE
370
#endif
371
95.0k
void init_shuffle_implementation(void) {
372
95.0k
  pthread_once(&implementation_initialized, &set_host_implementation);
373
95.0k
}
374
375
/*  Shuffle a block by dynamically dispatching to the appropriate
376
    hardware-accelerated routine at run-time. */
377
void
378
blosc_internal_shuffle(const size_t bytesoftype, const size_t blocksize,
379
0
                       const uint8_t* _src, const uint8_t* _dest) {
380
  /* Initialize the shuffle implementation if necessary. */
381
0
  init_shuffle_implementation();
382
383
  /*  The implementation is initialized.
384
      Dispatch to it's shuffle routine. */
385
0
  (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
386
0
}
387
388
/*  Unshuffle a block by dynamically dispatching to the appropriate
389
    hardware-accelerated routine at run-time. */
390
void
391
blosc_internal_unshuffle(const size_t bytesoftype, const size_t blocksize,
392
0
                         const uint8_t* _src, const uint8_t* _dest) {
393
  /* Initialize the shuffle implementation if necessary. */
394
0
  init_shuffle_implementation();
395
396
  /*  The implementation is initialized.
397
      Dispatch to it's unshuffle routine. */
398
0
  (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
399
0
}
400
401
/*  Bit-shuffle a block by dynamically dispatching to the appropriate
402
    hardware-accelerated routine at run-time. */
403
int
404
blosc_internal_bitshuffle(const size_t bytesoftype, const size_t blocksize,
405
                          const uint8_t* const _src, const uint8_t* _dest,
406
82.4k
                          const uint8_t* _tmp) {
407
82.4k
  int size = blocksize / bytesoftype;
408
  /* Initialize the shuffle implementation if necessary. */
409
82.4k
  init_shuffle_implementation();
410
411
82.4k
  if ((size % 8) == 0) {
412
    /* The number of elems is a multiple of 8 which is supported by
413
       bitshuffle. */
414
78.1k
    int ret = (int)(host_implementation.bitshuffle)((void *) _src, (void *) _dest,
415
78.1k
                                                    blocksize / bytesoftype,
416
78.1k
                                                    bytesoftype, (void *) _tmp);
417
    /* Copy the leftovers */
418
78.1k
    size_t offset = size * bytesoftype;
419
78.1k
    memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
420
78.1k
    return ret;
421
78.1k
  }
422
4.29k
  else {
423
4.29k
    memcpy((void *) _dest, (void *) _src, blocksize);
424
4.29k
  }
425
4.29k
  return size;
426
82.4k
}
427
428
/*  Bit-unshuffle a block by dynamically dispatching to the appropriate
429
    hardware-accelerated routine at run-time. */
430
int
431
blosc_internal_bitunshuffle(const size_t bytesoftype, const size_t blocksize,
432
                            const uint8_t* const _src, const uint8_t* _dest,
433
12.6k
                            const uint8_t* _tmp) {
434
12.6k
  int size = blocksize / bytesoftype;
435
  /* Initialize the shuffle implementation if necessary. */
436
12.6k
  init_shuffle_implementation();
437
438
12.6k
  if ((size % 8) == 0) {
439
    /* The number of elems is a multiple of 8 which is supported by
440
       bitshuffle. */
441
12.1k
    int ret = (int) (host_implementation.bitunshuffle)((void *) _src, (void *) _dest,
442
12.1k
                                                       blocksize / bytesoftype,
443
12.1k
                                                       bytesoftype, (void *) _tmp);
444
    /* Copy the leftovers */
445
12.1k
    size_t offset = size * bytesoftype;
446
12.1k
    memcpy((void *) (_dest + offset), (void *) (_src + offset), blocksize - offset);
447
12.1k
    return ret;
448
12.1k
  }
449
536
  else {
450
536
    memcpy((void *) _dest, (void *) _src, blocksize);
451
536
  }
452
536
  return size;
453
12.6k
}