/src/c-blosc2/blosc/shuffle-generic.h
Line | Count | Source |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | Generic (non-hardware-accelerated) shuffle/unshuffle routines. |
13 | | These are used when hardware-accelerated functions aren't available |
14 | | for a particular platform; they are also used by the hardware- |
15 | | accelerated functions to handle any remaining elements in a block |
16 | | which isn't a multiple of the hardware's vector size. |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifndef BLOSC_SHUFFLE_GENERIC_H |
20 | | #define BLOSC_SHUFFLE_GENERIC_H |
21 | | |
22 | | #include "blosc2/blosc2-common.h" |
23 | | |
24 | | #include <stdint.h> |
25 | | #include <string.h> |
26 | | |
27 | | /** |
28 | | Generic (non-hardware-accelerated) shuffle routine. |
29 | | This is the pure element-copying nested loop. It is used by the |
30 | | generic shuffle implementation and also by the vectorized shuffle |
31 | | implementations to process any remaining elements in a block which |
32 | | is not a multiple of (type_size * vector_size). |
33 | | */ |
34 | | static inline void shuffle_generic_inline(const int32_t type_size, |
35 | | const int32_t vectorizable_blocksize, const int32_t blocksize, |
36 | 14.6k | const uint8_t *_src, uint8_t *_dest) { |
37 | 14.6k | int32_t i, j; |
38 | | /* Calculate the number of elements in the block. */ |
39 | 14.6k | const int32_t neblock_quot = blocksize / type_size; |
40 | 14.6k | const int32_t neblock_rem = blocksize % type_size; |
41 | 14.6k | const int32_t vectorizable_elements = vectorizable_blocksize / type_size; |
42 | | |
43 | | |
44 | | /* Non-optimized shuffle */ |
45 | 29.3k | for (j = 0; j < type_size; j++) { |
46 | 23.2M | for (i = vectorizable_elements; i < (int32_t)neblock_quot; i++) { |
47 | 23.2M | _dest[j * neblock_quot + i] = _src[i * type_size + j]; |
48 | 23.2M | } |
49 | 14.6k | } |
50 | | |
51 | | /* Copy any leftover bytes in the block without shuffling them. */ |
52 | 14.6k | memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem); |
53 | 14.6k | } Unexecuted instantiation: shuffle.c:shuffle_generic_inline shuffle-generic.c:shuffle_generic_inline Line | Count | Source | 36 | 14.6k | const uint8_t *_src, uint8_t *_dest) { | 37 | 14.6k | int32_t i, j; | 38 | | /* Calculate the number of elements in the block. */ | 39 | 14.6k | const int32_t neblock_quot = blocksize / type_size; | 40 | 14.6k | const int32_t neblock_rem = blocksize % type_size; | 41 | 14.6k | const int32_t vectorizable_elements = vectorizable_blocksize / type_size; | 42 | | | 43 | | | 44 | | /* Non-optimized shuffle */ | 45 | 29.3k | for (j = 0; j < type_size; j++) { | 46 | 23.2M | for (i = vectorizable_elements; i < (int32_t)neblock_quot; i++) { | 47 | 23.2M | _dest[j * neblock_quot + i] = _src[i * type_size + j]; | 48 | 23.2M | } | 49 | 14.6k | } | 50 | | | 51 | | /* Copy any leftover bytes in the block without shuffling them. */ | 52 | 14.6k | memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem); | 53 | 14.6k | } |
Unexecuted instantiation: shuffle-sse2.c:shuffle_generic_inline Unexecuted instantiation: shuffle-avx2.c:shuffle_generic_inline |
54 | | |
55 | | /** |
56 | | Generic (non-hardware-accelerated) unshuffle routine. |
57 | | This is the pure element-copying nested loop. It is used by the |
58 | | generic unshuffle implementation and also by the vectorized unshuffle |
59 | | implementations to process any remaining elements in a block which |
60 | | is not a multiple of (type_size * vector_size). |
61 | | */ |
62 | | static inline void unshuffle_generic_inline(const int32_t type_size, |
63 | | const int32_t vectorizable_blocksize, const int32_t blocksize, |
64 | 7.97k | const uint8_t *_src, uint8_t *_dest) { |
65 | 7.97k | int32_t i, j; |
66 | | |
67 | | /* Calculate the number of elements in the block. */ |
68 | 7.97k | const int32_t neblock_quot = blocksize / type_size; |
69 | 7.97k | const int32_t neblock_rem = blocksize % type_size; |
70 | 7.97k | const int32_t vectorizable_elements = vectorizable_blocksize / type_size; |
71 | | |
72 | | /* Non-optimized unshuffle */ |
73 | 9.58M | for (i = vectorizable_elements; i < (int32_t)neblock_quot; i++) { |
74 | 19.4M | for (j = 0; j < type_size; j++) { |
75 | 9.88M | _dest[i * type_size + j] = _src[j * neblock_quot + i]; |
76 | 9.88M | } |
77 | 9.57M | } |
78 | | |
79 | | /* Copy any leftover bytes in the block without unshuffling them. */ |
80 | 7.97k | memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem); |
81 | 7.97k | } Unexecuted instantiation: shuffle.c:unshuffle_generic_inline shuffle-generic.c:unshuffle_generic_inline Line | Count | Source | 64 | 7.42k | const uint8_t *_src, uint8_t *_dest) { | 65 | 7.42k | int32_t i, j; | 66 | | | 67 | | /* Calculate the number of elements in the block. */ | 68 | 7.42k | const int32_t neblock_quot = blocksize / type_size; | 69 | 7.42k | const int32_t neblock_rem = blocksize % type_size; | 70 | 7.42k | const int32_t vectorizable_elements = vectorizable_blocksize / type_size; | 71 | | | 72 | | /* Non-optimized unshuffle */ | 73 | 9.57M | for (i = vectorizable_elements; i < (int32_t)neblock_quot; i++) { | 74 | 19.2M | for (j = 0; j < type_size; j++) { | 75 | 9.69M | _dest[i * type_size + j] = _src[j * neblock_quot + i]; | 76 | 9.69M | } | 77 | 9.56M | } | 78 | | | 79 | | /* Copy any leftover bytes in the block without unshuffling them. */ | 80 | 7.42k | memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem); | 81 | 7.42k | } |
Unexecuted instantiation: shuffle-sse2.c:unshuffle_generic_inline shuffle-avx2.c:unshuffle_generic_inline Line | Count | Source | 64 | 552 | const uint8_t *_src, uint8_t *_dest) { | 65 | 552 | int32_t i, j; | 66 | | | 67 | | /* Calculate the number of elements in the block. */ | 68 | 552 | const int32_t neblock_quot = blocksize / type_size; | 69 | 552 | const int32_t neblock_rem = blocksize % type_size; | 70 | 552 | const int32_t vectorizable_elements = vectorizable_blocksize / type_size; | 71 | | | 72 | | /* Non-optimized unshuffle */ | 73 | 6.77k | for (i = vectorizable_elements; i < (int32_t)neblock_quot; i++) { | 74 | 195k | for (j = 0; j < type_size; j++) { | 75 | 189k | _dest[i * type_size + j] = _src[j * neblock_quot + i]; | 76 | 189k | } | 77 | 6.21k | } | 78 | | | 79 | | /* Copy any leftover bytes in the block without unshuffling them. */ | 80 | 552 | memcpy(_dest + (blocksize - neblock_rem), _src + (blocksize - neblock_rem), neblock_rem); | 81 | 552 | } |
|
82 | | |
83 | | /** |
84 | | Generic (non-hardware-accelerated) shuffle routine. |
85 | | */ |
86 | | BLOSC_NO_EXPORT void shuffle_generic(const int32_t bytesoftype, const int32_t blocksize, |
87 | | const uint8_t *_src, uint8_t *_dest); |
88 | | |
89 | | /** |
90 | | Generic (non-hardware-accelerated) unshuffle routine. |
91 | | */ |
92 | | BLOSC_NO_EXPORT void unshuffle_generic(const int32_t bytesoftype, const int32_t blocksize, |
93 | | const uint8_t *_src, uint8_t *_dest); |
94 | | |
95 | | #endif /* BLOSC_SHUFFLE_GENERIC_H */ |