/src/zlib-ng/arch/x86/chunkset_avx512.c
Line  | Count  | Source  | 
1  |  | /* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.  | 
2  |  |  * For conditions of distribution and use, see copyright notice in zlib.h  | 
3  |  |  */  | 
4  |  | #include "zbuild.h"  | 
5  |  | #include "zmemory.h"  | 
6  |  |  | 
7  |  | #ifdef X86_AVX512  | 
8  |  |  | 
9  |  | #include "arch/generic/chunk_256bit_perm_idx_lut.h"  | 
10  |  | #include <immintrin.h>  | 
11  |  | #include "x86_intrins.h"  | 
12  |  |  | 
13  |  | typedef __m256i chunk_t;  | 
14  |  | typedef __m128i halfchunk_t;  | 
15  |  | typedef __mmask32 mask_t;  | 
16  |  | typedef __mmask16 halfmask_t;  | 
17  |  |  | 
18  |  | #define HAVE_CHUNKMEMSET_2  | 
19  |  | #define HAVE_CHUNKMEMSET_4  | 
20  |  | #define HAVE_CHUNKMEMSET_8  | 
21  |  | #define HAVE_CHUNKMEMSET_16  | 
22  |  | #define HAVE_CHUNK_MAG  | 
23  |  | #define HAVE_HALF_CHUNK  | 
24  |  | #define HAVE_MASKED_READWRITE  | 
25  |  | #define HAVE_CHUNKCOPY  | 
26  |  | #define HAVE_HALFCHUNKCOPY  | 
27  |  |  | 
28  | 0  | static inline halfmask_t gen_half_mask(unsigned len) { | 
29  | 0  |    return (halfmask_t)_bzhi_u32(0xFFFF, len);  | 
30  | 0  | }  | 
31  |  |  | 
32  | 0  | static inline mask_t gen_mask(unsigned len) { | 
33  | 0  |    return (mask_t)_bzhi_u32(0xFFFFFFFF, len);  | 
34  | 0  | }  | 
35  |  |  | 
36  | 0  | static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { | 
37  | 0  |     *chunk = _mm256_set1_epi16(zng_memread_2(from));  | 
38  | 0  | }  | 
39  |  |  | 
40  | 0  | static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { | 
41  | 0  |     *chunk = _mm256_set1_epi32(zng_memread_4(from));  | 
42  | 0  | }  | 
43  |  |  | 
44  | 0  | static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { | 
45  | 0  |     *chunk = _mm256_set1_epi64x(zng_memread_8(from));  | 
46  | 0  | }  | 
47  |  |  | 
48  | 0  | static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { | 
49  |  |     /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where  | 
50  |  |      * the load is dumped to the stack with an aligned move for this memory-register  | 
51  |  |      * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to  | 
52  |  |      * stack doesn't exist if compiled with optimizations. For the sake of working  | 
53  |  |      * properly in a debugger, let's take the 2 cycle penalty */  | 
54  |  | #if defined(_MSC_VER) && _MSC_VER <= 1900  | 
55  |  |     halfchunk_t half = _mm_loadu_si128((__m128i*)from);  | 
56  |  |     *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);  | 
57  |  | #else  | 
58  | 0  |     *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));  | 
59  | 0  | #endif  | 
60  | 0  | }  | 
61  |  |  | 
62  | 0  | static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { | 
63  | 0  |     *chunk = _mm256_loadu_si256((__m256i *)s);  | 
64  | 0  | }  | 
65  |  |  | 
66  | 0  | static inline void storechunk(uint8_t *out, chunk_t *chunk) { | 
67  | 0  |     _mm256_storeu_si256((__m256i *)out, *chunk);  | 
68  | 0  | }  | 
69  |  |  | 
70  | 0  | static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { | 
71  | 0  |     Assert(len > 0, "chunkcopy should never have a length 0");  | 
72  |  | 
  | 
73  | 0  |     chunk_t chunk;  | 
74  | 0  |     uint32_t rem = len % sizeof(chunk_t);  | 
75  |  | 
  | 
76  | 0  |     if (len < sizeof(chunk_t)) { | 
77  | 0  |         mask_t rem_mask = gen_mask(rem);  | 
78  | 0  |         chunk = _mm256_maskz_loadu_epi8(rem_mask, from);  | 
79  | 0  |         _mm256_mask_storeu_epi8(out, rem_mask, chunk);  | 
80  | 0  |         return out + rem;  | 
81  | 0  |     }  | 
82  |  |  | 
83  | 0  |     loadchunk(from, &chunk);  | 
84  | 0  |     rem = (rem == 0) ? sizeof(chunk_t) : rem;  | 
85  | 0  |     storechunk(out, &chunk);  | 
86  | 0  |     out += rem;  | 
87  | 0  |     from += rem;  | 
88  | 0  |     len -= rem;  | 
89  |  | 
  | 
90  | 0  |     while (len > 0) { | 
91  | 0  |         loadchunk(from, &chunk);  | 
92  | 0  |         storechunk(out, &chunk);  | 
93  | 0  |         out += sizeof(chunk_t);  | 
94  | 0  |         from += sizeof(chunk_t);  | 
95  | 0  |         len -= sizeof(chunk_t);  | 
96  | 0  |     }  | 
97  |  | 
  | 
98  | 0  |     return out;  | 
99  | 0  | }  | 
100  |  |  | 
101  |  | /* MSVC compiler decompression bug when optimizing for size */  | 
102  |  | #if defined(_MSC_VER) && _MSC_VER < 1943  | 
103  |  | #  pragma optimize("", off) | 
104  |  | #endif  | 
105  | 0  | static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { | 
106  | 0  |     lut_rem_pair lut_rem = perm_idx_lut[dist - 3];  | 
107  | 0  |     __m256i ret_vec;  | 
108  | 0  |     *chunk_rem = lut_rem.remval;  | 
109  |  |  | 
110  |  |     /* See the AVX2 implementation for more detailed comments. This is that + some masked  | 
111  |  |      * loads to avoid an out of bounds read on the heap */  | 
112  |  | 
  | 
113  | 0  |     if (dist < 16) { | 
114  | 0  |         const __m256i permute_xform =  | 
115  | 0  |             _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
116  | 0  |                              16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);  | 
117  | 0  |         __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));  | 
118  | 0  |         halfmask_t load_mask = gen_half_mask(dist);  | 
119  | 0  |         __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);  | 
120  | 0  |         perm_vec = _mm256_add_epi8(perm_vec, permute_xform);  | 
121  | 0  |         ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);  | 
122  | 0  |         ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);  | 
123  | 0  |     }  else { | 
124  | 0  |         halfmask_t load_mask = gen_half_mask(dist - 16);  | 
125  | 0  |         __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);  | 
126  | 0  |         __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));  | 
127  | 0  |         __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));  | 
128  | 0  |         halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);  | 
129  | 0  |         __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);  | 
130  | 0  |         ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);  | 
131  | 0  |     }  | 
132  |  | 
  | 
133  | 0  |     return ret_vec;  | 
134  | 0  | }  | 
135  |  | #if defined(_MSC_VER) && _MSC_VER < 1943  | 
136  |  | #  pragma optimize("", on) | 
137  |  | #endif  | 
138  |  |  | 
139  | 0  | static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { | 
140  | 0  |     _mm_storeu_si128((__m128i *)out, *chunk);  | 
141  | 0  | }  | 
142  |  |  | 
143  | 0  | static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { | 
144  |  |     /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately  | 
145  |  |      * unlikely to be actually written or read from */  | 
146  | 0  |     return _mm256_zextsi128_si256(*chunk);  | 
147  | 0  | }  | 
148  |  |  | 
149  | 0  | static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { | 
150  | 0  |     lut_rem_pair lut_rem = perm_idx_lut[dist - 3];  | 
151  | 0  |     __m128i perm_vec, ret_vec;  | 
152  | 0  |     halfmask_t load_mask = gen_half_mask(dist);  | 
153  | 0  |     ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);  | 
154  | 0  |     *chunk_rem = half_rem_vals[dist - 3];  | 
155  |  | 
  | 
156  | 0  |     perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));  | 
157  | 0  |     ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);  | 
158  |  | 
  | 
159  | 0  |     return ret_vec;  | 
160  | 0  | }  | 
161  |  |  | 
162  | 0  | static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { | 
163  | 0  |     Assert(len > 0, "chunkcopy should never have a length 0");  | 
164  | 0  |     halfchunk_t chunk;  | 
165  |  | 
  | 
166  | 0  |     uint32_t rem = len % sizeof(halfchunk_t);  | 
167  | 0  |     if (rem == 0) { | 
168  | 0  |         rem = sizeof(halfchunk_t);  | 
169  | 0  |     }  | 
170  |  | 
  | 
171  | 0  |     halfmask_t rem_mask = gen_half_mask(rem);  | 
172  | 0  |     chunk = _mm_maskz_loadu_epi8(rem_mask, from);  | 
173  | 0  |     _mm_mask_storeu_epi8(out, rem_mask, chunk);  | 
174  |  | 
  | 
175  | 0  |     return out + rem;  | 
176  | 0  | }  | 
177  |  |  | 
178  | 0  | #define CHUNKSIZE        chunksize_avx512  | 
179  | 0  | #define CHUNKUNROLL      chunkunroll_avx512  | 
180  | 0  | #define CHUNKMEMSET      chunkmemset_avx512  | 
181  |  | #define CHUNKMEMSET_SAFE chunkmemset_safe_avx512  | 
182  |  |  | 
183  |  | #include "chunkset_tpl.h"  | 
184  |  |  | 
185  |  | #define INFLATE_FAST     inflate_fast_avx512  | 
186  |  |  | 
187  |  | #include "inffast_tpl.h"  | 
188  |  |  | 
189  |  | #endif  |