Line | Count | Source |
1 | | /* inflate_p.h -- Private inline functions and macros shared with more than one deflate method |
2 | | * |
3 | | */ |
4 | | |
5 | | #ifndef INFLATE_P_H |
6 | | #define INFLATE_P_H |
7 | | |
8 | | #include <stdlib.h> |
9 | | |
10 | | #include "zendian.h" |
11 | | #include "zmemory.h" |
12 | | #include "crc32_braid_tbl.h" |
13 | | |
14 | | /* Architecture-specific hooks. */ |
15 | | #ifdef S390_DFLTCC_INFLATE |
16 | | # include "arch/s390/dfltcc_inflate.h" |
17 | | /* DFLTCC instructions require window to be page-aligned */ |
18 | | # define PAD_WINDOW PAD_4096 |
19 | | # define WINDOW_PAD_SIZE 4096 |
20 | | # define HINT_ALIGNED_WINDOW HINT_ALIGNED_4096 |
21 | | #else |
22 | | # define PAD_WINDOW PAD_64 |
23 | | # define WINDOW_PAD_SIZE 64 |
24 | | # define HINT_ALIGNED_WINDOW HINT_ALIGNED_64 |
25 | | /* Adjust the window size for the arch-specific inflate code. */ |
26 | 57.3k | # define INFLATE_ADJUST_WINDOW_SIZE(n) (n) |
27 | | /* Invoked at the end of inflateResetKeep(). Useful for initializing arch-specific extension blocks. */ |
28 | 28.6k | # define INFLATE_RESET_KEEP_HOOK(strm) do {} while (0) |
29 | | /* Invoked at the beginning of inflatePrime(). Useful for updating arch-specific buffers. */ |
30 | 0 | # define INFLATE_PRIME_HOOK(strm, bits, value) do {} while (0) |
31 | | /* Invoked at the beginning of each block. Useful for plugging arch-specific inflation code. */ |
32 | 8.42M | # define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) |
33 | | /* Returns whether zlib-ng should compute a checksum. Set to 0 if arch-specific inflation code already does that. */ |
34 | 53.5k | # define INFLATE_NEED_CHECKSUM(strm) 1 |
35 | | /* Returns whether zlib-ng should update a window. Set to 0 if arch-specific inflation code already does that. */ |
36 | 60.0k | # define INFLATE_NEED_UPDATEWINDOW(strm) 1 |
37 | | /* Invoked at the beginning of inflateMark(). Useful for updating arch-specific pointers and offsets. */ |
38 | 0 | # define INFLATE_MARK_HOOK(strm) do {} while (0) |
39 | | /* Invoked at the beginning of inflateSyncPoint(). Useful for performing arch-specific state checks. */ |
40 | 0 | # define INFLATE_SYNC_POINT_HOOK(strm) do {} while (0) |
41 | | /* Invoked at the beginning of inflateSetDictionary(). Useful for checking arch-specific window data. */ |
42 | 0 | # define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0) |
43 | | /* Invoked at the beginning of inflateGetDictionary(). Useful for adjusting arch-specific window data. */ |
44 | 0 | # define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0) |
45 | | #endif |
46 | | |
47 | | /* |
48 | | * Macros shared by inflate() and inflateBack() |
49 | | */ |
50 | | |
51 | | /* check macros for header crc */ |
52 | | #ifdef GUNZIP |
53 | 0 | # define CRC_DO1_B(c, b) c = crc_table[(c ^ (b)) & 0xff] ^ (c >> 8) |
54 | | |
55 | | # define CRC2(check, word) \ |
56 | 0 | do { \ |
57 | 0 | uint32_t crc = ~(uint32_t)(check); \ |
58 | 0 | CRC_DO1_B(crc, (word) ); \ |
59 | 0 | CRC_DO1_B(crc, (word) >> 8); \ |
60 | 0 | (check) = ~crc; \ |
61 | 0 | } while (0) |
62 | | |
63 | | # define CRC4(check, word) \ |
64 | 0 | do { \ |
65 | 0 | uint32_t crc = ~(uint32_t)(check); \ |
66 | 0 | CRC_DO1_B(crc, (word) ); \ |
67 | 0 | CRC_DO1_B(crc, (word) >> 8); \ |
68 | 0 | CRC_DO1_B(crc, (word) >> 16); \ |
69 | 0 | CRC_DO1_B(crc, (word) >> 24); \ |
70 | 0 | (check) = ~crc; \ |
71 | 0 | } while (0) |
72 | | #endif |
73 | | |
74 | | /* Compiler optimization for bit accumulator on x86 architectures */ |
75 | | #ifdef ARCH_X86 |
76 | | typedef uint8_t bits_t; |
77 | | #else |
78 | | typedef unsigned bits_t; |
79 | | #endif |
80 | | |
81 | | /* Load registers with state in inflate() for speed */ |
82 | | #define LOAD() \ |
83 | 8.44M | do { \ |
84 | 8.44M | put = strm->next_out; \ |
85 | 8.44M | left = strm->avail_out; \ |
86 | 8.44M | next = strm->next_in; \ |
87 | 8.44M | have = strm->avail_in; \ |
88 | 8.44M | hold = state->hold; \ |
89 | 8.44M | bits = (bits_t)state->bits; \ |
90 | 8.44M | } while (0) |
91 | | |
92 | | /* Restore state from registers in inflate() */ |
93 | | #define RESTORE() \ |
94 | 8.44M | do { \ |
95 | 8.44M | strm->next_out = put; \ |
96 | 8.44M | strm->avail_out = left; \ |
97 | 8.44M | strm->next_in = (z_const unsigned char *)next; \ |
98 | 8.44M | strm->avail_in = have; \ |
99 | 8.44M | state->hold = hold; \ |
100 | 8.44M | state->bits = bits; \ |
101 | 8.44M | } while (0) |
102 | | |
103 | | /* Refill to have at least 56 bits in the bit accumulator */ |
104 | 138M | #define REFILL() do { \ |
105 | 138M | hold |= load_64_bits(in, bits); \ |
106 | 138M | in += (63 ^ bits) >> 3; \ |
107 | 138M | bits |= 56; \ |
108 | 138M | } while (0) |
109 | | |
110 | | /* Clear the input bit accumulator */ |
111 | | #define INITBITS() \ |
112 | 60.8k | do { \ |
113 | 60.8k | hold = 0; \ |
114 | 60.8k | bits = 0; \ |
115 | 60.8k | } while (0) |
116 | | |
117 | | /* Ensure that there is at least n bits in the bit accumulator. If there is |
118 | | not enough available input to do that, then return from inflate()/inflateBack(). */ |
119 | | #define NEEDBITS(n) \ |
120 | 9.53M | do { \ |
121 | 9.53M | unsigned u = (unsigned)(n); \ |
122 | 14.3M | while (bits < (bits_t)u) \ |
123 | 9.53M | PULLBYTE(); \ |
124 | 9.53M | } while (0) |
125 | | |
126 | | /* Return the low n bits of the bit accumulator (n < 16) */ |
127 | | #define BITS(n) \ |
128 | 24.3M | (hold & ((1U << (unsigned)(n)) - 1)) |
129 | | |
130 | | /* Remove n bits from the bit accumulator */ |
131 | | #define DROPBITS(n) \ |
132 | 410M | do { \ |
133 | 410M | unsigned u = (unsigned)(n); \ |
134 | 410M | hold >>= u; \ |
135 | 410M | bits -= (bits_t)u; \ |
136 | 410M | } while (0) |
137 | | |
138 | | /* Remove zero to seven bits as needed to go to a byte boundary */ |
139 | | #define BYTEBITS() \ |
140 | 33.7k | do { \ |
141 | 33.7k | hold >>= bits & 7; \ |
142 | 33.7k | bits -= bits & 7; \ |
143 | 33.7k | } while (0) |
144 | | |
145 | | /* Set mode=BAD and prepare error message */ |
146 | | #define SET_BAD(errmsg) \ |
147 | 4.36k | do { \ |
148 | 4.36k | state->mode = BAD; \ |
149 | 4.36k | strm->msg = (char *)errmsg; \ |
150 | 4.36k | } while (0) |
151 | | |
152 | | /* Huffman code table entry format for length/distance codes (op & 16 set): |
153 | | * bits = code_bits + extra_bits (combined for single-shift decode) |
154 | | * op = 16 | code_bits |
155 | | * val = base value |
156 | | * |
157 | | * For literals (op == 0): bits = code_bits, val = literal byte |
158 | | */ |
159 | | |
160 | | /* Extract code size from a Huffman table entry */ |
161 | | #define CODE_BITS(here) \ |
162 | 609k | ((unsigned)((here.op & 16) ? (here.op & 15) : here.bits)) |
163 | | |
164 | | /* Extract extra bits count from a length/distance code entry */ |
165 | | #define CODE_EXTRA(here) \ |
166 | 52.3k | ((unsigned)((here.op & 16) ? (here.bits - (here.op & 15)) : 0)) |
167 | | |
168 | | /* Extract extra bits value from saved bit accumulator */ |
169 | | #define EXTRA_BITS(old, here, op) \ |
170 | 32.7M | ((old & (((uint64_t)1 << here.bits) - 1)) >> (op & MAX_BITS)) |
171 | | |
172 | | /* Build combined op field: preserves extra if not len/dist, else combines with code_bits */ |
173 | | #define COMBINE_OP(extra, code_bits) \ |
174 | 505k | ((unsigned char)((extra) & 16 ? (code_bits) | 16 : (extra))) |
175 | | |
176 | | /* Build combined bits field: code_bits + extra_bits from extra's low nibble */ |
177 | | #define COMBINE_BITS(code_bits, extra) \ |
178 | 505k | ((unsigned char)((code_bits) + ((extra) & 15))) |
179 | | |
180 | | /* Trace macros for debugging */ |
181 | | #define TRACE_LITERAL(val) \ |
182 | | Tracevv((stderr, val >= 0x20 && val < 0x7f ? \ |
183 | | "inflate: literal '%c'\n" : \ |
184 | | "inflate: literal 0x%02x\n", val)) |
185 | | |
186 | | #define TRACE_LENGTH(len) \ |
187 | | Tracevv((stderr, "inflate: length %u\n", len)) |
188 | | |
189 | | #define TRACE_DISTANCE(dist) \ |
190 | | Tracevv((stderr, "inflate: distance %u\n", dist)) |
191 | | |
192 | | #define TRACE_END_OF_BLOCK() \ |
193 | | Tracevv((stderr, "inflate: end of block\n")) |
194 | | |
195 | 34.0M | #define INFLATE_FAST_MIN_HAVE 15 /* max input bits per length/distance pair */ |
196 | 16.7M | #define INFLATE_FAST_MIN_LEFT 260 /* max output per token (258) + 2 */ |
197 | 8.45M | #define INFLATE_FAST_MIN_SAFE 3 /* max unchecked literal writes per iteration */ |
198 | | |
199 | | /* Load 64 bits from IN and place the bytes at offset BITS in the result. */ |
200 | 138M | static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { |
201 | 138M | uint64_t chunk = zng_memread_8(in); |
202 | 138M | return Z_U64_FROM_LE(chunk) << bits; |
203 | 138M | } Unexecuted instantiation: chunkset_sse2.c:load_64_bits Unexecuted instantiation: chunkset_ssse3.c:load_64_bits chunkset_avx2.c:load_64_bits Line | Count | Source | 200 | 138M | static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { | 201 | 138M | uint64_t chunk = zng_memread_8(in); | 202 | 138M | return Z_U64_FROM_LE(chunk) << bits; | 203 | 138M | } |
Unexecuted instantiation: chunkset_avx512.c:load_64_bits Unexecuted instantiation: inflate.c:load_64_bits Unexecuted instantiation: inftrees.c:load_64_bits |
204 | | |
205 | | /* Behave like chunkcopy, but avoid writing beyond of legal output. */ |
206 | 75.5k | static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe) { |
207 | 75.5k | size_t safelen = safe - out; |
208 | 75.5k | len = MIN(len, safelen); |
209 | 75.5k | int32_t olap_src = from >= out && from < out + len; |
210 | 75.5k | int32_t olap_dst = out >= from && out < from + len; |
211 | 75.5k | size_t tocopy; |
212 | | |
213 | | /* For all cases without overlap, memcpy is ideal */ |
214 | 75.5k | if (!(olap_src || olap_dst)) { |
215 | 59.9k | memcpy(out, from, len); |
216 | 59.9k | return out + len; |
217 | 59.9k | } |
218 | | |
219 | | /* Complete overlap: Source == destination */ |
220 | 15.6k | if (out == from) { |
221 | 0 | return out + len; |
222 | 0 | } |
223 | | |
224 | | /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior, |
225 | | * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the |
226 | | * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest |
227 | | * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look |
228 | | * behind or lookahead distance. */ |
229 | 15.6k | size_t non_olap_size = (size_t)ABS(from - out); |
230 | | |
231 | | /* So this doesn't give use a worst case scenario of function calls in a loop, |
232 | | * we want to instead break this down into copy blocks of fixed lengths |
233 | | * |
234 | | * TODO: The memcpy calls aren't inlined on architectures with strict memory alignment |
235 | | */ |
236 | 340k | while (len) { |
237 | 324k | tocopy = MIN(non_olap_size, len); |
238 | 324k | len -= tocopy; |
239 | | |
240 | 326k | while (tocopy >= 16) { |
241 | 1.73k | memcpy(out, from, 16); |
242 | 1.73k | out += 16; |
243 | 1.73k | from += 16; |
244 | 1.73k | tocopy -= 16; |
245 | 1.73k | } |
246 | | |
247 | 324k | if (tocopy >= 8) { |
248 | 2.43k | memcpy(out, from, 8); |
249 | 2.43k | out += 8; |
250 | 2.43k | from += 8; |
251 | 2.43k | tocopy -= 8; |
252 | 2.43k | } |
253 | | |
254 | 324k | if (tocopy >= 4) { |
255 | 7.93k | memcpy(out, from, 4); |
256 | 7.93k | out += 4; |
257 | 7.93k | from += 4; |
258 | 7.93k | tocopy -= 4; |
259 | 7.93k | } |
260 | | |
261 | 659k | while (tocopy--) { |
262 | 334k | *out++ = *from++; |
263 | 334k | } |
264 | 324k | } |
265 | | |
266 | 15.6k | return out; |
267 | 15.6k | } Unexecuted instantiation: chunkset_sse2.c:chunkcopy_safe Unexecuted instantiation: chunkset_ssse3.c:chunkcopy_safe chunkset_avx2.c:chunkcopy_safe Line | Count | Source | 206 | 75.5k | static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe) { | 207 | 75.5k | size_t safelen = safe - out; | 208 | 75.5k | len = MIN(len, safelen); | 209 | 75.5k | int32_t olap_src = from >= out && from < out + len; | 210 | 75.5k | int32_t olap_dst = out >= from && out < from + len; | 211 | 75.5k | size_t tocopy; | 212 | | | 213 | | /* For all cases without overlap, memcpy is ideal */ | 214 | 75.5k | if (!(olap_src || olap_dst)) { | 215 | 59.9k | memcpy(out, from, len); | 216 | 59.9k | return out + len; | 217 | 59.9k | } | 218 | | | 219 | | /* Complete overlap: Source == destination */ | 220 | 15.6k | if (out == from) { | 221 | 0 | return out + len; | 222 | 0 | } | 223 | | | 224 | | /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior, | 225 | | * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the | 226 | | * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest | 227 | | * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look | 228 | | * behind or lookahead distance. */ | 229 | 15.6k | size_t non_olap_size = (size_t)ABS(from - out); | 230 | | | 231 | | /* So this doesn't give use a worst case scenario of function calls in a loop, | 232 | | * we want to instead break this down into copy blocks of fixed lengths | 233 | | * | 234 | | * TODO: The memcpy calls aren't inlined on architectures with strict memory alignment | 235 | | */ | 236 | 340k | while (len) { | 237 | 324k | tocopy = MIN(non_olap_size, len); | 238 | 324k | len -= tocopy; | 239 | | | 240 | 326k | while (tocopy >= 16) { | 241 | 1.73k | memcpy(out, from, 16); | 242 | 1.73k | out += 16; | 243 | 1.73k | from += 16; | 244 | 1.73k | tocopy -= 16; | 245 | 1.73k | } | 246 | | | 247 | 324k | if (tocopy >= 8) { | 248 | 2.43k | memcpy(out, from, 8); | 249 | 2.43k | out += 8; | 250 | 2.43k | from += 8; | 251 | 2.43k | tocopy -= 8; | 252 | 2.43k | } | 253 | | | 254 | 324k | if (tocopy >= 4) { | 255 | 7.93k | memcpy(out, from, 4); | 256 | 7.93k | out += 4; | 257 | 7.93k | from += 4; | 258 | 7.93k | tocopy -= 4; | 259 | 7.93k | } | 260 | | | 261 | 659k | while (tocopy--) { | 262 | 334k | *out++ = *from++; | 263 | 334k | } | 264 | 324k | } | 265 | | | 266 | 15.6k | return out; | 267 | 15.6k | } |
Unexecuted instantiation: chunkset_avx512.c:chunkcopy_safe Unexecuted instantiation: inflate.c:chunkcopy_safe Unexecuted instantiation: inftrees.c:chunkcopy_safe |
268 | | |
269 | | #endif |