/src/c-blosc/blosc/blosclz.c
Line | Count | Source |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 The Blosc Developers <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | The code in this file is heavily based on FastLZ, a lightning-fast |
13 | | lossless compression library. See LICENSES/FASTLZ.txt for details. |
14 | | **********************************************************************/ |
15 | | |
16 | | |
17 | | #include <stdio.h> |
18 | | #include <stdbool.h> |
19 | | #include "blosclz.h" |
20 | | #include "fastcopy.h" |
21 | | #include "blosc-common.h" |
22 | | |
23 | | |
24 | | /* |
25 | | * Give hints to the compiler for branch prediction optimization. |
26 | | */ |
27 | | #if defined(__GNUC__) && (__GNUC__ > 2) |
28 | 18.5M | #define BLOSCLZ_LIKELY(c) (__builtin_expect((c), 1)) |
29 | 50.0M | #define BLOSCLZ_UNLIKELY(c) (__builtin_expect((c), 0)) |
30 | | #else |
31 | | #define BLOSCLZ_LIKELY(c) (c) |
32 | | #define BLOSCLZ_UNLIKELY(c) (c) |
33 | | #endif |
34 | | |
35 | | /* |
36 | | * Use inlined functions for supported systems. |
37 | | */ |
38 | | #if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */ |
39 | | #define inline __inline /* Visual C is not C99, but supports some kind of inline */ |
40 | | #endif |
41 | | |
42 | 923k | #define MAX_COPY 32U |
43 | 19.6M | #define MAX_DISTANCE 8191 |
44 | 18.5M | #define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1) |
45 | | |
46 | | #ifdef BLOSC_STRICT_ALIGN |
47 | | #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8) |
48 | | #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24) |
49 | | #else |
50 | | #define BLOSCLZ_READU16(p) *((const uint16_t*)(p)) |
51 | 26.2M | #define BLOSCLZ_READU32(p) *((const uint32_t*)(p)) |
52 | | #endif |
53 | | |
54 | 50.3k | #define HASH_LOG (14U) |
55 | 6.24k | #define HASH_LOG2 (12U) |
56 | | |
57 | | // This is used in LZ4 and seems to work pretty well here too |
58 | 19.6M | #define HASH_FUNCTION(v, s, h) { \ |
59 | 19.6M | (v) = ((s) * 2654435761U) >> (32U - (h)); \ |
60 | 19.6M | } |
61 | | |
62 | | |
63 | | #if defined(__AVX2__) |
64 | | static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
65 | | uint8_t x = ip[-1]; |
66 | | |
67 | | while (ip < (ip_bound - (sizeof(__m256i)))) { |
68 | | __m256i value, value2, cmp; |
69 | | /* Broadcast the value for every byte in a 256-bit register */ |
70 | | memset(&value, x, sizeof(__m256i)); |
71 | | value2 = _mm256_loadu_si256((__m256i *)ref); |
72 | | cmp = _mm256_cmpeq_epi64(value, value2); |
73 | | if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) { |
74 | | /* Return the byte that starts to differ */ |
75 | | while (*ref++ == x) ip++; |
76 | | return ip; |
77 | | } |
78 | | else { |
79 | | ip += sizeof(__m256i); |
80 | | ref += sizeof(__m256i); |
81 | | } |
82 | | } |
83 | | /* Look into the remainder */ |
84 | | while ((ip < ip_bound) && (*ref++ == x)) ip++; |
85 | | return ip; |
86 | | } |
87 | | #endif |
88 | | |
89 | | #if defined(__SSE2__) |
90 | 0 | uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
91 | 0 | uint8_t x = ip[-1]; |
92 | |
|
93 | 0 | while (ip < (ip_bound - sizeof(__m128i))) { |
94 | 0 | __m128i value, value2, cmp; |
95 | | /* Broadcast the value for every byte in a 128-bit register */ |
96 | 0 | memset(&value, x, sizeof(__m128i)); |
97 | 0 | value2 = _mm_loadu_si128((__m128i *)ref); |
98 | 0 | cmp = _mm_cmpeq_epi32(value, value2); |
99 | 0 | if (_mm_movemask_epi8(cmp) != 0xFFFF) { |
100 | | /* Return the byte that starts to differ */ |
101 | 0 | while (*ref++ == x) ip++; |
102 | 0 | return ip; |
103 | 0 | } |
104 | 0 | else { |
105 | 0 | ip += sizeof(__m128i); |
106 | 0 | ref += sizeof(__m128i); |
107 | 0 | } |
108 | 0 | } |
109 | | /* Look into the remainder */ |
110 | 0 | while ((ip < ip_bound) && (*ref++ == x)) ip++; |
111 | 0 | return ip; |
112 | 0 | } |
113 | | |
114 | | #endif |
115 | | |
116 | | |
117 | 104k | static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
118 | 104k | uint8_t x = ip[-1]; |
119 | 104k | int64_t value, value2; |
120 | | /* Broadcast the value for every byte in a 64-bit register */ |
121 | 104k | memset(&value, x, 8); |
122 | | /* safe because the outer check against ip limit */ |
123 | 411k | while (ip < (ip_bound - sizeof(int64_t))) { |
124 | | #if defined(BLOSC_STRICT_ALIGN) |
125 | | memcpy(&value2, ref, 8); |
126 | | #else |
127 | 409k | value2 = ((int64_t*)ref)[0]; |
128 | 409k | #endif |
129 | 409k | if (value != value2) { |
130 | | /* Return the byte that starts to differ */ |
131 | 351k | while (*ref++ == x) ip++; |
132 | 103k | return ip; |
133 | 103k | } |
134 | 306k | else { |
135 | 306k | ip += 8; |
136 | 306k | ref += 8; |
137 | 306k | } |
138 | 409k | } |
139 | | /* Look into the remainder */ |
140 | 6.36k | while ((ip < ip_bound) && (*ref++ == x)) ip++; |
141 | 1.52k | return ip; |
142 | 104k | } |
143 | | |
144 | | |
145 | | /* Return the byte that starts to differ */ |
146 | 0 | uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
147 | 0 | #if !defined(BLOSC_STRICT_ALIGN) |
148 | 0 | while (ip < (ip_bound - sizeof(int64_t))) { |
149 | 0 | if (*(int64_t*)ref != *(int64_t*)ip) { |
150 | | /* Return the byte that starts to differ */ |
151 | 0 | while (*ref++ == *ip++) {} |
152 | 0 | return ip; |
153 | 0 | } |
154 | 0 | else { |
155 | 0 | ip += sizeof(int64_t); |
156 | 0 | ref += sizeof(int64_t); |
157 | 0 | } |
158 | 0 | } |
159 | 0 | #endif |
160 | | /* Look into the remainder */ |
161 | 0 | while ((ip < ip_bound) && (*ref++ == *ip++)) {} |
162 | 0 | return ip; |
163 | 0 | } |
164 | | |
165 | | |
166 | | #if defined(__SSE2__) |
167 | 1.23M | static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
168 | 1.23M | __m128i value, value2, cmp; |
169 | | |
170 | 2.19M | while (ip < (ip_bound - sizeof(__m128i))) { |
171 | 2.18M | value = _mm_loadu_si128((__m128i *) ip); |
172 | 2.18M | value2 = _mm_loadu_si128((__m128i *) ref); |
173 | 2.18M | cmp = _mm_cmpeq_epi32(value, value2); |
174 | 2.18M | if (_mm_movemask_epi8(cmp) != 0xFFFF) { |
175 | | /* Return the byte that starts to differ */ |
176 | 3.67M | while (*ref++ == *ip++) {} |
177 | 1.22M | return ip; |
178 | 1.22M | } |
179 | 963k | else { |
180 | 963k | ip += sizeof(__m128i); |
181 | 963k | ref += sizeof(__m128i); |
182 | 963k | } |
183 | 2.18M | } |
184 | | /* Look into the remainder */ |
185 | 44.4k | while ((ip < ip_bound) && (*ref++ == *ip++)) {} |
186 | 7.95k | return ip; |
187 | 1.23M | } |
188 | | #endif |
189 | | |
190 | | |
191 | | #if defined(__AVX2__) |
192 | | static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
193 | | |
194 | | while (ip < (ip_bound - sizeof(__m256i))) { |
195 | | __m256i value, value2, cmp; |
196 | | value = _mm256_loadu_si256((__m256i *) ip); |
197 | | value2 = _mm256_loadu_si256((__m256i *)ref); |
198 | | cmp = _mm256_cmpeq_epi64(value, value2); |
199 | | if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) { |
200 | | /* Return the byte that starts to differ */ |
201 | | while (*ref++ == *ip++) {} |
202 | | return ip; |
203 | | } |
204 | | else { |
205 | | ip += sizeof(__m256i); |
206 | | ref += sizeof(__m256i); |
207 | | } |
208 | | } |
209 | | /* Look into the remainder */ |
210 | | while ((ip < ip_bound) && (*ref++ == *ip++)) {} |
211 | | return ip; |
212 | | } |
213 | | #endif |
214 | | |
215 | | |
216 | 1.33M | static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) { |
217 | 1.33M | if (BLOSCLZ_UNLIKELY(run)) { |
218 | | #if defined(__AVX2__) |
219 | | // Extensive experiments on AMD Ryzen3 say that regular get_run is faster |
220 | | // ip = get_run_32(ip, ip_bound, ref); |
221 | | ip = get_run(ip, ip_bound, ref); |
222 | | #elif defined(__SSE2__) |
223 | | // Extensive experiments on AMD Ryzen3 say that regular get_run is faster |
224 | | // ip = get_run_16(ip, ip_bound, ref); |
225 | 104k | ip = get_run(ip, ip_bound, ref); |
226 | | #else |
227 | | ip = get_run(ip, ip_bound, ref); |
228 | | #endif |
229 | 104k | } |
230 | 1.23M | else { |
231 | | #if defined(__AVX2__) |
232 | | // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster |
233 | | // ip = get_match_32(ip, ip_bound, ref); |
234 | | ip = get_match_16(ip, ip_bound, ref); |
235 | | #elif defined(__SSE2__) |
236 | | ip = get_match_16(ip, ip_bound, ref); |
237 | | #else |
238 | | ip = get_match(ip, ip_bound, ref); |
239 | | #endif |
240 | 1.23M | } |
241 | | |
242 | 1.33M | return ip; |
243 | 1.33M | } |
244 | | |
245 | | |
246 | 14.4M | #define LITERAL(ip, op, op_limit, anchor, copy) { \ |
247 | 14.4M | if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \ |
248 | 14.4M | goto out; \ |
249 | 14.4M | *(op)++ = *(anchor)++; \ |
250 | 14.4M | (ip) = (anchor); \ |
251 | 14.4M | (copy)++; \ |
252 | 14.4M | if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) { \ |
253 | 358k | (copy) = 0; \ |
254 | 358k | *(op)++ = MAX_COPY-1; \ |
255 | 358k | } \ |
256 | 14.4M | } |
257 | | |
258 | 3.29M | #define LITERAL2(ip, anchor, copy) { \ |
259 | 3.29M | oc++; (anchor)++; \ |
260 | 3.29M | (ip) = (anchor); \ |
261 | 3.29M | (copy)++; \ |
262 | 3.29M | if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) { \ |
263 | 67.2k | (copy) = 0; \ |
264 | 67.2k | oc++; \ |
265 | 67.2k | } \ |
266 | 3.29M | } |
267 | | |
268 | 380k | #define MATCH_SHORT(op, op_limit, len, distance) { \ |
269 | 380k | if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \ |
270 | 380k | goto out; \ |
271 | 380k | *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\ |
272 | 380k | *(op)++ = (uint8_t)(((distance) & 255U)); \ |
273 | 380k | } |
274 | | |
275 | 172k | #define MATCH_LONG(op, op_limit, len, distance) { \ |
276 | 172k | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
277 | 172k | goto out; \ |
278 | 172k | *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \ |
279 | 209k | for ((len) -= 7; (len) >= 255; (len) -= 255) { \ |
280 | 37.1k | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
281 | 37.1k | goto out; \ |
282 | 37.1k | *(op)++ = 255; \ |
283 | 37.1k | } \ |
284 | 172k | if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \ |
285 | 172k | goto out; \ |
286 | 172k | *(op)++ = (uint8_t)(len); \ |
287 | 172k | *(op)++ = (uint8_t)(((distance) & 255U)); \ |
288 | 172k | } |
289 | | |
290 | 1.08k | #define MATCH_SHORT_FAR(op, op_limit, len, distance) { \ |
291 | 1.08k | if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit))) \ |
292 | 1.08k | goto out; \ |
293 | 1.08k | *(op)++ = (uint8_t)(((len) << 5U) + 31); \ |
294 | 1.08k | *(op)++ = 255; \ |
295 | 1.08k | *(op)++ = (uint8_t)((distance) >> 8U); \ |
296 | 1.08k | *(op)++ = (uint8_t)((distance) & 255U); \ |
297 | 1.08k | } |
298 | | |
299 | 5.75k | #define MATCH_LONG_FAR(op, op_limit, len, distance) { \ |
300 | 5.75k | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
301 | 5.75k | goto out; \ |
302 | 5.75k | *(op)++ = (7U << 5U) + 31; \ |
303 | 14.4k | for ((len) -= 7; (len) >= 255; (len) -= 255) { \ |
304 | 8.70k | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
305 | 8.70k | goto out; \ |
306 | 8.70k | *(op)++ = 255; \ |
307 | 8.70k | } \ |
308 | 5.75k | if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit))) \ |
309 | 5.75k | goto out; \ |
310 | 5.75k | *(op)++ = (uint8_t)(len); \ |
311 | 5.75k | *(op)++ = 255; \ |
312 | 5.75k | *(op)++ = (uint8_t)((distance) >> 8U); \ |
313 | 5.75k | *(op)++ = (uint8_t)((distance) & 255U); \ |
314 | 5.75k | } |
315 | | |
316 | | |
317 | | // Get a guess for the compressed size of a buffer |
318 | 6.24k | static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) { |
319 | 6.24k | uint8_t* ip = ibase; |
320 | 6.24k | int32_t oc = 0; |
321 | 6.24k | const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2); |
322 | 6.24k | uint16_t htab[1U << (uint8_t)HASH_LOG2]; |
323 | 6.24k | uint32_t hval; |
324 | 6.24k | uint32_t seq; |
325 | 6.24k | uint8_t copy; |
326 | | // Make a tradeoff between testing too much and too little |
327 | 6.24k | uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen; |
328 | 6.24k | uint8_t* ip_bound = ibase + limit - 1; |
329 | 6.24k | uint8_t* ip_limit = ibase + limit - 12; |
330 | | |
331 | | // Initialize the hash table to distances of 0 |
332 | 6.24k | memset(htab, 0, hashlen * sizeof(uint16_t)); |
333 | | |
334 | | /* we start with literal copy */ |
335 | 6.24k | copy = 4; |
336 | 6.24k | oc += 5; |
337 | | |
338 | | /* main loop */ |
339 | 3.48M | while (BLOSCLZ_LIKELY(ip < ip_limit)) { |
340 | 3.48M | const uint8_t* ref; |
341 | 3.48M | unsigned distance; |
342 | 3.48M | uint8_t* anchor = ip; /* comparison starting-point */ |
343 | | |
344 | | /* find potential match */ |
345 | 3.48M | seq = BLOSCLZ_READU32(ip); |
346 | 3.48M | HASH_FUNCTION(hval, seq, HASH_LOG2) |
347 | 3.48M | ref = ibase + htab[hval]; |
348 | | |
349 | | /* calculate distance to the match */ |
350 | 3.48M | distance = (unsigned int)(anchor - ref); |
351 | | |
352 | | /* update hash table */ |
353 | 3.48M | htab[hval] = (uint16_t) (anchor - ibase); |
354 | | |
355 | 3.48M | if (distance == 0 || (distance >= MAX_FARDISTANCE)) { |
356 | 6.19k | LITERAL2(ip, anchor, copy) |
357 | 6.19k | continue; |
358 | 6.19k | } |
359 | | |
360 | | /* is this a match? check the first 4 bytes */ |
361 | 3.47M | if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) { |
362 | 337k | ref += 4; |
363 | 337k | } |
364 | 3.13M | else { |
365 | | /* no luck, copy as a literal */ |
366 | 3.13M | LITERAL2(ip, anchor, copy) |
367 | 3.13M | continue; |
368 | 3.13M | } |
369 | | |
370 | | /* last matched byte */ |
371 | 337k | ip = anchor + 4; |
372 | | |
373 | | /* distance is biased */ |
374 | 337k | distance--; |
375 | | |
376 | | /* get runs or matches; zero distance means a run */ |
377 | 337k | ip = get_run_or_match(ip, ip_bound, ref, !distance); |
378 | | |
379 | 337k | ip -= ipshift; |
380 | 337k | int len = (int)(ip - anchor); |
381 | 337k | if (len < minlen) { |
382 | 148k | LITERAL2(ip, anchor, copy) |
383 | 148k | continue; |
384 | 148k | } |
385 | | |
386 | | /* if we haven't copied anything, adjust the output counter */ |
387 | 189k | if (!copy) |
388 | 44.2k | oc--; |
389 | | /* reset literal counter */ |
390 | 189k | copy = 0; |
391 | | |
392 | | /* encode the match */ |
393 | 189k | if (distance < MAX_DISTANCE) { |
394 | 189k | if (len >= 7) { |
395 | 47.3k | oc += ((len - 7) / 255) + 1; |
396 | 47.3k | } |
397 | 189k | oc += 2; |
398 | 189k | } |
399 | 0 | else { |
400 | | /* far away, but not yet in the another galaxy... */ |
401 | 0 | if (len >= 7) { |
402 | 0 | oc += ((len - 7) / 255) + 1; |
403 | 0 | } |
404 | 0 | oc += 4; |
405 | 0 | } |
406 | | |
407 | | /* update the hash at match boundary */ |
408 | 189k | seq = BLOSCLZ_READU32(ip); |
409 | 189k | HASH_FUNCTION(hval, seq, HASH_LOG2) |
410 | 189k | htab[hval] = (uint16_t)(ip++ - ibase); |
411 | 189k | ip++; |
412 | | /* assuming literal copy */ |
413 | 189k | oc++; |
414 | 189k | } |
415 | | |
416 | 6.24k | double ic = (double)(ip - ibase); |
417 | 6.24k | return ic / (double)oc; |
418 | 6.24k | } |
419 | | |
420 | | |
421 | | int blosclz_compress(const int clevel, const void* input, int length, |
422 | 6.24k | void* output, int maxout, const int split_block) { |
423 | 6.24k | uint8_t* ibase = (uint8_t*)input; |
424 | | |
425 | | // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio |
426 | 6.24k | int maxlen = length / 4; |
427 | | // Start probing somewhere inside the buffer |
428 | 6.24k | int shift = length - maxlen; |
429 | | // Actual entropy probing! |
430 | 6.24k | double cratio = get_cratio(ibase + shift, maxlen, 3, 3); |
431 | | // discard probes with small compression ratios (too expensive) |
432 | 6.24k | double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0}; |
433 | 6.24k | if (cratio < cratio_[clevel]) { |
434 | 649 | goto out; |
435 | 649 | } |
436 | | |
437 | | /* When we go back in a match (shift), we obtain quite different compression properties. |
438 | | * It looks like 4 is more useful in combination with bitshuffle and small typesizes |
439 | | * Fallback to 4 because it provides more consistent results for large cratios. |
440 | | * |
441 | | * In this block we also check cratios for the beginning of the buffers and |
442 | | * eventually discard those that are small (take too long to decompress). |
443 | | * This process is called _entropy probing_. |
444 | | */ |
445 | 5.59k | unsigned ipshift = 4; |
446 | | // Compute optimal shift and minimum lengths for encoding |
447 | | // Use 4 by default, except for low entropy data, where we should do a best effort |
448 | 5.59k | unsigned minlen = 4; |
449 | | // BloscLZ works better with splits mostly, so when data is not split, do a best effort |
450 | | // Why using cratio < 4 is based in experiments with low and high entropy |
451 | 5.59k | if (!split_block || cratio < 4) { |
452 | 5.41k | ipshift = 3; |
453 | 5.41k | minlen = 3; |
454 | 5.41k | } |
455 | 180 | else { |
456 | 180 | minlen = 4; |
457 | 180 | } |
458 | | |
459 | 5.59k | uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG, |
460 | 5.59k | HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG}; |
461 | 5.59k | uint8_t hashlog = hashlog_[clevel]; |
462 | | |
463 | 5.59k | uint8_t* ip = ibase; |
464 | 5.59k | uint8_t* ip_bound = ibase + length - 1; |
465 | 5.59k | uint8_t* ip_limit = ibase + length - 12; |
466 | 5.59k | uint8_t* op = (uint8_t*)output; |
467 | 5.59k | const uint8_t* op_limit = op + maxout; |
468 | 5.59k | uint32_t seq; |
469 | 5.59k | uint8_t copy; |
470 | 5.59k | uint32_t hval; |
471 | | |
472 | | /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */ |
473 | 5.59k | if (length < 16 || maxout < 66) { |
474 | 4 | return 0; |
475 | 4 | } |
476 | | |
477 | | // Initialize the hash table |
478 | 5.58k | uint32_t htab[1U << (uint8_t)HASH_LOG]; |
479 | 5.58k | memset(htab, 0, (1U << hashlog) * sizeof(uint32_t)); |
480 | | |
481 | | /* we start with literal copy */ |
482 | 5.58k | copy = 4; |
483 | 5.58k | *op++ = MAX_COPY - 1; |
484 | 5.58k | *op++ = *ip++; |
485 | 5.58k | *op++ = *ip++; |
486 | 5.58k | *op++ = *ip++; |
487 | 5.58k | *op++ = *ip++; |
488 | | |
489 | | /* main loop */ |
490 | 15.0M | while (BLOSCLZ_LIKELY(ip < ip_limit)) { |
491 | 15.0M | const uint8_t* ref; |
492 | 15.0M | unsigned distance; |
493 | 15.0M | uint8_t* anchor = ip; /* comparison starting-point */ |
494 | | |
495 | | /* find potential match */ |
496 | 15.0M | seq = BLOSCLZ_READU32(ip); |
497 | 15.0M | HASH_FUNCTION(hval, seq, hashlog) |
498 | 15.0M | ref = ibase + htab[hval]; |
499 | | |
500 | | /* calculate distance to the match */ |
501 | 15.0M | distance = (unsigned int)(anchor - ref); |
502 | | |
503 | | /* update hash table */ |
504 | 15.0M | htab[hval] = (uint32_t) (anchor - ibase); |
505 | | |
506 | 15.0M | if (distance == 0 || (distance >= MAX_FARDISTANCE)) { |
507 | 0 | LITERAL(ip, op, op_limit, anchor, copy) |
508 | 0 | continue; |
509 | 0 | } |
510 | | |
511 | | /* is this a match? check the first 4 bytes */ |
512 | 15.0M | if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) { |
513 | 998k | ref += 4; |
514 | 14.0M | } else { |
515 | | /* no luck, copy as a literal */ |
516 | 14.0M | LITERAL(ip, op, op_limit, anchor, copy) |
517 | 0 | continue; |
518 | 14.0M | } |
519 | | |
520 | | /* last matched byte */ |
521 | 998k | ip = anchor + 4; |
522 | | |
523 | | /* distance is biased */ |
524 | 998k | distance--; |
525 | | |
526 | | /* get runs or matches; zero distance means a run */ |
527 | 998k | ip = get_run_or_match(ip, ip_bound, ref, !distance); |
528 | | |
529 | | /* length is biased, '1' means a match of 3 bytes */ |
530 | 998k | ip -= ipshift; |
531 | | |
532 | 998k | unsigned len = (int)(ip - anchor); |
533 | | |
534 | | // Encoding short lengths is expensive during decompression |
535 | 998k | if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) { |
536 | 438k | LITERAL(ip, op, op_limit, anchor, copy) |
537 | 0 | continue; |
538 | 438k | } |
539 | | |
540 | | /* if we have copied something, adjust the copy count */ |
541 | 560k | if (copy) |
542 | | /* copy is biased, '0' means 1 byte copy */ |
543 | 400k | *(op - copy - 1) = (uint8_t)(copy - 1); |
544 | 159k | else |
545 | | /* back, to overwrite the copy count */ |
546 | 159k | op--; |
547 | | /* reset literal counter */ |
548 | 560k | copy = 0; |
549 | | |
550 | | /* encode the match */ |
551 | 560k | if (distance < MAX_DISTANCE) { |
552 | 553k | if (len < 7) { |
553 | 380k | MATCH_SHORT(op, op_limit, len, distance) |
554 | 380k | } else { |
555 | 516k | MATCH_LONG(op, op_limit, len, distance) |
556 | 516k | } |
557 | 553k | } else { |
558 | | /* far away, but not yet in the another galaxy... */ |
559 | 6.83k | distance -= MAX_DISTANCE; |
560 | 6.83k | if (len < 7) { |
561 | 1.08k | MATCH_SHORT_FAR(op, op_limit, len, distance) |
562 | 5.75k | } else { |
563 | 17.2k | MATCH_LONG_FAR(op, op_limit, len, distance) |
564 | 17.2k | } |
565 | 6.83k | } |
566 | | |
567 | | /* update the hash at match boundary */ |
568 | 559k | seq = BLOSCLZ_READU32(ip); |
569 | 559k | HASH_FUNCTION(hval, seq, hashlog) |
570 | 559k | htab[hval] = (uint32_t) (ip++ - ibase); |
571 | 559k | if (clevel == 9) { |
572 | | // In some situations, including a second hash proves to be useful, |
573 | | // but not in others. Activating here in max clevel only. |
574 | 344k | seq >>= 8U; |
575 | 344k | HASH_FUNCTION(hval, seq, hashlog) |
576 | 344k | htab[hval] = (uint32_t) (ip++ - ibase); |
577 | 344k | } |
578 | 215k | else { |
579 | 215k | ip++; |
580 | 215k | } |
581 | | |
582 | 559k | if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) |
583 | 97 | goto out; |
584 | | |
585 | | /* assuming literal copy */ |
586 | 559k | *op++ = MAX_COPY - 1; |
587 | 559k | } |
588 | | |
589 | | /* left-over as literal copy */ |
590 | 28.2k | while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) { |
591 | 24.0k | if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out; |
592 | 23.9k | *op++ = *ip++; |
593 | 23.9k | copy++; |
594 | 23.9k | if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { |
595 | 224 | copy = 0; |
596 | 224 | *op++ = MAX_COPY - 1; |
597 | 224 | } |
598 | 23.9k | } |
599 | | |
600 | | /* if we have copied something, adjust the copy length */ |
601 | 4.15k | if (copy) |
602 | 4.10k | *(op - copy - 1) = (uint8_t)(copy - 1); |
603 | 51 | else |
604 | 51 | op--; |
605 | | |
606 | | /* marker for blosclz */ |
607 | 4.15k | *(uint8_t*)output |= (1U << 5U); |
608 | | |
609 | 4.15k | return (int)(op - (uint8_t*)output); |
610 | | |
611 | 2.08k | out: |
612 | 2.08k | return 0; |
613 | 4.25k | } |
614 | | |
615 | | // See https://habr.com/en/company/yandex/blog/457612/ |
616 | | #if defined(__AVX2__) |
617 | | |
618 | | #if defined(_MSC_VER) |
619 | | #define ALIGNED_(x) __declspec(align(x)) |
620 | | #else |
621 | | #if defined(__GNUC__) |
622 | | #define ALIGNED_(x) __attribute__ ((aligned(x))) |
623 | | #endif |
624 | | #endif |
625 | | #define ALIGNED_TYPE_(t, x) t ALIGNED_(x) |
626 | | |
627 | | static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len) |
628 | | { |
629 | | size_t offset = op - match; |
630 | | while (len >= 16) { |
631 | | |
632 | | static const ALIGNED_TYPE_(uint8_t, 16) masks[] = |
633 | | { |
634 | | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, // offset = 0, not used as mask, but for shift |
635 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // offset = 1 |
636 | | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
637 | | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
638 | | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
639 | | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
640 | | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
641 | | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
642 | | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
643 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
644 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
645 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
646 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
647 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
648 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
649 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
650 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // offset = 16 |
651 | | }; |
652 | | |
653 | | _mm_storeu_si128((__m128i *)(op), |
654 | | _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)), |
655 | | _mm_load_si128((const __m128i *)(masks) + offset))); |
656 | | |
657 | | match += masks[offset]; |
658 | | |
659 | | op += 16; |
660 | | len -= 16; |
661 | | } |
662 | | // Deal with remainders |
663 | | for (; len > 0; len--) { |
664 | | *op++ = *match++; |
665 | | } |
666 | | return op; |
667 | | } |
668 | | #endif |
669 | | |
670 | | // LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure) |
671 | 2.75k | static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) { |
672 | 2.75k | uint8_t* d = out; |
673 | 2.75k | const uint8_t* s = from; |
674 | 2.75k | uint8_t* const e = end; |
675 | | |
676 | 3.34k | do { memcpy(d,s,8); d+=8; s+=8; } while (d<e); |
677 | 2.75k | } |
678 | | |
679 | 113 | int blosclz_decompress(const void* input, int length, void* output, int maxout) { |
680 | 113 | const uint8_t* ip = (const uint8_t*)input; |
681 | 113 | const uint8_t* ip_limit = ip + length; |
682 | 113 | uint8_t* op = (uint8_t*)output; |
683 | 113 | uint32_t ctrl; |
684 | 113 | uint8_t* op_limit = op + maxout; |
685 | 113 | if (BLOSCLZ_UNLIKELY(length == 0)) { |
686 | 0 | return 0; |
687 | 0 | } |
688 | 113 | ctrl = (*ip++) & 31U; |
689 | | |
690 | 19.2k | while (1) { |
691 | 19.2k | if (ctrl >= 32) { |
692 | | // match |
693 | 3.72k | int32_t len = (int32_t)(ctrl >> 5U) - 1 ; |
694 | 3.72k | int32_t ofs = (int32_t)(ctrl & 31U) << 8U; |
695 | 3.72k | uint8_t code; |
696 | 3.72k | const uint8_t* ref = op - ofs; |
697 | | |
698 | 3.72k | if (len == 7 - 1) { |
699 | 648 | do { |
700 | 648 | if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) { |
701 | 0 | return 0; |
702 | 0 | } |
703 | 648 | code = *ip++; |
704 | 648 | len += code; |
705 | 648 | } while (code == 255); |
706 | 648 | } |
707 | 3.07k | else { |
708 | 3.07k | if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) { |
709 | 0 | return 0; |
710 | 0 | } |
711 | 3.07k | } |
712 | 3.72k | code = *ip++; |
713 | 3.72k | len += 3; |
714 | 3.72k | ref -= code; |
715 | | |
716 | | /* match from 16-bit distance */ |
717 | 3.72k | if (BLOSCLZ_UNLIKELY(code == 255)) { |
718 | 623 | if (ofs == (31U << 8U)) { |
719 | 0 | if (ip + 1 >= ip_limit) { |
720 | 0 | return 0; |
721 | 0 | } |
722 | 0 | ofs = (*ip++) << 8U; |
723 | 0 | ofs += *ip++; |
724 | 0 | ref = op - ofs - MAX_DISTANCE; |
725 | 0 | } |
726 | 623 | } |
727 | | |
728 | 3.72k | if (BLOSCLZ_UNLIKELY(op + len > op_limit)) { |
729 | 0 | return 0; |
730 | 0 | } |
731 | | |
732 | 3.72k | if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) { |
733 | 0 | return 0; |
734 | 0 | } |
735 | | |
736 | 3.72k | if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break; |
737 | 3.72k | ctrl = *ip++; |
738 | | |
739 | 3.72k | ref--; |
740 | 3.72k | if (ref == op - 1) { |
741 | | /* optimized copy for a run */ |
742 | 656 | memset(op, *ref, len); |
743 | 656 | op += len; |
744 | 656 | } |
745 | 3.07k | else if ((op - ref >= 8) && (op_limit - op >= len + 8)) { |
746 | | // copy with an overlap not larger than 8 |
747 | 2.75k | wild_copy(op, ref, op + len); |
748 | 2.75k | op += len; |
749 | 2.75k | } |
750 | 319 | else { |
751 | | // general copy with any overlap |
752 | | #if defined(__AVX2__) |
753 | | if (op - ref <= 16) { |
754 | | // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but |
755 | | // it is not slower either. Let's activate here for experimentation. |
756 | | op = copy_match_16(op, ref, len); |
757 | | } |
758 | | else { |
759 | | #endif |
760 | 319 | op = copy_match(op, ref, (unsigned) len); |
761 | | #if defined(__AVX2__) |
762 | | } |
763 | | #endif |
764 | 319 | } |
765 | 3.72k | } |
766 | 15.5k | else { |
767 | | // literal |
768 | 15.5k | ctrl++; |
769 | 15.5k | if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) { |
770 | 0 | return 0; |
771 | 0 | } |
772 | 15.5k | if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) { |
773 | 0 | return 0; |
774 | 0 | } |
775 | | |
776 | 15.5k | memcpy(op, ip, ctrl); op += ctrl; ip += ctrl; |
777 | | // On GCC-6, fastcopy this is still faster than plain memcpy |
778 | | // However, using recent CLANG/LLVM 9.0, there is almost no difference |
779 | | // in performance. |
780 | | // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster. |
781 | | // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl; |
782 | | |
783 | 15.5k | if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break; |
784 | 15.4k | ctrl = *ip++; |
785 | 15.4k | } |
786 | 19.2k | } |
787 | | |
788 | 113 | return (int)(op - (uint8_t*)output); |
789 | 113 | } |