/src/c-blosc2/blosc/blosclz.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | The code in this file is heavily based on FastLZ, a lightning-fast |
13 | | lossless compression library. See LICENSES/FASTLZ.txt for details. |
14 | | **********************************************************************/ |
15 | | |
16 | | |
17 | | #include "blosclz.h" |
18 | | #include "fastcopy.h" |
19 | | #include "blosc2/blosc2-common.h" |
20 | | |
21 | | #include <stdbool.h> |
22 | | #include <stdio.h> |
23 | | #include <stdint.h> |
24 | | #include <string.h> |
25 | | |
26 | | /* |
27 | | * Give hints to the compiler for branch prediction optimization. |
28 | | * This is not necessary anymore with modern CPUs. |
29 | | */ |
30 | | #if 0 && defined(__GNUC__) && (__GNUC__ > 2) |
31 | | #define BLOSCLZ_LIKELY(c) (__builtin_expect((c), 1)) |
32 | | #define BLOSCLZ_UNLIKELY(c) (__builtin_expect((c), 0)) |
33 | | #else |
34 | 0 | #define BLOSCLZ_LIKELY(c) (c) |
35 | 0 | #define BLOSCLZ_UNLIKELY(c) (c) |
36 | | #endif |
37 | | |
38 | | /* |
39 | | * Use inlined functions for supported systems. |
40 | | */ |
41 | | #if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */ |
42 | | #define inline __inline /* Visual C is not C99, but supports some kind of inline */ |
43 | | #endif |
44 | | |
45 | 0 | #define MAX_COPY 32U |
46 | 0 | #define MAX_DISTANCE 8191 |
47 | 0 | #define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1) |
48 | | |
49 | | #ifdef BLOSC_STRICT_ALIGN |
50 | | #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8) |
51 | | #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24) |
52 | | #else |
53 | | #define BLOSCLZ_READU16(p) *((const uint16_t*)(p)) |
54 | 0 | #define BLOSCLZ_READU32(p) *((const uint32_t*)(p)) |
55 | | #endif |
56 | | |
57 | 0 | #define HASH_LOG (14U) |
58 | | |
59 | | // This is used in LZ4 and seems to work pretty well here too |
60 | 0 | #define HASH_FUNCTION(v, s, h) { \ |
61 | 0 | (v) = ((s) * 2654435761U) >> (32U - (h)); \ |
62 | 0 | } |
63 | | |
64 | | |
65 | | #if defined(__AVX2__) |
66 | | static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
67 | | uint8_t x = ip[-1]; |
68 | | |
69 | | while (ip < (ip_bound - (sizeof(__m256i)))) { |
70 | | __m256i value, value2, cmp; |
71 | | /* Broadcast the value for every byte in a 256-bit register */ |
72 | | memset(&value, x, sizeof(__m256i)); |
73 | | value2 = _mm256_loadu_si256((__m256i *)ref); |
74 | | cmp = _mm256_cmpeq_epi64(value, value2); |
75 | | if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) { |
76 | | /* Return the byte that starts to differ */ |
77 | | while (*ref++ == x) ip++; |
78 | | return ip; |
79 | | } |
80 | | else { |
81 | | ip += sizeof(__m256i); |
82 | | ref += sizeof(__m256i); |
83 | | } |
84 | | } |
85 | | /* Look into the remainder */ |
86 | | while ((ip < ip_bound) && (*ref++ == x)) ip++; |
87 | | return ip; |
88 | | } |
89 | | #endif |
90 | | |
91 | | #if defined(__SSE2__) |
92 | 0 | uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
93 | 0 | uint8_t x = ip[-1]; |
94 | |
|
95 | 0 | while (ip < (ip_bound - sizeof(__m128i))) { |
96 | 0 | __m128i value, value2, cmp; |
97 | | /* Broadcast the value for every byte in a 128-bit register */ |
98 | 0 | memset(&value, x, sizeof(__m128i)); |
99 | 0 | value2 = _mm_loadu_si128((__m128i *)ref); |
100 | 0 | cmp = _mm_cmpeq_epi32(value, value2); |
101 | 0 | if (_mm_movemask_epi8(cmp) != 0xFFFF) { |
102 | | /* Return the byte that starts to differ */ |
103 | 0 | while (*ref++ == x) ip++; |
104 | 0 | return ip; |
105 | 0 | } |
106 | 0 | else { |
107 | 0 | ip += sizeof(__m128i); |
108 | 0 | ref += sizeof(__m128i); |
109 | 0 | } |
110 | 0 | } |
111 | | /* Look into the remainder */ |
112 | 0 | while ((ip < ip_bound) && (*ref++ == x)) ip++; |
113 | 0 | return ip; |
114 | 0 | } |
115 | | |
116 | | #endif |
117 | | |
118 | | |
119 | 0 | static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
120 | 0 | uint8_t x = ip[-1]; |
121 | 0 | int64_t value, value2; |
122 | | /* Broadcast the value for every byte in a 64-bit register */ |
123 | 0 | memset(&value, x, 8); |
124 | | /* safe because the outer check against ip limit */ |
125 | 0 | while (ip < (ip_bound - sizeof(int64_t))) { |
126 | | #if defined(BLOSC_STRICT_ALIGN) |
127 | | memcpy(&value2, ref, 8); |
128 | | #else |
129 | 0 | value2 = ((int64_t*)ref)[0]; |
130 | 0 | #endif |
131 | 0 | if (value != value2) { |
132 | | /* Return the byte that starts to differ */ |
133 | 0 | while (*ref++ == x) ip++; |
134 | 0 | return ip; |
135 | 0 | } |
136 | 0 | else { |
137 | 0 | ip += 8; |
138 | 0 | ref += 8; |
139 | 0 | } |
140 | 0 | } |
141 | | /* Look into the remainder */ |
142 | 0 | while ((ip < ip_bound) && (*ref++ == x)) ip++; |
143 | 0 | return ip; |
144 | 0 | } |
145 | | |
146 | | |
147 | | /* Return the byte that starts to differ */ |
148 | 0 | uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
149 | 0 | #if !defined(BLOSC_STRICT_ALIGN) |
150 | 0 | while (ip < (ip_bound - sizeof(int64_t))) { |
151 | 0 | if (*(int64_t*)ref != *(int64_t*)ip) { |
152 | | /* Return the byte that starts to differ */ |
153 | 0 | while (*ref++ == *ip++) {} |
154 | 0 | return ip; |
155 | 0 | } |
156 | 0 | else { |
157 | 0 | ip += sizeof(int64_t); |
158 | 0 | ref += sizeof(int64_t); |
159 | 0 | } |
160 | 0 | } |
161 | 0 | #endif |
162 | | /* Look into the remainder */ |
163 | 0 | while ((ip < ip_bound) && (*ref++ == *ip++)) {} |
164 | 0 | return ip; |
165 | 0 | } |
166 | | |
167 | | |
168 | | #if defined(__SSE2__) |
169 | 0 | static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
170 | 0 | __m128i value, value2, cmp; |
171 | |
|
172 | 0 | while (ip < (ip_bound - sizeof(__m128i))) { |
173 | 0 | value = _mm_loadu_si128((__m128i *) ip); |
174 | 0 | value2 = _mm_loadu_si128((__m128i *) ref); |
175 | 0 | cmp = _mm_cmpeq_epi32(value, value2); |
176 | 0 | if (_mm_movemask_epi8(cmp) != 0xFFFF) { |
177 | | /* Return the byte that starts to differ */ |
178 | 0 | while (*ref++ == *ip++) {} |
179 | 0 | return ip; |
180 | 0 | } |
181 | 0 | else { |
182 | 0 | ip += sizeof(__m128i); |
183 | 0 | ref += sizeof(__m128i); |
184 | 0 | } |
185 | 0 | } |
186 | | /* Look into the remainder */ |
187 | 0 | while ((ip < ip_bound) && (*ref++ == *ip++)) {} |
188 | 0 | return ip; |
189 | 0 | } |
190 | | #endif |
191 | | |
192 | | |
193 | | #if defined(__AVX2__) |
194 | | static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) { |
195 | | |
196 | | while (ip < (ip_bound - sizeof(__m256i))) { |
197 | | __m256i value, value2, cmp; |
198 | | value = _mm256_loadu_si256((__m256i *) ip); |
199 | | value2 = _mm256_loadu_si256((__m256i *)ref); |
200 | | cmp = _mm256_cmpeq_epi64(value, value2); |
201 | | if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) { |
202 | | /* Return the byte that starts to differ */ |
203 | | while (*ref++ == *ip++) {} |
204 | | return ip; |
205 | | } |
206 | | else { |
207 | | ip += sizeof(__m256i); |
208 | | ref += sizeof(__m256i); |
209 | | } |
210 | | } |
211 | | /* Look into the remainder */ |
212 | | while ((ip < ip_bound) && (*ref++ == *ip++)) {} |
213 | | return ip; |
214 | | } |
215 | | #endif |
216 | | |
217 | | |
218 | 0 | static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) { |
219 | 0 | if (BLOSCLZ_UNLIKELY(run)) { |
220 | | #if defined(__AVX2__) |
221 | | // Extensive experiments on AMD Ryzen3 say that regular get_run is faster |
222 | | // ip = get_run_32(ip, ip_bound, ref); |
223 | | ip = get_run(ip, ip_bound, ref); |
224 | | #elif defined(__SSE2__) |
225 | | // Extensive experiments on AMD Ryzen3 say that regular get_run is faster |
226 | | // ip = get_run_16(ip, ip_bound, ref); |
227 | 0 | ip = get_run(ip, ip_bound, ref); |
228 | | #else |
229 | | ip = get_run(ip, ip_bound, ref); |
230 | | #endif |
231 | 0 | } |
232 | 0 | else { |
233 | | #if defined(__AVX2__) |
234 | | // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster |
235 | | // ip = get_match_32(ip, ip_bound, ref); |
236 | | ip = get_match_16(ip, ip_bound, ref); |
237 | | #elif defined(__SSE2__) |
238 | | ip = get_match_16(ip, ip_bound, ref); |
239 | | #else |
240 | | ip = get_match(ip, ip_bound, ref); |
241 | | #endif |
242 | 0 | } |
243 | |
|
244 | 0 | return ip; |
245 | 0 | } |
246 | | |
247 | | |
248 | 0 | #define LITERAL(ip, op, op_limit, anchor, copy) { \ |
249 | 0 | if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \ |
250 | 0 | goto out; \ |
251 | 0 | *(op)++ = *(anchor)++; \ |
252 | 0 | (ip) = (anchor); \ |
253 | 0 | (copy)++; \ |
254 | 0 | if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) { \ |
255 | 0 | (copy) = 0; \ |
256 | 0 | *(op)++ = MAX_COPY-1; \ |
257 | 0 | } \ |
258 | 0 | } |
259 | | |
260 | 0 | #define LITERAL2(ip, anchor, copy) { \ |
261 | 0 | oc++; (anchor)++; \ |
262 | 0 | (ip) = (anchor); \ |
263 | 0 | (copy)++; \ |
264 | 0 | if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) { \ |
265 | 0 | (copy) = 0; \ |
266 | 0 | oc++; \ |
267 | 0 | } \ |
268 | 0 | } |
269 | | |
270 | 0 | #define MATCH_SHORT(op, op_limit, len, distance) { \ |
271 | 0 | if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \ |
272 | 0 | goto out; \ |
273 | 0 | *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\ |
274 | 0 | *(op)++ = (uint8_t)(((distance) & 255U)); \ |
275 | 0 | } |
276 | | |
277 | 0 | #define MATCH_LONG(op, op_limit, len, distance) { \ |
278 | 0 | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
279 | 0 | goto out; \ |
280 | 0 | *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \ |
281 | 0 | for ((len) -= 7; (len) >= 255; (len) -= 255) { \ |
282 | 0 | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
283 | 0 | goto out; \ |
284 | 0 | *(op)++ = 255; \ |
285 | 0 | } \ |
286 | 0 | if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit))) \ |
287 | 0 | goto out; \ |
288 | 0 | *(op)++ = (uint8_t)(len); \ |
289 | 0 | *(op)++ = (uint8_t)(((distance) & 255U)); \ |
290 | 0 | } |
291 | | |
292 | 0 | #define MATCH_SHORT_FAR(op, op_limit, len, distance) { \ |
293 | 0 | if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit))) \ |
294 | 0 | goto out; \ |
295 | 0 | *(op)++ = (uint8_t)(((len) << 5U) + 31); \ |
296 | 0 | *(op)++ = 255; \ |
297 | 0 | *(op)++ = (uint8_t)((distance) >> 8U); \ |
298 | 0 | *(op)++ = (uint8_t)((distance) & 255U); \ |
299 | 0 | } |
300 | | |
301 | 0 | #define MATCH_LONG_FAR(op, op_limit, len, distance) { \ |
302 | 0 | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
303 | 0 | goto out; \ |
304 | 0 | *(op)++ = (7U << 5U) + 31; \ |
305 | 0 | for ((len) -= 7; (len) >= 255; (len) -= 255) { \ |
306 | 0 | if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit))) \ |
307 | 0 | goto out; \ |
308 | 0 | *(op)++ = 255; \ |
309 | 0 | } \ |
310 | 0 | if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit))) \ |
311 | 0 | goto out; \ |
312 | 0 | *(op)++ = (uint8_t)(len); \ |
313 | 0 | *(op)++ = 255; \ |
314 | 0 | *(op)++ = (uint8_t)((distance) >> 8U); \ |
315 | 0 | *(op)++ = (uint8_t)((distance) & 255U); \ |
316 | 0 | } |
317 | | |
318 | | |
319 | | // Get a guess for the compressed size of a buffer |
320 | 0 | static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift, uint32_t htab[], int8_t hashlog) { |
321 | 0 | uint8_t* ip = ibase; |
322 | 0 | int32_t oc = 0; |
323 | 0 | const uint16_t hashlen = (1U << (uint8_t)hashlog); |
324 | 0 | uint32_t hval; |
325 | 0 | uint32_t seq; |
326 | 0 | uint8_t copy; |
327 | | // Make a tradeoff between testing too much and too little |
328 | 0 | uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen; |
329 | 0 | uint8_t* ip_bound = ibase + limit - 1; |
330 | 0 | uint8_t* ip_limit = ibase + limit - 12; |
331 | | |
332 | | // Initialize the hash table to distances of 0 |
333 | 0 | memset(htab, 0, hashlen * sizeof(uint32_t)); |
334 | | |
335 | | /* we start with literal copy */ |
336 | 0 | copy = 4; |
337 | 0 | oc += 5; |
338 | | |
339 | | /* main loop */ |
340 | 0 | while (BLOSCLZ_LIKELY(ip < ip_limit)) { |
341 | 0 | const uint8_t* ref; |
342 | 0 | unsigned distance; |
343 | 0 | uint8_t* anchor = ip; /* comparison starting-point */ |
344 | | |
345 | | /* find potential match */ |
346 | 0 | seq = BLOSCLZ_READU32(ip); |
347 | 0 | HASH_FUNCTION(hval, seq, hashlog) |
348 | 0 | ref = ibase + htab[hval]; |
349 | | |
350 | | /* calculate distance to the match */ |
351 | 0 | distance = (unsigned int)(anchor - ref); |
352 | | |
353 | | /* update hash table */ |
354 | 0 | htab[hval] = (uint32_t) (anchor - ibase); |
355 | |
|
356 | 0 | if (distance == 0 || (distance >= MAX_FARDISTANCE)) { |
357 | 0 | LITERAL2(ip, anchor, copy) |
358 | 0 | continue; |
359 | 0 | } |
360 | | |
361 | | /* is this a match? check the first 4 bytes */ |
362 | 0 | if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) { |
363 | 0 | ref += 4; |
364 | 0 | } |
365 | 0 | else { |
366 | | /* no luck, copy as a literal */ |
367 | 0 | LITERAL2(ip, anchor, copy) |
368 | 0 | continue; |
369 | 0 | } |
370 | | |
371 | | /* last matched byte */ |
372 | 0 | ip = anchor + 4; |
373 | | |
374 | | /* distance is biased */ |
375 | 0 | distance--; |
376 | | |
377 | | /* get runs or matches; zero distance means a run */ |
378 | 0 | ip = get_run_or_match(ip, ip_bound, ref, !distance); |
379 | |
|
380 | 0 | ip -= ipshift; |
381 | 0 | int len = (int)(ip - anchor); |
382 | 0 | if (len < minlen) { |
383 | 0 | LITERAL2(ip, anchor, copy) |
384 | 0 | continue; |
385 | 0 | } |
386 | | |
387 | | /* if we haven't copied anything, adjust the output counter */ |
388 | 0 | if (!copy) |
389 | 0 | oc--; |
390 | | /* reset literal counter */ |
391 | 0 | copy = 0; |
392 | | |
393 | | /* encode the match */ |
394 | 0 | if (distance < MAX_DISTANCE) { |
395 | 0 | if (len >= 7) { |
396 | 0 | oc += ((len - 7) / 255) + 1; |
397 | 0 | } |
398 | 0 | oc += 2; |
399 | 0 | } |
400 | 0 | else { |
401 | | /* far away, but not yet in the another galaxy... */ |
402 | 0 | if (len >= 7) { |
403 | 0 | oc += ((len - 7) / 255) + 1; |
404 | 0 | } |
405 | 0 | oc += 4; |
406 | 0 | } |
407 | | |
408 | | /* update the hash at match boundary */ |
409 | 0 | seq = BLOSCLZ_READU32(ip); |
410 | 0 | HASH_FUNCTION(hval, seq, hashlog) |
411 | 0 | htab[hval] = (uint32_t)(ip++ - ibase); |
412 | 0 | ip++; |
413 | | /* assuming literal copy */ |
414 | 0 | oc++; |
415 | 0 | } |
416 | |
|
417 | 0 | double ic = (double)(ip - ibase); |
418 | 0 | return ic / (double)oc; |
419 | 0 | } |
420 | | |
421 | | |
422 | | int blosclz_compress(const int clevel, const void* input, int length, |
423 | 0 | void* output, int maxout, blosc2_context* ctx) { |
424 | 0 | BLOSC_UNUSED_PARAM(ctx); |
425 | 0 | uint8_t* ibase = (uint8_t*)input; |
426 | 0 | uint32_t htab[1U << (uint8_t)HASH_LOG]; |
427 | | |
428 | | /* When we go back in a match (shift), we obtain quite different compression properties. |
429 | | * It looks like 4 is more useful in combination with bitshuffle and small typesizes |
430 | | * Fallback to 4 because it provides more consistent results for large cratios. |
431 | | * UPDATE: new experiments show that using a value of 3 is a bit better, at least for ERA5. |
432 | | * UPDATE 2: go back to 4, as they seem to provide better cratios in general. |
433 | | * |
434 | | * In this block we also check cratios for the beginning of the buffers and |
435 | | * eventually discard those that are small (take too long to decompress). |
436 | | * This process is called _entropy probing_. |
437 | | */ |
438 | 0 | unsigned ipshift = 4; |
439 | | // Minimum lengths for encoding (normally it is good to match the shift value) |
440 | 0 | unsigned minlen = 4; |
441 | |
|
442 | 0 | uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG, |
443 | 0 | HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG}; |
444 | 0 | uint8_t hashlog = hashlog_[clevel]; |
445 | | |
446 | | // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio |
447 | | // UPDATE: new experiments with ERA5 datasets (float32) say that checking the whole buffer |
448 | | // is better (specially when combined with bitshuffle). |
449 | | // The loss in speed for checking the whole buffer is pretty negligible too. |
450 | 0 | int maxlen = length; |
451 | 0 | if (clevel < 2) { |
452 | 0 | maxlen /= 8; |
453 | 0 | } |
454 | 0 | else if (clevel < 4) { |
455 | 0 | maxlen /= 4; |
456 | 0 | } |
457 | 0 | else if (clevel < 7) { |
458 | 0 | maxlen /= 2; |
459 | 0 | } |
460 | | // Start probing somewhere inside the buffer |
461 | 0 | int shift = length - maxlen; |
462 | | // Actual entropy probing! |
463 | 0 | double cratio = get_cratio(ibase + shift, maxlen, minlen, ipshift, htab, hashlog); |
464 | | // discard probes with small compression ratios (too expensive) |
465 | 0 | double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0}; |
466 | 0 | if (cratio < cratio_[clevel]) { |
467 | 0 | goto out; |
468 | 0 | } |
469 | | |
470 | 0 | uint8_t* ip = ibase; |
471 | 0 | uint8_t* ip_bound = ibase + length - 1; |
472 | 0 | uint8_t* ip_limit = ibase + length - 12; |
473 | 0 | uint8_t* op = (uint8_t*)output; |
474 | 0 | const uint8_t* op_limit = op + maxout; |
475 | 0 | uint32_t seq; |
476 | 0 | uint8_t copy; |
477 | 0 | uint32_t hval; |
478 | | |
479 | | /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */ |
480 | 0 | if (length < 16 || maxout < 66) { |
481 | 0 | return 0; |
482 | 0 | } |
483 | | |
484 | | // Initialize the hash table |
485 | 0 | memset(htab, 0, (1U << hashlog) * sizeof(uint32_t)); |
486 | | |
487 | | /* we start with literal copy */ |
488 | 0 | copy = 4; |
489 | 0 | *op++ = MAX_COPY - 1; |
490 | 0 | *op++ = *ip++; |
491 | 0 | *op++ = *ip++; |
492 | 0 | *op++ = *ip++; |
493 | 0 | *op++ = *ip++; |
494 | | |
495 | | /* main loop */ |
496 | 0 | while (BLOSCLZ_LIKELY(ip < ip_limit)) { |
497 | 0 | const uint8_t* ref; |
498 | 0 | unsigned distance; |
499 | 0 | uint8_t* anchor = ip; /* comparison starting-point */ |
500 | | |
501 | | /* find potential match */ |
502 | 0 | seq = BLOSCLZ_READU32(ip); |
503 | 0 | HASH_FUNCTION(hval, seq, hashlog) |
504 | 0 | ref = ibase + htab[hval]; |
505 | | |
506 | | /* calculate distance to the match */ |
507 | 0 | distance = (unsigned int)(anchor - ref); |
508 | | |
509 | | /* update hash table */ |
510 | 0 | htab[hval] = (uint32_t) (anchor - ibase); |
511 | |
|
512 | 0 | if (distance == 0 || (distance >= MAX_FARDISTANCE)) { |
513 | 0 | LITERAL(ip, op, op_limit, anchor, copy) |
514 | 0 | continue; |
515 | 0 | } |
516 | | |
517 | | /* is this a match? check the first 4 bytes */ |
518 | 0 | if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) { |
519 | 0 | ref += 4; |
520 | 0 | } else { |
521 | | /* no luck, copy as a literal */ |
522 | 0 | LITERAL(ip, op, op_limit, anchor, copy) |
523 | 0 | continue; |
524 | 0 | } |
525 | | |
526 | | /* last matched byte */ |
527 | 0 | ip = anchor + 4; |
528 | | |
529 | | /* distance is biased */ |
530 | 0 | distance--; |
531 | | |
532 | | /* get runs or matches; zero distance means a run */ |
533 | 0 | ip = get_run_or_match(ip, ip_bound, ref, !distance); |
534 | | |
535 | | /* length is biased, '1' means a match of 3 bytes */ |
536 | 0 | ip -= ipshift; |
537 | |
|
538 | 0 | unsigned len = (int)(ip - anchor); |
539 | | |
540 | | // Encoding short lengths is expensive during decompression |
541 | 0 | if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) { |
542 | 0 | LITERAL(ip, op, op_limit, anchor, copy) |
543 | 0 | continue; |
544 | 0 | } |
545 | | |
546 | | /* if we have copied something, adjust the copy count */ |
547 | 0 | if (copy) |
548 | | /* copy is biased, '0' means 1 byte copy */ |
549 | 0 | *(op - copy - 1) = (uint8_t)(copy - 1); |
550 | 0 | else |
551 | | /* back, to overwrite the copy count */ |
552 | 0 | op--; |
553 | | /* reset literal counter */ |
554 | 0 | copy = 0; |
555 | | |
556 | | /* encode the match */ |
557 | 0 | if (distance < MAX_DISTANCE) { |
558 | 0 | if (len < 7) { |
559 | 0 | MATCH_SHORT(op, op_limit, len, distance) |
560 | 0 | } else { |
561 | 0 | MATCH_LONG(op, op_limit, len, distance) |
562 | 0 | } |
563 | 0 | } else { |
564 | | /* far away, but not yet in the another galaxy... */ |
565 | 0 | distance -= MAX_DISTANCE; |
566 | 0 | if (len < 7) { |
567 | 0 | MATCH_SHORT_FAR(op, op_limit, len, distance) |
568 | 0 | } else { |
569 | 0 | MATCH_LONG_FAR(op, op_limit, len, distance) |
570 | 0 | } |
571 | 0 | } |
572 | | |
573 | | /* update the hash at match boundary */ |
574 | 0 | seq = BLOSCLZ_READU32(ip); |
575 | 0 | HASH_FUNCTION(hval, seq, hashlog) |
576 | 0 | htab[hval] = (uint32_t) (ip++ - ibase); |
577 | 0 | if (clevel == 9) { |
578 | | // In some situations, including a second hash proves to be useful, |
579 | | // but not in others. Activating here in max clevel only. |
580 | 0 | seq >>= 8U; |
581 | 0 | HASH_FUNCTION(hval, seq, hashlog) |
582 | 0 | htab[hval] = (uint32_t) (ip++ - ibase); |
583 | 0 | } |
584 | 0 | else { |
585 | 0 | ip++; |
586 | 0 | } |
587 | |
|
588 | 0 | if (BLOSCLZ_UNLIKELY(op + 1 > op_limit)) |
589 | 0 | goto out; |
590 | | |
591 | | /* assuming literal copy */ |
592 | 0 | *op++ = MAX_COPY - 1; |
593 | 0 | } |
594 | | |
595 | | /* left-over as literal copy */ |
596 | 0 | while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) { |
597 | 0 | if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out; |
598 | 0 | *op++ = *ip++; |
599 | 0 | copy++; |
600 | 0 | if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) { |
601 | 0 | copy = 0; |
602 | 0 | *op++ = MAX_COPY - 1; |
603 | 0 | } |
604 | 0 | } |
605 | | |
606 | | /* if we have copied something, adjust the copy length */ |
607 | 0 | if (copy) |
608 | 0 | *(op - copy - 1) = (uint8_t)(copy - 1); |
609 | 0 | else |
610 | 0 | op--; |
611 | | |
612 | | /* marker for blosclz */ |
613 | 0 | *(uint8_t*)output |= (1U << 5U); |
614 | |
|
615 | 0 | return (int)(op - (uint8_t*)output); |
616 | | |
617 | 0 | out: |
618 | 0 | return 0; |
619 | 0 | } |
620 | | |
621 | | // See https://habr.com/en/company/yandex/blog/457612/ |
622 | | #if defined(__AVX2__) |
623 | | |
624 | | #if defined(_MSC_VER) |
625 | | #define ALIGNED_(x) __declspec(align(x)) |
626 | | #else |
627 | | #if defined(__GNUC__) |
628 | | #define ALIGNED_(x) __attribute__ ((aligned(x))) |
629 | | #endif |
630 | | #endif |
631 | | #define ALIGNED_TYPE_(t, x) t ALIGNED_(x) |
632 | | |
633 | | static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len) |
634 | | { |
635 | | size_t offset = op - match; |
636 | | while (len >= 16) { |
637 | | |
638 | | static const ALIGNED_TYPE_(uint8_t, 16) masks[] = |
639 | | { |
640 | | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, // offset = 0, not used as mask, but for shift |
641 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // offset = 1 |
642 | | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
643 | | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
644 | | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
645 | | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
646 | | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
647 | | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
648 | | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
649 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
650 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
651 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
652 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
653 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
654 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
655 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
656 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // offset = 16 |
657 | | }; |
658 | | |
659 | | _mm_storeu_si128((__m128i *)(op), |
660 | | _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)), |
661 | | _mm_load_si128((const __m128i *)(masks) + offset))); |
662 | | |
663 | | match += masks[offset]; |
664 | | |
665 | | op += 16; |
666 | | len -= 16; |
667 | | } |
668 | | // Deal with remainders |
669 | | for (; len > 0; len--) { |
670 | | *op++ = *match++; |
671 | | } |
672 | | return op; |
673 | | } |
674 | | #endif |
675 | | |
676 | | // LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure) |
677 | 0 | static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) { |
678 | 0 | uint8_t* d = out; |
679 | 0 | const uint8_t* s = from; |
680 | 0 | uint8_t* const e = end; |
681 | |
|
682 | 0 | do { memcpy(d,s,8); d+=8; s+=8; } while (d<e); |
683 | 0 | } |
684 | | |
685 | 0 | int blosclz_decompress(const void* input, int length, void* output, int maxout) { |
686 | 0 | const uint8_t* ip = (const uint8_t*)input; |
687 | 0 | const uint8_t* ip_limit = ip + length; |
688 | 0 | uint8_t* op = (uint8_t*)output; |
689 | 0 | uint32_t ctrl; |
690 | 0 | uint8_t* op_limit = op + maxout; |
691 | 0 | if (BLOSCLZ_UNLIKELY(length == 0)) { |
692 | 0 | return 0; |
693 | 0 | } |
694 | 0 | ctrl = (*ip++) & 31U; |
695 | |
|
696 | 0 | while (1) { |
697 | 0 | if (ctrl >= 32) { |
698 | | // match |
699 | 0 | int32_t len = (int32_t)(ctrl >> 5U) - 1 ; |
700 | 0 | int32_t ofs = (int32_t)(ctrl & 31U) << 8U; |
701 | 0 | uint8_t code; |
702 | 0 | const uint8_t* ref = op - ofs; |
703 | |
|
704 | 0 | if (len == 7 - 1) { |
705 | 0 | do { |
706 | 0 | if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) { |
707 | 0 | return 0; |
708 | 0 | } |
709 | 0 | code = *ip++; |
710 | 0 | len += code; |
711 | 0 | } while (code == 255); |
712 | 0 | } |
713 | 0 | else { |
714 | 0 | if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) { |
715 | 0 | return 0; |
716 | 0 | } |
717 | 0 | } |
718 | 0 | code = *ip++; |
719 | 0 | len += 3; |
720 | 0 | ref -= code; |
721 | | |
722 | | /* match from 16-bit distance */ |
723 | 0 | if (BLOSCLZ_UNLIKELY(code == 255)) { |
724 | 0 | if (ofs == (31U << 8U)) { |
725 | 0 | if (ip + 1 >= ip_limit) { |
726 | 0 | return 0; |
727 | 0 | } |
728 | 0 | ofs = (*ip++) << 8U; |
729 | 0 | ofs += *ip++; |
730 | 0 | ref = op - ofs - MAX_DISTANCE; |
731 | 0 | } |
732 | 0 | } |
733 | | |
734 | 0 | if (BLOSCLZ_UNLIKELY(op + len > op_limit)) { |
735 | 0 | return 0; |
736 | 0 | } |
737 | | |
738 | 0 | if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) { |
739 | 0 | return 0; |
740 | 0 | } |
741 | | |
742 | 0 | if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break; |
743 | 0 | ctrl = *ip++; |
744 | |
|
745 | 0 | ref--; |
746 | 0 | if (ref == op - 1) { |
747 | | /* optimized copy for a run */ |
748 | 0 | memset(op, *ref, len); |
749 | 0 | op += len; |
750 | 0 | } |
751 | 0 | else if ((op - ref >= 8) && (op_limit - op >= len + 8)) { |
752 | | // copy with an overlap not larger than 8 |
753 | 0 | wild_copy(op, ref, op + len); |
754 | 0 | op += len; |
755 | 0 | } |
756 | 0 | else { |
757 | | // general copy with any overlap |
758 | | #if 0 && defined(__AVX2__) |
759 | | if (op - ref <= 16) { |
760 | | // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but |
761 | | // it is not too slower either. |
762 | | op = copy_match_16(op, ref, len); |
763 | | } |
764 | | else { |
765 | | #endif |
766 | 0 | op = copy_match(op, ref, (unsigned) len); |
767 | | #if 0 && defined(__AVX2__) |
768 | | } |
769 | | #endif |
770 | 0 | } |
771 | 0 | } |
772 | 0 | else { |
773 | | // literal |
774 | 0 | ctrl++; |
775 | 0 | if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) { |
776 | 0 | return 0; |
777 | 0 | } |
778 | 0 | if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) { |
779 | 0 | return 0; |
780 | 0 | } |
781 | | |
782 | 0 | memcpy(op, ip, ctrl); op += ctrl; ip += ctrl; |
783 | | // On GCC-6, fastcopy this is still faster than plain memcpy |
784 | | // However, using recent CLANG/LLVM 9.0, there is almost no difference |
785 | | // in performance. |
786 | | // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster. |
787 | | // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl; |
788 | |
|
789 | 0 | if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break; |
790 | 0 | ctrl = *ip++; |
791 | 0 | } |
792 | 0 | } |
793 | | |
794 | 0 | return (int)(op - (uint8_t*)output); |
795 | 0 | } |