Coverage Report

Created: 2026-01-09 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include "blosclz.h"
18
#include "fastcopy.h"
19
#include "blosc2/blosc2-common.h"
20
21
#include <stdbool.h>
22
#include <stdio.h>
23
#include <stdint.h>
24
#include <string.h>
25
26
/*
27
 * Give hints to the compiler for branch prediction optimization.
28
 * This is not necessary anymore with modern CPUs.
29
 */
30
#if 0 && defined(__GNUC__) && (__GNUC__ > 2)
31
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
32
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
33
#else
34
19.1M
#define BLOSCLZ_LIKELY(c)    (c)
35
44.5M
#define BLOSCLZ_UNLIKELY(c)  (c)
36
#endif
37
38
/*
39
 * Use inlined functions for supported systems.
40
 */
41
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
42
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
43
#endif
44
45
760k
#define MAX_COPY 32U
46
19.8M
#define MAX_DISTANCE 8191
47
19.0M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
48
49
#ifdef BLOSC_STRICT_ALIGN
50
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
51
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
52
#else
53
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
54
36.9M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
55
#endif
56
57
316k
#define HASH_LOG (14U)
58
59
// This is used in LZ4 and seems to work pretty well here too
60
19.7M
#define HASH_FUNCTION(v, s, h) {      \
61
19.7M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
62
19.7M
}
63
64
65
#if defined(__AVX2__)
66
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
67
    uint8_t x = ip[-1];
68
69
    while (ip < (ip_bound - (sizeof(__m256i)))) {
70
        __m256i value, value2, cmp;
71
        /* Broadcast the value for every byte in a 256-bit register */
72
        memset(&value, x, sizeof(__m256i));
73
        value2 = _mm256_loadu_si256((__m256i *)ref);
74
        cmp = _mm256_cmpeq_epi64(value, value2);
75
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
76
            /* Return the byte that starts to differ */
77
            while (*ref++ == x) ip++;
78
            return ip;
79
        }
80
        else {
81
            ip += sizeof(__m256i);
82
            ref += sizeof(__m256i);
83
        }
84
    }
85
    /* Look into the remainder */
86
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
87
    return ip;
88
}
89
#endif
90
91
#if defined(__SSE2__)
92
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
93
0
  uint8_t x = ip[-1];
94
95
0
  while (ip < (ip_bound - sizeof(__m128i))) {
96
0
    __m128i value, value2, cmp;
97
    /* Broadcast the value for every byte in a 128-bit register */
98
0
    memset(&value, x, sizeof(__m128i));
99
0
    value2 = _mm_loadu_si128((__m128i *)ref);
100
0
    cmp = _mm_cmpeq_epi32(value, value2);
101
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
102
      /* Return the byte that starts to differ */
103
0
      while (*ref++ == x) ip++;
104
0
      return ip;
105
0
    }
106
0
    else {
107
0
      ip += sizeof(__m128i);
108
0
      ref += sizeof(__m128i);
109
0
    }
110
0
  }
111
  /* Look into the remainder */
112
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
113
0
  return ip;
114
0
}
115
116
#endif
117
118
119
319k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
120
319k
  uint8_t x = ip[-1];
121
319k
  int64_t value, value2;
122
  /* Broadcast the value for every byte in a 64-bit register */
123
319k
  memset(&value, x, 8);
124
  /* safe because the outer check against ip limit */
125
844k
  while (ip < (ip_bound - sizeof(int64_t))) {
126
#if defined(BLOSC_STRICT_ALIGN)
127
    memcpy(&value2, ref, 8);
128
#else
129
828k
    value2 = ((int64_t*)ref)[0];
130
828k
#endif
131
828k
    if (value != value2) {
132
      /* Return the byte that starts to differ */
133
977k
      while (*ref++ == x) ip++;
134
304k
      return ip;
135
304k
    }
136
524k
    else {
137
524k
      ip += 8;
138
524k
      ref += 8;
139
524k
    }
140
828k
  }
141
  /* Look into the remainder */
142
52.7k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
143
15.3k
  return ip;
144
319k
}
145
146
147
/* Return the byte that starts to differ */
148
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
149
0
#if !defined(BLOSC_STRICT_ALIGN)
150
0
  while (ip < (ip_bound - sizeof(int64_t))) {
151
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
152
      /* Return the byte that starts to differ */
153
0
      while (*ref++ == *ip++) {}
154
0
      return ip;
155
0
    }
156
0
    else {
157
0
      ip += sizeof(int64_t);
158
0
      ref += sizeof(int64_t);
159
0
    }
160
0
  }
161
0
#endif
162
  /* Look into the remainder */
163
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
164
0
  return ip;
165
0
}
166
167
168
#if defined(__SSE2__)
169
1.53M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
170
1.53M
  __m128i value, value2, cmp;
171
172
3.25M
  while (ip < (ip_bound - sizeof(__m128i))) {
173
3.21M
    value = _mm_loadu_si128((__m128i *) ip);
174
3.21M
    value2 = _mm_loadu_si128((__m128i *) ref);
175
3.21M
    cmp = _mm_cmpeq_epi32(value, value2);
176
3.21M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
177
      /* Return the byte that starts to differ */
178
5.52M
      while (*ref++ == *ip++) {}
179
1.48M
      return ip;
180
1.48M
    }
181
1.72M
    else {
182
1.72M
      ip += sizeof(__m128i);
183
1.72M
      ref += sizeof(__m128i);
184
1.72M
    }
185
3.21M
  }
186
  /* Look into the remainder */
187
294k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
188
46.2k
  return ip;
189
1.53M
}
190
#endif
191
192
193
#if defined(__AVX2__)
194
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
195
196
  while (ip < (ip_bound - sizeof(__m256i))) {
197
    __m256i value, value2, cmp;
198
    value = _mm256_loadu_si256((__m256i *) ip);
199
    value2 = _mm256_loadu_si256((__m256i *)ref);
200
    cmp = _mm256_cmpeq_epi64(value, value2);
201
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
202
      /* Return the byte that starts to differ */
203
      while (*ref++ == *ip++) {}
204
      return ip;
205
    }
206
    else {
207
      ip += sizeof(__m256i);
208
      ref += sizeof(__m256i);
209
    }
210
  }
211
  /* Look into the remainder */
212
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
213
  return ip;
214
}
215
#endif
216
217
218
1.85M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
219
1.85M
  if (BLOSCLZ_UNLIKELY(run)) {
220
#if defined(__AVX2__)
221
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
222
    // ip = get_run_32(ip, ip_bound, ref);
223
    ip = get_run(ip, ip_bound, ref);
224
#elif defined(__SSE2__)
225
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
226
    // ip = get_run_16(ip, ip_bound, ref);
227
319k
    ip = get_run(ip, ip_bound, ref);
228
#else
229
    ip = get_run(ip, ip_bound, ref);
230
#endif
231
319k
  }
232
1.53M
  else {
233
#if defined(__AVX2__)
234
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
235
    // ip = get_match_32(ip, ip_bound, ref);
236
    ip = get_match_16(ip, ip_bound, ref);
237
#elif defined(__SSE2__)
238
    ip = get_match_16(ip, ip_bound, ref);
239
#else
240
    ip = get_match(ip, ip_bound, ref);
241
#endif
242
1.53M
  }
243
244
1.85M
  return ip;
245
1.85M
}
246
247
248
9.95M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
249
9.95M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
250
9.95M
    goto out;                                           \
251
9.95M
  *(op)++ = *(anchor)++;                                \
252
9.95M
  (ip) = (anchor);                                      \
253
9.95M
  (copy)++;                                             \
254
9.95M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
255
249k
    (copy) = 0;                                         \
256
249k
    *(op)++ = MAX_COPY-1;                               \
257
249k
  }                                                     \
258
9.95M
}
259
260
8.46M
#define LITERAL2(ip, anchor, copy) {                    \
261
8.46M
  oc++; (anchor)++;                                     \
262
8.46M
  (ip) = (anchor);                                      \
263
8.46M
  (copy)++;                                             \
264
8.46M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
265
236k
    (copy) = 0;                                         \
266
236k
    oc++;                                               \
267
236k
  }                                                     \
268
8.46M
}
269
270
148k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
271
148k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
272
148k
    goto out;                                             \
273
148k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
274
148k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
275
148k
}
276
277
319k
#define MATCH_LONG(op, op_limit, len, distance) {       \
278
319k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
279
319k
    goto out;                                           \
280
319k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
281
361k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
282
41.5k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
283
41.5k
      goto out;                                         \
284
41.5k
    *(op)++ = 255;                                      \
285
41.5k
  }                                                     \
286
319k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
287
319k
    goto out;                                           \
288
319k
  *(op)++ = (uint8_t)(len);                             \
289
319k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
290
319k
}
291
292
1.39k
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
293
1.39k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
294
1.39k
    goto out;                                               \
295
1.39k
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
296
1.38k
  *(op)++ = 255;                                            \
297
1.38k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
298
1.38k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
299
1.38k
}
300
301
16.6k
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
302
16.6k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
303
16.6k
    goto out;                                               \
304
16.6k
  *(op)++ = (7U << 5U) + 31;                                \
305
25.8k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
306
9.23k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
307
9.23k
      goto out;                                             \
308
9.23k
    *(op)++ = 255;                                          \
309
9.22k
  }                                                         \
310
16.6k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
311
16.6k
    goto out;                                               \
312
16.6k
  *(op)++ = (uint8_t)(len);                                 \
313
16.5k
  *(op)++ = 255;                                            \
314
16.5k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
315
16.5k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
316
16.5k
}
317
318
319
// Get a guess for the compressed size of a buffer
320
35.2k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift, uint32_t htab[], int8_t hashlog) {
321
35.2k
  uint8_t* ip = ibase;
322
35.2k
  int32_t oc = 0;
323
35.2k
  const uint16_t hashlen = (1U << (uint8_t)hashlog);
324
35.2k
  uint32_t hval;
325
35.2k
  uint32_t seq;
326
35.2k
  uint8_t copy;
327
  // Make a tradeoff between testing too much and too little
328
35.2k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
329
35.2k
  uint8_t* ip_bound = ibase + limit - 1;
330
35.2k
  uint8_t* ip_limit = ibase + limit - 12;
331
332
  // Initialize the hash table to distances of 0
333
35.2k
  memset(htab, 0, hashlen * sizeof(uint32_t));
334
335
  /* we start with literal copy */
336
35.2k
  copy = 4;
337
35.2k
  oc += 5;
338
339
  /* main loop */
340
8.66M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
341
8.63M
    const uint8_t* ref;
342
8.63M
    unsigned distance;
343
8.63M
    uint8_t* anchor = ip;    /* comparison starting-point */
344
345
    /* find potential match */
346
8.63M
    seq = BLOSCLZ_READU32(ip);
347
8.63M
    HASH_FUNCTION(hval, seq, hashlog)
348
8.63M
    ref = ibase + htab[hval];
349
350
    /* calculate distance to the match */
351
8.63M
    distance = (unsigned int)(anchor - ref);
352
353
    /* update hash table */
354
8.63M
    htab[hval] = (uint32_t) (anchor - ibase);
355
356
8.63M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
357
35.1k
      LITERAL2(ip, anchor, copy)
358
35.1k
      continue;
359
35.1k
    }
360
361
    /* is this a match? check the first 4 bytes */
362
8.59M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
363
462k
      ref += 4;
364
462k
    }
365
8.13M
    else {
366
      /* no luck, copy as a literal */
367
8.13M
      LITERAL2(ip, anchor, copy)
368
8.13M
      continue;
369
8.13M
    }
370
371
    /* last matched byte */
372
462k
    ip = anchor + 4;
373
374
    /* distance is biased */
375
462k
    distance--;
376
377
    /* get runs or matches; zero distance means a run */
378
462k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
379
380
462k
    ip -= ipshift;
381
462k
    int len = (int)(ip - anchor);
382
462k
    if (len < minlen) {
383
291k
      LITERAL2(ip, anchor, copy)
384
291k
      continue;
385
291k
    }
386
387
    /* if we haven't copied anything, adjust the output counter */
388
170k
    if (!copy)
389
48.7k
      oc--;
390
    /* reset literal counter */
391
170k
    copy = 0;
392
393
    /* encode the match */
394
170k
    if (distance < MAX_DISTANCE) {
395
167k
      if (len >= 7) {
396
116k
        oc += ((len - 7) / 255) + 1;
397
116k
      }
398
167k
      oc += 2;
399
167k
    }
400
3.35k
    else {
401
      /* far away, but not yet in the another galaxy... */
402
3.35k
      if (len >= 7) {
403
2.35k
        oc += ((len - 7) / 255) + 1;
404
2.35k
      }
405
3.35k
      oc += 4;
406
3.35k
    }
407
408
    /* update the hash at match boundary */
409
170k
    seq = BLOSCLZ_READU32(ip);
410
170k
    HASH_FUNCTION(hval, seq, hashlog)
411
170k
    htab[hval] = (uint32_t)(ip++ - ibase);
412
170k
    ip++;
413
    /* assuming literal copy */
414
170k
    oc++;
415
170k
  }
416
417
35.2k
  double ic = (double)(ip - ibase);
418
35.2k
  return ic / (double)oc;
419
35.2k
}
420
421
422
int blosclz_compress(const int clevel, const void* input, int length,
423
35.2k
                     void* output, int maxout, blosc2_context* ctx) {
424
35.2k
  BLOSC_UNUSED_PARAM(ctx);
425
35.2k
  uint8_t* ibase = (uint8_t*)input;
426
35.2k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
427
428
  /* When we go back in a match (shift), we obtain quite different compression properties.
429
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
430
   * Fallback to 4 because it provides more consistent results for large cratios.
431
   * UPDATE: new experiments show that using a value of 3 is a bit better, at least for ERA5.
432
   * UPDATE 2: go back to 4, as they seem to provide better cratios in general.
433
   *
434
   * In this block we also check cratios for the beginning of the buffers and
435
   * eventually discard those that are small (take too long to decompress).
436
   * This process is called _entropy probing_.
437
   */
438
35.2k
  unsigned ipshift = 4;
439
  // Minimum lengths for encoding (normally it is good to match the shift value)
440
35.2k
  unsigned minlen = 4;
441
442
35.2k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
443
35.2k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
444
35.2k
  uint8_t hashlog = hashlog_[clevel];
445
446
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
447
  // UPDATE: new experiments with ERA5 datasets (float32) say that checking the whole buffer
448
  // is better (specially when combined with bitshuffle).
449
  // The loss in speed for checking the whole buffer is pretty negligible too.
450
35.2k
  int maxlen = length;
451
35.2k
  if (clevel < 2) {
452
31.8k
    maxlen /= 8;
453
31.8k
  }
454
3.32k
  else if (clevel < 4) {
455
687
    maxlen /= 4;
456
687
  }
457
2.63k
  else if (clevel < 7) {
458
579
    maxlen /= 2;
459
579
  }
460
  // Start probing somewhere inside the buffer
461
35.2k
  int shift = length - maxlen;
462
  // Actual entropy probing!
463
35.2k
  double cratio = get_cratio(ibase + shift, maxlen, minlen, ipshift, htab, hashlog);
464
  // discard probes with small compression ratios (too expensive)
465
35.2k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
466
35.2k
  if (cratio < cratio_[clevel]) {
467
10.8k
      goto out;
468
10.8k
  }
469
470
24.3k
  uint8_t* ip = ibase;
471
24.3k
  uint8_t* ip_bound = ibase + length - 1;
472
24.3k
  uint8_t* ip_limit = ibase + length - 12;
473
24.3k
  uint8_t* op = (uint8_t*)output;
474
24.3k
  const uint8_t* op_limit = op + maxout;
475
24.3k
  uint32_t seq;
476
24.3k
  uint8_t copy;
477
24.3k
  uint32_t hval;
478
479
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
480
24.3k
  if (length < 16 || maxout < 66) {
481
96
    return 0;
482
96
  }
483
484
  // Initialize the hash table
485
24.2k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
486
487
  /* we start with literal copy */
488
24.2k
  copy = 4;
489
24.2k
  *op++ = MAX_COPY - 1;
490
24.2k
  *op++ = *ip++;
491
24.2k
  *op++ = *ip++;
492
24.2k
  *op++ = *ip++;
493
24.2k
  *op++ = *ip++;
494
495
  /* main loop */
496
10.4M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
497
10.4M
    const uint8_t* ref;
498
10.4M
    unsigned distance;
499
10.4M
    uint8_t* anchor = ip;    /* comparison starting-point */
500
501
    /* find potential match */
502
10.4M
    seq = BLOSCLZ_READU32(ip);
503
10.4M
    HASH_FUNCTION(hval, seq, hashlog)
504
10.4M
    ref = ibase + htab[hval];
505
506
    /* calculate distance to the match */
507
10.4M
    distance = (unsigned int)(anchor - ref);
508
509
    /* update hash table */
510
10.4M
    htab[hval] = (uint32_t) (anchor - ibase);
511
512
10.4M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
513
35.6k
      LITERAL(ip, op, op_limit, anchor, copy)
514
0
      continue;
515
35.6k
    }
516
517
    /* is this a match? check the first 4 bytes */
518
10.4M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
519
1.39M
      ref += 4;
520
9.01M
    } else {
521
      /* no luck, copy as a literal */
522
9.01M
      LITERAL(ip, op, op_limit, anchor, copy)
523
0
      continue;
524
9.01M
    }
525
526
    /* last matched byte */
527
1.39M
    ip = anchor + 4;
528
529
    /* distance is biased */
530
1.39M
    distance--;
531
532
    /* get runs or matches; zero distance means a run */
533
1.39M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
534
535
    /* length is biased, '1' means a match of 3 bytes */
536
1.39M
    ip -= ipshift;
537
538
1.39M
    unsigned len = (int)(ip - anchor);
539
540
    // Encoding short lengths is expensive during decompression
541
1.39M
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
542
905k
      LITERAL(ip, op, op_limit, anchor, copy)
543
0
      continue;
544
905k
    }
545
546
    /* if we have copied something, adjust the copy count */
547
486k
    if (copy)
548
      /* copy is biased, '0' means 1 byte copy */
549
313k
      *(op - copy - 1) = (uint8_t)(copy - 1);
550
172k
    else
551
      /* back, to overwrite the copy count */
552
172k
      op--;
553
    /* reset literal counter */
554
486k
    copy = 0;
555
556
    /* encode the match */
557
486k
    if (distance < MAX_DISTANCE) {
558
468k
      if (len < 7) {
559
148k
        MATCH_SHORT(op, op_limit, len, distance)
560
319k
      } else {
561
958k
        MATCH_LONG(op, op_limit, len, distance)
562
958k
      }
563
468k
    } else {
564
      /* far away, but not yet in the another galaxy... */
565
18.0k
      distance -= MAX_DISTANCE;
566
18.0k
      if (len < 7) {
567
1.39k
        MATCH_SHORT_FAR(op, op_limit, len, distance)
568
16.6k
      } else {
569
49.8k
        MATCH_LONG_FAR(op, op_limit, len, distance)
570
49.8k
      }
571
18.0k
    }
572
573
    /* update the hash at match boundary */
574
486k
    seq = BLOSCLZ_READU32(ip);
575
486k
    HASH_FUNCTION(hval, seq, hashlog)
576
486k
    htab[hval] = (uint32_t) (ip++ - ibase);
577
486k
    if (clevel == 9) {
578
      // In some situations, including a second hash proves to be useful,
579
      // but not in others.  Activating here in max clevel only.
580
63.4k
      seq >>= 8U;
581
63.4k
      HASH_FUNCTION(hval, seq, hashlog)
582
63.4k
      htab[hval] = (uint32_t) (ip++ - ibase);
583
63.4k
    }
584
422k
    else {
585
422k
      ip++;
586
422k
    }
587
588
486k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
589
72
      goto out;
590
591
    /* assuming literal copy */
592
486k
    *op++ = MAX_COPY - 1;
593
486k
  }
594
595
  /* left-over as literal copy */
596
127k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
597
104k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
598
104k
    *op++ = *ip++;
599
104k
    copy++;
600
104k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
601
561
      copy = 0;
602
561
      *op++ = MAX_COPY - 1;
603
561
    }
604
104k
  }
605
606
  /* if we have copied something, adjust the copy length */
607
23.4k
  if (copy)
608
23.2k
    *(op - copy - 1) = (uint8_t)(copy - 1);
609
237
  else
610
237
    op--;
611
612
  /* marker for blosclz */
613
23.4k
  *(uint8_t*)output |= (1U << 5U);
614
615
23.4k
  return (int)(op - (uint8_t*)output);
616
617
11.6k
  out:
618
11.6k
  return 0;
619
23.7k
}
620
621
// See https://habr.com/en/company/yandex/blog/457612/
622
#if defined(__AVX2__)
623
624
#if defined(_MSC_VER)
625
#define ALIGNED_(x) __declspec(align(x))
626
#else
627
#if defined(__GNUC__)
628
#define ALIGNED_(x) __attribute__ ((aligned(x)))
629
#endif
630
#endif
631
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
632
633
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
634
{
635
  size_t offset = op - match;
636
  while (len >= 16) {
637
638
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
639
      {
640
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
641
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
642
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
643
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
644
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
645
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
646
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
648
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
651
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
652
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
653
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
654
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
655
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
656
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
657
      };
658
659
    _mm_storeu_si128((__m128i *)(op),
660
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
661
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
662
663
    match += masks[offset];
664
665
    op += 16;
666
    len -= 16;
667
  }
668
  // Deal with remainders
669
  for (; len > 0; len--) {
670
    *op++ = *match++;
671
  }
672
  return op;
673
}
674
#endif
675
676
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
677
189k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
678
189k
  uint8_t* d = out;
679
189k
  const uint8_t* s = from;
680
189k
  uint8_t* const e = end;
681
682
1.14M
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
683
189k
}
684
685
10.6k
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
686
10.6k
  const uint8_t* ip = (const uint8_t*)input;
687
10.6k
  const uint8_t* ip_limit = ip + length;
688
10.6k
  uint8_t* op = (uint8_t*)output;
689
10.6k
  uint32_t ctrl;
690
10.6k
  uint8_t* op_limit = op + maxout;
691
10.6k
  if (BLOSCLZ_UNLIKELY(length == 0)) {
692
0
    return 0;
693
0
  }
694
10.6k
  ctrl = (*ip++) & 31U;
695
696
549k
  while (1) {
697
549k
    if (ctrl >= 32) {
698
      // match
699
269k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
700
269k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
701
269k
      uint8_t code;
702
269k
      const uint8_t* ref = op - ofs;
703
704
269k
      if (len == 7 - 1) {
705
210k
        do {
706
210k
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
707
0
            return 0;
708
0
          }
709
210k
          code = *ip++;
710
210k
          len += code;
711
210k
        } while (code == 255);
712
180k
      }
713
88.8k
      else {
714
88.8k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
715
0
          return 0;
716
0
        }
717
88.8k
      }
718
269k
      code = *ip++;
719
269k
      len += 3;
720
269k
      ref -= code;
721
722
      /* match from 16-bit distance */
723
269k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
724
22.6k
        if (ofs == (31U << 8U)) {
725
13.2k
          if (ip + 1 >= ip_limit) {
726
0
            return 0;
727
0
          }
728
13.2k
          ofs = (*ip++) << 8U;
729
13.2k
          ofs += *ip++;
730
13.2k
          ref = op - ofs - MAX_DISTANCE;
731
13.2k
        }
732
22.6k
      }
733
734
269k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
735
0
        return 0;
736
0
      }
737
738
269k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
739
0
        return 0;
740
0
      }
741
742
269k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
743
269k
      ctrl = *ip++;
744
745
269k
      ref--;
746
269k
      if (ref == op - 1) {
747
        /* optimized copy for a run */
748
29.8k
        memset(op, *ref, len);
749
29.8k
        op += len;
750
29.8k
      }
751
239k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
752
        // copy with an overlap not larger than 8
753
189k
        wild_copy(op, ref, op + len);
754
189k
        op += len;
755
189k
      }
756
50.7k
      else {
757
        // general copy with any overlap
758
#if 0 && defined(__AVX2__)
759
        if (op - ref <= 16) {
760
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
761
          // it is not too slower either.
762
          op = copy_match_16(op, ref, len);
763
        }
764
        else {
765
#endif
766
50.7k
          op = copy_match(op, ref, (unsigned) len);
767
#if 0 && defined(__AVX2__)
768
        }
769
#endif
770
50.7k
      }
771
269k
    }
772
279k
    else {
773
      // literal
774
279k
      ctrl++;
775
279k
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
776
0
        return 0;
777
0
      }
778
279k
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
779
0
        return 0;
780
0
      }
781
782
279k
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
783
      // On GCC-6, fastcopy this is still faster than plain memcpy
784
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
785
      // in performance.
786
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
787
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
788
789
279k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
790
269k
      ctrl = *ip++;
791
269k
    }
792
549k
  }
793
794
10.6k
  return (int)(op - (uint8_t*)output);
795
10.6k
}