Coverage Report

Created: 2025-08-29 06:39

/src/c-blosc2/blosc/blosclz.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include "blosclz.h"
18
#include "fastcopy.h"
19
#include "blosc2/blosc2-common.h"
20
21
#include <stdbool.h>
22
#include <stdio.h>
23
#include <stdint.h>
24
#include <string.h>
25
26
/*
27
 * Give hints to the compiler for branch prediction optimization.
28
 * This is not necessary anymore with modern CPUs.
29
 */
30
#if 0 && defined(__GNUC__) && (__GNUC__ > 2)
31
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
32
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
33
#else
34
34.3M
#define BLOSCLZ_LIKELY(c)    (c)
35
79.1M
#define BLOSCLZ_UNLIKELY(c)  (c)
36
#endif
37
38
/*
39
 * Use inlined functions for supported systems.
40
 */
41
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
42
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
43
#endif
44
45
1.30M
#define MAX_COPY 32U
46
35.9M
#define MAX_DISTANCE 8191
47
34.1M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
48
49
#ifdef BLOSC_STRICT_ALIGN
50
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
51
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
52
#else
53
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
54
69.2M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
55
#endif
56
57
423k
#define HASH_LOG (14U)
58
59
// This is used in LZ4 and seems to work pretty well here too
60
36.0M
#define HASH_FUNCTION(v, s, h) {      \
61
36.0M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
62
36.0M
}
63
64
65
#if defined(__AVX2__)
66
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
67
    uint8_t x = ip[-1];
68
69
    while (ip < (ip_bound - (sizeof(__m256i)))) {
70
        __m256i value, value2, cmp;
71
        /* Broadcast the value for every byte in a 256-bit register */
72
        memset(&value, x, sizeof(__m256i));
73
        value2 = _mm256_loadu_si256((__m256i *)ref);
74
        cmp = _mm256_cmpeq_epi64(value, value2);
75
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
76
            /* Return the byte that starts to differ */
77
            while (*ref++ == x) ip++;
78
            return ip;
79
        }
80
        else {
81
            ip += sizeof(__m256i);
82
            ref += sizeof(__m256i);
83
        }
84
    }
85
    /* Look into the remainder */
86
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
87
    return ip;
88
}
89
#endif
90
91
#if defined(__SSE2__)
92
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
93
0
  uint8_t x = ip[-1];
94
95
0
  while (ip < (ip_bound - sizeof(__m128i))) {
96
0
    __m128i value, value2, cmp;
97
    /* Broadcast the value for every byte in a 128-bit register */
98
0
    memset(&value, x, sizeof(__m128i));
99
0
    value2 = _mm_loadu_si128((__m128i *)ref);
100
0
    cmp = _mm_cmpeq_epi32(value, value2);
101
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
102
      /* Return the byte that starts to differ */
103
0
      while (*ref++ == x) ip++;
104
0
      return ip;
105
0
    }
106
0
    else {
107
0
      ip += sizeof(__m128i);
108
0
      ref += sizeof(__m128i);
109
0
    }
110
0
  }
111
  /* Look into the remainder */
112
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
113
0
  return ip;
114
0
}
115
116
#endif
117
118
119
953k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
120
953k
  uint8_t x = ip[-1];
121
953k
  int64_t value, value2;
122
  /* Broadcast the value for every byte in a 64-bit register */
123
953k
  memset(&value, x, 8);
124
  /* safe because the outer check against ip limit */
125
1.73M
  while (ip < (ip_bound - sizeof(int64_t))) {
126
#if defined(BLOSC_STRICT_ALIGN)
127
    memcpy(&value2, ref, 8);
128
#else
129
1.71M
    value2 = ((int64_t*)ref)[0];
130
1.71M
#endif
131
1.71M
    if (value != value2) {
132
      /* Return the byte that starts to differ */
133
2.94M
      while (*ref++ == x) ip++;
134
936k
      return ip;
135
936k
    }
136
780k
    else {
137
780k
      ip += 8;
138
780k
      ref += 8;
139
780k
    }
140
1.71M
  }
141
  /* Look into the remainder */
142
61.0k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
143
17.2k
  return ip;
144
953k
}
145
146
147
/* Return the byte that starts to differ */
148
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
149
0
#if !defined(BLOSC_STRICT_ALIGN)
150
0
  while (ip < (ip_bound - sizeof(int64_t))) {
151
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
152
      /* Return the byte that starts to differ */
153
0
      while (*ref++ == *ip++) {}
154
0
      return ip;
155
0
    }
156
0
    else {
157
0
      ip += sizeof(int64_t);
158
0
      ref += sizeof(int64_t);
159
0
    }
160
0
  }
161
0
#endif
162
  /* Look into the remainder */
163
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
164
0
  return ip;
165
0
}
166
167
168
#if defined(__SSE2__)
169
4.48M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
170
4.48M
  __m128i value, value2, cmp;
171
172
6.85M
  while (ip < (ip_bound - sizeof(__m128i))) {
173
6.79M
    value = _mm_loadu_si128((__m128i *) ip);
174
6.79M
    value2 = _mm_loadu_si128((__m128i *) ref);
175
6.79M
    cmp = _mm_cmpeq_epi32(value, value2);
176
6.79M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
177
      /* Return the byte that starts to differ */
178
13.0M
      while (*ref++ == *ip++) {}
179
4.42M
      return ip;
180
4.42M
    }
181
2.36M
    else {
182
2.36M
      ip += sizeof(__m128i);
183
2.36M
      ref += sizeof(__m128i);
184
2.36M
    }
185
6.79M
  }
186
  /* Look into the remainder */
187
413k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
188
61.7k
  return ip;
189
4.48M
}
190
#endif
191
192
193
#if defined(__AVX2__)
194
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
195
196
  while (ip < (ip_bound - sizeof(__m256i))) {
197
    __m256i value, value2, cmp;
198
    value = _mm256_loadu_si256((__m256i *) ip);
199
    value2 = _mm256_loadu_si256((__m256i *)ref);
200
    cmp = _mm256_cmpeq_epi64(value, value2);
201
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
202
      /* Return the byte that starts to differ */
203
      while (*ref++ == *ip++) {}
204
      return ip;
205
    }
206
    else {
207
      ip += sizeof(__m256i);
208
      ref += sizeof(__m256i);
209
    }
210
  }
211
  /* Look into the remainder */
212
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
213
  return ip;
214
}
215
#endif
216
217
218
5.44M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
219
5.44M
  if (BLOSCLZ_UNLIKELY(run)) {
220
#if defined(__AVX2__)
221
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
222
    // ip = get_run_32(ip, ip_bound, ref);
223
    ip = get_run(ip, ip_bound, ref);
224
#elif defined(__SSE2__)
225
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
226
    // ip = get_run_16(ip, ip_bound, ref);
227
953k
    ip = get_run(ip, ip_bound, ref);
228
#else
229
    ip = get_run(ip, ip_bound, ref);
230
#endif
231
953k
  }
232
4.48M
  else {
233
#if defined(__AVX2__)
234
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
235
    // ip = get_match_32(ip, ip_bound, ref);
236
    ip = get_match_16(ip, ip_bound, ref);
237
#elif defined(__SSE2__)
238
    ip = get_match_16(ip, ip_bound, ref);
239
#else
240
    ip = get_match(ip, ip_bound, ref);
241
#endif
242
4.48M
  }
243
244
5.44M
  return ip;
245
5.44M
}
246
247
248
16.4M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
249
16.4M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
250
16.4M
    goto out;                                           \
251
16.4M
  *(op)++ = *(anchor)++;                                \
252
16.4M
  (ip) = (anchor);                                      \
253
16.4M
  (copy)++;                                             \
254
16.4M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
255
359k
    (copy) = 0;                                         \
256
359k
    *(op)++ = MAX_COPY-1;                               \
257
359k
  }                                                     \
258
16.4M
}
259
260
16.3M
#define LITERAL2(ip, anchor, copy) {                    \
261
16.3M
  oc++; (anchor)++;                                     \
262
16.3M
  (ip) = (anchor);                                      \
263
16.3M
  (copy)++;                                             \
264
16.3M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
265
394k
    (copy) = 0;                                         \
266
394k
    oc++;                                               \
267
394k
  }                                                     \
268
16.3M
}
269
270
361k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
271
361k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
272
361k
    goto out;                                             \
273
361k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
274
361k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
275
361k
}
276
277
535k
#define MATCH_LONG(op, op_limit, len, distance) {       \
278
535k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
279
535k
    goto out;                                           \
280
535k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
281
589k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
282
53.4k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
283
53.4k
      goto out;                                         \
284
53.4k
    *(op)++ = 255;                                      \
285
53.4k
  }                                                     \
286
535k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
287
535k
    goto out;                                           \
288
535k
  *(op)++ = (uint8_t)(len);                             \
289
535k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
290
535k
}
291
292
1.21k
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
293
1.21k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
294
1.21k
    goto out;                                               \
295
1.21k
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
296
1.20k
  *(op)++ = 255;                                            \
297
1.20k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
298
1.20k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
299
1.20k
}
300
301
15.2k
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
302
15.2k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
303
15.2k
    goto out;                                               \
304
15.2k
  *(op)++ = (7U << 5U) + 31;                                \
305
22.6k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
306
7.34k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
307
7.34k
      goto out;                                             \
308
7.34k
    *(op)++ = 255;                                          \
309
7.33k
  }                                                         \
310
15.2k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
311
15.2k
    goto out;                                               \
312
15.2k
  *(op)++ = (uint8_t)(len);                                 \
313
15.2k
  *(op)++ = 255;                                            \
314
15.2k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
315
15.2k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
316
15.2k
}
317
318
319
// Get a guess for the compressed size of a buffer
320
47.0k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift, uint32_t htab[], int8_t hashlog) {
321
47.0k
  uint8_t* ip = ibase;
322
47.0k
  int32_t oc = 0;
323
47.0k
  const uint16_t hashlen = (1U << (uint8_t)hashlog);
324
47.0k
  uint32_t hval;
325
47.0k
  uint32_t seq;
326
47.0k
  uint8_t copy;
327
  // Make a tradeoff between testing too much and too little
328
47.0k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
329
47.0k
  uint8_t* ip_bound = ibase + limit - 1;
330
47.0k
  uint8_t* ip_limit = ibase + limit - 12;
331
332
  // Initialize the hash table to distances of 0
333
47.0k
  memset(htab, 0, hashlen * sizeof(uint32_t));
334
335
  /* we start with literal copy */
336
47.0k
  copy = 4;
337
47.0k
  oc += 5;
338
339
  /* main loop */
340
16.8M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
341
16.8M
    const uint8_t* ref;
342
16.8M
    unsigned distance;
343
16.8M
    uint8_t* anchor = ip;    /* comparison starting-point */
344
345
    /* find potential match */
346
16.8M
    seq = BLOSCLZ_READU32(ip);
347
16.8M
    HASH_FUNCTION(hval, seq, hashlog)
348
16.8M
    ref = ibase + htab[hval];
349
350
    /* calculate distance to the match */
351
16.8M
    distance = (unsigned int)(anchor - ref);
352
353
    /* update hash table */
354
16.8M
    htab[hval] = (uint32_t) (anchor - ibase);
355
356
16.8M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
357
46.9k
      LITERAL2(ip, anchor, copy)
358
46.9k
      continue;
359
46.9k
    }
360
361
    /* is this a match? check the first 4 bytes */
362
16.7M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
363
2.25M
      ref += 4;
364
2.25M
    }
365
14.5M
    else {
366
      /* no luck, copy as a literal */
367
14.5M
      LITERAL2(ip, anchor, copy)
368
14.5M
      continue;
369
14.5M
    }
370
371
    /* last matched byte */
372
2.25M
    ip = anchor + 4;
373
374
    /* distance is biased */
375
2.25M
    distance--;
376
377
    /* get runs or matches; zero distance means a run */
378
2.25M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
379
380
2.25M
    ip -= ipshift;
381
2.25M
    int len = (int)(ip - anchor);
382
2.25M
    if (len < minlen) {
383
1.72M
      LITERAL2(ip, anchor, copy)
384
1.72M
      continue;
385
1.72M
    }
386
387
    /* if we haven't copied anything, adjust the output counter */
388
529k
    if (!copy)
389
118k
      oc--;
390
    /* reset literal counter */
391
529k
    copy = 0;
392
393
    /* encode the match */
394
529k
    if (distance < MAX_DISTANCE) {
395
526k
      if (len >= 7) {
396
287k
        oc += ((len - 7) / 255) + 1;
397
287k
      }
398
526k
      oc += 2;
399
526k
    }
400
2.79k
    else {
401
      /* far away, but not yet in the another galaxy... */
402
2.79k
      if (len >= 7) {
403
1.95k
        oc += ((len - 7) / 255) + 1;
404
1.95k
      }
405
2.79k
      oc += 4;
406
2.79k
    }
407
408
    /* update the hash at match boundary */
409
529k
    seq = BLOSCLZ_READU32(ip);
410
529k
    HASH_FUNCTION(hval, seq, hashlog)
411
529k
    htab[hval] = (uint32_t)(ip++ - ibase);
412
529k
    ip++;
413
    /* assuming literal copy */
414
529k
    oc++;
415
529k
  }
416
417
47.0k
  double ic = (double)(ip - ibase);
418
47.0k
  return ic / (double)oc;
419
47.0k
}
420
421
422
int blosclz_compress(const int clevel, const void* input, int length,
423
47.0k
                     void* output, int maxout, blosc2_context* ctx) {
424
47.0k
  BLOSC_UNUSED_PARAM(ctx);
425
47.0k
  uint8_t* ibase = (uint8_t*)input;
426
47.0k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
427
428
  /* When we go back in a match (shift), we obtain quite different compression properties.
429
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
430
   * Fallback to 4 because it provides more consistent results for large cratios.
431
   * UPDATE: new experiments show that using a value of 3 is a bit better, at least for ERA5.
432
   * UPDATE 2: go back to 4, as they seem to provide better cratios in general.
433
   *
434
   * In this block we also check cratios for the beginning of the buffers and
435
   * eventually discard those that are small (take too long to decompress).
436
   * This process is called _entropy probing_.
437
   */
438
47.0k
  unsigned ipshift = 4;
439
  // Minimum lengths for encoding (normally it is good to match the shift value)
440
47.0k
  unsigned minlen = 4;
441
442
47.0k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
443
47.0k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
444
47.0k
  uint8_t hashlog = hashlog_[clevel];
445
446
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
447
  // UPDATE: new experiments with ERA5 datasets (float32) say that checking the whole buffer
448
  // is better (specially when combined with bitshuffle).
449
  // The loss in speed for checking the whole buffer is pretty negligible too.
450
47.0k
  int maxlen = length;
451
47.0k
  if (clevel < 2) {
452
38.7k
    maxlen /= 8;
453
38.7k
  }
454
8.26k
  else if (clevel < 4) {
455
1.21k
    maxlen /= 4;
456
1.21k
  }
457
7.05k
  else if (clevel < 7) {
458
1.22k
    maxlen /= 2;
459
1.22k
  }
460
  // Start probing somewhere inside the buffer
461
47.0k
  int shift = length - maxlen;
462
  // Actual entropy probing!
463
47.0k
  double cratio = get_cratio(ibase + shift, maxlen, minlen, ipshift, htab, hashlog);
464
  // discard probes with small compression ratios (too expensive)
465
47.0k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
466
47.0k
  if (cratio < cratio_[clevel]) {
467
14.5k
      goto out;
468
14.5k
  }
469
470
32.5k
  uint8_t* ip = ibase;
471
32.5k
  uint8_t* ip_bound = ibase + length - 1;
472
32.5k
  uint8_t* ip_limit = ibase + length - 12;
473
32.5k
  uint8_t* op = (uint8_t*)output;
474
32.5k
  const uint8_t* op_limit = op + maxout;
475
32.5k
  uint32_t seq;
476
32.5k
  uint8_t copy;
477
32.5k
  uint32_t hval;
478
479
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
480
32.5k
  if (length < 16 || maxout < 66) {
481
161
    return 0;
482
161
  }
483
484
  // Initialize the hash table
485
32.3k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
486
487
  /* we start with literal copy */
488
32.3k
  copy = 4;
489
32.3k
  *op++ = MAX_COPY - 1;
490
32.3k
  *op++ = *ip++;
491
32.3k
  *op++ = *ip++;
492
32.3k
  *op++ = *ip++;
493
32.3k
  *op++ = *ip++;
494
495
  /* main loop */
496
17.4M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
497
17.4M
    const uint8_t* ref;
498
17.4M
    unsigned distance;
499
17.4M
    uint8_t* anchor = ip;    /* comparison starting-point */
500
501
    /* find potential match */
502
17.4M
    seq = BLOSCLZ_READU32(ip);
503
17.4M
    HASH_FUNCTION(hval, seq, hashlog)
504
17.4M
    ref = ibase + htab[hval];
505
506
    /* calculate distance to the match */
507
17.4M
    distance = (unsigned int)(anchor - ref);
508
509
    /* update hash table */
510
17.4M
    htab[hval] = (uint32_t) (anchor - ibase);
511
512
17.4M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
513
34.0k
      LITERAL(ip, op, op_limit, anchor, copy)
514
0
      continue;
515
34.0k
    }
516
517
    /* is this a match? check the first 4 bytes */
518
17.3M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
519
3.18M
      ref += 4;
520
14.1M
    } else {
521
      /* no luck, copy as a literal */
522
14.1M
      LITERAL(ip, op, op_limit, anchor, copy)
523
0
      continue;
524
14.1M
    }
525
526
    /* last matched byte */
527
3.18M
    ip = anchor + 4;
528
529
    /* distance is biased */
530
3.18M
    distance--;
531
532
    /* get runs or matches; zero distance means a run */
533
3.18M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
534
535
    /* length is biased, '1' means a match of 3 bytes */
536
3.18M
    ip -= ipshift;
537
538
3.18M
    unsigned len = (int)(ip - anchor);
539
540
    // Encoding short lengths is expensive during decompression
541
3.18M
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
542
2.27M
      LITERAL(ip, op, op_limit, anchor, copy)
543
0
      continue;
544
2.27M
    }
545
546
    /* if we have copied something, adjust the copy count */
547
913k
    if (copy)
548
      /* copy is biased, '0' means 1 byte copy */
549
648k
      *(op - copy - 1) = (uint8_t)(copy - 1);
550
265k
    else
551
      /* back, to overwrite the copy count */
552
265k
      op--;
553
    /* reset literal counter */
554
913k
    copy = 0;
555
556
    /* encode the match */
557
913k
    if (distance < MAX_DISTANCE) {
558
897k
      if (len < 7) {
559
361k
        MATCH_SHORT(op, op_limit, len, distance)
560
535k
      } else {
561
1.60M
        MATCH_LONG(op, op_limit, len, distance)
562
1.60M
      }
563
897k
    } else {
564
      /* far away, but not yet in the another galaxy... */
565
16.5k
      distance -= MAX_DISTANCE;
566
16.5k
      if (len < 7) {
567
1.21k
        MATCH_SHORT_FAR(op, op_limit, len, distance)
568
15.2k
      } else {
569
45.8k
        MATCH_LONG_FAR(op, op_limit, len, distance)
570
45.8k
      }
571
16.5k
    }
572
573
    /* update the hash at match boundary */
574
913k
    seq = BLOSCLZ_READU32(ip);
575
913k
    HASH_FUNCTION(hval, seq, hashlog)
576
913k
    htab[hval] = (uint32_t) (ip++ - ibase);
577
913k
    if (clevel == 9) {
578
      // In some situations, including a second hash proves to be useful,
579
      // but not in others.  Activating here in max clevel only.
580
358k
      seq >>= 8U;
581
358k
      HASH_FUNCTION(hval, seq, hashlog)
582
358k
      htab[hval] = (uint32_t) (ip++ - ibase);
583
358k
    }
584
554k
    else {
585
554k
      ip++;
586
554k
    }
587
588
913k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
589
76
      goto out;
590
591
    /* assuming literal copy */
592
913k
    *op++ = MAX_COPY - 1;
593
913k
  }
594
595
  /* left-over as literal copy */
596
175k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
597
143k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
598
143k
    *op++ = *ip++;
599
143k
    copy++;
600
143k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
601
675
      copy = 0;
602
675
      *op++ = MAX_COPY - 1;
603
675
    }
604
143k
  }
605
606
  /* if we have copied something, adjust the copy length */
607
31.5k
  if (copy)
608
31.3k
    *(op - copy - 1) = (uint8_t)(copy - 1);
609
149
  else
610
149
    op--;
611
612
  /* marker for blosclz */
613
31.5k
  *(uint8_t*)output |= (1U << 5U);
614
615
31.5k
  return (int)(op - (uint8_t*)output);
616
617
15.3k
  out:
618
15.3k
  return 0;
619
31.7k
}
620
621
// See https://habr.com/en/company/yandex/blog/457612/
622
#if defined(__AVX2__)
623
624
#if defined(_MSC_VER)
625
#define ALIGNED_(x) __declspec(align(x))
626
#else
627
#if defined(__GNUC__)
628
#define ALIGNED_(x) __attribute__ ((aligned(x)))
629
#endif
630
#endif
631
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
632
633
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
634
{
635
  size_t offset = op - match;
636
  while (len >= 16) {
637
638
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
639
      {
640
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
641
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
642
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
643
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
644
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
645
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
646
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
648
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
651
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
652
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
653
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
654
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
655
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
656
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
657
      };
658
659
    _mm_storeu_si128((__m128i *)(op),
660
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
661
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
662
663
    match += masks[offset];
664
665
    op += 16;
666
    len -= 16;
667
  }
668
  // Deal with remainders
669
  for (; len > 0; len--) {
670
    *op++ = *match++;
671
  }
672
  return op;
673
}
674
#endif
675
676
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
677
255k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
678
255k
  uint8_t* d = out;
679
255k
  const uint8_t* s = from;
680
255k
  uint8_t* const e = end;
681
682
1.38M
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
683
255k
}
684
685
14.3k
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
686
14.3k
  const uint8_t* ip = (const uint8_t*)input;
687
14.3k
  const uint8_t* ip_limit = ip + length;
688
14.3k
  uint8_t* op = (uint8_t*)output;
689
14.3k
  uint32_t ctrl;
690
14.3k
  uint8_t* op_limit = op + maxout;
691
14.3k
  if (BLOSCLZ_UNLIKELY(length == 0)) {
692
0
    return 0;
693
0
  }
694
14.3k
  ctrl = (*ip++) & 31U;
695
696
1.11M
  while (1) {
697
1.11M
    if (ctrl >= 32) {
698
      // match
699
384k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
700
384k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
701
384k
      uint8_t code;
702
384k
      const uint8_t* ref = op - ofs;
703
704
384k
      if (len == 7 - 1) {
705
245k
        do {
706
245k
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
707
1
            return 0;
708
1
          }
709
245k
          code = *ip++;
710
245k
          len += code;
711
245k
        } while (code == 255);
712
213k
      }
713
171k
      else {
714
171k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
715
3
          return 0;
716
3
        }
717
171k
      }
718
384k
      code = *ip++;
719
384k
      len += 3;
720
384k
      ref -= code;
721
722
      /* match from 16-bit distance */
723
384k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
724
23.0k
        if (ofs == (31U << 8U)) {
725
9.58k
          if (ip + 1 >= ip_limit) {
726
0
            return 0;
727
0
          }
728
9.58k
          ofs = (*ip++) << 8U;
729
9.58k
          ofs += *ip++;
730
9.58k
          ref = op - ofs - MAX_DISTANCE;
731
9.58k
        }
732
23.0k
      }
733
734
384k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
735
66
        return 0;
736
66
      }
737
738
384k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
739
40
        return 0;
740
40
      }
741
742
384k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
743
384k
      ctrl = *ip++;
744
745
384k
      ref--;
746
384k
      if (ref == op - 1) {
747
        /* optimized copy for a run */
748
44.9k
        memset(op, *ref, len);
749
44.9k
        op += len;
750
44.9k
      }
751
339k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
752
        // copy with an overlap not larger than 8
753
255k
        wild_copy(op, ref, op + len);
754
255k
        op += len;
755
255k
      }
756
84.6k
      else {
757
        // general copy with any overlap
758
#if 0 && defined(__AVX2__)
759
        if (op - ref <= 16) {
760
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
761
          // it is not too slower either.
762
          op = copy_match_16(op, ref, len);
763
        }
764
        else {
765
#endif
766
84.6k
          op = copy_match(op, ref, (unsigned) len);
767
#if 0 && defined(__AVX2__)
768
        }
769
#endif
770
84.6k
      }
771
384k
    }
772
730k
    else {
773
      // literal
774
730k
      ctrl++;
775
730k
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
776
50
        return 0;
777
50
      }
778
730k
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
779
4
        return 0;
780
4
      }
781
782
730k
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
783
      // On GCC-6, fastcopy this is still faster than plain memcpy
784
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
785
      // in performance.
786
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
787
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
788
789
730k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
790
716k
      ctrl = *ip++;
791
716k
    }
792
1.11M
  }
793
794
14.1k
  return (int)(op - (uint8_t*)output);
795
14.3k
}