Coverage Report

Created: 2025-10-10 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include "blosclz.h"
18
#include "fastcopy.h"
19
#include "blosc2/blosc2-common.h"
20
21
#include <stdbool.h>
22
#include <stdio.h>
23
#include <stdint.h>
24
#include <string.h>
25
26
/*
27
 * Give hints to the compiler for branch prediction optimization.
28
 * This is not necessary anymore with modern CPUs.
29
 */
30
#if 0 && defined(__GNUC__) && (__GNUC__ > 2)
31
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
32
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
33
#else
34
15.3M
#define BLOSCLZ_LIKELY(c)    (c)
35
32.4M
#define BLOSCLZ_UNLIKELY(c)  (c)
36
#endif
37
38
/*
39
 * Use inlined functions for supported systems.
40
 */
41
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
42
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
43
#endif
44
45
478k
#define MAX_COPY 32U
46
16.2M
#define MAX_DISTANCE 8191
47
15.3M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
48
49
#ifdef BLOSC_STRICT_ALIGN
50
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
51
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
52
#else
53
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
54
33.6M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
55
#endif
56
57
47.6k
#define HASH_LOG (14U)
58
59
// This is used in LZ4 and seems to work pretty well here too
60
16.3M
#define HASH_FUNCTION(v, s, h) {      \
61
16.3M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
62
16.3M
}
63
64
65
#if defined(__AVX2__)
66
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
67
    uint8_t x = ip[-1];
68
69
    while (ip < (ip_bound - (sizeof(__m256i)))) {
70
        __m256i value, value2, cmp;
71
        /* Broadcast the value for every byte in a 256-bit register */
72
        memset(&value, x, sizeof(__m256i));
73
        value2 = _mm256_loadu_si256((__m256i *)ref);
74
        cmp = _mm256_cmpeq_epi64(value, value2);
75
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
76
            /* Return the byte that starts to differ */
77
            while (*ref++ == x) ip++;
78
            return ip;
79
        }
80
        else {
81
            ip += sizeof(__m256i);
82
            ref += sizeof(__m256i);
83
        }
84
    }
85
    /* Look into the remainder */
86
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
87
    return ip;
88
}
89
#endif
90
91
#if defined(__SSE2__)
92
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
93
0
  uint8_t x = ip[-1];
94
95
0
  while (ip < (ip_bound - sizeof(__m128i))) {
96
0
    __m128i value, value2, cmp;
97
    /* Broadcast the value for every byte in a 128-bit register */
98
0
    memset(&value, x, sizeof(__m128i));
99
0
    value2 = _mm_loadu_si128((__m128i *)ref);
100
0
    cmp = _mm_cmpeq_epi32(value, value2);
101
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
102
      /* Return the byte that starts to differ */
103
0
      while (*ref++ == x) ip++;
104
0
      return ip;
105
0
    }
106
0
    else {
107
0
      ip += sizeof(__m128i);
108
0
      ref += sizeof(__m128i);
109
0
    }
110
0
  }
111
  /* Look into the remainder */
112
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
113
0
  return ip;
114
0
}
115
116
#endif
117
118
119
596k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
120
596k
  uint8_t x = ip[-1];
121
596k
  int64_t value, value2;
122
  /* Broadcast the value for every byte in a 64-bit register */
123
596k
  memset(&value, x, 8);
124
  /* safe because the outer check against ip limit */
125
865k
  while (ip < (ip_bound - sizeof(int64_t))) {
126
#if defined(BLOSC_STRICT_ALIGN)
127
    memcpy(&value2, ref, 8);
128
#else
129
863k
    value2 = ((int64_t*)ref)[0];
130
863k
#endif
131
863k
    if (value != value2) {
132
      /* Return the byte that starts to differ */
133
1.83M
      while (*ref++ == x) ip++;
134
595k
      return ip;
135
595k
    }
136
268k
    else {
137
268k
      ip += 8;
138
268k
      ref += 8;
139
268k
    }
140
863k
  }
141
  /* Look into the remainder */
142
6.58k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
143
1.70k
  return ip;
144
596k
}
145
146
147
/* Return the byte that starts to differ */
148
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
149
0
#if !defined(BLOSC_STRICT_ALIGN)
150
0
  while (ip < (ip_bound - sizeof(int64_t))) {
151
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
152
      /* Return the byte that starts to differ */
153
0
      while (*ref++ == *ip++) {}
154
0
      return ip;
155
0
    }
156
0
    else {
157
0
      ip += sizeof(int64_t);
158
0
      ref += sizeof(int64_t);
159
0
    }
160
0
  }
161
0
#endif
162
  /* Look into the remainder */
163
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
164
0
  return ip;
165
0
}
166
167
168
#if defined(__SSE2__)
169
3.32M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
170
3.32M
  __m128i value, value2, cmp;
171
172
3.85M
  while (ip < (ip_bound - sizeof(__m128i))) {
173
3.84M
    value = _mm_loadu_si128((__m128i *) ip);
174
3.84M
    value2 = _mm_loadu_si128((__m128i *) ref);
175
3.84M
    cmp = _mm_cmpeq_epi32(value, value2);
176
3.84M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
177
      /* Return the byte that starts to differ */
178
7.64M
      while (*ref++ == *ip++) {}
179
3.30M
      return ip;
180
3.30M
    }
181
531k
    else {
182
531k
      ip += sizeof(__m128i);
183
531k
      ref += sizeof(__m128i);
184
531k
    }
185
3.84M
  }
186
  /* Look into the remainder */
187
46.1k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
188
10.5k
  return ip;
189
3.32M
}
190
#endif
191
192
193
#if defined(__AVX2__)
194
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
195
196
  while (ip < (ip_bound - sizeof(__m256i))) {
197
    __m256i value, value2, cmp;
198
    value = _mm256_loadu_si256((__m256i *) ip);
199
    value2 = _mm256_loadu_si256((__m256i *)ref);
200
    cmp = _mm256_cmpeq_epi64(value, value2);
201
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
202
      /* Return the byte that starts to differ */
203
      while (*ref++ == *ip++) {}
204
      return ip;
205
    }
206
    else {
207
      ip += sizeof(__m256i);
208
      ref += sizeof(__m256i);
209
    }
210
  }
211
  /* Look into the remainder */
212
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
213
  return ip;
214
}
215
#endif
216
217
218
3.91M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
219
3.91M
  if (BLOSCLZ_UNLIKELY(run)) {
220
#if defined(__AVX2__)
221
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
222
    // ip = get_run_32(ip, ip_bound, ref);
223
    ip = get_run(ip, ip_bound, ref);
224
#elif defined(__SSE2__)
225
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
226
    // ip = get_run_16(ip, ip_bound, ref);
227
596k
    ip = get_run(ip, ip_bound, ref);
228
#else
229
    ip = get_run(ip, ip_bound, ref);
230
#endif
231
596k
  }
232
3.32M
  else {
233
#if defined(__AVX2__)
234
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
235
    // ip = get_match_32(ip, ip_bound, ref);
236
    ip = get_match_16(ip, ip_bound, ref);
237
#elif defined(__SSE2__)
238
    ip = get_match_16(ip, ip_bound, ref);
239
#else
240
    ip = get_match(ip, ip_bound, ref);
241
#endif
242
3.32M
  }
243
244
3.91M
  return ip;
245
3.91M
}
246
247
248
6.20M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
249
6.20M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
250
6.20M
    goto out;                                           \
251
6.20M
  *(op)++ = *(anchor)++;                                \
252
6.20M
  (ip) = (anchor);                                      \
253
6.20M
  (copy)++;                                             \
254
6.20M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
255
99.5k
    (copy) = 0;                                         \
256
99.5k
    *(op)++ = MAX_COPY-1;                               \
257
99.5k
  }                                                     \
258
6.20M
}
259
260
8.45M
#define LITERAL2(ip, anchor, copy) {                    \
261
8.45M
  oc++; (anchor)++;                                     \
262
8.45M
  (ip) = (anchor);                                      \
263
8.45M
  (copy)++;                                             \
264
8.45M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
265
174k
    (copy) = 0;                                         \
266
174k
    oc++;                                               \
267
174k
  }                                                     \
268
8.45M
}
269
270
203k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
271
203k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
272
203k
    goto out;                                             \
273
203k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
274
203k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
275
203k
}
276
277
171k
#define MATCH_LONG(op, op_limit, len, distance) {       \
278
171k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
279
171k
    goto out;                                           \
280
171k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
281
186k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
282
14.7k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
283
14.7k
      goto out;                                         \
284
14.7k
    *(op)++ = 255;                                      \
285
14.7k
  }                                                     \
286
171k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
287
171k
    goto out;                                           \
288
171k
  *(op)++ = (uint8_t)(len);                             \
289
171k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
290
171k
}
291
292
0
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
293
0
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
294
0
    goto out;                                               \
295
0
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
296
0
  *(op)++ = 255;                                            \
297
0
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
298
0
  *(op)++ = (uint8_t)((distance) & 255U);                   \
299
0
}
300
301
0
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
302
0
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
303
0
    goto out;                                               \
304
0
  *(op)++ = (7U << 5U) + 31;                                \
305
0
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
306
0
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
307
0
      goto out;                                             \
308
0
    *(op)++ = 255;                                          \
309
0
  }                                                         \
310
0
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
311
0
    goto out;                                               \
312
0
  *(op)++ = (uint8_t)(len);                                 \
313
0
  *(op)++ = 255;                                            \
314
0
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
315
0
  *(op)++ = (uint8_t)((distance) & 255U);                   \
316
0
}
317
318
319
// Get a guess for the compressed size of a buffer
320
5.29k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift, uint32_t htab[], int8_t hashlog) {
321
5.29k
  uint8_t* ip = ibase;
322
5.29k
  int32_t oc = 0;
323
5.29k
  const uint16_t hashlen = (1U << (uint8_t)hashlog);
324
5.29k
  uint32_t hval;
325
5.29k
  uint32_t seq;
326
5.29k
  uint8_t copy;
327
  // Make a tradeoff between testing too much and too little
328
5.29k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
329
5.29k
  uint8_t* ip_bound = ibase + limit - 1;
330
5.29k
  uint8_t* ip_limit = ibase + limit - 12;
331
332
  // Initialize the hash table to distances of 0
333
5.29k
  memset(htab, 0, hashlen * sizeof(uint32_t));
334
335
  /* we start with literal copy */
336
5.29k
  copy = 4;
337
5.29k
  oc += 5;
338
339
  /* main loop */
340
8.80M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
341
8.79M
    const uint8_t* ref;
342
8.79M
    unsigned distance;
343
8.79M
    uint8_t* anchor = ip;    /* comparison starting-point */
344
345
    /* find potential match */
346
8.79M
    seq = BLOSCLZ_READU32(ip);
347
8.79M
    HASH_FUNCTION(hval, seq, hashlog)
348
8.79M
    ref = ibase + htab[hval];
349
350
    /* calculate distance to the match */
351
8.79M
    distance = (unsigned int)(anchor - ref);
352
353
    /* update hash table */
354
8.79M
    htab[hval] = (uint32_t) (anchor - ibase);
355
356
8.79M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
357
5.25k
      LITERAL2(ip, anchor, copy)
358
5.25k
      continue;
359
5.25k
    }
360
361
    /* is this a match? check the first 4 bytes */
362
8.79M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
363
1.92M
      ref += 4;
364
1.92M
    }
365
6.86M
    else {
366
      /* no luck, copy as a literal */
367
6.86M
      LITERAL2(ip, anchor, copy)
368
6.86M
      continue;
369
6.86M
    }
370
371
    /* last matched byte */
372
1.92M
    ip = anchor + 4;
373
374
    /* distance is biased */
375
1.92M
    distance--;
376
377
    /* get runs or matches; zero distance means a run */
378
1.92M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
379
380
1.92M
    ip -= ipshift;
381
1.92M
    int len = (int)(ip - anchor);
382
1.92M
    if (len < minlen) {
383
1.58M
      LITERAL2(ip, anchor, copy)
384
1.58M
      continue;
385
1.58M
    }
386
387
    /* if we haven't copied anything, adjust the output counter */
388
342k
    if (!copy)
389
76.1k
      oc--;
390
    /* reset literal counter */
391
342k
    copy = 0;
392
393
    /* encode the match */
394
342k
    if (distance < MAX_DISTANCE) {
395
342k
      if (len >= 7) {
396
145k
        oc += ((len - 7) / 255) + 1;
397
145k
      }
398
342k
      oc += 2;
399
342k
    }
400
0
    else {
401
      /* far away, but not yet in the another galaxy... */
402
0
      if (len >= 7) {
403
0
        oc += ((len - 7) / 255) + 1;
404
0
      }
405
0
      oc += 4;
406
0
    }
407
408
    /* update the hash at match boundary */
409
342k
    seq = BLOSCLZ_READU32(ip);
410
342k
    HASH_FUNCTION(hval, seq, hashlog)
411
342k
    htab[hval] = (uint32_t)(ip++ - ibase);
412
342k
    ip++;
413
    /* assuming literal copy */
414
342k
    oc++;
415
342k
  }
416
417
5.29k
  double ic = (double)(ip - ibase);
418
5.29k
  return ic / (double)oc;
419
5.29k
}
420
421
422
int blosclz_compress(const int clevel, const void* input, int length,
423
5.29k
                     void* output, int maxout, blosc2_context* ctx) {
424
5.29k
  BLOSC_UNUSED_PARAM(ctx);
425
5.29k
  uint8_t* ibase = (uint8_t*)input;
426
5.29k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
427
428
  /* When we go back in a match (shift), we obtain quite different compression properties.
429
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
430
   * Fallback to 4 because it provides more consistent results for large cratios.
431
   * UPDATE: new experiments show that using a value of 3 is a bit better, at least for ERA5.
432
   * UPDATE 2: go back to 4, as they seem to provide better cratios in general.
433
   *
434
   * In this block we also check cratios for the beginning of the buffers and
435
   * eventually discard those that are small (take too long to decompress).
436
   * This process is called _entropy probing_.
437
   */
438
5.29k
  unsigned ipshift = 4;
439
  // Minimum lengths for encoding (normally it is good to match the shift value)
440
5.29k
  unsigned minlen = 4;
441
442
5.29k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
443
5.29k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
444
5.29k
  uint8_t hashlog = hashlog_[clevel];
445
446
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
447
  // UPDATE: new experiments with ERA5 datasets (float32) say that checking the whole buffer
448
  // is better (specially when combined with bitshuffle).
449
  // The loss in speed for checking the whole buffer is pretty negligible too.
450
5.29k
  int maxlen = length;
451
5.29k
  if (clevel < 2) {
452
573
    maxlen /= 8;
453
573
  }
454
4.72k
  else if (clevel < 4) {
455
627
    maxlen /= 4;
456
627
  }
457
4.09k
  else if (clevel < 7) {
458
725
    maxlen /= 2;
459
725
  }
460
  // Start probing somewhere inside the buffer
461
5.29k
  int shift = length - maxlen;
462
  // Actual entropy probing!
463
5.29k
  double cratio = get_cratio(ibase + shift, maxlen, minlen, ipshift, htab, hashlog);
464
  // discard probes with small compression ratios (too expensive)
465
5.29k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
466
5.29k
  if (cratio < cratio_[clevel]) {
467
1.15k
      goto out;
468
1.15k
  }
469
470
4.14k
  uint8_t* ip = ibase;
471
4.14k
  uint8_t* ip_bound = ibase + length - 1;
472
4.14k
  uint8_t* ip_limit = ibase + length - 12;
473
4.14k
  uint8_t* op = (uint8_t*)output;
474
4.14k
  const uint8_t* op_limit = op + maxout;
475
4.14k
  uint32_t seq;
476
4.14k
  uint8_t copy;
477
4.14k
  uint32_t hval;
478
479
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
480
4.14k
  if (length < 16 || maxout < 66) {
481
51
    return 0;
482
51
  }
483
484
  // Initialize the hash table
485
4.09k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
486
487
  /* we start with literal copy */
488
4.09k
  copy = 4;
489
4.09k
  *op++ = MAX_COPY - 1;
490
4.09k
  *op++ = *ip++;
491
4.09k
  *op++ = *ip++;
492
4.09k
  *op++ = *ip++;
493
4.09k
  *op++ = *ip++;
494
495
  /* main loop */
496
6.58M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
497
6.57M
    const uint8_t* ref;
498
6.57M
    unsigned distance;
499
6.57M
    uint8_t* anchor = ip;    /* comparison starting-point */
500
501
    /* find potential match */
502
6.57M
    seq = BLOSCLZ_READU32(ip);
503
6.57M
    HASH_FUNCTION(hval, seq, hashlog)
504
6.57M
    ref = ibase + htab[hval];
505
506
    /* calculate distance to the match */
507
6.57M
    distance = (unsigned int)(anchor - ref);
508
509
    /* update hash table */
510
6.57M
    htab[hval] = (uint32_t) (anchor - ibase);
511
512
6.57M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
513
0
      LITERAL(ip, op, op_limit, anchor, copy)
514
0
      continue;
515
0
    }
516
517
    /* is this a match? check the first 4 bytes */
518
6.57M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
519
1.98M
      ref += 4;
520
4.59M
    } else {
521
      /* no luck, copy as a literal */
522
4.59M
      LITERAL(ip, op, op_limit, anchor, copy)
523
0
      continue;
524
4.59M
    }
525
526
    /* last matched byte */
527
1.98M
    ip = anchor + 4;
528
529
    /* distance is biased */
530
1.98M
    distance--;
531
532
    /* get runs or matches; zero distance means a run */
533
1.98M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
534
535
    /* length is biased, '1' means a match of 3 bytes */
536
1.98M
    ip -= ipshift;
537
538
1.98M
    unsigned len = (int)(ip - anchor);
539
540
    // Encoding short lengths is expensive during decompression
541
1.98M
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
542
1.61M
      LITERAL(ip, op, op_limit, anchor, copy)
543
0
      continue;
544
1.61M
    }
545
546
    /* if we have copied something, adjust the copy count */
547
374k
    if (copy)
548
      /* copy is biased, '0' means 1 byte copy */
549
285k
      *(op - copy - 1) = (uint8_t)(copy - 1);
550
89.8k
    else
551
      /* back, to overwrite the copy count */
552
89.8k
      op--;
553
    /* reset literal counter */
554
374k
    copy = 0;
555
556
    /* encode the match */
557
374k
    if (distance < MAX_DISTANCE) {
558
374k
      if (len < 7) {
559
203k
        MATCH_SHORT(op, op_limit, len, distance)
560
203k
      } else {
561
514k
        MATCH_LONG(op, op_limit, len, distance)
562
514k
      }
563
374k
    } else {
564
      /* far away, but not yet in the another galaxy... */
565
0
      distance -= MAX_DISTANCE;
566
0
      if (len < 7) {
567
0
        MATCH_SHORT_FAR(op, op_limit, len, distance)
568
0
      } else {
569
0
        MATCH_LONG_FAR(op, op_limit, len, distance)
570
0
      }
571
0
    }
572
573
    /* update the hash at match boundary */
574
374k
    seq = BLOSCLZ_READU32(ip);
575
374k
    HASH_FUNCTION(hval, seq, hashlog)
576
374k
    htab[hval] = (uint32_t) (ip++ - ibase);
577
374k
    if (clevel == 9) {
578
      // In some situations, including a second hash proves to be useful,
579
      // but not in others.  Activating here in max clevel only.
580
277k
      seq >>= 8U;
581
277k
      HASH_FUNCTION(hval, seq, hashlog)
582
277k
      htab[hval] = (uint32_t) (ip++ - ibase);
583
277k
    }
584
97.3k
    else {
585
97.3k
      ip++;
586
97.3k
    }
587
588
374k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
589
4
      goto out;
590
591
    /* assuming literal copy */
592
374k
    *op++ = MAX_COPY - 1;
593
374k
  }
594
595
  /* left-over as literal copy */
596
31.2k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
597
27.1k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
598
27.1k
    *op++ = *ip++;
599
27.1k
    copy++;
600
27.1k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
601
218
      copy = 0;
602
218
      *op++ = MAX_COPY - 1;
603
218
    }
604
27.1k
  }
605
606
  /* if we have copied something, adjust the copy length */
607
4.04k
  if (copy)
608
4.01k
    *(op - copy - 1) = (uint8_t)(copy - 1);
609
32
  else
610
32
    op--;
611
612
  /* marker for blosclz */
613
4.04k
  *(uint8_t*)output |= (1U << 5U);
614
615
4.04k
  return (int)(op - (uint8_t*)output);
616
617
1.19k
  out:
618
1.19k
  return 0;
619
4.06k
}
620
621
// See https://habr.com/en/company/yandex/blog/457612/
622
#if defined(__AVX2__)
623
624
#if defined(_MSC_VER)
625
#define ALIGNED_(x) __declspec(align(x))
626
#else
627
#if defined(__GNUC__)
628
#define ALIGNED_(x) __attribute__ ((aligned(x)))
629
#endif
630
#endif
631
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
632
633
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
634
{
635
  size_t offset = op - match;
636
  while (len >= 16) {
637
638
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
639
      {
640
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
641
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
642
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
643
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
644
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
645
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
646
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
648
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
651
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
652
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
653
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
654
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
655
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
656
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
657
      };
658
659
    _mm_storeu_si128((__m128i *)(op),
660
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
661
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
662
663
    match += masks[offset];
664
665
    op += 16;
666
    len -= 16;
667
  }
668
  // Deal with remainders
669
  for (; len > 0; len--) {
670
    *op++ = *match++;
671
  }
672
  return op;
673
}
674
#endif
675
676
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
677
753
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
678
753
  uint8_t* d = out;
679
753
  const uint8_t* s = from;
680
753
  uint8_t* const e = end;
681
682
1.91k
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
683
753
}
684
685
68
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
686
68
  const uint8_t* ip = (const uint8_t*)input;
687
68
  const uint8_t* ip_limit = ip + length;
688
68
  uint8_t* op = (uint8_t*)output;
689
68
  uint32_t ctrl;
690
68
  uint8_t* op_limit = op + maxout;
691
68
  if (BLOSCLZ_UNLIKELY(length == 0)) {
692
0
    return 0;
693
0
  }
694
68
  ctrl = (*ip++) & 31U;
695
696
10.0k
  while (1) {
697
10.0k
    if (ctrl >= 32) {
698
      // match
699
1.18k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
700
1.18k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
701
1.18k
      uint8_t code;
702
1.18k
      const uint8_t* ref = op - ofs;
703
704
1.18k
      if (len == 7 - 1) {
705
486
        do {
706
486
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
707
0
            return 0;
708
0
          }
709
486
          code = *ip++;
710
486
          len += code;
711
486
        } while (code == 255);
712
481
      }
713
700
      else {
714
700
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
715
0
          return 0;
716
0
        }
717
700
      }
718
1.18k
      code = *ip++;
719
1.18k
      len += 3;
720
1.18k
      ref -= code;
721
722
      /* match from 16-bit distance */
723
1.18k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
724
268
        if (ofs == (31U << 8U)) {
725
0
          if (ip + 1 >= ip_limit) {
726
0
            return 0;
727
0
          }
728
0
          ofs = (*ip++) << 8U;
729
0
          ofs += *ip++;
730
0
          ref = op - ofs - MAX_DISTANCE;
731
0
        }
732
268
      }
733
734
1.18k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
735
0
        return 0;
736
0
      }
737
738
1.18k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
739
0
        return 0;
740
0
      }
741
742
1.18k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
743
1.18k
      ctrl = *ip++;
744
745
1.18k
      ref--;
746
1.18k
      if (ref == op - 1) {
747
        /* optimized copy for a run */
748
258
        memset(op, *ref, len);
749
258
        op += len;
750
258
      }
751
923
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
752
        // copy with an overlap not larger than 8
753
753
        wild_copy(op, ref, op + len);
754
753
        op += len;
755
753
      }
756
170
      else {
757
        // general copy with any overlap
758
#if 0 && defined(__AVX2__)
759
        if (op - ref <= 16) {
760
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
761
          // it is not too slower either.
762
          op = copy_match_16(op, ref, len);
763
        }
764
        else {
765
#endif
766
170
          op = copy_match(op, ref, (unsigned) len);
767
#if 0 && defined(__AVX2__)
768
        }
769
#endif
770
170
      }
771
1.18k
    }
772
8.83k
    else {
773
      // literal
774
8.83k
      ctrl++;
775
8.83k
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
776
0
        return 0;
777
0
      }
778
8.83k
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
779
0
        return 0;
780
0
      }
781
782
8.83k
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
783
      // On GCC-6, fastcopy this is still faster than plain memcpy
784
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
785
      // in performance.
786
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
787
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
788
789
8.83k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
790
8.76k
      ctrl = *ip++;
791
8.76k
    }
792
10.0k
  }
793
794
68
  return (int)(op - (uint8_t*)output);
795
68
}