Coverage Report

Created: 2025-11-23 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include "blosclz.h"
18
#include "fastcopy.h"
19
#include "blosc2/blosc2-common.h"
20
21
#include <stdbool.h>
22
#include <stdio.h>
23
#include <stdint.h>
24
#include <string.h>
25
26
/*
27
 * Give hints to the compiler for branch prediction optimization.
28
 * This is not necessary anymore with modern CPUs.
29
 */
30
#if 0 && defined(__GNUC__) && (__GNUC__ > 2)
31
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
32
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
33
#else
34
21.0M
#define BLOSCLZ_LIKELY(c)    (c)
35
47.7M
#define BLOSCLZ_UNLIKELY(c)  (c)
36
#endif
37
38
/*
39
 * Use inlined functions for supported systems.
40
 */
41
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
42
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
43
#endif
44
45
804k
#define MAX_COPY 32U
46
21.7M
#define MAX_DISTANCE 8191
47
20.9M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
48
49
#ifdef BLOSC_STRICT_ALIGN
50
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
51
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
52
#else
53
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
54
41.2M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
55
#endif
56
57
333k
#define HASH_LOG (14U)
58
59
// This is used in LZ4 and seems to work pretty well here too
60
21.6M
#define HASH_FUNCTION(v, s, h) {      \
61
21.6M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
62
21.6M
}
63
64
65
#if defined(__AVX2__)
66
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
67
    uint8_t x = ip[-1];
68
69
    while (ip < (ip_bound - (sizeof(__m256i)))) {
70
        __m256i value, value2, cmp;
71
        /* Broadcast the value for every byte in a 256-bit register */
72
        memset(&value, x, sizeof(__m256i));
73
        value2 = _mm256_loadu_si256((__m256i *)ref);
74
        cmp = _mm256_cmpeq_epi64(value, value2);
75
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
76
            /* Return the byte that starts to differ */
77
            while (*ref++ == x) ip++;
78
            return ip;
79
        }
80
        else {
81
            ip += sizeof(__m256i);
82
            ref += sizeof(__m256i);
83
        }
84
    }
85
    /* Look into the remainder */
86
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
87
    return ip;
88
}
89
#endif
90
91
#if defined(__SSE2__)
92
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
93
0
  uint8_t x = ip[-1];
94
95
0
  while (ip < (ip_bound - sizeof(__m128i))) {
96
0
    __m128i value, value2, cmp;
97
    /* Broadcast the value for every byte in a 128-bit register */
98
0
    memset(&value, x, sizeof(__m128i));
99
0
    value2 = _mm_loadu_si128((__m128i *)ref);
100
0
    cmp = _mm_cmpeq_epi32(value, value2);
101
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
102
      /* Return the byte that starts to differ */
103
0
      while (*ref++ == x) ip++;
104
0
      return ip;
105
0
    }
106
0
    else {
107
0
      ip += sizeof(__m128i);
108
0
      ref += sizeof(__m128i);
109
0
    }
110
0
  }
111
  /* Look into the remainder */
112
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
113
0
  return ip;
114
0
}
115
116
#endif
117
118
119
357k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
120
357k
  uint8_t x = ip[-1];
121
357k
  int64_t value, value2;
122
  /* Broadcast the value for every byte in a 64-bit register */
123
357k
  memset(&value, x, 8);
124
  /* safe because the outer check against ip limit */
125
883k
  while (ip < (ip_bound - sizeof(int64_t))) {
126
#if defined(BLOSC_STRICT_ALIGN)
127
    memcpy(&value2, ref, 8);
128
#else
129
869k
    value2 = ((int64_t*)ref)[0];
130
869k
#endif
131
869k
    if (value != value2) {
132
      /* Return the byte that starts to differ */
133
1.09M
      while (*ref++ == x) ip++;
134
343k
      return ip;
135
343k
    }
136
525k
    else {
137
525k
      ip += 8;
138
525k
      ref += 8;
139
525k
    }
140
869k
  }
141
  /* Look into the remainder */
142
50.1k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
143
14.2k
  return ip;
144
357k
}
145
146
147
/* Return the byte that starts to differ */
148
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
149
0
#if !defined(BLOSC_STRICT_ALIGN)
150
0
  while (ip < (ip_bound - sizeof(int64_t))) {
151
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
152
      /* Return the byte that starts to differ */
153
0
      while (*ref++ == *ip++) {}
154
0
      return ip;
155
0
    }
156
0
    else {
157
0
      ip += sizeof(int64_t);
158
0
      ref += sizeof(int64_t);
159
0
    }
160
0
  }
161
0
#endif
162
  /* Look into the remainder */
163
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
164
0
  return ip;
165
0
}
166
167
168
#if defined(__SSE2__)
169
1.44M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
170
1.44M
  __m128i value, value2, cmp;
171
172
3.08M
  while (ip < (ip_bound - sizeof(__m128i))) {
173
3.04M
    value = _mm_loadu_si128((__m128i *) ip);
174
3.04M
    value2 = _mm_loadu_si128((__m128i *) ref);
175
3.04M
    cmp = _mm_cmpeq_epi32(value, value2);
176
3.04M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
177
      /* Return the byte that starts to differ */
178
5.39M
      while (*ref++ == *ip++) {}
179
1.39M
      return ip;
180
1.39M
    }
181
1.64M
    else {
182
1.64M
      ip += sizeof(__m128i);
183
1.64M
      ref += sizeof(__m128i);
184
1.64M
    }
185
3.04M
  }
186
  /* Look into the remainder */
187
322k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
188
46.9k
  return ip;
189
1.44M
}
190
#endif
191
192
193
#if defined(__AVX2__)
194
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
195
196
  while (ip < (ip_bound - sizeof(__m256i))) {
197
    __m256i value, value2, cmp;
198
    value = _mm256_loadu_si256((__m256i *) ip);
199
    value2 = _mm256_loadu_si256((__m256i *)ref);
200
    cmp = _mm256_cmpeq_epi64(value, value2);
201
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
202
      /* Return the byte that starts to differ */
203
      while (*ref++ == *ip++) {}
204
      return ip;
205
    }
206
    else {
207
      ip += sizeof(__m256i);
208
      ref += sizeof(__m256i);
209
    }
210
  }
211
  /* Look into the remainder */
212
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
213
  return ip;
214
}
215
#endif
216
217
218
1.79M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
219
1.79M
  if (BLOSCLZ_UNLIKELY(run)) {
220
#if defined(__AVX2__)
221
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
222
    // ip = get_run_32(ip, ip_bound, ref);
223
    ip = get_run(ip, ip_bound, ref);
224
#elif defined(__SSE2__)
225
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
226
    // ip = get_run_16(ip, ip_bound, ref);
227
357k
    ip = get_run(ip, ip_bound, ref);
228
#else
229
    ip = get_run(ip, ip_bound, ref);
230
#endif
231
357k
  }
232
1.44M
  else {
233
#if defined(__AVX2__)
234
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
235
    // ip = get_match_32(ip, ip_bound, ref);
236
    ip = get_match_16(ip, ip_bound, ref);
237
#elif defined(__SSE2__)
238
    ip = get_match_16(ip, ip_bound, ref);
239
#else
240
    ip = get_match(ip, ip_bound, ref);
241
#endif
242
1.44M
  }
243
244
1.79M
  return ip;
245
1.79M
}
246
247
248
10.5M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
249
10.5M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
250
10.5M
    goto out;                                           \
251
10.5M
  *(op)++ = *(anchor)++;                                \
252
10.5M
  (ip) = (anchor);                                      \
253
10.5M
  (copy)++;                                             \
254
10.5M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
255
265k
    (copy) = 0;                                         \
256
265k
    *(op)++ = MAX_COPY-1;                               \
257
265k
  }                                                     \
258
10.5M
}
259
260
9.66M
#define LITERAL2(ip, anchor, copy) {                    \
261
9.66M
  oc++; (anchor)++;                                     \
262
9.66M
  (ip) = (anchor);                                      \
263
9.66M
  (copy)++;                                             \
264
9.66M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
265
271k
    (copy) = 0;                                         \
266
271k
    oc++;                                               \
267
271k
  }                                                     \
268
9.66M
}
269
270
176k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
271
176k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
272
176k
    goto out;                                             \
273
176k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
274
176k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
275
176k
}
276
277
322k
#define MATCH_LONG(op, op_limit, len, distance) {       \
278
322k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
279
322k
    goto out;                                           \
280
322k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
281
363k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
282
41.0k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
283
41.0k
      goto out;                                         \
284
41.0k
    *(op)++ = 255;                                      \
285
41.0k
  }                                                     \
286
322k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
287
322k
    goto out;                                           \
288
322k
  *(op)++ = (uint8_t)(len);                             \
289
322k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
290
322k
}
291
292
1.06k
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
293
1.06k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
294
1.06k
    goto out;                                               \
295
1.06k
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
296
1.05k
  *(op)++ = 255;                                            \
297
1.05k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
298
1.05k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
299
1.05k
}
300
301
14.0k
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
302
14.0k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
303
14.0k
    goto out;                                               \
304
14.0k
  *(op)++ = (7U << 5U) + 31;                                \
305
23.2k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
306
9.19k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
307
9.19k
      goto out;                                             \
308
9.19k
    *(op)++ = 255;                                          \
309
9.18k
  }                                                         \
310
14.0k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
311
14.0k
    goto out;                                               \
312
14.0k
  *(op)++ = (uint8_t)(len);                                 \
313
13.9k
  *(op)++ = 255;                                            \
314
13.9k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
315
13.9k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
316
13.9k
}
317
318
319
// Get a guess for the compressed size of a buffer
320
37.0k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift, uint32_t htab[], int8_t hashlog) {
321
37.0k
  uint8_t* ip = ibase;
322
37.0k
  int32_t oc = 0;
323
37.0k
  const uint16_t hashlen = (1U << (uint8_t)hashlog);
324
37.0k
  uint32_t hval;
325
37.0k
  uint32_t seq;
326
37.0k
  uint8_t copy;
327
  // Make a tradeoff between testing too much and too little
328
37.0k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
329
37.0k
  uint8_t* ip_bound = ibase + limit - 1;
330
37.0k
  uint8_t* ip_limit = ibase + limit - 12;
331
332
  // Initialize the hash table to distances of 0
333
37.0k
  memset(htab, 0, hashlen * sizeof(uint32_t));
334
335
  /* we start with literal copy */
336
37.0k
  copy = 4;
337
37.0k
  oc += 5;
338
339
  /* main loop */
340
9.87M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
341
9.83M
    const uint8_t* ref;
342
9.83M
    unsigned distance;
343
9.83M
    uint8_t* anchor = ip;    /* comparison starting-point */
344
345
    /* find potential match */
346
9.83M
    seq = BLOSCLZ_READU32(ip);
347
9.83M
    HASH_FUNCTION(hval, seq, hashlog)
348
9.83M
    ref = ibase + htab[hval];
349
350
    /* calculate distance to the match */
351
9.83M
    distance = (unsigned int)(anchor - ref);
352
353
    /* update hash table */
354
9.83M
    htab[hval] = (uint32_t) (anchor - ibase);
355
356
9.83M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
357
36.9k
      LITERAL2(ip, anchor, copy)
358
36.9k
      continue;
359
36.9k
    }
360
361
    /* is this a match? check the first 4 bytes */
362
9.79M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
363
468k
      ref += 4;
364
468k
    }
365
9.33M
    else {
366
      /* no luck, copy as a literal */
367
9.33M
      LITERAL2(ip, anchor, copy)
368
9.33M
      continue;
369
9.33M
    }
370
371
    /* last matched byte */
372
468k
    ip = anchor + 4;
373
374
    /* distance is biased */
375
468k
    distance--;
376
377
    /* get runs or matches; zero distance means a run */
378
468k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
379
380
468k
    ip -= ipshift;
381
468k
    int len = (int)(ip - anchor);
382
468k
    if (len < minlen) {
383
293k
      LITERAL2(ip, anchor, copy)
384
293k
      continue;
385
293k
    }
386
387
    /* if we haven't copied anything, adjust the output counter */
388
174k
    if (!copy)
389
46.7k
      oc--;
390
    /* reset literal counter */
391
174k
    copy = 0;
392
393
    /* encode the match */
394
174k
    if (distance < MAX_DISTANCE) {
395
171k
      if (len >= 7) {
396
115k
        oc += ((len - 7) / 255) + 1;
397
115k
      }
398
171k
      oc += 2;
399
171k
    }
400
2.75k
    else {
401
      /* far away, but not yet in the another galaxy... */
402
2.75k
      if (len >= 7) {
403
1.98k
        oc += ((len - 7) / 255) + 1;
404
1.98k
      }
405
2.75k
      oc += 4;
406
2.75k
    }
407
408
    /* update the hash at match boundary */
409
174k
    seq = BLOSCLZ_READU32(ip);
410
174k
    HASH_FUNCTION(hval, seq, hashlog)
411
174k
    htab[hval] = (uint32_t)(ip++ - ibase);
412
174k
    ip++;
413
    /* assuming literal copy */
414
174k
    oc++;
415
174k
  }
416
417
37.0k
  double ic = (double)(ip - ibase);
418
37.0k
  return ic / (double)oc;
419
37.0k
}
420
421
422
int blosclz_compress(const int clevel, const void* input, int length,
423
37.0k
                     void* output, int maxout, blosc2_context* ctx) {
424
37.0k
  BLOSC_UNUSED_PARAM(ctx);
425
37.0k
  uint8_t* ibase = (uint8_t*)input;
426
37.0k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
427
428
  /* When we go back in a match (shift), we obtain quite different compression properties.
429
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
430
   * Fallback to 4 because it provides more consistent results for large cratios.
431
   * UPDATE: new experiments show that using a value of 3 is a bit better, at least for ERA5.
432
   * UPDATE 2: go back to 4, as they seem to provide better cratios in general.
433
   *
434
   * In this block we also check cratios for the beginning of the buffers and
435
   * eventually discard those that are small (take too long to decompress).
436
   * This process is called _entropy probing_.
437
   */
438
37.0k
  unsigned ipshift = 4;
439
  // Minimum lengths for encoding (normally it is good to match the shift value)
440
37.0k
  unsigned minlen = 4;
441
442
37.0k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
443
37.0k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
444
37.0k
  uint8_t hashlog = hashlog_[clevel];
445
446
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
447
  // UPDATE: new experiments with ERA5 datasets (float32) say that checking the whole buffer
448
  // is better (specially when combined with bitshuffle).
449
  // The loss in speed for checking the whole buffer is pretty negligible too.
450
37.0k
  int maxlen = length;
451
37.0k
  if (clevel < 2) {
452
33.3k
    maxlen /= 8;
453
33.3k
  }
454
3.70k
  else if (clevel < 4) {
455
798
    maxlen /= 4;
456
798
  }
457
2.90k
  else if (clevel < 7) {
458
551
    maxlen /= 2;
459
551
  }
460
  // Start probing somewhere inside the buffer
461
37.0k
  int shift = length - maxlen;
462
  // Actual entropy probing!
463
37.0k
  double cratio = get_cratio(ibase + shift, maxlen, minlen, ipshift, htab, hashlog);
464
  // discard probes with small compression ratios (too expensive)
465
37.0k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
466
37.0k
  if (cratio < cratio_[clevel]) {
467
11.3k
      goto out;
468
11.3k
  }
469
470
25.6k
  uint8_t* ip = ibase;
471
25.6k
  uint8_t* ip_bound = ibase + length - 1;
472
25.6k
  uint8_t* ip_limit = ibase + length - 12;
473
25.6k
  uint8_t* op = (uint8_t*)output;
474
25.6k
  const uint8_t* op_limit = op + maxout;
475
25.6k
  uint32_t seq;
476
25.6k
  uint8_t copy;
477
25.6k
  uint32_t hval;
478
479
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
480
25.6k
  if (length < 16 || maxout < 66) {
481
114
    return 0;
482
114
  }
483
484
  // Initialize the hash table
485
25.5k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
486
487
  /* we start with literal copy */
488
25.5k
  copy = 4;
489
25.5k
  *op++ = MAX_COPY - 1;
490
25.5k
  *op++ = *ip++;
491
25.5k
  *op++ = *ip++;
492
25.5k
  *op++ = *ip++;
493
25.5k
  *op++ = *ip++;
494
495
  /* main loop */
496
11.1M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
497
11.1M
    const uint8_t* ref;
498
11.1M
    unsigned distance;
499
11.1M
    uint8_t* anchor = ip;    /* comparison starting-point */
500
501
    /* find potential match */
502
11.1M
    seq = BLOSCLZ_READU32(ip);
503
11.1M
    HASH_FUNCTION(hval, seq, hashlog)
504
11.1M
    ref = ibase + htab[hval];
505
506
    /* calculate distance to the match */
507
11.1M
    distance = (unsigned int)(anchor - ref);
508
509
    /* update hash table */
510
11.1M
    htab[hval] = (uint32_t) (anchor - ibase);
511
512
11.1M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
513
33.8k
      LITERAL(ip, op, op_limit, anchor, copy)
514
0
      continue;
515
33.8k
    }
516
517
    /* is this a match? check the first 4 bytes */
518
11.0M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
519
1.32M
      ref += 4;
520
9.74M
    } else {
521
      /* no luck, copy as a literal */
522
9.74M
      LITERAL(ip, op, op_limit, anchor, copy)
523
0
      continue;
524
9.74M
    }
525
526
    /* last matched byte */
527
1.32M
    ip = anchor + 4;
528
529
    /* distance is biased */
530
1.32M
    distance--;
531
532
    /* get runs or matches; zero distance means a run */
533
1.32M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
534
535
    /* length is biased, '1' means a match of 3 bytes */
536
1.32M
    ip -= ipshift;
537
538
1.32M
    unsigned len = (int)(ip - anchor);
539
540
    // Encoding short lengths is expensive during decompression
541
1.32M
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
542
816k
      LITERAL(ip, op, op_limit, anchor, copy)
543
0
      continue;
544
816k
    }
545
546
    /* if we have copied something, adjust the copy count */
547
513k
    if (copy)
548
      /* copy is biased, '0' means 1 byte copy */
549
337k
      *(op - copy - 1) = (uint8_t)(copy - 1);
550
176k
    else
551
      /* back, to overwrite the copy count */
552
176k
      op--;
553
    /* reset literal counter */
554
513k
    copy = 0;
555
556
    /* encode the match */
557
513k
    if (distance < MAX_DISTANCE) {
558
498k
      if (len < 7) {
559
176k
        MATCH_SHORT(op, op_limit, len, distance)
560
322k
      } else {
561
967k
        MATCH_LONG(op, op_limit, len, distance)
562
967k
      }
563
498k
    } else {
564
      /* far away, but not yet in the another galaxy... */
565
15.0k
      distance -= MAX_DISTANCE;
566
15.0k
      if (len < 7) {
567
1.06k
        MATCH_SHORT_FAR(op, op_limit, len, distance)
568
14.0k
      } else {
569
42.0k
        MATCH_LONG_FAR(op, op_limit, len, distance)
570
42.0k
      }
571
15.0k
    }
572
573
    /* update the hash at match boundary */
574
513k
    seq = BLOSCLZ_READU32(ip);
575
513k
    HASH_FUNCTION(hval, seq, hashlog)
576
513k
    htab[hval] = (uint32_t) (ip++ - ibase);
577
513k
    if (clevel == 9) {
578
      // In some situations, including a second hash proves to be useful,
579
      // but not in others.  Activating here in max clevel only.
580
60.8k
      seq >>= 8U;
581
60.8k
      HASH_FUNCTION(hval, seq, hashlog)
582
60.8k
      htab[hval] = (uint32_t) (ip++ - ibase);
583
60.8k
    }
584
452k
    else {
585
452k
      ip++;
586
452k
    }
587
588
513k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
589
72
      goto out;
590
591
    /* assuming literal copy */
592
513k
    *op++ = MAX_COPY - 1;
593
513k
  }
594
595
  /* left-over as literal copy */
596
137k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
597
112k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
598
112k
    *op++ = *ip++;
599
112k
    copy++;
600
112k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
601
678
      copy = 0;
602
678
      *op++ = MAX_COPY - 1;
603
678
    }
604
112k
  }
605
606
  /* if we have copied something, adjust the copy length */
607
24.7k
  if (copy)
608
24.4k
    *(op - copy - 1) = (uint8_t)(copy - 1);
609
281
  else
610
281
    op--;
611
612
  /* marker for blosclz */
613
24.7k
  *(uint8_t*)output |= (1U << 5U);
614
615
24.7k
  return (int)(op - (uint8_t*)output);
616
617
12.1k
  out:
618
12.1k
  return 0;
619
25.0k
}
620
621
// See https://habr.com/en/company/yandex/blog/457612/
622
#if defined(__AVX2__)
623
624
#if defined(_MSC_VER)
625
#define ALIGNED_(x) __declspec(align(x))
626
#else
627
#if defined(__GNUC__)
628
#define ALIGNED_(x) __attribute__ ((aligned(x)))
629
#endif
630
#endif
631
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
632
633
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
634
{
635
  size_t offset = op - match;
636
  while (len >= 16) {
637
638
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
639
      {
640
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
641
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
642
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
643
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
644
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
645
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
646
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
648
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
651
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
652
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
653
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
654
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
655
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
656
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
657
      };
658
659
    _mm_storeu_si128((__m128i *)(op),
660
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
661
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
662
663
    match += masks[offset];
664
665
    op += 16;
666
    len -= 16;
667
  }
668
  // Deal with remainders
669
  for (; len > 0; len--) {
670
    *op++ = *match++;
671
  }
672
  return op;
673
}
674
#endif
675
676
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
677
195k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
678
195k
  uint8_t* d = out;
679
195k
  const uint8_t* s = from;
680
195k
  uint8_t* const e = end;
681
682
1.15M
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
683
195k
}
684
685
11.0k
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
686
11.0k
  const uint8_t* ip = (const uint8_t*)input;
687
11.0k
  const uint8_t* ip_limit = ip + length;
688
11.0k
  uint8_t* op = (uint8_t*)output;
689
11.0k
  uint32_t ctrl;
690
11.0k
  uint8_t* op_limit = op + maxout;
691
11.0k
  if (BLOSCLZ_UNLIKELY(length == 0)) {
692
0
    return 0;
693
0
  }
694
11.0k
  ctrl = (*ip++) & 31U;
695
696
541k
  while (1) {
697
541k
    if (ctrl >= 32) {
698
      // match
699
273k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
700
273k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
701
273k
      uint8_t code;
702
273k
      const uint8_t* ref = op - ofs;
703
704
273k
      if (len == 7 - 1) {
705
205k
        do {
706
205k
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
707
0
            return 0;
708
0
          }
709
205k
          code = *ip++;
710
205k
          len += code;
711
205k
        } while (code == 255);
712
175k
      }
713
97.5k
      else {
714
97.5k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
715
0
          return 0;
716
0
        }
717
97.5k
      }
718
273k
      code = *ip++;
719
273k
      len += 3;
720
273k
      ref -= code;
721
722
      /* match from 16-bit distance */
723
273k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
724
20.8k
        if (ofs == (31U << 8U)) {
725
11.9k
          if (ip + 1 >= ip_limit) {
726
0
            return 0;
727
0
          }
728
11.9k
          ofs = (*ip++) << 8U;
729
11.9k
          ofs += *ip++;
730
11.9k
          ref = op - ofs - MAX_DISTANCE;
731
11.9k
        }
732
20.8k
      }
733
734
273k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
735
0
        return 0;
736
0
      }
737
738
273k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
739
0
        return 0;
740
0
      }
741
742
273k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
743
273k
      ctrl = *ip++;
744
745
273k
      ref--;
746
273k
      if (ref == op - 1) {
747
        /* optimized copy for a run */
748
28.5k
        memset(op, *ref, len);
749
28.5k
        op += len;
750
28.5k
      }
751
244k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
752
        // copy with an overlap not larger than 8
753
195k
        wild_copy(op, ref, op + len);
754
195k
        op += len;
755
195k
      }
756
49.3k
      else {
757
        // general copy with any overlap
758
#if 0 && defined(__AVX2__)
759
        if (op - ref <= 16) {
760
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
761
          // it is not too slower either.
762
          op = copy_match_16(op, ref, len);
763
        }
764
        else {
765
#endif
766
49.3k
          op = copy_match(op, ref, (unsigned) len);
767
#if 0 && defined(__AVX2__)
768
        }
769
#endif
770
49.3k
      }
771
273k
    }
772
268k
    else {
773
      // literal
774
268k
      ctrl++;
775
268k
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
776
0
        return 0;
777
0
      }
778
268k
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
779
0
        return 0;
780
0
      }
781
782
268k
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
783
      // On GCC-6, fastcopy this is still faster than plain memcpy
784
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
785
      // in performance.
786
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
787
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
788
789
268k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
790
257k
      ctrl = *ip++;
791
257k
    }
792
541k
  }
793
794
11.0k
  return (int)(op - (uint8_t*)output);
795
11.0k
}