Coverage Report

Created: 2025-12-11 06:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  The Blosc Developers <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include <stdio.h>
18
#include <stdbool.h>
19
#include "blosclz.h"
20
#include "fastcopy.h"
21
#include "blosc-common.h"
22
23
24
/*
25
 * Give hints to the compiler for branch prediction optimization.
26
 */
27
#if defined(__GNUC__) && (__GNUC__ > 2)
28
18.5M
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
29
50.0M
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
30
#else
31
#define BLOSCLZ_LIKELY(c)    (c)
32
#define BLOSCLZ_UNLIKELY(c)  (c)
33
#endif
34
35
/*
36
 * Use inlined functions for supported systems.
37
 */
38
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
39
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
40
#endif
41
42
923k
#define MAX_COPY 32U
43
19.6M
#define MAX_DISTANCE 8191
44
18.5M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
45
46
#ifdef BLOSC_STRICT_ALIGN
47
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
48
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
49
#else
50
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
51
26.2M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
52
#endif
53
54
50.3k
#define HASH_LOG (14U)
55
6.24k
#define HASH_LOG2 (12U)
56
57
// This is used in LZ4 and seems to work pretty well here too
58
19.6M
#define HASH_FUNCTION(v, s, h) {      \
59
19.6M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
60
19.6M
}
61
62
63
#if defined(__AVX2__)
64
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
65
    uint8_t x = ip[-1];
66
67
    while (ip < (ip_bound - (sizeof(__m256i)))) {
68
        __m256i value, value2, cmp;
69
        /* Broadcast the value for every byte in a 256-bit register */
70
        memset(&value, x, sizeof(__m256i));
71
        value2 = _mm256_loadu_si256((__m256i *)ref);
72
        cmp = _mm256_cmpeq_epi64(value, value2);
73
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
74
            /* Return the byte that starts to differ */
75
            while (*ref++ == x) ip++;
76
            return ip;
77
        }
78
        else {
79
            ip += sizeof(__m256i);
80
            ref += sizeof(__m256i);
81
        }
82
    }
83
    /* Look into the remainder */
84
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
85
    return ip;
86
}
87
#endif
88
89
#if defined(__SSE2__)
90
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
91
0
  uint8_t x = ip[-1];
92
93
0
  while (ip < (ip_bound - sizeof(__m128i))) {
94
0
    __m128i value, value2, cmp;
95
    /* Broadcast the value for every byte in a 128-bit register */
96
0
    memset(&value, x, sizeof(__m128i));
97
0
    value2 = _mm_loadu_si128((__m128i *)ref);
98
0
    cmp = _mm_cmpeq_epi32(value, value2);
99
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
100
      /* Return the byte that starts to differ */
101
0
      while (*ref++ == x) ip++;
102
0
      return ip;
103
0
    }
104
0
    else {
105
0
      ip += sizeof(__m128i);
106
0
      ref += sizeof(__m128i);
107
0
    }
108
0
  }
109
  /* Look into the remainder */
110
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
111
0
  return ip;
112
0
}
113
114
#endif
115
116
117
104k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
118
104k
  uint8_t x = ip[-1];
119
104k
  int64_t value, value2;
120
  /* Broadcast the value for every byte in a 64-bit register */
121
104k
  memset(&value, x, 8);
122
  /* safe because the outer check against ip limit */
123
411k
  while (ip < (ip_bound - sizeof(int64_t))) {
124
#if defined(BLOSC_STRICT_ALIGN)
125
    memcpy(&value2, ref, 8);
126
#else
127
409k
    value2 = ((int64_t*)ref)[0];
128
409k
#endif
129
409k
    if (value != value2) {
130
      /* Return the byte that starts to differ */
131
351k
      while (*ref++ == x) ip++;
132
103k
      return ip;
133
103k
    }
134
306k
    else {
135
306k
      ip += 8;
136
306k
      ref += 8;
137
306k
    }
138
409k
  }
139
  /* Look into the remainder */
140
6.36k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
141
1.52k
  return ip;
142
104k
}
143
144
145
/* Return the byte that starts to differ */
146
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
147
0
#if !defined(BLOSC_STRICT_ALIGN)
148
0
  while (ip < (ip_bound - sizeof(int64_t))) {
149
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
150
      /* Return the byte that starts to differ */
151
0
      while (*ref++ == *ip++) {}
152
0
      return ip;
153
0
    }
154
0
    else {
155
0
      ip += sizeof(int64_t);
156
0
      ref += sizeof(int64_t);
157
0
    }
158
0
  }
159
0
#endif
160
  /* Look into the remainder */
161
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
162
0
  return ip;
163
0
}
164
165
166
#if defined(__SSE2__)
167
1.23M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
168
1.23M
  __m128i value, value2, cmp;
169
170
2.19M
  while (ip < (ip_bound - sizeof(__m128i))) {
171
2.18M
    value = _mm_loadu_si128((__m128i *) ip);
172
2.18M
    value2 = _mm_loadu_si128((__m128i *) ref);
173
2.18M
    cmp = _mm_cmpeq_epi32(value, value2);
174
2.18M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
175
      /* Return the byte that starts to differ */
176
3.67M
      while (*ref++ == *ip++) {}
177
1.22M
      return ip;
178
1.22M
    }
179
963k
    else {
180
963k
      ip += sizeof(__m128i);
181
963k
      ref += sizeof(__m128i);
182
963k
    }
183
2.18M
  }
184
  /* Look into the remainder */
185
44.4k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
186
7.95k
  return ip;
187
1.23M
}
188
#endif
189
190
191
#if defined(__AVX2__)
192
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
193
194
  while (ip < (ip_bound - sizeof(__m256i))) {
195
    __m256i value, value2, cmp;
196
    value = _mm256_loadu_si256((__m256i *) ip);
197
    value2 = _mm256_loadu_si256((__m256i *)ref);
198
    cmp = _mm256_cmpeq_epi64(value, value2);
199
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
200
      /* Return the byte that starts to differ */
201
      while (*ref++ == *ip++) {}
202
      return ip;
203
    }
204
    else {
205
      ip += sizeof(__m256i);
206
      ref += sizeof(__m256i);
207
    }
208
  }
209
  /* Look into the remainder */
210
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
211
  return ip;
212
}
213
#endif
214
215
216
1.33M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
217
1.33M
  if (BLOSCLZ_UNLIKELY(run)) {
218
#if defined(__AVX2__)
219
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
220
    // ip = get_run_32(ip, ip_bound, ref);
221
    ip = get_run(ip, ip_bound, ref);
222
#elif defined(__SSE2__)
223
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
224
    // ip = get_run_16(ip, ip_bound, ref);
225
104k
    ip = get_run(ip, ip_bound, ref);
226
#else
227
    ip = get_run(ip, ip_bound, ref);
228
#endif
229
104k
  }
230
1.23M
  else {
231
#if defined(__AVX2__)
232
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
233
    // ip = get_match_32(ip, ip_bound, ref);
234
    ip = get_match_16(ip, ip_bound, ref);
235
#elif defined(__SSE2__)
236
    ip = get_match_16(ip, ip_bound, ref);
237
#else
238
    ip = get_match(ip, ip_bound, ref);
239
#endif
240
1.23M
  }
241
242
1.33M
  return ip;
243
1.33M
}
244
245
246
14.4M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
247
14.4M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
248
14.4M
    goto out;                                           \
249
14.4M
  *(op)++ = *(anchor)++;                                \
250
14.4M
  (ip) = (anchor);                                      \
251
14.4M
  (copy)++;                                             \
252
14.4M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
253
358k
    (copy) = 0;                                         \
254
358k
    *(op)++ = MAX_COPY-1;                               \
255
358k
  }                                                     \
256
14.4M
}
257
258
3.29M
#define LITERAL2(ip, anchor, copy) {                    \
259
3.29M
  oc++; (anchor)++;                                     \
260
3.29M
  (ip) = (anchor);                                      \
261
3.29M
  (copy)++;                                             \
262
3.29M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
263
67.2k
    (copy) = 0;                                         \
264
67.2k
    oc++;                                               \
265
67.2k
  }                                                     \
266
3.29M
}
267
268
380k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
269
380k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
270
380k
    goto out;                                             \
271
380k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
272
380k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
273
380k
}
274
275
172k
#define MATCH_LONG(op, op_limit, len, distance) {       \
276
172k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
277
172k
    goto out;                                           \
278
172k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
279
209k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
280
37.1k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
281
37.1k
      goto out;                                         \
282
37.1k
    *(op)++ = 255;                                      \
283
37.1k
  }                                                     \
284
172k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
285
172k
    goto out;                                           \
286
172k
  *(op)++ = (uint8_t)(len);                             \
287
172k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
288
172k
}
289
290
1.08k
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
291
1.08k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
292
1.08k
    goto out;                                               \
293
1.08k
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
294
1.08k
  *(op)++ = 255;                                            \
295
1.08k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
296
1.08k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
297
1.08k
}
298
299
5.75k
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
300
5.75k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
301
5.75k
    goto out;                                               \
302
5.75k
  *(op)++ = (7U << 5U) + 31;                                \
303
14.4k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
304
8.70k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
305
8.70k
      goto out;                                             \
306
8.70k
    *(op)++ = 255;                                          \
307
8.70k
  }                                                         \
308
5.75k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
309
5.75k
    goto out;                                               \
310
5.75k
  *(op)++ = (uint8_t)(len);                                 \
311
5.75k
  *(op)++ = 255;                                            \
312
5.75k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
313
5.75k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
314
5.75k
}
315
316
317
// Get a guess for the compressed size of a buffer
318
6.24k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) {
319
6.24k
  uint8_t* ip = ibase;
320
6.24k
  int32_t oc = 0;
321
6.24k
  const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2);
322
6.24k
  uint16_t htab[1U << (uint8_t)HASH_LOG2];
323
6.24k
  uint32_t hval;
324
6.24k
  uint32_t seq;
325
6.24k
  uint8_t copy;
326
  // Make a tradeoff between testing too much and too little
327
6.24k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
328
6.24k
  uint8_t* ip_bound = ibase + limit - 1;
329
6.24k
  uint8_t* ip_limit = ibase + limit - 12;
330
331
  // Initialize the hash table to distances of 0
332
6.24k
  memset(htab, 0, hashlen * sizeof(uint16_t));
333
334
  /* we start with literal copy */
335
6.24k
  copy = 4;
336
6.24k
  oc += 5;
337
338
  /* main loop */
339
3.48M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
340
3.48M
    const uint8_t* ref;
341
3.48M
    unsigned distance;
342
3.48M
    uint8_t* anchor = ip;    /* comparison starting-point */
343
344
    /* find potential match */
345
3.48M
    seq = BLOSCLZ_READU32(ip);
346
3.48M
    HASH_FUNCTION(hval, seq, HASH_LOG2)
347
3.48M
    ref = ibase + htab[hval];
348
349
    /* calculate distance to the match */
350
3.48M
    distance = (unsigned int)(anchor - ref);
351
352
    /* update hash table */
353
3.48M
    htab[hval] = (uint16_t) (anchor - ibase);
354
355
3.48M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
356
6.19k
      LITERAL2(ip, anchor, copy)
357
6.19k
      continue;
358
6.19k
    }
359
360
    /* is this a match? check the first 4 bytes */
361
3.47M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
362
337k
      ref += 4;
363
337k
    }
364
3.13M
    else {
365
      /* no luck, copy as a literal */
366
3.13M
      LITERAL2(ip, anchor, copy)
367
3.13M
      continue;
368
3.13M
    }
369
370
    /* last matched byte */
371
337k
    ip = anchor + 4;
372
373
    /* distance is biased */
374
337k
    distance--;
375
376
    /* get runs or matches; zero distance means a run */
377
337k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
378
379
337k
    ip -= ipshift;
380
337k
    int len = (int)(ip - anchor);
381
337k
    if (len < minlen) {
382
148k
      LITERAL2(ip, anchor, copy)
383
148k
      continue;
384
148k
    }
385
386
    /* if we haven't copied anything, adjust the output counter */
387
189k
    if (!copy)
388
44.2k
      oc--;
389
    /* reset literal counter */
390
189k
    copy = 0;
391
392
    /* encode the match */
393
189k
    if (distance < MAX_DISTANCE) {
394
189k
      if (len >= 7) {
395
47.3k
        oc += ((len - 7) / 255) + 1;
396
47.3k
      }
397
189k
      oc += 2;
398
189k
    }
399
0
    else {
400
      /* far away, but not yet in the another galaxy... */
401
0
      if (len >= 7) {
402
0
        oc += ((len - 7) / 255) + 1;
403
0
      }
404
0
      oc += 4;
405
0
    }
406
407
    /* update the hash at match boundary */
408
189k
    seq = BLOSCLZ_READU32(ip);
409
189k
    HASH_FUNCTION(hval, seq, HASH_LOG2)
410
189k
    htab[hval] = (uint16_t)(ip++ - ibase);
411
189k
    ip++;
412
    /* assuming literal copy */
413
189k
    oc++;
414
189k
  }
415
416
6.24k
  double ic = (double)(ip - ibase);
417
6.24k
  return ic / (double)oc;
418
6.24k
}
419
420
421
int blosclz_compress(const int clevel, const void* input, int length,
422
6.24k
                     void* output, int maxout, const int split_block) {
423
6.24k
  uint8_t* ibase = (uint8_t*)input;
424
425
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
426
6.24k
  int maxlen = length / 4;
427
  // Start probing somewhere inside the buffer
428
6.24k
  int shift = length - maxlen;
429
  // Actual entropy probing!
430
6.24k
  double cratio = get_cratio(ibase + shift, maxlen, 3, 3);
431
  // discard probes with small compression ratios (too expensive)
432
6.24k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
433
6.24k
  if (cratio < cratio_[clevel]) {
434
649
      goto out;
435
649
  }
436
437
  /* When we go back in a match (shift), we obtain quite different compression properties.
438
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
439
   * Fallback to 4 because it provides more consistent results for large cratios.
440
   *
441
   * In this block we also check cratios for the beginning of the buffers and
442
   * eventually discard those that are small (take too long to decompress).
443
   * This process is called _entropy probing_.
444
   */
445
5.59k
  unsigned ipshift = 4;
446
  // Compute optimal shift and minimum lengths for encoding
447
  // Use 4 by default, except for low entropy data, where we should do a best effort
448
5.59k
  unsigned minlen = 4;
449
  // BloscLZ works better with splits mostly, so when data is not split, do a best effort
450
  // Why using cratio < 4 is based in experiments with low and high entropy
451
5.59k
  if (!split_block || cratio < 4) {
452
5.41k
    ipshift = 3;
453
5.41k
    minlen = 3;
454
5.41k
  }
455
180
  else {
456
180
    minlen = 4;
457
180
  }
458
459
5.59k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
460
5.59k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
461
5.59k
  uint8_t hashlog = hashlog_[clevel];
462
463
5.59k
  uint8_t* ip = ibase;
464
5.59k
  uint8_t* ip_bound = ibase + length - 1;
465
5.59k
  uint8_t* ip_limit = ibase + length - 12;
466
5.59k
  uint8_t* op = (uint8_t*)output;
467
5.59k
  const uint8_t* op_limit = op + maxout;
468
5.59k
  uint32_t seq;
469
5.59k
  uint8_t copy;
470
5.59k
  uint32_t hval;
471
472
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
473
5.59k
  if (length < 16 || maxout < 66) {
474
4
    return 0;
475
4
  }
476
477
  // Initialize the hash table
478
5.58k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
479
5.58k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
480
481
  /* we start with literal copy */
482
5.58k
  copy = 4;
483
5.58k
  *op++ = MAX_COPY - 1;
484
5.58k
  *op++ = *ip++;
485
5.58k
  *op++ = *ip++;
486
5.58k
  *op++ = *ip++;
487
5.58k
  *op++ = *ip++;
488
489
  /* main loop */
490
15.0M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
491
15.0M
    const uint8_t* ref;
492
15.0M
    unsigned distance;
493
15.0M
    uint8_t* anchor = ip;    /* comparison starting-point */
494
495
    /* find potential match */
496
15.0M
    seq = BLOSCLZ_READU32(ip);
497
15.0M
    HASH_FUNCTION(hval, seq, hashlog)
498
15.0M
    ref = ibase + htab[hval];
499
500
    /* calculate distance to the match */
501
15.0M
    distance = (unsigned int)(anchor - ref);
502
503
    /* update hash table */
504
15.0M
    htab[hval] = (uint32_t) (anchor - ibase);
505
506
15.0M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
507
0
      LITERAL(ip, op, op_limit, anchor, copy)
508
0
      continue;
509
0
    }
510
511
    /* is this a match? check the first 4 bytes */
512
15.0M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
513
998k
      ref += 4;
514
14.0M
    } else {
515
      /* no luck, copy as a literal */
516
14.0M
      LITERAL(ip, op, op_limit, anchor, copy)
517
0
      continue;
518
14.0M
    }
519
520
    /* last matched byte */
521
998k
    ip = anchor + 4;
522
523
    /* distance is biased */
524
998k
    distance--;
525
526
    /* get runs or matches; zero distance means a run */
527
998k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
528
529
    /* length is biased, '1' means a match of 3 bytes */
530
998k
    ip -= ipshift;
531
532
998k
    unsigned len = (int)(ip - anchor);
533
534
    // Encoding short lengths is expensive during decompression
535
998k
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
536
438k
      LITERAL(ip, op, op_limit, anchor, copy)
537
0
      continue;
538
438k
    }
539
540
    /* if we have copied something, adjust the copy count */
541
560k
    if (copy)
542
      /* copy is biased, '0' means 1 byte copy */
543
400k
      *(op - copy - 1) = (uint8_t)(copy - 1);
544
159k
    else
545
      /* back, to overwrite the copy count */
546
159k
      op--;
547
    /* reset literal counter */
548
560k
    copy = 0;
549
550
    /* encode the match */
551
560k
    if (distance < MAX_DISTANCE) {
552
553k
      if (len < 7) {
553
380k
        MATCH_SHORT(op, op_limit, len, distance)
554
380k
      } else {
555
516k
        MATCH_LONG(op, op_limit, len, distance)
556
516k
      }
557
553k
    } else {
558
      /* far away, but not yet in the another galaxy... */
559
6.83k
      distance -= MAX_DISTANCE;
560
6.83k
      if (len < 7) {
561
1.08k
        MATCH_SHORT_FAR(op, op_limit, len, distance)
562
5.75k
      } else {
563
17.2k
        MATCH_LONG_FAR(op, op_limit, len, distance)
564
17.2k
      }
565
6.83k
    }
566
567
    /* update the hash at match boundary */
568
559k
    seq = BLOSCLZ_READU32(ip);
569
559k
    HASH_FUNCTION(hval, seq, hashlog)
570
559k
    htab[hval] = (uint32_t) (ip++ - ibase);
571
559k
    if (clevel == 9) {
572
      // In some situations, including a second hash proves to be useful,
573
      // but not in others.  Activating here in max clevel only.
574
344k
      seq >>= 8U;
575
344k
      HASH_FUNCTION(hval, seq, hashlog)
576
344k
      htab[hval] = (uint32_t) (ip++ - ibase);
577
344k
    }
578
215k
    else {
579
215k
      ip++;
580
215k
    }
581
582
559k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
583
97
      goto out;
584
585
    /* assuming literal copy */
586
559k
    *op++ = MAX_COPY - 1;
587
559k
  }
588
589
  /* left-over as literal copy */
590
28.2k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
591
24.0k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
592
23.9k
    *op++ = *ip++;
593
23.9k
    copy++;
594
23.9k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
595
224
      copy = 0;
596
224
      *op++ = MAX_COPY - 1;
597
224
    }
598
23.9k
  }
599
600
  /* if we have copied something, adjust the copy length */
601
4.15k
  if (copy)
602
4.10k
    *(op - copy - 1) = (uint8_t)(copy - 1);
603
51
  else
604
51
    op--;
605
606
  /* marker for blosclz */
607
4.15k
  *(uint8_t*)output |= (1U << 5U);
608
609
4.15k
  return (int)(op - (uint8_t*)output);
610
611
2.08k
  out:
612
2.08k
  return 0;
613
4.25k
}
614
615
// See https://habr.com/en/company/yandex/blog/457612/
616
#if defined(__AVX2__)
617
618
#if defined(_MSC_VER)
619
#define ALIGNED_(x) __declspec(align(x))
620
#else
621
#if defined(__GNUC__)
622
#define ALIGNED_(x) __attribute__ ((aligned(x)))
623
#endif
624
#endif
625
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
626
627
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
628
{
629
  size_t offset = op - match;
630
  while (len >= 16) {
631
632
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
633
      {
634
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
635
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
636
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
637
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
638
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
639
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
640
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
641
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
642
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
643
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
644
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
645
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
646
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
648
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
651
      };
652
653
    _mm_storeu_si128((__m128i *)(op),
654
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
655
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
656
657
    match += masks[offset];
658
659
    op += 16;
660
    len -= 16;
661
  }
662
  // Deal with remainders
663
  for (; len > 0; len--) {
664
    *op++ = *match++;
665
  }
666
  return op;
667
}
668
#endif
669
670
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
671
2.75k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
672
2.75k
  uint8_t* d = out;
673
2.75k
  const uint8_t* s = from;
674
2.75k
  uint8_t* const e = end;
675
676
3.34k
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
677
2.75k
}
678
679
113
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
680
113
  const uint8_t* ip = (const uint8_t*)input;
681
113
  const uint8_t* ip_limit = ip + length;
682
113
  uint8_t* op = (uint8_t*)output;
683
113
  uint32_t ctrl;
684
113
  uint8_t* op_limit = op + maxout;
685
113
  if (BLOSCLZ_UNLIKELY(length == 0)) {
686
0
    return 0;
687
0
  }
688
113
  ctrl = (*ip++) & 31U;
689
690
19.2k
  while (1) {
691
19.2k
    if (ctrl >= 32) {
692
      // match
693
3.72k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
694
3.72k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
695
3.72k
      uint8_t code;
696
3.72k
      const uint8_t* ref = op - ofs;
697
698
3.72k
      if (len == 7 - 1) {
699
648
        do {
700
648
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
701
0
            return 0;
702
0
          }
703
648
          code = *ip++;
704
648
          len += code;
705
648
        } while (code == 255);
706
648
      }
707
3.07k
      else {
708
3.07k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
709
0
          return 0;
710
0
        }
711
3.07k
      }
712
3.72k
      code = *ip++;
713
3.72k
      len += 3;
714
3.72k
      ref -= code;
715
716
      /* match from 16-bit distance */
717
3.72k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
718
623
        if (ofs == (31U << 8U)) {
719
0
          if (ip + 1 >= ip_limit) {
720
0
            return 0;
721
0
          }
722
0
          ofs = (*ip++) << 8U;
723
0
          ofs += *ip++;
724
0
          ref = op - ofs - MAX_DISTANCE;
725
0
        }
726
623
      }
727
728
3.72k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
729
0
        return 0;
730
0
      }
731
732
3.72k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
733
0
        return 0;
734
0
      }
735
736
3.72k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
737
3.72k
      ctrl = *ip++;
738
739
3.72k
      ref--;
740
3.72k
      if (ref == op - 1) {
741
        /* optimized copy for a run */
742
656
        memset(op, *ref, len);
743
656
        op += len;
744
656
      }
745
3.07k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
746
        // copy with an overlap not larger than 8
747
2.75k
        wild_copy(op, ref, op + len);
748
2.75k
        op += len;
749
2.75k
      }
750
319
      else {
751
        // general copy with any overlap
752
#if defined(__AVX2__)
753
        if (op - ref <= 16) {
754
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
755
          // it is not slower either.  Let's activate here for experimentation.
756
          op = copy_match_16(op, ref, len);
757
        }
758
        else {
759
#endif
760
319
          op = copy_match(op, ref, (unsigned) len);
761
#if defined(__AVX2__)
762
        }
763
#endif
764
319
      }
765
3.72k
    }
766
15.5k
    else {
767
      // literal
768
15.5k
      ctrl++;
769
15.5k
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
770
0
        return 0;
771
0
      }
772
15.5k
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
773
0
        return 0;
774
0
      }
775
776
15.5k
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
777
      // On GCC-6, fastcopy this is still faster than plain memcpy
778
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
779
      // in performance.
780
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
781
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
782
783
15.5k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
784
15.4k
      ctrl = *ip++;
785
15.4k
    }
786
19.2k
  }
787
788
113
  return (int)(op - (uint8_t*)output);
789
113
}