Coverage Report

Created: 2026-03-19 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  The Blosc Developers <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include <stdio.h>
18
#include <stdbool.h>
19
#include "blosclz.h"
20
#include "fastcopy.h"
21
#include "blosc-common.h"
22
23
24
/*
25
 * Give hints to the compiler for branch prediction optimization.
26
 */
27
#if defined(__GNUC__) && (__GNUC__ > 2)
28
26.3M
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
29
70.1M
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
30
#else
31
#define BLOSCLZ_LIKELY(c)    (c)
32
#define BLOSCLZ_UNLIKELY(c)  (c)
33
#endif
34
35
/*
36
 * Use inlined functions for supported systems.
37
 */
38
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
39
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
40
#endif
41
42
1.02M
#define MAX_COPY 32U
43
27.2M
#define MAX_DISTANCE 8191
44
26.3M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
45
46
#ifdef BLOSC_STRICT_ALIGN
47
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
48
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
49
#else
50
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
51
37.5M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
52
#endif
53
54
61.1k
#define HASH_LOG (14U)
55
7.60k
#define HASH_LOG2 (12U)
56
57
// This is used in LZ4 and seems to work pretty well here too
58
27.3M
#define HASH_FUNCTION(v, s, h) {      \
59
27.3M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
60
27.3M
}
61
62
63
#if defined(__AVX2__)
64
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
65
    uint8_t x = ip[-1];
66
67
    while (ip < (ip_bound - (sizeof(__m256i)))) {
68
        __m256i value, value2, cmp;
69
        /* Broadcast the value for every byte in a 256-bit register */
70
        memset(&value, x, sizeof(__m256i));
71
        value2 = _mm256_loadu_si256((__m256i *)ref);
72
        cmp = _mm256_cmpeq_epi64(value, value2);
73
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
74
            /* Return the byte that starts to differ */
75
            while (*ref++ == x) ip++;
76
            return ip;
77
        }
78
        else {
79
            ip += sizeof(__m256i);
80
            ref += sizeof(__m256i);
81
        }
82
    }
83
    /* Look into the remainder */
84
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
85
    return ip;
86
}
87
#endif
88
89
#if defined(__SSE2__)
90
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
91
0
  uint8_t x = ip[-1];
92
93
0
  while (ip < (ip_bound - sizeof(__m128i))) {
94
0
    __m128i value, value2, cmp;
95
    /* Broadcast the value for every byte in a 128-bit register */
96
0
    memset(&value, x, sizeof(__m128i));
97
0
    value2 = _mm_loadu_si128((__m128i *)ref);
98
0
    cmp = _mm_cmpeq_epi32(value, value2);
99
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
100
      /* Return the byte that starts to differ */
101
0
      while (*ref++ == x) ip++;
102
0
      return ip;
103
0
    }
104
0
    else {
105
0
      ip += sizeof(__m128i);
106
0
      ref += sizeof(__m128i);
107
0
    }
108
0
  }
109
  /* Look into the remainder */
110
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
111
0
  return ip;
112
0
}
113
114
#endif
115
116
117
123k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
118
123k
  uint8_t x = ip[-1];
119
123k
  int64_t value, value2;
120
  /* Broadcast the value for every byte in a 64-bit register */
121
123k
  memset(&value, x, 8);
122
  /* safe because the outer check against ip limit */
123
386k
  while (ip < (ip_bound - sizeof(int64_t))) {
124
#if defined(BLOSC_STRICT_ALIGN)
125
    memcpy(&value2, ref, 8);
126
#else
127
385k
    value2 = ((int64_t*)ref)[0];
128
385k
#endif
129
385k
    if (value != value2) {
130
      /* Return the byte that starts to differ */
131
406k
      while (*ref++ == x) ip++;
132
122k
      return ip;
133
122k
    }
134
263k
    else {
135
263k
      ip += 8;
136
263k
      ref += 8;
137
263k
    }
138
385k
  }
139
  /* Look into the remainder */
140
7.03k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
141
1.52k
  return ip;
142
123k
}
143
144
145
/* Return the byte that starts to differ */
146
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
147
0
#if !defined(BLOSC_STRICT_ALIGN)
148
0
  while (ip < (ip_bound - sizeof(int64_t))) {
149
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
150
      /* Return the byte that starts to differ */
151
0
      while (*ref++ == *ip++) {}
152
0
      return ip;
153
0
    }
154
0
    else {
155
0
      ip += sizeof(int64_t);
156
0
      ref += sizeof(int64_t);
157
0
    }
158
0
  }
159
0
#endif
160
  /* Look into the remainder */
161
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
162
0
  return ip;
163
0
}
164
165
166
#if defined(__SSE2__)
167
1.16M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
168
1.16M
  __m128i value, value2, cmp;
169
170
1.93M
  while (ip < (ip_bound - sizeof(__m128i))) {
171
1.93M
    value = _mm_loadu_si128((__m128i *) ip);
172
1.93M
    value2 = _mm_loadu_si128((__m128i *) ref);
173
1.93M
    cmp = _mm_cmpeq_epi32(value, value2);
174
1.93M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
175
      /* Return the byte that starts to differ */
176
2.84M
      while (*ref++ == *ip++) {}
177
1.16M
      return ip;
178
1.16M
    }
179
769k
    else {
180
769k
      ip += sizeof(__m128i);
181
769k
      ref += sizeof(__m128i);
182
769k
    }
183
1.93M
  }
184
  /* Look into the remainder */
185
26.2k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
186
7.07k
  return ip;
187
1.16M
}
188
#endif
189
190
191
#if defined(__AVX2__)
192
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
193
194
  while (ip < (ip_bound - sizeof(__m256i))) {
195
    __m256i value, value2, cmp;
196
    value = _mm256_loadu_si256((__m256i *) ip);
197
    value2 = _mm256_loadu_si256((__m256i *)ref);
198
    cmp = _mm256_cmpeq_epi64(value, value2);
199
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
200
      /* Return the byte that starts to differ */
201
      while (*ref++ == *ip++) {}
202
      return ip;
203
    }
204
    else {
205
      ip += sizeof(__m256i);
206
      ref += sizeof(__m256i);
207
    }
208
  }
209
  /* Look into the remainder */
210
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
211
  return ip;
212
}
213
#endif
214
215
216
1.29M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
217
1.29M
  if (BLOSCLZ_UNLIKELY(run)) {
218
#if defined(__AVX2__)
219
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
220
    // ip = get_run_32(ip, ip_bound, ref);
221
    ip = get_run(ip, ip_bound, ref);
222
#elif defined(__SSE2__)
223
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
224
    // ip = get_run_16(ip, ip_bound, ref);
225
123k
    ip = get_run(ip, ip_bound, ref);
226
#else
227
    ip = get_run(ip, ip_bound, ref);
228
#endif
229
123k
  }
230
1.16M
  else {
231
#if defined(__AVX2__)
232
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
233
    // ip = get_match_32(ip, ip_bound, ref);
234
    ip = get_match_16(ip, ip_bound, ref);
235
#elif defined(__SSE2__)
236
    ip = get_match_16(ip, ip_bound, ref);
237
#else
238
    ip = get_match(ip, ip_bound, ref);
239
#endif
240
1.16M
  }
241
242
1.29M
  return ip;
243
1.29M
}
244
245
246
20.6M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
247
20.6M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
248
20.6M
    goto out;                                           \
249
20.6M
  *(op)++ = *(anchor)++;                                \
250
20.6M
  (ip) = (anchor);                                      \
251
20.6M
  (copy)++;                                             \
252
20.6M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
253
566k
    (copy) = 0;                                         \
254
566k
    *(op)++ = MAX_COPY-1;                               \
255
566k
  }                                                     \
256
20.6M
}
257
258
5.11M
#define LITERAL2(ip, anchor, copy) {                    \
259
5.11M
  oc++; (anchor)++;                                     \
260
5.11M
  (ip) = (anchor);                                      \
261
5.11M
  (copy)++;                                             \
262
5.11M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
263
116k
    (copy) = 0;                                         \
264
116k
    oc++;                                               \
265
116k
  }                                                     \
266
5.11M
}
267
268
322k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
269
322k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
270
322k
    goto out;                                             \
271
322k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
272
322k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
273
322k
}
274
275
119k
#define MATCH_LONG(op, op_limit, len, distance) {       \
276
119k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
277
119k
    goto out;                                           \
278
119k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
279
151k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
280
31.9k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
281
31.9k
      goto out;                                         \
282
31.9k
    *(op)++ = 255;                                      \
283
31.9k
  }                                                     \
284
119k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
285
119k
    goto out;                                           \
286
119k
  *(op)++ = (uint8_t)(len);                             \
287
119k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
288
119k
}
289
290
556
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
291
556
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
292
556
    goto out;                                               \
293
556
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
294
555
  *(op)++ = 255;                                            \
295
555
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
296
555
  *(op)++ = (uint8_t)((distance) & 255U);                   \
297
555
}
298
299
4.35k
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
300
4.35k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
301
4.35k
    goto out;                                               \
302
4.35k
  *(op)++ = (7U << 5U) + 31;                                \
303
11.2k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
304
6.91k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
305
6.91k
      goto out;                                             \
306
6.91k
    *(op)++ = 255;                                          \
307
6.91k
  }                                                         \
308
4.35k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
309
4.35k
    goto out;                                               \
310
4.35k
  *(op)++ = (uint8_t)(len);                                 \
311
4.35k
  *(op)++ = 255;                                            \
312
4.35k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
313
4.35k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
314
4.35k
}
315
316
317
// Get a guess for the compressed size of a buffer
318
7.60k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) {
319
7.60k
  uint8_t* ip = ibase;
320
7.60k
  int32_t oc = 0;
321
7.60k
  const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2);
322
7.60k
  uint16_t htab[1U << (uint8_t)HASH_LOG2];
323
7.60k
  uint32_t hval;
324
7.60k
  uint32_t seq;
325
7.60k
  uint8_t copy;
326
  // Make a tradeoff between testing too much and too little
327
7.60k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
328
7.60k
  uint8_t* ip_bound = ibase + limit - 1;
329
7.60k
  uint8_t* ip_limit = ibase + limit - 12;
330
331
  // Initialize the hash table to distances of 0
332
7.60k
  memset(htab, 0, hashlen * sizeof(uint16_t));
333
334
  /* we start with literal copy */
335
7.60k
  copy = 4;
336
7.60k
  oc += 5;
337
338
  /* main loop */
339
5.28M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
340
5.27M
    const uint8_t* ref;
341
5.27M
    unsigned distance;
342
5.27M
    uint8_t* anchor = ip;    /* comparison starting-point */
343
344
    /* find potential match */
345
5.27M
    seq = BLOSCLZ_READU32(ip);
346
5.27M
    HASH_FUNCTION(hval, seq, HASH_LOG2)
347
5.27M
    ref = ibase + htab[hval];
348
349
    /* calculate distance to the match */
350
5.27M
    distance = (unsigned int)(anchor - ref);
351
352
    /* update hash table */
353
5.27M
    htab[hval] = (uint16_t) (anchor - ibase);
354
355
5.27M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
356
7.52k
      LITERAL2(ip, anchor, copy)
357
7.52k
      continue;
358
7.52k
    }
359
360
    /* is this a match? check the first 4 bytes */
361
5.26M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
362
337k
      ref += 4;
363
337k
    }
364
4.93M
    else {
365
      /* no luck, copy as a literal */
366
4.93M
      LITERAL2(ip, anchor, copy)
367
4.93M
      continue;
368
4.93M
    }
369
370
    /* last matched byte */
371
337k
    ip = anchor + 4;
372
373
    /* distance is biased */
374
337k
    distance--;
375
376
    /* get runs or matches; zero distance means a run */
377
337k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
378
379
337k
    ip -= ipshift;
380
337k
    int len = (int)(ip - anchor);
381
337k
    if (len < minlen) {
382
173k
      LITERAL2(ip, anchor, copy)
383
173k
      continue;
384
173k
    }
385
386
    /* if we haven't copied anything, adjust the output counter */
387
164k
    if (!copy)
388
30.3k
      oc--;
389
    /* reset literal counter */
390
164k
    copy = 0;
391
392
    /* encode the match */
393
164k
    if (distance < MAX_DISTANCE) {
394
164k
      if (len >= 7) {
395
33.3k
        oc += ((len - 7) / 255) + 1;
396
33.3k
      }
397
164k
      oc += 2;
398
164k
    }
399
0
    else {
400
      /* far away, but not yet in the another galaxy... */
401
0
      if (len >= 7) {
402
0
        oc += ((len - 7) / 255) + 1;
403
0
      }
404
0
      oc += 4;
405
0
    }
406
407
    /* update the hash at match boundary */
408
164k
    seq = BLOSCLZ_READU32(ip);
409
164k
    HASH_FUNCTION(hval, seq, HASH_LOG2)
410
164k
    htab[hval] = (uint16_t)(ip++ - ibase);
411
164k
    ip++;
412
    /* assuming literal copy */
413
164k
    oc++;
414
164k
  }
415
416
7.60k
  double ic = (double)(ip - ibase);
417
7.60k
  return ic / (double)oc;
418
7.60k
}
419
420
421
int blosclz_compress(const int clevel, const void* input, int length,
422
7.60k
                     void* output, int maxout, const int split_block) {
423
7.60k
  uint8_t* ibase = (uint8_t*)input;
424
425
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
426
7.60k
  int maxlen = length / 4;
427
  // Start probing somewhere inside the buffer
428
7.60k
  int shift = length - maxlen;
429
  // Actual entropy probing!
430
7.60k
  double cratio = get_cratio(ibase + shift, maxlen, 3, 3);
431
  // discard probes with small compression ratios (too expensive)
432
7.60k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
433
7.60k
  if (cratio < cratio_[clevel]) {
434
805
      goto out;
435
805
  }
436
437
  /* When we go back in a match (shift), we obtain quite different compression properties.
438
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
439
   * Fallback to 4 because it provides more consistent results for large cratios.
440
   *
441
   * In this block we also check cratios for the beginning of the buffers and
442
   * eventually discard those that are small (take too long to decompress).
443
   * This process is called _entropy probing_.
444
   */
445
6.79k
  unsigned ipshift = 4;
446
  // Compute optimal shift and minimum lengths for encoding
447
  // Use 4 by default, except for low entropy data, where we should do a best effort
448
6.79k
  unsigned minlen = 4;
449
  // BloscLZ works better with splits mostly, so when data is not split, do a best effort
450
  // Why using cratio < 4 is based in experiments with low and high entropy
451
6.79k
  if (!split_block || cratio < 4) {
452
6.61k
    ipshift = 3;
453
6.61k
    minlen = 3;
454
6.61k
  }
455
178
  else {
456
178
    minlen = 4;
457
178
  }
458
459
6.79k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
460
6.79k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
461
6.79k
  uint8_t hashlog = hashlog_[clevel];
462
463
6.79k
  uint8_t* ip = ibase;
464
6.79k
  uint8_t* ip_bound = ibase + length - 1;
465
6.79k
  uint8_t* ip_limit = ibase + length - 12;
466
6.79k
  uint8_t* op = (uint8_t*)output;
467
6.79k
  const uint8_t* op_limit = op + maxout;
468
6.79k
  uint32_t seq;
469
6.79k
  uint8_t copy;
470
6.79k
  uint32_t hval;
471
472
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
473
6.79k
  if (length < 16 || maxout < 66) {
474
8
    return 0;
475
8
  }
476
477
  // Initialize the hash table
478
6.78k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
479
6.78k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
480
481
  /* we start with literal copy */
482
6.78k
  copy = 4;
483
6.78k
  *op++ = MAX_COPY - 1;
484
6.78k
  *op++ = *ip++;
485
6.78k
  *op++ = *ip++;
486
6.78k
  *op++ = *ip++;
487
6.78k
  *op++ = *ip++;
488
489
  /* main loop */
490
21.0M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
491
21.0M
    const uint8_t* ref;
492
21.0M
    unsigned distance;
493
21.0M
    uint8_t* anchor = ip;    /* comparison starting-point */
494
495
    /* find potential match */
496
21.0M
    seq = BLOSCLZ_READU32(ip);
497
21.0M
    HASH_FUNCTION(hval, seq, hashlog)
498
21.0M
    ref = ibase + htab[hval];
499
500
    /* calculate distance to the match */
501
21.0M
    distance = (unsigned int)(anchor - ref);
502
503
    /* update hash table */
504
21.0M
    htab[hval] = (uint32_t) (anchor - ibase);
505
506
21.0M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
507
0
      LITERAL(ip, op, op_limit, anchor, copy)
508
0
      continue;
509
0
    }
510
511
    /* is this a match? check the first 4 bytes */
512
21.0M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
513
953k
      ref += 4;
514
20.1M
    } else {
515
      /* no luck, copy as a literal */
516
20.1M
      LITERAL(ip, op, op_limit, anchor, copy)
517
0
      continue;
518
20.1M
    }
519
520
    /* last matched byte */
521
953k
    ip = anchor + 4;
522
523
    /* distance is biased */
524
953k
    distance--;
525
526
    /* get runs or matches; zero distance means a run */
527
953k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
528
529
    /* length is biased, '1' means a match of 3 bytes */
530
953k
    ip -= ipshift;
531
532
953k
    unsigned len = (int)(ip - anchor);
533
534
    // Encoding short lengths is expensive during decompression
535
953k
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
536
506k
      LITERAL(ip, op, op_limit, anchor, copy)
537
0
      continue;
538
506k
    }
539
540
    /* if we have copied something, adjust the copy count */
541
446k
    if (copy)
542
      /* copy is biased, '0' means 1 byte copy */
543
300k
      *(op - copy - 1) = (uint8_t)(copy - 1);
544
146k
    else
545
      /* back, to overwrite the copy count */
546
146k
      op--;
547
    /* reset literal counter */
548
446k
    copy = 0;
549
550
    /* encode the match */
551
446k
    if (distance < MAX_DISTANCE) {
552
441k
      if (len < 7) {
553
322k
        MATCH_SHORT(op, op_limit, len, distance)
554
322k
      } else {
555
357k
        MATCH_LONG(op, op_limit, len, distance)
556
357k
      }
557
441k
    } else {
558
      /* far away, but not yet in the another galaxy... */
559
4.90k
      distance -= MAX_DISTANCE;
560
4.90k
      if (len < 7) {
561
556
        MATCH_SHORT_FAR(op, op_limit, len, distance)
562
4.35k
      } else {
563
13.0k
        MATCH_LONG_FAR(op, op_limit, len, distance)
564
13.0k
      }
565
4.90k
    }
566
567
    /* update the hash at match boundary */
568
446k
    seq = BLOSCLZ_READU32(ip);
569
446k
    HASH_FUNCTION(hval, seq, hashlog)
570
446k
    htab[hval] = (uint32_t) (ip++ - ibase);
571
446k
    if (clevel == 9) {
572
      // In some situations, including a second hash proves to be useful,
573
      // but not in others.  Activating here in max clevel only.
574
407k
      seq >>= 8U;
575
407k
      HASH_FUNCTION(hval, seq, hashlog)
576
407k
      htab[hval] = (uint32_t) (ip++ - ibase);
577
407k
    }
578
38.6k
    else {
579
38.6k
      ip++;
580
38.6k
    }
581
582
446k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
583
235
      goto out;
584
585
    /* assuming literal copy */
586
446k
    *op++ = MAX_COPY - 1;
587
446k
  }
588
589
  /* left-over as literal copy */
590
24.8k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
591
21.7k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
592
21.5k
    *op++ = *ip++;
593
21.5k
    copy++;
594
21.5k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
595
529
      copy = 0;
596
529
      *op++ = MAX_COPY - 1;
597
529
    }
598
21.5k
  }
599
600
  /* if we have copied something, adjust the copy length */
601
3.05k
  if (copy)
602
2.70k
    *(op - copy - 1) = (uint8_t)(copy - 1);
603
353
  else
604
353
    op--;
605
606
  /* marker for blosclz */
607
3.05k
  *(uint8_t*)output |= (1U << 5U);
608
609
3.05k
  return (int)(op - (uint8_t*)output);
610
611
4.53k
  out:
612
4.53k
  return 0;
613
3.25k
}
614
615
// See https://habr.com/en/company/yandex/blog/457612/
616
#if defined(__AVX2__)
617
618
#if defined(_MSC_VER)
619
#define ALIGNED_(x) __declspec(align(x))
620
#else
621
#if defined(__GNUC__)
622
#define ALIGNED_(x) __attribute__ ((aligned(x)))
623
#endif
624
#endif
625
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
626
627
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
628
{
629
  size_t offset = op - match;
630
  while (len >= 16) {
631
632
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
633
      {
634
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
635
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
636
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
637
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
638
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
639
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
640
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
641
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
642
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
643
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
644
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
645
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
646
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
648
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
651
      };
652
653
    _mm_storeu_si128((__m128i *)(op),
654
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
655
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
656
657
    match += masks[offset];
658
659
    op += 16;
660
    len -= 16;
661
  }
662
  // Deal with remainders
663
  for (; len > 0; len--) {
664
    *op++ = *match++;
665
  }
666
  return op;
667
}
668
#endif
669
670
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
671
8.72k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
672
8.72k
  uint8_t* d = out;
673
8.72k
  const uint8_t* s = from;
674
8.72k
  uint8_t* const e = end;
675
676
11.4k
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
677
8.72k
}
678
679
363
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
680
363
  const uint8_t* ip = (const uint8_t*)input;
681
363
  const uint8_t* ip_limit = ip + length;
682
363
  uint8_t* op = (uint8_t*)output;
683
363
  uint32_t ctrl;
684
363
  uint8_t* op_limit = op + maxout;
685
363
  if (BLOSCLZ_UNLIKELY(length == 0)) {
686
0
    return 0;
687
0
  }
688
363
  ctrl = (*ip++) & 31U;
689
690
62.0k
  while (1) {
691
62.0k
    if (ctrl >= 32) {
692
      // match
693
12.0k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
694
12.0k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
695
12.0k
      uint8_t code;
696
12.0k
      const uint8_t* ref = op - ofs;
697
698
12.0k
      if (len == 7 - 1) {
699
2.04k
        do {
700
2.04k
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
701
0
            return 0;
702
0
          }
703
2.04k
          code = *ip++;
704
2.04k
          len += code;
705
2.04k
        } while (code == 255);
706
2.03k
      }
707
10.0k
      else {
708
10.0k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
709
0
          return 0;
710
0
        }
711
10.0k
      }
712
12.0k
      code = *ip++;
713
12.0k
      len += 3;
714
12.0k
      ref -= code;
715
716
      /* match from 16-bit distance */
717
12.0k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
718
1.53k
        if (ofs == (31U << 8U)) {
719
0
          if (ip + 1 >= ip_limit) {
720
0
            return 0;
721
0
          }
722
0
          ofs = (*ip++) << 8U;
723
0
          ofs += *ip++;
724
0
          ref = op - ofs - MAX_DISTANCE;
725
0
        }
726
1.53k
      }
727
728
12.0k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
729
0
        return 0;
730
0
      }
731
732
12.0k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
733
0
        return 0;
734
0
      }
735
736
12.0k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
737
12.0k
      ctrl = *ip++;
738
739
12.0k
      ref--;
740
12.0k
      if (ref == op - 1) {
741
        /* optimized copy for a run */
742
2.39k
        memset(op, *ref, len);
743
2.39k
        op += len;
744
2.39k
      }
745
9.64k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
746
        // copy with an overlap not larger than 8
747
8.72k
        wild_copy(op, ref, op + len);
748
8.72k
        op += len;
749
8.72k
      }
750
920
      else {
751
        // general copy with any overlap
752
#if defined(__AVX2__)
753
        if (op - ref <= 16) {
754
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
755
          // it is not slower either.  Let's activate here for experimentation.
756
          op = copy_match_16(op, ref, len);
757
        }
758
        else {
759
#endif
760
920
          op = copy_match(op, ref, (unsigned) len);
761
#if defined(__AVX2__)
762
        }
763
#endif
764
920
      }
765
12.0k
    }
766
49.9k
    else {
767
      // literal
768
49.9k
      ctrl++;
769
49.9k
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
770
0
        return 0;
771
0
      }
772
49.9k
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
773
0
        return 0;
774
0
      }
775
776
49.9k
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
777
      // On GCC-6, fastcopy this is still faster than plain memcpy
778
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
779
      // in performance.
780
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
781
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
782
783
49.9k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
784
49.6k
      ctrl = *ip++;
785
49.6k
    }
786
62.0k
  }
787
788
363
  return (int)(op - (uint8_t*)output);
789
363
}