Coverage Report

Created: 2026-01-17 06:55

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  The Blosc Developers <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include <stdio.h>
18
#include <stdbool.h>
19
#include "blosclz.h"
20
#include "fastcopy.h"
21
#include "blosc-common.h"
22
23
24
/*
25
 * Give hints to the compiler for branch prediction optimization.
26
 */
27
#if defined(__GNUC__) && (__GNUC__ > 2)
28
16.9M
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
29
52.8M
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
30
#else
31
#define BLOSCLZ_LIKELY(c)    (c)
32
#define BLOSCLZ_UNLIKELY(c)  (c)
33
#endif
34
35
/*
36
 * Use inlined functions for supported systems.
37
 */
38
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
39
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
40
#endif
41
42
868k
#define MAX_COPY 32U
43
18.0M
#define MAX_DISTANCE 8191
44
16.9M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
45
46
#ifdef BLOSC_STRICT_ALIGN
47
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
48
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
49
#else
50
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
51
24.1M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
52
#endif
53
54
46.9k
#define HASH_LOG (14U)
55
5.88k
#define HASH_LOG2 (12U)
56
57
// This is used in LZ4 and seems to work pretty well here too
58
18.0M
#define HASH_FUNCTION(v, s, h) {      \
59
18.0M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
60
18.0M
}
61
62
63
#if defined(__AVX2__)
64
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
65
    uint8_t x = ip[-1];
66
67
    while (ip < (ip_bound - (sizeof(__m256i)))) {
68
        __m256i value, value2, cmp;
69
        /* Broadcast the value for every byte in a 256-bit register */
70
        memset(&value, x, sizeof(__m256i));
71
        value2 = _mm256_loadu_si256((__m256i *)ref);
72
        cmp = _mm256_cmpeq_epi64(value, value2);
73
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
74
            /* Return the byte that starts to differ */
75
            while (*ref++ == x) ip++;
76
            return ip;
77
        }
78
        else {
79
            ip += sizeof(__m256i);
80
            ref += sizeof(__m256i);
81
        }
82
    }
83
    /* Look into the remainder */
84
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
85
    return ip;
86
}
87
#endif
88
89
#if defined(__SSE2__)
90
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
91
0
  uint8_t x = ip[-1];
92
93
0
  while (ip < (ip_bound - sizeof(__m128i))) {
94
0
    __m128i value, value2, cmp;
95
    /* Broadcast the value for every byte in a 128-bit register */
96
0
    memset(&value, x, sizeof(__m128i));
97
0
    value2 = _mm_loadu_si128((__m128i *)ref);
98
0
    cmp = _mm_cmpeq_epi32(value, value2);
99
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
100
      /* Return the byte that starts to differ */
101
0
      while (*ref++ == x) ip++;
102
0
      return ip;
103
0
    }
104
0
    else {
105
0
      ip += sizeof(__m128i);
106
0
      ref += sizeof(__m128i);
107
0
    }
108
0
  }
109
  /* Look into the remainder */
110
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
111
0
  return ip;
112
0
}
113
114
#endif
115
116
117
111k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
118
111k
  uint8_t x = ip[-1];
119
111k
  int64_t value, value2;
120
  /* Broadcast the value for every byte in a 64-bit register */
121
111k
  memset(&value, x, 8);
122
  /* safe because the outer check against ip limit */
123
381k
  while (ip < (ip_bound - sizeof(int64_t))) {
124
#if defined(BLOSC_STRICT_ALIGN)
125
    memcpy(&value2, ref, 8);
126
#else
127
379k
    value2 = ((int64_t*)ref)[0];
128
379k
#endif
129
379k
    if (value != value2) {
130
      /* Return the byte that starts to differ */
131
371k
      while (*ref++ == x) ip++;
132
110k
      return ip;
133
110k
    }
134
269k
    else {
135
269k
      ip += 8;
136
269k
      ref += 8;
137
269k
    }
138
379k
  }
139
  /* Look into the remainder */
140
6.25k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
141
1.38k
  return ip;
142
111k
}
143
144
145
/* Return the byte that starts to differ */
146
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
147
0
#if !defined(BLOSC_STRICT_ALIGN)
148
0
  while (ip < (ip_bound - sizeof(int64_t))) {
149
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
150
      /* Return the byte that starts to differ */
151
0
      while (*ref++ == *ip++) {}
152
0
      return ip;
153
0
    }
154
0
    else {
155
0
      ip += sizeof(int64_t);
156
0
      ref += sizeof(int64_t);
157
0
    }
158
0
  }
159
0
#endif
160
  /* Look into the remainder */
161
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
162
0
  return ip;
163
0
}
164
165
166
#if defined(__SSE2__)
167
1.15M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
168
1.15M
  __m128i value, value2, cmp;
169
170
2.01M
  while (ip < (ip_bound - sizeof(__m128i))) {
171
2.00M
    value = _mm_loadu_si128((__m128i *) ip);
172
2.00M
    value2 = _mm_loadu_si128((__m128i *) ref);
173
2.00M
    cmp = _mm_cmpeq_epi32(value, value2);
174
2.00M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
175
      /* Return the byte that starts to differ */
176
3.43M
      while (*ref++ == *ip++) {}
177
1.14M
      return ip;
178
1.14M
    }
179
857k
    else {
180
857k
      ip += sizeof(__m128i);
181
857k
      ref += sizeof(__m128i);
182
857k
    }
183
2.00M
  }
184
  /* Look into the remainder */
185
38.3k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
186
7.46k
  return ip;
187
1.15M
}
188
#endif
189
190
191
#if defined(__AVX2__)
192
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
193
194
  while (ip < (ip_bound - sizeof(__m256i))) {
195
    __m256i value, value2, cmp;
196
    value = _mm256_loadu_si256((__m256i *) ip);
197
    value2 = _mm256_loadu_si256((__m256i *)ref);
198
    cmp = _mm256_cmpeq_epi64(value, value2);
199
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
200
      /* Return the byte that starts to differ */
201
      while (*ref++ == *ip++) {}
202
      return ip;
203
    }
204
    else {
205
      ip += sizeof(__m256i);
206
      ref += sizeof(__m256i);
207
    }
208
  }
209
  /* Look into the remainder */
210
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
211
  return ip;
212
}
213
#endif
214
215
216
1.26M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
217
1.26M
  if (BLOSCLZ_UNLIKELY(run)) {
218
#if defined(__AVX2__)
219
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
220
    // ip = get_run_32(ip, ip_bound, ref);
221
    ip = get_run(ip, ip_bound, ref);
222
#elif defined(__SSE2__)
223
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
224
    // ip = get_run_16(ip, ip_bound, ref);
225
111k
    ip = get_run(ip, ip_bound, ref);
226
#else
227
    ip = get_run(ip, ip_bound, ref);
228
#endif
229
111k
  }
230
1.15M
  else {
231
#if defined(__AVX2__)
232
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
233
    // ip = get_match_32(ip, ip_bound, ref);
234
    ip = get_match_16(ip, ip_bound, ref);
235
#elif defined(__SSE2__)
236
    ip = get_match_16(ip, ip_bound, ref);
237
#else
238
    ip = get_match(ip, ip_bound, ref);
239
#endif
240
1.15M
  }
241
242
1.26M
  return ip;
243
1.26M
}
244
245
246
13.2M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
247
13.2M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
248
13.2M
    goto out;                                           \
249
13.2M
  *(op)++ = *(anchor)++;                                \
250
13.2M
  (ip) = (anchor);                                      \
251
13.2M
  (copy)++;                                             \
252
13.2M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
253
332k
    (copy) = 0;                                         \
254
332k
    *(op)++ = MAX_COPY-1;                               \
255
332k
  }                                                     \
256
13.2M
}
257
258
3.04M
#define LITERAL2(ip, anchor, copy) {                    \
259
3.04M
  oc++; (anchor)++;                                     \
260
3.04M
  (ip) = (anchor);                                      \
261
3.04M
  (copy)++;                                             \
262
3.04M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
263
61.6k
    (copy) = 0;                                         \
264
61.6k
    oc++;                                               \
265
61.6k
  }                                                     \
266
3.04M
}
267
268
363k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
269
363k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
270
363k
    goto out;                                             \
271
363k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
272
363k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
273
363k
}
274
275
159k
#define MATCH_LONG(op, op_limit, len, distance) {       \
276
159k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
277
159k
    goto out;                                           \
278
159k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
279
193k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
280
33.6k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
281
33.6k
      goto out;                                         \
282
33.6k
    *(op)++ = 255;                                      \
283
33.6k
  }                                                     \
284
159k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
285
159k
    goto out;                                           \
286
159k
  *(op)++ = (uint8_t)(len);                             \
287
159k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
288
159k
}
289
290
1.52k
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
291
1.52k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
292
1.52k
    goto out;                                               \
293
1.52k
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
294
1.52k
  *(op)++ = 255;                                            \
295
1.52k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
296
1.52k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
297
1.52k
}
298
299
5.82k
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
300
5.82k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
301
5.82k
    goto out;                                               \
302
5.82k
  *(op)++ = (7U << 5U) + 31;                                \
303
13.5k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
304
7.69k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
305
7.69k
      goto out;                                             \
306
7.69k
    *(op)++ = 255;                                          \
307
7.69k
  }                                                         \
308
5.82k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
309
5.82k
    goto out;                                               \
310
5.82k
  *(op)++ = (uint8_t)(len);                                 \
311
5.82k
  *(op)++ = 255;                                            \
312
5.82k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
313
5.82k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
314
5.82k
}
315
316
317
// Get a guess for the compressed size of a buffer
318
5.88k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) {
319
5.88k
  uint8_t* ip = ibase;
320
5.88k
  int32_t oc = 0;
321
5.88k
  const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2);
322
5.88k
  uint16_t htab[1U << (uint8_t)HASH_LOG2];
323
5.88k
  uint32_t hval;
324
5.88k
  uint32_t seq;
325
5.88k
  uint8_t copy;
326
  // Make a tradeoff between testing too much and too little
327
5.88k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
328
5.88k
  uint8_t* ip_bound = ibase + limit - 1;
329
5.88k
  uint8_t* ip_limit = ibase + limit - 12;
330
331
  // Initialize the hash table to distances of 0
332
5.88k
  memset(htab, 0, hashlen * sizeof(uint16_t));
333
334
  /* we start with literal copy */
335
5.88k
  copy = 4;
336
5.88k
  oc += 5;
337
338
  /* main loop */
339
3.24M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
340
3.23M
    const uint8_t* ref;
341
3.23M
    unsigned distance;
342
3.23M
    uint8_t* anchor = ip;    /* comparison starting-point */
343
344
    /* find potential match */
345
3.23M
    seq = BLOSCLZ_READU32(ip);
346
3.23M
    HASH_FUNCTION(hval, seq, HASH_LOG2)
347
3.23M
    ref = ibase + htab[hval];
348
349
    /* calculate distance to the match */
350
3.23M
    distance = (unsigned int)(anchor - ref);
351
352
    /* update hash table */
353
3.23M
    htab[hval] = (uint16_t) (anchor - ibase);
354
355
3.23M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
356
5.81k
      LITERAL2(ip, anchor, copy)
357
5.81k
      continue;
358
5.81k
    }
359
360
    /* is this a match? check the first 4 bytes */
361
3.23M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
362
342k
      ref += 4;
363
342k
    }
364
2.89M
    else {
365
      /* no luck, copy as a literal */
366
2.89M
      LITERAL2(ip, anchor, copy)
367
2.89M
      continue;
368
2.89M
    }
369
370
    /* last matched byte */
371
342k
    ip = anchor + 4;
372
373
    /* distance is biased */
374
342k
    distance--;
375
376
    /* get runs or matches; zero distance means a run */
377
342k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
378
379
342k
    ip -= ipshift;
380
342k
    int len = (int)(ip - anchor);
381
342k
    if (len < minlen) {
382
153k
      LITERAL2(ip, anchor, copy)
383
153k
      continue;
384
153k
    }
385
386
    /* if we haven't copied anything, adjust the output counter */
387
188k
    if (!copy)
388
47.3k
      oc--;
389
    /* reset literal counter */
390
188k
    copy = 0;
391
392
    /* encode the match */
393
188k
    if (distance < MAX_DISTANCE) {
394
188k
      if (len >= 7) {
395
47.0k
        oc += ((len - 7) / 255) + 1;
396
47.0k
      }
397
188k
      oc += 2;
398
188k
    }
399
0
    else {
400
      /* far away, but not yet in the another galaxy... */
401
0
      if (len >= 7) {
402
0
        oc += ((len - 7) / 255) + 1;
403
0
      }
404
0
      oc += 4;
405
0
    }
406
407
    /* update the hash at match boundary */
408
188k
    seq = BLOSCLZ_READU32(ip);
409
188k
    HASH_FUNCTION(hval, seq, HASH_LOG2)
410
188k
    htab[hval] = (uint16_t)(ip++ - ibase);
411
188k
    ip++;
412
    /* assuming literal copy */
413
188k
    oc++;
414
188k
  }
415
416
5.88k
  double ic = (double)(ip - ibase);
417
5.88k
  return ic / (double)oc;
418
5.88k
}
419
420
421
int blosclz_compress(const int clevel, const void* input, int length,
422
5.88k
                     void* output, int maxout, const int split_block) {
423
5.88k
  uint8_t* ibase = (uint8_t*)input;
424
425
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
426
5.88k
  int maxlen = length / 4;
427
  // Start probing somewhere inside the buffer
428
5.88k
  int shift = length - maxlen;
429
  // Actual entropy probing!
430
5.88k
  double cratio = get_cratio(ibase + shift, maxlen, 3, 3);
431
  // discard probes with small compression ratios (too expensive)
432
5.88k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
433
5.88k
  if (cratio < cratio_[clevel]) {
434
659
      goto out;
435
659
  }
436
437
  /* When we go back in a match (shift), we obtain quite different compression properties.
438
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
439
   * Fallback to 4 because it provides more consistent results for large cratios.
440
   *
441
   * In this block we also check cratios for the beginning of the buffers and
442
   * eventually discard those that are small (take too long to decompress).
443
   * This process is called _entropy probing_.
444
   */
445
5.22k
  unsigned ipshift = 4;
446
  // Compute optimal shift and minimum lengths for encoding
447
  // Use 4 by default, except for low entropy data, where we should do a best effort
448
5.22k
  unsigned minlen = 4;
449
  // BloscLZ works better with splits mostly, so when data is not split, do a best effort
450
  // Why using cratio < 4 is based in experiments with low and high entropy
451
5.22k
  if (!split_block || cratio < 4) {
452
5.04k
    ipshift = 3;
453
5.04k
    minlen = 3;
454
5.04k
  }
455
174
  else {
456
174
    minlen = 4;
457
174
  }
458
459
5.22k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
460
5.22k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
461
5.22k
  uint8_t hashlog = hashlog_[clevel];
462
463
5.22k
  uint8_t* ip = ibase;
464
5.22k
  uint8_t* ip_bound = ibase + length - 1;
465
5.22k
  uint8_t* ip_limit = ibase + length - 12;
466
5.22k
  uint8_t* op = (uint8_t*)output;
467
5.22k
  const uint8_t* op_limit = op + maxout;
468
5.22k
  uint32_t seq;
469
5.22k
  uint8_t copy;
470
5.22k
  uint32_t hval;
471
472
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
473
5.22k
  if (length < 16 || maxout < 66) {
474
8
    return 0;
475
8
  }
476
477
  // Initialize the hash table
478
5.21k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
479
5.21k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
480
481
  /* we start with literal copy */
482
5.21k
  copy = 4;
483
5.21k
  *op++ = MAX_COPY - 1;
484
5.21k
  *op++ = *ip++;
485
5.21k
  *op++ = *ip++;
486
5.21k
  *op++ = *ip++;
487
5.21k
  *op++ = *ip++;
488
489
  /* main loop */
490
13.7M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
491
13.7M
    const uint8_t* ref;
492
13.7M
    unsigned distance;
493
13.7M
    uint8_t* anchor = ip;    /* comparison starting-point */
494
495
    /* find potential match */
496
13.7M
    seq = BLOSCLZ_READU32(ip);
497
13.7M
    HASH_FUNCTION(hval, seq, hashlog)
498
13.7M
    ref = ibase + htab[hval];
499
500
    /* calculate distance to the match */
501
13.7M
    distance = (unsigned int)(anchor - ref);
502
503
    /* update hash table */
504
13.7M
    htab[hval] = (uint32_t) (anchor - ibase);
505
506
13.7M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
507
0
      LITERAL(ip, op, op_limit, anchor, copy)
508
0
      continue;
509
0
    }
510
511
    /* is this a match? check the first 4 bytes */
512
13.7M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
513
923k
      ref += 4;
514
12.8M
    } else {
515
      /* no luck, copy as a literal */
516
12.8M
      LITERAL(ip, op, op_limit, anchor, copy)
517
0
      continue;
518
12.8M
    }
519
520
    /* last matched byte */
521
923k
    ip = anchor + 4;
522
523
    /* distance is biased */
524
923k
    distance--;
525
526
    /* get runs or matches; zero distance means a run */
527
923k
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
528
529
    /* length is biased, '1' means a match of 3 bytes */
530
923k
    ip -= ipshift;
531
532
923k
    unsigned len = (int)(ip - anchor);
533
534
    // Encoding short lengths is expensive during decompression
535
923k
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
536
393k
      LITERAL(ip, op, op_limit, anchor, copy)
537
0
      continue;
538
393k
    }
539
540
    /* if we have copied something, adjust the copy count */
541
530k
    if (copy)
542
      /* copy is biased, '0' means 1 byte copy */
543
365k
      *(op - copy - 1) = (uint8_t)(copy - 1);
544
164k
    else
545
      /* back, to overwrite the copy count */
546
164k
      op--;
547
    /* reset literal counter */
548
530k
    copy = 0;
549
550
    /* encode the match */
551
530k
    if (distance < MAX_DISTANCE) {
552
523k
      if (len < 7) {
553
363k
        MATCH_SHORT(op, op_limit, len, distance)
554
363k
      } else {
555
479k
        MATCH_LONG(op, op_limit, len, distance)
556
479k
      }
557
523k
    } else {
558
      /* far away, but not yet in the another galaxy... */
559
7.35k
      distance -= MAX_DISTANCE;
560
7.35k
      if (len < 7) {
561
1.52k
        MATCH_SHORT_FAR(op, op_limit, len, distance)
562
5.82k
      } else {
563
17.4k
        MATCH_LONG_FAR(op, op_limit, len, distance)
564
17.4k
      }
565
7.35k
    }
566
567
    /* update the hash at match boundary */
568
530k
    seq = BLOSCLZ_READU32(ip);
569
530k
    HASH_FUNCTION(hval, seq, hashlog)
570
530k
    htab[hval] = (uint32_t) (ip++ - ibase);
571
530k
    if (clevel == 9) {
572
      // In some situations, including a second hash proves to be useful,
573
      // but not in others.  Activating here in max clevel only.
574
363k
      seq >>= 8U;
575
363k
      HASH_FUNCTION(hval, seq, hashlog)
576
363k
      htab[hval] = (uint32_t) (ip++ - ibase);
577
363k
    }
578
167k
    else {
579
167k
      ip++;
580
167k
    }
581
582
530k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
583
95
      goto out;
584
585
    /* assuming literal copy */
586
530k
    *op++ = MAX_COPY - 1;
587
530k
  }
588
589
  /* left-over as literal copy */
590
26.4k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
591
22.7k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
592
22.6k
    *op++ = *ip++;
593
22.6k
    copy++;
594
22.6k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
595
200
      copy = 0;
596
200
      *op++ = MAX_COPY - 1;
597
200
    }
598
22.6k
  }
599
600
  /* if we have copied something, adjust the copy length */
601
3.68k
  if (copy)
602
3.63k
    *(op - copy - 1) = (uint8_t)(copy - 1);
603
47
  else
604
47
    op--;
605
606
  /* marker for blosclz */
607
3.68k
  *(uint8_t*)output |= (1U << 5U);
608
609
3.68k
  return (int)(op - (uint8_t*)output);
610
611
2.18k
  out:
612
2.18k
  return 0;
613
3.82k
}
614
615
// See https://habr.com/en/company/yandex/blog/457612/
616
#if defined(__AVX2__)
617
618
#if defined(_MSC_VER)
619
#define ALIGNED_(x) __declspec(align(x))
620
#else
621
#if defined(__GNUC__)
622
#define ALIGNED_(x) __attribute__ ((aligned(x)))
623
#endif
624
#endif
625
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
626
627
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
628
{
629
  size_t offset = op - match;
630
  while (len >= 16) {
631
632
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
633
      {
634
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
635
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
636
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
637
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
638
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
639
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
640
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
641
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
642
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
643
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
644
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
645
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
646
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
648
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
651
      };
652
653
    _mm_storeu_si128((__m128i *)(op),
654
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
655
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
656
657
    match += masks[offset];
658
659
    op += 16;
660
    len -= 16;
661
  }
662
  // Deal with remainders
663
  for (; len > 0; len--) {
664
    *op++ = *match++;
665
  }
666
  return op;
667
}
668
#endif
669
670
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
671
114k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
672
114k
  uint8_t* d = out;
673
114k
  const uint8_t* s = from;
674
114k
  uint8_t* const e = end;
675
676
722k
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
677
114k
}
678
679
1.47k
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
680
1.47k
  const uint8_t* ip = (const uint8_t*)input;
681
1.47k
  const uint8_t* ip_limit = ip + length;
682
1.47k
  uint8_t* op = (uint8_t*)output;
683
1.47k
  uint32_t ctrl;
684
1.47k
  uint8_t* op_limit = op + maxout;
685
1.47k
  if (BLOSCLZ_UNLIKELY(length == 0)) {
686
9
    return 0;
687
9
  }
688
1.46k
  ctrl = (*ip++) & 31U;
689
690
2.23M
  while (1) {
691
2.23M
    if (ctrl >= 32) {
692
      // match
693
166k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
694
166k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
695
166k
      uint8_t code;
696
166k
      const uint8_t* ref = op - ofs;
697
698
166k
      if (len == 7 - 1) {
699
64.3k
        do {
700
64.3k
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
701
28
            return 0;
702
28
          }
703
64.3k
          code = *ip++;
704
64.3k
          len += code;
705
64.3k
        } while (code == 255);
706
40.7k
      }
707
125k
      else {
708
125k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
709
46
          return 0;
710
46
        }
711
125k
      }
712
166k
      code = *ip++;
713
166k
      len += 3;
714
166k
      ref -= code;
715
716
      /* match from 16-bit distance */
717
166k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
718
3.93k
        if (ofs == (31U << 8U)) {
719
1.28k
          if (ip + 1 >= ip_limit) {
720
3
            return 0;
721
3
          }
722
1.28k
          ofs = (*ip++) << 8U;
723
1.28k
          ofs += *ip++;
724
1.28k
          ref = op - ofs - MAX_DISTANCE;
725
1.28k
        }
726
3.93k
      }
727
728
166k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
729
151
        return 0;
730
151
      }
731
732
166k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
733
84
        return 0;
734
84
      }
735
736
166k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
737
166k
      ctrl = *ip++;
738
739
166k
      ref--;
740
166k
      if (ref == op - 1) {
741
        /* optimized copy for a run */
742
14.5k
        memset(op, *ref, len);
743
14.5k
        op += len;
744
14.5k
      }
745
151k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
746
        // copy with an overlap not larger than 8
747
114k
        wild_copy(op, ref, op + len);
748
114k
        op += len;
749
114k
      }
750
36.8k
      else {
751
        // general copy with any overlap
752
#if defined(__AVX2__)
753
        if (op - ref <= 16) {
754
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
755
          // it is not slower either.  Let's activate here for experimentation.
756
          op = copy_match_16(op, ref, len);
757
        }
758
        else {
759
#endif
760
36.8k
          op = copy_match(op, ref, (unsigned) len);
761
#if defined(__AVX2__)
762
        }
763
#endif
764
36.8k
      }
765
166k
    }
766
2.06M
    else {
767
      // literal
768
2.06M
      ctrl++;
769
2.06M
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
770
115
        return 0;
771
115
      }
772
2.06M
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
773
16
        return 0;
774
16
      }
775
776
2.06M
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
777
      // On GCC-6, fastcopy this is still faster than plain memcpy
778
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
779
      // in performance.
780
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
781
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
782
783
2.06M
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
784
2.06M
      ctrl = *ip++;
785
2.06M
    }
786
2.23M
  }
787
788
1.02k
  return (int)(op - (uint8_t*)output);
789
1.46k
}