Coverage Report

Created: 2026-02-14 07:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  The Blosc Developers <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include <stdio.h>
18
#include <stdbool.h>
19
#include "blosclz.h"
20
#include "fastcopy.h"
21
#include "blosc-common.h"
22
23
24
/*
25
 * Give hints to the compiler for branch prediction optimization.
26
 */
27
#if defined(__GNUC__) && (__GNUC__ > 2)
28
0
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
29
4.80M
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
30
#else
31
#define BLOSCLZ_LIKELY(c)    (c)
32
#define BLOSCLZ_UNLIKELY(c)  (c)
33
#endif
34
35
/*
36
 * Use inlined functions for supported systems.
37
 */
38
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
39
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
40
#endif
41
42
0
#define MAX_COPY 32U
43
1.00k
#define MAX_DISTANCE 8191
44
0
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
45
46
#ifdef BLOSC_STRICT_ALIGN
47
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
48
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
49
#else
50
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
51
0
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
52
#endif
53
54
0
#define HASH_LOG (14U)
55
0
#define HASH_LOG2 (12U)
56
57
// This is used in LZ4 and seems to work pretty well here too
58
0
#define HASH_FUNCTION(v, s, h) {      \
59
0
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
60
0
}
61
62
63
#if defined(__AVX2__)
64
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
65
    uint8_t x = ip[-1];
66
67
    while (ip < (ip_bound - (sizeof(__m256i)))) {
68
        __m256i value, value2, cmp;
69
        /* Broadcast the value for every byte in a 256-bit register */
70
        memset(&value, x, sizeof(__m256i));
71
        value2 = _mm256_loadu_si256((__m256i *)ref);
72
        cmp = _mm256_cmpeq_epi64(value, value2);
73
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
74
            /* Return the byte that starts to differ */
75
            while (*ref++ == x) ip++;
76
            return ip;
77
        }
78
        else {
79
            ip += sizeof(__m256i);
80
            ref += sizeof(__m256i);
81
        }
82
    }
83
    /* Look into the remainder */
84
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
85
    return ip;
86
}
87
#endif
88
89
#if defined(__SSE2__)
90
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
91
0
  uint8_t x = ip[-1];
92
93
0
  while (ip < (ip_bound - sizeof(__m128i))) {
94
0
    __m128i value, value2, cmp;
95
    /* Broadcast the value for every byte in a 128-bit register */
96
0
    memset(&value, x, sizeof(__m128i));
97
0
    value2 = _mm_loadu_si128((__m128i *)ref);
98
0
    cmp = _mm_cmpeq_epi32(value, value2);
99
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
100
      /* Return the byte that starts to differ */
101
0
      while (*ref++ == x) ip++;
102
0
      return ip;
103
0
    }
104
0
    else {
105
0
      ip += sizeof(__m128i);
106
0
      ref += sizeof(__m128i);
107
0
    }
108
0
  }
109
  /* Look into the remainder */
110
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
111
0
  return ip;
112
0
}
113
114
#endif
115
116
117
0
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
118
0
  uint8_t x = ip[-1];
119
0
  int64_t value, value2;
120
  /* Broadcast the value for every byte in a 64-bit register */
121
0
  memset(&value, x, 8);
122
  /* safe because the outer check against ip limit */
123
0
  while (ip < (ip_bound - sizeof(int64_t))) {
124
#if defined(BLOSC_STRICT_ALIGN)
125
    memcpy(&value2, ref, 8);
126
#else
127
0
    value2 = ((int64_t*)ref)[0];
128
0
#endif
129
0
    if (value != value2) {
130
      /* Return the byte that starts to differ */
131
0
      while (*ref++ == x) ip++;
132
0
      return ip;
133
0
    }
134
0
    else {
135
0
      ip += 8;
136
0
      ref += 8;
137
0
    }
138
0
  }
139
  /* Look into the remainder */
140
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
141
0
  return ip;
142
0
}
143
144
145
/* Return the byte that starts to differ */
146
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
147
0
#if !defined(BLOSC_STRICT_ALIGN)
148
0
  while (ip < (ip_bound - sizeof(int64_t))) {
149
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
150
      /* Return the byte that starts to differ */
151
0
      while (*ref++ == *ip++) {}
152
0
      return ip;
153
0
    }
154
0
    else {
155
0
      ip += sizeof(int64_t);
156
0
      ref += sizeof(int64_t);
157
0
    }
158
0
  }
159
0
#endif
160
  /* Look into the remainder */
161
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
162
0
  return ip;
163
0
}
164
165
166
#if defined(__SSE2__)
167
0
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
168
0
  __m128i value, value2, cmp;
169
170
0
  while (ip < (ip_bound - sizeof(__m128i))) {
171
0
    value = _mm_loadu_si128((__m128i *) ip);
172
0
    value2 = _mm_loadu_si128((__m128i *) ref);
173
0
    cmp = _mm_cmpeq_epi32(value, value2);
174
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
175
      /* Return the byte that starts to differ */
176
0
      while (*ref++ == *ip++) {}
177
0
      return ip;
178
0
    }
179
0
    else {
180
0
      ip += sizeof(__m128i);
181
0
      ref += sizeof(__m128i);
182
0
    }
183
0
  }
184
  /* Look into the remainder */
185
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
186
0
  return ip;
187
0
}
188
#endif
189
190
191
#if defined(__AVX2__)
192
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
193
194
  while (ip < (ip_bound - sizeof(__m256i))) {
195
    __m256i value, value2, cmp;
196
    value = _mm256_loadu_si256((__m256i *) ip);
197
    value2 = _mm256_loadu_si256((__m256i *)ref);
198
    cmp = _mm256_cmpeq_epi64(value, value2);
199
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
200
      /* Return the byte that starts to differ */
201
      while (*ref++ == *ip++) {}
202
      return ip;
203
    }
204
    else {
205
      ip += sizeof(__m256i);
206
      ref += sizeof(__m256i);
207
    }
208
  }
209
  /* Look into the remainder */
210
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
211
  return ip;
212
}
213
#endif
214
215
216
0
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
217
0
  if (BLOSCLZ_UNLIKELY(run)) {
218
#if defined(__AVX2__)
219
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
220
    // ip = get_run_32(ip, ip_bound, ref);
221
    ip = get_run(ip, ip_bound, ref);
222
#elif defined(__SSE2__)
223
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
224
    // ip = get_run_16(ip, ip_bound, ref);
225
0
    ip = get_run(ip, ip_bound, ref);
226
#else
227
    ip = get_run(ip, ip_bound, ref);
228
#endif
229
0
  }
230
0
  else {
231
#if defined(__AVX2__)
232
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
233
    // ip = get_match_32(ip, ip_bound, ref);
234
    ip = get_match_16(ip, ip_bound, ref);
235
#elif defined(__SSE2__)
236
    ip = get_match_16(ip, ip_bound, ref);
237
#else
238
    ip = get_match(ip, ip_bound, ref);
239
#endif
240
0
  }
241
242
0
  return ip;
243
0
}
244
245
246
0
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
247
0
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
248
0
    goto out;                                           \
249
0
  *(op)++ = *(anchor)++;                                \
250
0
  (ip) = (anchor);                                      \
251
0
  (copy)++;                                             \
252
0
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
253
0
    (copy) = 0;                                         \
254
0
    *(op)++ = MAX_COPY-1;                               \
255
0
  }                                                     \
256
0
}
257
258
0
#define LITERAL2(ip, anchor, copy) {                    \
259
0
  oc++; (anchor)++;                                     \
260
0
  (ip) = (anchor);                                      \
261
0
  (copy)++;                                             \
262
0
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
263
0
    (copy) = 0;                                         \
264
0
    oc++;                                               \
265
0
  }                                                     \
266
0
}
267
268
0
#define MATCH_SHORT(op, op_limit, len, distance) {        \
269
0
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
270
0
    goto out;                                             \
271
0
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
272
0
  *(op)++ = (uint8_t)(((distance) & 255U));               \
273
0
}
274
275
0
#define MATCH_LONG(op, op_limit, len, distance) {       \
276
0
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
277
0
    goto out;                                           \
278
0
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
279
0
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
280
0
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
281
0
      goto out;                                         \
282
0
    *(op)++ = 255;                                      \
283
0
  }                                                     \
284
0
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
285
0
    goto out;                                           \
286
0
  *(op)++ = (uint8_t)(len);                             \
287
0
  *(op)++ = (uint8_t)(((distance) & 255U));             \
288
0
}
289
290
0
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
291
0
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
292
0
    goto out;                                               \
293
0
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
294
0
  *(op)++ = 255;                                            \
295
0
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
296
0
  *(op)++ = (uint8_t)((distance) & 255U);                   \
297
0
}
298
299
0
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
300
0
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
301
0
    goto out;                                               \
302
0
  *(op)++ = (7U << 5U) + 31;                                \
303
0
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
304
0
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
305
0
      goto out;                                             \
306
0
    *(op)++ = 255;                                          \
307
0
  }                                                         \
308
0
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
309
0
    goto out;                                               \
310
0
  *(op)++ = (uint8_t)(len);                                 \
311
0
  *(op)++ = 255;                                            \
312
0
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
313
0
  *(op)++ = (uint8_t)((distance) & 255U);                   \
314
0
}
315
316
317
// Get a guess for the compressed size of a buffer
318
0
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift) {
319
0
  uint8_t* ip = ibase;
320
0
  int32_t oc = 0;
321
0
  const uint16_t hashlen = (1U << (uint8_t)HASH_LOG2);
322
0
  uint16_t htab[1U << (uint8_t)HASH_LOG2];
323
0
  uint32_t hval;
324
0
  uint32_t seq;
325
0
  uint8_t copy;
326
  // Make a tradeoff between testing too much and too little
327
0
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
328
0
  uint8_t* ip_bound = ibase + limit - 1;
329
0
  uint8_t* ip_limit = ibase + limit - 12;
330
331
  // Initialize the hash table to distances of 0
332
0
  memset(htab, 0, hashlen * sizeof(uint16_t));
333
334
  /* we start with literal copy */
335
0
  copy = 4;
336
0
  oc += 5;
337
338
  /* main loop */
339
0
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
340
0
    const uint8_t* ref;
341
0
    unsigned distance;
342
0
    uint8_t* anchor = ip;    /* comparison starting-point */
343
344
    /* find potential match */
345
0
    seq = BLOSCLZ_READU32(ip);
346
0
    HASH_FUNCTION(hval, seq, HASH_LOG2)
347
0
    ref = ibase + htab[hval];
348
349
    /* calculate distance to the match */
350
0
    distance = (unsigned int)(anchor - ref);
351
352
    /* update hash table */
353
0
    htab[hval] = (uint16_t) (anchor - ibase);
354
355
0
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
356
0
      LITERAL2(ip, anchor, copy)
357
0
      continue;
358
0
    }
359
360
    /* is this a match? check the first 4 bytes */
361
0
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
362
0
      ref += 4;
363
0
    }
364
0
    else {
365
      /* no luck, copy as a literal */
366
0
      LITERAL2(ip, anchor, copy)
367
0
      continue;
368
0
    }
369
370
    /* last matched byte */
371
0
    ip = anchor + 4;
372
373
    /* distance is biased */
374
0
    distance--;
375
376
    /* get runs or matches; zero distance means a run */
377
0
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
378
379
0
    ip -= ipshift;
380
0
    int len = (int)(ip - anchor);
381
0
    if (len < minlen) {
382
0
      LITERAL2(ip, anchor, copy)
383
0
      continue;
384
0
    }
385
386
    /* if we haven't copied anything, adjust the output counter */
387
0
    if (!copy)
388
0
      oc--;
389
    /* reset literal counter */
390
0
    copy = 0;
391
392
    /* encode the match */
393
0
    if (distance < MAX_DISTANCE) {
394
0
      if (len >= 7) {
395
0
        oc += ((len - 7) / 255) + 1;
396
0
      }
397
0
      oc += 2;
398
0
    }
399
0
    else {
400
      /* far away, but not yet in the another galaxy... */
401
0
      if (len >= 7) {
402
0
        oc += ((len - 7) / 255) + 1;
403
0
      }
404
0
      oc += 4;
405
0
    }
406
407
    /* update the hash at match boundary */
408
0
    seq = BLOSCLZ_READU32(ip);
409
0
    HASH_FUNCTION(hval, seq, HASH_LOG2)
410
0
    htab[hval] = (uint16_t)(ip++ - ibase);
411
0
    ip++;
412
    /* assuming literal copy */
413
0
    oc++;
414
0
  }
415
416
0
  double ic = (double)(ip - ibase);
417
0
  return ic / (double)oc;
418
0
}
419
420
421
int blosclz_compress(const int clevel, const void* input, int length,
422
0
                     void* output, int maxout, const int split_block) {
423
0
  uint8_t* ibase = (uint8_t*)input;
424
425
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
426
0
  int maxlen = length / 4;
427
  // Start probing somewhere inside the buffer
428
0
  int shift = length - maxlen;
429
  // Actual entropy probing!
430
0
  double cratio = get_cratio(ibase + shift, maxlen, 3, 3);
431
  // discard probes with small compression ratios (too expensive)
432
0
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
433
0
  if (cratio < cratio_[clevel]) {
434
0
      goto out;
435
0
  }
436
437
  /* When we go back in a match (shift), we obtain quite different compression properties.
438
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
439
   * Fallback to 4 because it provides more consistent results for large cratios.
440
   *
441
   * In this block we also check cratios for the beginning of the buffers and
442
   * eventually discard those that are small (take too long to decompress).
443
   * This process is called _entropy probing_.
444
   */
445
0
  unsigned ipshift = 4;
446
  // Compute optimal shift and minimum lengths for encoding
447
  // Use 4 by default, except for low entropy data, where we should do a best effort
448
0
  unsigned minlen = 4;
449
  // BloscLZ works better with splits mostly, so when data is not split, do a best effort
450
  // Why using cratio < 4 is based in experiments with low and high entropy
451
0
  if (!split_block || cratio < 4) {
452
0
    ipshift = 3;
453
0
    minlen = 3;
454
0
  }
455
0
  else {
456
0
    minlen = 4;
457
0
  }
458
459
0
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
460
0
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
461
0
  uint8_t hashlog = hashlog_[clevel];
462
463
0
  uint8_t* ip = ibase;
464
0
  uint8_t* ip_bound = ibase + length - 1;
465
0
  uint8_t* ip_limit = ibase + length - 12;
466
0
  uint8_t* op = (uint8_t*)output;
467
0
  const uint8_t* op_limit = op + maxout;
468
0
  uint32_t seq;
469
0
  uint8_t copy;
470
0
  uint32_t hval;
471
472
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
473
0
  if (length < 16 || maxout < 66) {
474
0
    return 0;
475
0
  }
476
477
  // Initialize the hash table
478
0
  uint32_t htab[1U << (uint8_t)HASH_LOG];
479
0
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
480
481
  /* we start with literal copy */
482
0
  copy = 4;
483
0
  *op++ = MAX_COPY - 1;
484
0
  *op++ = *ip++;
485
0
  *op++ = *ip++;
486
0
  *op++ = *ip++;
487
0
  *op++ = *ip++;
488
489
  /* main loop */
490
0
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
491
0
    const uint8_t* ref;
492
0
    unsigned distance;
493
0
    uint8_t* anchor = ip;    /* comparison starting-point */
494
495
    /* find potential match */
496
0
    seq = BLOSCLZ_READU32(ip);
497
0
    HASH_FUNCTION(hval, seq, hashlog)
498
0
    ref = ibase + htab[hval];
499
500
    /* calculate distance to the match */
501
0
    distance = (unsigned int)(anchor - ref);
502
503
    /* update hash table */
504
0
    htab[hval] = (uint32_t) (anchor - ibase);
505
506
0
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
507
0
      LITERAL(ip, op, op_limit, anchor, copy)
508
0
      continue;
509
0
    }
510
511
    /* is this a match? check the first 4 bytes */
512
0
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
513
0
      ref += 4;
514
0
    } else {
515
      /* no luck, copy as a literal */
516
0
      LITERAL(ip, op, op_limit, anchor, copy)
517
0
      continue;
518
0
    }
519
520
    /* last matched byte */
521
0
    ip = anchor + 4;
522
523
    /* distance is biased */
524
0
    distance--;
525
526
    /* get runs or matches; zero distance means a run */
527
0
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
528
529
    /* length is biased, '1' means a match of 3 bytes */
530
0
    ip -= ipshift;
531
532
0
    unsigned len = (int)(ip - anchor);
533
534
    // Encoding short lengths is expensive during decompression
535
0
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
536
0
      LITERAL(ip, op, op_limit, anchor, copy)
537
0
      continue;
538
0
    }
539
540
    /* if we have copied something, adjust the copy count */
541
0
    if (copy)
542
      /* copy is biased, '0' means 1 byte copy */
543
0
      *(op - copy - 1) = (uint8_t)(copy - 1);
544
0
    else
545
      /* back, to overwrite the copy count */
546
0
      op--;
547
    /* reset literal counter */
548
0
    copy = 0;
549
550
    /* encode the match */
551
0
    if (distance < MAX_DISTANCE) {
552
0
      if (len < 7) {
553
0
        MATCH_SHORT(op, op_limit, len, distance)
554
0
      } else {
555
0
        MATCH_LONG(op, op_limit, len, distance)
556
0
      }
557
0
    } else {
558
      /* far away, but not yet in the another galaxy... */
559
0
      distance -= MAX_DISTANCE;
560
0
      if (len < 7) {
561
0
        MATCH_SHORT_FAR(op, op_limit, len, distance)
562
0
      } else {
563
0
        MATCH_LONG_FAR(op, op_limit, len, distance)
564
0
      }
565
0
    }
566
567
    /* update the hash at match boundary */
568
0
    seq = BLOSCLZ_READU32(ip);
569
0
    HASH_FUNCTION(hval, seq, hashlog)
570
0
    htab[hval] = (uint32_t) (ip++ - ibase);
571
0
    if (clevel == 9) {
572
      // In some situations, including a second hash proves to be useful,
573
      // but not in others.  Activating here in max clevel only.
574
0
      seq >>= 8U;
575
0
      HASH_FUNCTION(hval, seq, hashlog)
576
0
      htab[hval] = (uint32_t) (ip++ - ibase);
577
0
    }
578
0
    else {
579
0
      ip++;
580
0
    }
581
582
0
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
583
0
      goto out;
584
585
    /* assuming literal copy */
586
0
    *op++ = MAX_COPY - 1;
587
0
  }
588
589
  /* left-over as literal copy */
590
0
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
591
0
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
592
0
    *op++ = *ip++;
593
0
    copy++;
594
0
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
595
0
      copy = 0;
596
0
      *op++ = MAX_COPY - 1;
597
0
    }
598
0
  }
599
600
  /* if we have copied something, adjust the copy length */
601
0
  if (copy)
602
0
    *(op - copy - 1) = (uint8_t)(copy - 1);
603
0
  else
604
0
    op--;
605
606
  /* marker for blosclz */
607
0
  *(uint8_t*)output |= (1U << 5U);
608
609
0
  return (int)(op - (uint8_t*)output);
610
611
0
  out:
612
0
  return 0;
613
0
}
614
615
// See https://habr.com/en/company/yandex/blog/457612/
616
#if defined(__AVX2__)
617
618
#if defined(_MSC_VER)
619
#define ALIGNED_(x) __declspec(align(x))
620
#else
621
#if defined(__GNUC__)
622
#define ALIGNED_(x) __attribute__ ((aligned(x)))
623
#endif
624
#endif
625
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
626
627
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
628
{
629
  size_t offset = op - match;
630
  while (len >= 16) {
631
632
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
633
      {
634
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
635
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
636
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
637
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
638
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
639
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
640
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
641
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
642
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
643
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
644
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
645
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
646
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
648
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
651
      };
652
653
    _mm_storeu_si128((__m128i *)(op),
654
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
655
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
656
657
    match += masks[offset];
658
659
    op += 16;
660
    len -= 16;
661
  }
662
  // Deal with remainders
663
  for (; len > 0; len--) {
664
    *op++ = *match++;
665
  }
666
  return op;
667
}
668
#endif
669
670
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
671
102k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
672
102k
  uint8_t* d = out;
673
102k
  const uint8_t* s = from;
674
102k
  uint8_t* const e = end;
675
676
544k
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
677
102k
}
678
679
1.18k
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
680
1.18k
  const uint8_t* ip = (const uint8_t*)input;
681
1.18k
  const uint8_t* ip_limit = ip + length;
682
1.18k
  uint8_t* op = (uint8_t*)output;
683
1.18k
  uint32_t ctrl;
684
1.18k
  uint8_t* op_limit = op + maxout;
685
1.18k
  if (BLOSCLZ_UNLIKELY(length == 0)) {
686
8
    return 0;
687
8
  }
688
1.17k
  ctrl = (*ip++) & 31U;
689
690
1.49M
  while (1) {
691
1.49M
    if (ctrl >= 32) {
692
      // match
693
141k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
694
141k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
695
141k
      uint8_t code;
696
141k
      const uint8_t* ref = op - ofs;
697
698
141k
      if (len == 7 - 1) {
699
52.8k
        do {
700
52.8k
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
701
11
            return 0;
702
11
          }
703
52.8k
          code = *ip++;
704
52.8k
          len += code;
705
52.8k
        } while (code == 255);
706
31.3k
      }
707
110k
      else {
708
110k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
709
38
          return 0;
710
38
        }
711
110k
      }
712
141k
      code = *ip++;
713
141k
      len += 3;
714
141k
      ref -= code;
715
716
      /* match from 16-bit distance */
717
141k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
718
3.05k
        if (ofs == (31U << 8U)) {
719
1.00k
          if (ip + 1 >= ip_limit) {
720
3
            return 0;
721
3
          }
722
1.00k
          ofs = (*ip++) << 8U;
723
1.00k
          ofs += *ip++;
724
1.00k
          ref = op - ofs - MAX_DISTANCE;
725
1.00k
        }
726
3.05k
      }
727
728
141k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
729
118
        return 0;
730
118
      }
731
732
141k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
733
75
        return 0;
734
75
      }
735
736
141k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
737
141k
      ctrl = *ip++;
738
739
141k
      ref--;
740
141k
      if (ref == op - 1) {
741
        /* optimized copy for a run */
742
10.4k
        memset(op, *ref, len);
743
10.4k
        op += len;
744
10.4k
      }
745
130k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
746
        // copy with an overlap not larger than 8
747
102k
        wild_copy(op, ref, op + len);
748
102k
        op += len;
749
102k
      }
750
28.0k
      else {
751
        // general copy with any overlap
752
#if defined(__AVX2__)
753
        if (op - ref <= 16) {
754
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
755
          // it is not slower either.  Let's activate here for experimentation.
756
          op = copy_match_16(op, ref, len);
757
        }
758
        else {
759
#endif
760
28.0k
          op = copy_match(op, ref, (unsigned) len);
761
#if defined(__AVX2__)
762
        }
763
#endif
764
28.0k
      }
765
141k
    }
766
1.35M
    else {
767
      // literal
768
1.35M
      ctrl++;
769
1.35M
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
770
113
        return 0;
771
113
      }
772
1.35M
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
773
14
        return 0;
774
14
      }
775
776
1.35M
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
777
      // On GCC-6, fastcopy this is still faster than plain memcpy
778
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
779
      // in performance.
780
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
781
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
782
783
1.35M
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
784
1.35M
      ctrl = *ip++;
785
1.35M
    }
786
1.49M
  }
787
788
806
  return (int)(op - (uint8_t*)output);
789
1.17k
}