Coverage Report

Created: 2025-07-11 06:48

/src/c-blosc2/blosc/blosclz.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include "blosclz.h"
18
#include "fastcopy.h"
19
#include "blosc2/blosc2-common.h"
20
21
#include <stdbool.h>
22
#include <stdio.h>
23
#include <stdint.h>
24
#include <string.h>
25
26
/*
27
 * Give hints to the compiler for branch prediction optimization.
28
 * This is not necessary anymore with modern CPUs.
29
 */
30
#if 0 && defined(__GNUC__) && (__GNUC__ > 2)
31
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
32
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
33
#else
34
0
#define BLOSCLZ_LIKELY(c)    (c)
35
0
#define BLOSCLZ_UNLIKELY(c)  (c)
36
#endif
37
38
/*
39
 * Use inlined functions for supported systems.
40
 */
41
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
42
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
43
#endif
44
45
0
#define MAX_COPY 32U
46
0
#define MAX_DISTANCE 8191
47
0
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
48
49
#ifdef BLOSC_STRICT_ALIGN
50
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
51
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
52
#else
53
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
54
0
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
55
#endif
56
57
0
#define HASH_LOG (14U)
58
59
// This is used in LZ4 and seems to work pretty well here too
60
0
#define HASH_FUNCTION(v, s, h) {      \
61
0
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
62
0
}
63
64
65
#if defined(__AVX2__)
66
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
67
    uint8_t x = ip[-1];
68
69
    while (ip < (ip_bound - (sizeof(__m256i)))) {
70
        __m256i value, value2, cmp;
71
        /* Broadcast the value for every byte in a 256-bit register */
72
        memset(&value, x, sizeof(__m256i));
73
        value2 = _mm256_loadu_si256((__m256i *)ref);
74
        cmp = _mm256_cmpeq_epi64(value, value2);
75
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
76
            /* Return the byte that starts to differ */
77
            while (*ref++ == x) ip++;
78
            return ip;
79
        }
80
        else {
81
            ip += sizeof(__m256i);
82
            ref += sizeof(__m256i);
83
        }
84
    }
85
    /* Look into the remainder */
86
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
87
    return ip;
88
}
89
#endif
90
91
#if defined(__SSE2__)
92
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
93
0
  uint8_t x = ip[-1];
94
95
0
  while (ip < (ip_bound - sizeof(__m128i))) {
96
0
    __m128i value, value2, cmp;
97
    /* Broadcast the value for every byte in a 128-bit register */
98
0
    memset(&value, x, sizeof(__m128i));
99
0
    value2 = _mm_loadu_si128((__m128i *)ref);
100
0
    cmp = _mm_cmpeq_epi32(value, value2);
101
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
102
      /* Return the byte that starts to differ */
103
0
      while (*ref++ == x) ip++;
104
0
      return ip;
105
0
    }
106
0
    else {
107
0
      ip += sizeof(__m128i);
108
0
      ref += sizeof(__m128i);
109
0
    }
110
0
  }
111
  /* Look into the remainder */
112
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
113
0
  return ip;
114
0
}
115
116
#endif
117
118
119
0
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
120
0
  uint8_t x = ip[-1];
121
0
  int64_t value, value2;
122
  /* Broadcast the value for every byte in a 64-bit register */
123
0
  memset(&value, x, 8);
124
  /* safe because the outer check against ip limit */
125
0
  while (ip < (ip_bound - sizeof(int64_t))) {
126
#if defined(BLOSC_STRICT_ALIGN)
127
    memcpy(&value2, ref, 8);
128
#else
129
0
    value2 = ((int64_t*)ref)[0];
130
0
#endif
131
0
    if (value != value2) {
132
      /* Return the byte that starts to differ */
133
0
      while (*ref++ == x) ip++;
134
0
      return ip;
135
0
    }
136
0
    else {
137
0
      ip += 8;
138
0
      ref += 8;
139
0
    }
140
0
  }
141
  /* Look into the remainder */
142
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
143
0
  return ip;
144
0
}
145
146
147
/* Return the byte that starts to differ */
148
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
149
0
#if !defined(BLOSC_STRICT_ALIGN)
150
0
  while (ip < (ip_bound - sizeof(int64_t))) {
151
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
152
      /* Return the byte that starts to differ */
153
0
      while (*ref++ == *ip++) {}
154
0
      return ip;
155
0
    }
156
0
    else {
157
0
      ip += sizeof(int64_t);
158
0
      ref += sizeof(int64_t);
159
0
    }
160
0
  }
161
0
#endif
162
  /* Look into the remainder */
163
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
164
0
  return ip;
165
0
}
166
167
168
#if defined(__SSE2__)
169
0
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
170
0
  __m128i value, value2, cmp;
171
172
0
  while (ip < (ip_bound - sizeof(__m128i))) {
173
0
    value = _mm_loadu_si128((__m128i *) ip);
174
0
    value2 = _mm_loadu_si128((__m128i *) ref);
175
0
    cmp = _mm_cmpeq_epi32(value, value2);
176
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
177
      /* Return the byte that starts to differ */
178
0
      while (*ref++ == *ip++) {}
179
0
      return ip;
180
0
    }
181
0
    else {
182
0
      ip += sizeof(__m128i);
183
0
      ref += sizeof(__m128i);
184
0
    }
185
0
  }
186
  /* Look into the remainder */
187
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
188
0
  return ip;
189
0
}
190
#endif
191
192
193
#if defined(__AVX2__)
194
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
195
196
  while (ip < (ip_bound - sizeof(__m256i))) {
197
    __m256i value, value2, cmp;
198
    value = _mm256_loadu_si256((__m256i *) ip);
199
    value2 = _mm256_loadu_si256((__m256i *)ref);
200
    cmp = _mm256_cmpeq_epi64(value, value2);
201
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
202
      /* Return the byte that starts to differ */
203
      while (*ref++ == *ip++) {}
204
      return ip;
205
    }
206
    else {
207
      ip += sizeof(__m256i);
208
      ref += sizeof(__m256i);
209
    }
210
  }
211
  /* Look into the remainder */
212
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
213
  return ip;
214
}
215
#endif
216
217
218
0
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
219
0
  if (BLOSCLZ_UNLIKELY(run)) {
220
#if defined(__AVX2__)
221
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
222
    // ip = get_run_32(ip, ip_bound, ref);
223
    ip = get_run(ip, ip_bound, ref);
224
#elif defined(__SSE2__)
225
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
226
    // ip = get_run_16(ip, ip_bound, ref);
227
0
    ip = get_run(ip, ip_bound, ref);
228
#else
229
    ip = get_run(ip, ip_bound, ref);
230
#endif
231
0
  }
232
0
  else {
233
#if defined(__AVX2__)
234
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
235
    // ip = get_match_32(ip, ip_bound, ref);
236
    ip = get_match_16(ip, ip_bound, ref);
237
#elif defined(__SSE2__)
238
    ip = get_match_16(ip, ip_bound, ref);
239
#else
240
    ip = get_match(ip, ip_bound, ref);
241
#endif
242
0
  }
243
244
0
  return ip;
245
0
}
246
247
248
0
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
249
0
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
250
0
    goto out;                                           \
251
0
  *(op)++ = *(anchor)++;                                \
252
0
  (ip) = (anchor);                                      \
253
0
  (copy)++;                                             \
254
0
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
255
0
    (copy) = 0;                                         \
256
0
    *(op)++ = MAX_COPY-1;                               \
257
0
  }                                                     \
258
0
}
259
260
0
#define LITERAL2(ip, anchor, copy) {                    \
261
0
  oc++; (anchor)++;                                     \
262
0
  (ip) = (anchor);                                      \
263
0
  (copy)++;                                             \
264
0
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
265
0
    (copy) = 0;                                         \
266
0
    oc++;                                               \
267
0
  }                                                     \
268
0
}
269
270
0
#define MATCH_SHORT(op, op_limit, len, distance) {        \
271
0
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
272
0
    goto out;                                             \
273
0
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
274
0
  *(op)++ = (uint8_t)(((distance) & 255U));               \
275
0
}
276
277
0
#define MATCH_LONG(op, op_limit, len, distance) {       \
278
0
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
279
0
    goto out;                                           \
280
0
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
281
0
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
282
0
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
283
0
      goto out;                                         \
284
0
    *(op)++ = 255;                                      \
285
0
  }                                                     \
286
0
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
287
0
    goto out;                                           \
288
0
  *(op)++ = (uint8_t)(len);                             \
289
0
  *(op)++ = (uint8_t)(((distance) & 255U));             \
290
0
}
291
292
0
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
293
0
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
294
0
    goto out;                                               \
295
0
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
296
0
  *(op)++ = 255;                                            \
297
0
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
298
0
  *(op)++ = (uint8_t)((distance) & 255U);                   \
299
0
}
300
301
0
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
302
0
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
303
0
    goto out;                                               \
304
0
  *(op)++ = (7U << 5U) + 31;                                \
305
0
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
306
0
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
307
0
      goto out;                                             \
308
0
    *(op)++ = 255;                                          \
309
0
  }                                                         \
310
0
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
311
0
    goto out;                                               \
312
0
  *(op)++ = (uint8_t)(len);                                 \
313
0
  *(op)++ = 255;                                            \
314
0
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
315
0
  *(op)++ = (uint8_t)((distance) & 255U);                   \
316
0
}
317
318
319
// Get a guess for the compressed size of a buffer
320
0
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift, uint32_t htab[], int8_t hashlog) {
321
0
  uint8_t* ip = ibase;
322
0
  int32_t oc = 0;
323
0
  const uint16_t hashlen = (1U << (uint8_t)hashlog);
324
0
  uint32_t hval;
325
0
  uint32_t seq;
326
0
  uint8_t copy;
327
  // Make a tradeoff between testing too much and too little
328
0
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
329
0
  uint8_t* ip_bound = ibase + limit - 1;
330
0
  uint8_t* ip_limit = ibase + limit - 12;
331
332
  // Initialize the hash table to distances of 0
333
0
  memset(htab, 0, hashlen * sizeof(uint32_t));
334
335
  /* we start with literal copy */
336
0
  copy = 4;
337
0
  oc += 5;
338
339
  /* main loop */
340
0
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
341
0
    const uint8_t* ref;
342
0
    unsigned distance;
343
0
    uint8_t* anchor = ip;    /* comparison starting-point */
344
345
    /* find potential match */
346
0
    seq = BLOSCLZ_READU32(ip);
347
0
    HASH_FUNCTION(hval, seq, hashlog)
348
0
    ref = ibase + htab[hval];
349
350
    /* calculate distance to the match */
351
0
    distance = (unsigned int)(anchor - ref);
352
353
    /* update hash table */
354
0
    htab[hval] = (uint32_t) (anchor - ibase);
355
356
0
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
357
0
      LITERAL2(ip, anchor, copy)
358
0
      continue;
359
0
    }
360
361
    /* is this a match? check the first 4 bytes */
362
0
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
363
0
      ref += 4;
364
0
    }
365
0
    else {
366
      /* no luck, copy as a literal */
367
0
      LITERAL2(ip, anchor, copy)
368
0
      continue;
369
0
    }
370
371
    /* last matched byte */
372
0
    ip = anchor + 4;
373
374
    /* distance is biased */
375
0
    distance--;
376
377
    /* get runs or matches; zero distance means a run */
378
0
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
379
380
0
    ip -= ipshift;
381
0
    int len = (int)(ip - anchor);
382
0
    if (len < minlen) {
383
0
      LITERAL2(ip, anchor, copy)
384
0
      continue;
385
0
    }
386
387
    /* if we haven't copied anything, adjust the output counter */
388
0
    if (!copy)
389
0
      oc--;
390
    /* reset literal counter */
391
0
    copy = 0;
392
393
    /* encode the match */
394
0
    if (distance < MAX_DISTANCE) {
395
0
      if (len >= 7) {
396
0
        oc += ((len - 7) / 255) + 1;
397
0
      }
398
0
      oc += 2;
399
0
    }
400
0
    else {
401
      /* far away, but not yet in the another galaxy... */
402
0
      if (len >= 7) {
403
0
        oc += ((len - 7) / 255) + 1;
404
0
      }
405
0
      oc += 4;
406
0
    }
407
408
    /* update the hash at match boundary */
409
0
    seq = BLOSCLZ_READU32(ip);
410
0
    HASH_FUNCTION(hval, seq, hashlog)
411
0
    htab[hval] = (uint32_t)(ip++ - ibase);
412
0
    ip++;
413
    /* assuming literal copy */
414
0
    oc++;
415
0
  }
416
417
0
  double ic = (double)(ip - ibase);
418
0
  return ic / (double)oc;
419
0
}
420
421
422
int blosclz_compress(const int clevel, const void* input, int length,
423
0
                     void* output, int maxout, blosc2_context* ctx) {
424
0
  BLOSC_UNUSED_PARAM(ctx);
425
0
  uint8_t* ibase = (uint8_t*)input;
426
0
  uint32_t htab[1U << (uint8_t)HASH_LOG];
427
428
  /* When we go back in a match (shift), we obtain quite different compression properties.
429
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
430
   * Fallback to 4 because it provides more consistent results for large cratios.
431
   * UPDATE: new experiments show that using a value of 3 is a bit better, at least for ERA5.
432
   * UPDATE 2: go back to 4, as they seem to provide better cratios in general.
433
   *
434
   * In this block we also check cratios for the beginning of the buffers and
435
   * eventually discard those that are small (take too long to decompress).
436
   * This process is called _entropy probing_.
437
   */
438
0
  unsigned ipshift = 4;
439
  // Minimum lengths for encoding (normally it is good to match the shift value)
440
0
  unsigned minlen = 4;
441
442
0
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
443
0
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
444
0
  uint8_t hashlog = hashlog_[clevel];
445
446
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
447
  // UPDATE: new experiments with ERA5 datasets (float32) say that checking the whole buffer
448
  // is better (specially when combined with bitshuffle).
449
  // The loss in speed for checking the whole buffer is pretty negligible too.
450
0
  int maxlen = length;
451
0
  if (clevel < 2) {
452
0
    maxlen /= 8;
453
0
  }
454
0
  else if (clevel < 4) {
455
0
    maxlen /= 4;
456
0
  }
457
0
  else if (clevel < 7) {
458
0
    maxlen /= 2;
459
0
  }
460
  // Start probing somewhere inside the buffer
461
0
  int shift = length - maxlen;
462
  // Actual entropy probing!
463
0
  double cratio = get_cratio(ibase + shift, maxlen, minlen, ipshift, htab, hashlog);
464
  // discard probes with small compression ratios (too expensive)
465
0
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
466
0
  if (cratio < cratio_[clevel]) {
467
0
      goto out;
468
0
  }
469
470
0
  uint8_t* ip = ibase;
471
0
  uint8_t* ip_bound = ibase + length - 1;
472
0
  uint8_t* ip_limit = ibase + length - 12;
473
0
  uint8_t* op = (uint8_t*)output;
474
0
  const uint8_t* op_limit = op + maxout;
475
0
  uint32_t seq;
476
0
  uint8_t copy;
477
0
  uint32_t hval;
478
479
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
480
0
  if (length < 16 || maxout < 66) {
481
0
    return 0;
482
0
  }
483
484
  // Initialize the hash table
485
0
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
486
487
  /* we start with literal copy */
488
0
  copy = 4;
489
0
  *op++ = MAX_COPY - 1;
490
0
  *op++ = *ip++;
491
0
  *op++ = *ip++;
492
0
  *op++ = *ip++;
493
0
  *op++ = *ip++;
494
495
  /* main loop */
496
0
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
497
0
    const uint8_t* ref;
498
0
    unsigned distance;
499
0
    uint8_t* anchor = ip;    /* comparison starting-point */
500
501
    /* find potential match */
502
0
    seq = BLOSCLZ_READU32(ip);
503
0
    HASH_FUNCTION(hval, seq, hashlog)
504
0
    ref = ibase + htab[hval];
505
506
    /* calculate distance to the match */
507
0
    distance = (unsigned int)(anchor - ref);
508
509
    /* update hash table */
510
0
    htab[hval] = (uint32_t) (anchor - ibase);
511
512
0
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
513
0
      LITERAL(ip, op, op_limit, anchor, copy)
514
0
      continue;
515
0
    }
516
517
    /* is this a match? check the first 4 bytes */
518
0
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
519
0
      ref += 4;
520
0
    } else {
521
      /* no luck, copy as a literal */
522
0
      LITERAL(ip, op, op_limit, anchor, copy)
523
0
      continue;
524
0
    }
525
526
    /* last matched byte */
527
0
    ip = anchor + 4;
528
529
    /* distance is biased */
530
0
    distance--;
531
532
    /* get runs or matches; zero distance means a run */
533
0
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
534
535
    /* length is biased, '1' means a match of 3 bytes */
536
0
    ip -= ipshift;
537
538
0
    unsigned len = (int)(ip - anchor);
539
540
    // Encoding short lengths is expensive during decompression
541
0
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
542
0
      LITERAL(ip, op, op_limit, anchor, copy)
543
0
      continue;
544
0
    }
545
546
    /* if we have copied something, adjust the copy count */
547
0
    if (copy)
548
      /* copy is biased, '0' means 1 byte copy */
549
0
      *(op - copy - 1) = (uint8_t)(copy - 1);
550
0
    else
551
      /* back, to overwrite the copy count */
552
0
      op--;
553
    /* reset literal counter */
554
0
    copy = 0;
555
556
    /* encode the match */
557
0
    if (distance < MAX_DISTANCE) {
558
0
      if (len < 7) {
559
0
        MATCH_SHORT(op, op_limit, len, distance)
560
0
      } else {
561
0
        MATCH_LONG(op, op_limit, len, distance)
562
0
      }
563
0
    } else {
564
      /* far away, but not yet in the another galaxy... */
565
0
      distance -= MAX_DISTANCE;
566
0
      if (len < 7) {
567
0
        MATCH_SHORT_FAR(op, op_limit, len, distance)
568
0
      } else {
569
0
        MATCH_LONG_FAR(op, op_limit, len, distance)
570
0
      }
571
0
    }
572
573
    /* update the hash at match boundary */
574
0
    seq = BLOSCLZ_READU32(ip);
575
0
    HASH_FUNCTION(hval, seq, hashlog)
576
0
    htab[hval] = (uint32_t) (ip++ - ibase);
577
0
    if (clevel == 9) {
578
      // In some situations, including a second hash proves to be useful,
579
      // but not in others.  Activating here in max clevel only.
580
0
      seq >>= 8U;
581
0
      HASH_FUNCTION(hval, seq, hashlog)
582
0
      htab[hval] = (uint32_t) (ip++ - ibase);
583
0
    }
584
0
    else {
585
0
      ip++;
586
0
    }
587
588
0
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
589
0
      goto out;
590
591
    /* assuming literal copy */
592
0
    *op++ = MAX_COPY - 1;
593
0
  }
594
595
  /* left-over as literal copy */
596
0
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
597
0
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
598
0
    *op++ = *ip++;
599
0
    copy++;
600
0
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
601
0
      copy = 0;
602
0
      *op++ = MAX_COPY - 1;
603
0
    }
604
0
  }
605
606
  /* if we have copied something, adjust the copy length */
607
0
  if (copy)
608
0
    *(op - copy - 1) = (uint8_t)(copy - 1);
609
0
  else
610
0
    op--;
611
612
  /* marker for blosclz */
613
0
  *(uint8_t*)output |= (1U << 5U);
614
615
0
  return (int)(op - (uint8_t*)output);
616
617
0
  out:
618
0
  return 0;
619
0
}
620
621
// See https://habr.com/en/company/yandex/blog/457612/
622
#if defined(__AVX2__)
623
624
#if defined(_MSC_VER)
625
#define ALIGNED_(x) __declspec(align(x))
626
#else
627
#if defined(__GNUC__)
628
#define ALIGNED_(x) __attribute__ ((aligned(x)))
629
#endif
630
#endif
631
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
632
633
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
634
{
635
  size_t offset = op - match;
636
  while (len >= 16) {
637
638
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
639
      {
640
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
641
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
642
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
643
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
644
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
645
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
646
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
648
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
651
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
652
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
653
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
654
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
655
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
656
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
657
      };
658
659
    _mm_storeu_si128((__m128i *)(op),
660
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
661
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
662
663
    match += masks[offset];
664
665
    op += 16;
666
    len -= 16;
667
  }
668
  // Deal with remainders
669
  for (; len > 0; len--) {
670
    *op++ = *match++;
671
  }
672
  return op;
673
}
674
#endif
675
676
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
677
0
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
678
0
  uint8_t* d = out;
679
0
  const uint8_t* s = from;
680
0
  uint8_t* const e = end;
681
682
0
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
683
0
}
684
685
0
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
686
0
  const uint8_t* ip = (const uint8_t*)input;
687
0
  const uint8_t* ip_limit = ip + length;
688
0
  uint8_t* op = (uint8_t*)output;
689
0
  uint32_t ctrl;
690
0
  uint8_t* op_limit = op + maxout;
691
0
  if (BLOSCLZ_UNLIKELY(length == 0)) {
692
0
    return 0;
693
0
  }
694
0
  ctrl = (*ip++) & 31U;
695
696
0
  while (1) {
697
0
    if (ctrl >= 32) {
698
      // match
699
0
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
700
0
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
701
0
      uint8_t code;
702
0
      const uint8_t* ref = op - ofs;
703
704
0
      if (len == 7 - 1) {
705
0
        do {
706
0
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
707
0
            return 0;
708
0
          }
709
0
          code = *ip++;
710
0
          len += code;
711
0
        } while (code == 255);
712
0
      }
713
0
      else {
714
0
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
715
0
          return 0;
716
0
        }
717
0
      }
718
0
      code = *ip++;
719
0
      len += 3;
720
0
      ref -= code;
721
722
      /* match from 16-bit distance */
723
0
      if (BLOSCLZ_UNLIKELY(code == 255)) {
724
0
        if (ofs == (31U << 8U)) {
725
0
          if (ip + 1 >= ip_limit) {
726
0
            return 0;
727
0
          }
728
0
          ofs = (*ip++) << 8U;
729
0
          ofs += *ip++;
730
0
          ref = op - ofs - MAX_DISTANCE;
731
0
        }
732
0
      }
733
734
0
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
735
0
        return 0;
736
0
      }
737
738
0
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
739
0
        return 0;
740
0
      }
741
742
0
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
743
0
      ctrl = *ip++;
744
745
0
      ref--;
746
0
      if (ref == op - 1) {
747
        /* optimized copy for a run */
748
0
        memset(op, *ref, len);
749
0
        op += len;
750
0
      }
751
0
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
752
        // copy with an overlap not larger than 8
753
0
        wild_copy(op, ref, op + len);
754
0
        op += len;
755
0
      }
756
0
      else {
757
        // general copy with any overlap
758
#if 0 && defined(__AVX2__)
759
        if (op - ref <= 16) {
760
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
761
          // it is not too slower either.
762
          op = copy_match_16(op, ref, len);
763
        }
764
        else {
765
#endif
766
0
          op = copy_match(op, ref, (unsigned) len);
767
#if 0 && defined(__AVX2__)
768
        }
769
#endif
770
0
      }
771
0
    }
772
0
    else {
773
      // literal
774
0
      ctrl++;
775
0
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
776
0
        return 0;
777
0
      }
778
0
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
779
0
        return 0;
780
0
      }
781
782
0
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
783
      // On GCC-6, fastcopy this is still faster than plain memcpy
784
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
785
      // in performance.
786
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
787
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
788
789
0
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
790
0
      ctrl = *ip++;
791
0
    }
792
0
  }
793
794
0
  return (int)(op - (uint8_t*)output);
795
0
}