Coverage Report

Created: 2025-12-11 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc2/blosc/blosclz.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on FastLZ, a lightning-fast
13
  lossless compression library.  See LICENSES/FASTLZ.txt for details.
14
**********************************************************************/
15
16
17
#include "blosclz.h"
18
#include "fastcopy.h"
19
#include "blosc2/blosc2-common.h"
20
21
#include <stdbool.h>
22
#include <stdio.h>
23
#include <stdint.h>
24
#include <string.h>
25
26
/*
27
 * Give hints to the compiler for branch prediction optimization.
28
 * This is not necessary anymore with modern CPUs.
29
 */
30
#if 0 && defined(__GNUC__) && (__GNUC__ > 2)
31
#define BLOSCLZ_LIKELY(c)    (__builtin_expect((c), 1))
32
#define BLOSCLZ_UNLIKELY(c)  (__builtin_expect((c), 0))
33
#else
34
37.1M
#define BLOSCLZ_LIKELY(c)    (c)
35
81.4M
#define BLOSCLZ_UNLIKELY(c)  (c)
36
#endif
37
38
/*
39
 * Use inlined functions for supported systems.
40
 */
41
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
42
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
43
#endif
44
45
1.30M
#define MAX_COPY 32U
46
38.7M
#define MAX_DISTANCE 8191
47
37.0M
#define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
48
49
#ifdef BLOSC_STRICT_ALIGN
50
  #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
51
  #define BLOSCLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
52
#else
53
  #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
54
76.7M
  #define BLOSCLZ_READU32(p) *((const uint32_t*)(p))
55
#endif
56
57
394k
#define HASH_LOG (14U)
58
59
// This is used in LZ4 and seems to work pretty well here too
60
38.8M
#define HASH_FUNCTION(v, s, h) {      \
61
38.8M
  (v) = ((s) * 2654435761U) >> (32U - (h)); \
62
38.8M
}
63
64
65
#if defined(__AVX2__)
66
static uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
67
    uint8_t x = ip[-1];
68
69
    while (ip < (ip_bound - (sizeof(__m256i)))) {
70
        __m256i value, value2, cmp;
71
        /* Broadcast the value for every byte in a 256-bit register */
72
        memset(&value, x, sizeof(__m256i));
73
        value2 = _mm256_loadu_si256((__m256i *)ref);
74
        cmp = _mm256_cmpeq_epi64(value, value2);
75
        if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
76
            /* Return the byte that starts to differ */
77
            while (*ref++ == x) ip++;
78
            return ip;
79
        }
80
        else {
81
            ip += sizeof(__m256i);
82
            ref += sizeof(__m256i);
83
        }
84
    }
85
    /* Look into the remainder */
86
    while ((ip < ip_bound) && (*ref++ == x)) ip++;
87
    return ip;
88
}
89
#endif
90
91
#if defined(__SSE2__)
92
0
uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
93
0
  uint8_t x = ip[-1];
94
95
0
  while (ip < (ip_bound - sizeof(__m128i))) {
96
0
    __m128i value, value2, cmp;
97
    /* Broadcast the value for every byte in a 128-bit register */
98
0
    memset(&value, x, sizeof(__m128i));
99
0
    value2 = _mm_loadu_si128((__m128i *)ref);
100
0
    cmp = _mm_cmpeq_epi32(value, value2);
101
0
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
102
      /* Return the byte that starts to differ */
103
0
      while (*ref++ == x) ip++;
104
0
      return ip;
105
0
    }
106
0
    else {
107
0
      ip += sizeof(__m128i);
108
0
      ref += sizeof(__m128i);
109
0
    }
110
0
  }
111
  /* Look into the remainder */
112
0
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
113
0
  return ip;
114
0
}
115
116
#endif
117
118
119
945k
static uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
120
945k
  uint8_t x = ip[-1];
121
945k
  int64_t value, value2;
122
  /* Broadcast the value for every byte in a 64-bit register */
123
945k
  memset(&value, x, 8);
124
  /* safe because the outer check against ip limit */
125
1.70M
  while (ip < (ip_bound - sizeof(int64_t))) {
126
#if defined(BLOSC_STRICT_ALIGN)
127
    memcpy(&value2, ref, 8);
128
#else
129
1.69M
    value2 = ((int64_t*)ref)[0];
130
1.69M
#endif
131
1.69M
    if (value != value2) {
132
      /* Return the byte that starts to differ */
133
2.90M
      while (*ref++ == x) ip++;
134
928k
      return ip;
135
928k
    }
136
761k
    else {
137
761k
      ip += 8;
138
761k
      ref += 8;
139
761k
    }
140
1.69M
  }
141
  /* Look into the remainder */
142
58.5k
  while ((ip < ip_bound) && (*ref++ == x)) ip++;
143
16.4k
  return ip;
144
945k
}
145
146
147
/* Return the byte that starts to differ */
148
0
uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
149
0
#if !defined(BLOSC_STRICT_ALIGN)
150
0
  while (ip < (ip_bound - sizeof(int64_t))) {
151
0
    if (*(int64_t*)ref != *(int64_t*)ip) {
152
      /* Return the byte that starts to differ */
153
0
      while (*ref++ == *ip++) {}
154
0
      return ip;
155
0
    }
156
0
    else {
157
0
      ip += sizeof(int64_t);
158
0
      ref += sizeof(int64_t);
159
0
    }
160
0
  }
161
0
#endif
162
  /* Look into the remainder */
163
0
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
164
0
  return ip;
165
0
}
166
167
168
#if defined(__SSE2__)
169
4.81M
static uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
170
4.81M
  __m128i value, value2, cmp;
171
172
6.97M
  while (ip < (ip_bound - sizeof(__m128i))) {
173
6.91M
    value = _mm_loadu_si128((__m128i *) ip);
174
6.91M
    value2 = _mm_loadu_si128((__m128i *) ref);
175
6.91M
    cmp = _mm_cmpeq_epi32(value, value2);
176
6.91M
    if (_mm_movemask_epi8(cmp) != 0xFFFF) {
177
      /* Return the byte that starts to differ */
178
13.1M
      while (*ref++ == *ip++) {}
179
4.74M
      return ip;
180
4.74M
    }
181
2.16M
    else {
182
2.16M
      ip += sizeof(__m128i);
183
2.16M
      ref += sizeof(__m128i);
184
2.16M
    }
185
6.91M
  }
186
  /* Look into the remainder */
187
399k
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
188
62.3k
  return ip;
189
4.81M
}
190
#endif
191
192
193
#if defined(__AVX2__)
194
static uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
195
196
  while (ip < (ip_bound - sizeof(__m256i))) {
197
    __m256i value, value2, cmp;
198
    value = _mm256_loadu_si256((__m256i *) ip);
199
    value2 = _mm256_loadu_si256((__m256i *)ref);
200
    cmp = _mm256_cmpeq_epi64(value, value2);
201
    if ((unsigned)_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
202
      /* Return the byte that starts to differ */
203
      while (*ref++ == *ip++) {}
204
      return ip;
205
    }
206
    else {
207
      ip += sizeof(__m256i);
208
      ref += sizeof(__m256i);
209
    }
210
  }
211
  /* Look into the remainder */
212
  while ((ip < ip_bound) && (*ref++ == *ip++)) {}
213
  return ip;
214
}
215
#endif
216
217
218
5.75M
static uint8_t* get_run_or_match(uint8_t* ip, uint8_t* ip_bound, const uint8_t* ref, bool run) {
219
5.75M
  if (BLOSCLZ_UNLIKELY(run)) {
220
#if defined(__AVX2__)
221
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
222
    // ip = get_run_32(ip, ip_bound, ref);
223
    ip = get_run(ip, ip_bound, ref);
224
#elif defined(__SSE2__)
225
    // Extensive experiments on AMD Ryzen3 say that regular get_run is faster
226
    // ip = get_run_16(ip, ip_bound, ref);
227
945k
    ip = get_run(ip, ip_bound, ref);
228
#else
229
    ip = get_run(ip, ip_bound, ref);
230
#endif
231
945k
  }
232
4.81M
  else {
233
#if defined(__AVX2__)
234
    // Extensive experiments on AMD Ryzen3 say that regular get_match_16 is faster
235
    // ip = get_match_32(ip, ip_bound, ref);
236
    ip = get_match_16(ip, ip_bound, ref);
237
#elif defined(__SSE2__)
238
    ip = get_match_16(ip, ip_bound, ref);
239
#else
240
    ip = get_match(ip, ip_bound, ref);
241
#endif
242
4.81M
  }
243
244
5.75M
  return ip;
245
5.75M
}
246
247
248
17.0M
#define LITERAL(ip, op, op_limit, anchor, copy) {       \
249
17.0M
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
250
17.0M
    goto out;                                           \
251
17.0M
  *(op)++ = *(anchor)++;                                \
252
17.0M
  (ip) = (anchor);                                      \
253
17.0M
  (copy)++;                                             \
254
17.0M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
255
372k
    (copy) = 0;                                         \
256
372k
    *(op)++ = MAX_COPY-1;                               \
257
372k
  }                                                     \
258
17.0M
}
259
260
18.6M
#define LITERAL2(ip, anchor, copy) {                    \
261
18.6M
  oc++; (anchor)++;                                     \
262
18.6M
  (ip) = (anchor);                                      \
263
18.6M
  (copy)++;                                             \
264
18.6M
  if (BLOSCLZ_UNLIKELY((copy) == MAX_COPY)) {           \
265
463k
    (copy) = 0;                                         \
266
463k
    oc++;                                               \
267
463k
  }                                                     \
268
18.6M
}
269
270
376k
#define MATCH_SHORT(op, op_limit, len, distance) {        \
271
376k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))            \
272
376k
    goto out;                                             \
273
376k
  *(op)++ = (uint8_t)(((len) << 5U) + ((distance) >> 8U));\
274
376k
  *(op)++ = (uint8_t)(((distance) & 255U));               \
275
376k
}
276
277
505k
#define MATCH_LONG(op, op_limit, len, distance) {       \
278
505k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))          \
279
505k
    goto out;                                           \
280
505k
  *(op)++ = (uint8_t)((7U << 5U) + ((distance) >> 8U)); \
281
558k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {        \
282
52.9k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))        \
283
52.9k
      goto out;                                         \
284
52.9k
    *(op)++ = 255;                                      \
285
52.9k
  }                                                     \
286
505k
  if (BLOSCLZ_UNLIKELY((op) + 2 > (op_limit)))          \
287
505k
    goto out;                                           \
288
505k
  *(op)++ = (uint8_t)(len);                             \
289
505k
  *(op)++ = (uint8_t)(((distance) & 255U));             \
290
505k
}
291
292
1.04k
#define MATCH_SHORT_FAR(op, op_limit, len, distance) {      \
293
1.04k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
294
1.04k
    goto out;                                               \
295
1.04k
  *(op)++ = (uint8_t)(((len) << 5U) + 31);                  \
296
1.03k
  *(op)++ = 255;                                            \
297
1.03k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
298
1.03k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
299
1.03k
}
300
301
13.7k
#define MATCH_LONG_FAR(op, op_limit, len, distance) {       \
302
13.7k
  if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))              \
303
13.7k
    goto out;                                               \
304
13.7k
  *(op)++ = (7U << 5U) + 31;                                \
305
21.8k
  for ((len) -= 7; (len) >= 255; (len) -= 255) {            \
306
8.20k
    if (BLOSCLZ_UNLIKELY((op) + 1 > (op_limit)))            \
307
8.20k
      goto out;                                             \
308
8.20k
    *(op)++ = 255;                                          \
309
8.19k
  }                                                         \
310
13.7k
  if (BLOSCLZ_UNLIKELY((op) + 4 > (op_limit)))              \
311
13.6k
    goto out;                                               \
312
13.6k
  *(op)++ = (uint8_t)(len);                                 \
313
13.6k
  *(op)++ = 255;                                            \
314
13.6k
  *(op)++ = (uint8_t)((distance) >> 8U);                    \
315
13.6k
  *(op)++ = (uint8_t)((distance) & 255U);                   \
316
13.6k
}
317
318
319
// Get a guess for the compressed size of a buffer
320
43.8k
static double get_cratio(uint8_t* ibase, int maxlen, int minlen, int ipshift, uint32_t htab[], int8_t hashlog) {
321
43.8k
  uint8_t* ip = ibase;
322
43.8k
  int32_t oc = 0;
323
43.8k
  const uint16_t hashlen = (1U << (uint8_t)hashlog);
324
43.8k
  uint32_t hval;
325
43.8k
  uint32_t seq;
326
43.8k
  uint8_t copy;
327
  // Make a tradeoff between testing too much and too little
328
43.8k
  uint16_t limit = (maxlen > hashlen) ? hashlen : maxlen;
329
43.8k
  uint8_t* ip_bound = ibase + limit - 1;
330
43.8k
  uint8_t* ip_limit = ibase + limit - 12;
331
332
  // Initialize the hash table to distances of 0
333
43.8k
  memset(htab, 0, hashlen * sizeof(uint32_t));
334
335
  /* we start with literal copy */
336
43.8k
  copy = 4;
337
43.8k
  oc += 5;
338
339
  /* main loop */
340
19.2M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
341
19.1M
    const uint8_t* ref;
342
19.1M
    unsigned distance;
343
19.1M
    uint8_t* anchor = ip;    /* comparison starting-point */
344
345
    /* find potential match */
346
19.1M
    seq = BLOSCLZ_READU32(ip);
347
19.1M
    HASH_FUNCTION(hval, seq, hashlog)
348
19.1M
    ref = ibase + htab[hval];
349
350
    /* calculate distance to the match */
351
19.1M
    distance = (unsigned int)(anchor - ref);
352
353
    /* update hash table */
354
19.1M
    htab[hval] = (uint32_t) (anchor - ibase);
355
356
19.1M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
357
43.7k
      LITERAL2(ip, anchor, copy)
358
43.7k
      continue;
359
43.7k
    }
360
361
    /* is this a match? check the first 4 bytes */
362
19.1M
    if (BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip)) {
363
2.40M
      ref += 4;
364
2.40M
    }
365
16.7M
    else {
366
      /* no luck, copy as a literal */
367
16.7M
      LITERAL2(ip, anchor, copy)
368
16.7M
      continue;
369
16.7M
    }
370
371
    /* last matched byte */
372
2.40M
    ip = anchor + 4;
373
374
    /* distance is biased */
375
2.40M
    distance--;
376
377
    /* get runs or matches; zero distance means a run */
378
2.40M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
379
380
2.40M
    ip -= ipshift;
381
2.40M
    int len = (int)(ip - anchor);
382
2.40M
    if (len < minlen) {
383
1.88M
      LITERAL2(ip, anchor, copy)
384
1.88M
      continue;
385
1.88M
    }
386
387
    /* if we haven't copied anything, adjust the output counter */
388
518k
    if (!copy)
389
122k
      oc--;
390
    /* reset literal counter */
391
518k
    copy = 0;
392
393
    /* encode the match */
394
518k
    if (distance < MAX_DISTANCE) {
395
515k
      if (len >= 7) {
396
263k
        oc += ((len - 7) / 255) + 1;
397
263k
      }
398
515k
      oc += 2;
399
515k
    }
400
2.72k
    else {
401
      /* far away, but not yet in the another galaxy... */
402
2.72k
      if (len >= 7) {
403
1.98k
        oc += ((len - 7) / 255) + 1;
404
1.98k
      }
405
2.72k
      oc += 4;
406
2.72k
    }
407
408
    /* update the hash at match boundary */
409
518k
    seq = BLOSCLZ_READU32(ip);
410
518k
    HASH_FUNCTION(hval, seq, hashlog)
411
518k
    htab[hval] = (uint32_t)(ip++ - ibase);
412
518k
    ip++;
413
    /* assuming literal copy */
414
518k
    oc++;
415
518k
  }
416
417
43.8k
  double ic = (double)(ip - ibase);
418
43.8k
  return ic / (double)oc;
419
43.8k
}
420
421
422
int blosclz_compress(const int clevel, const void* input, int length,
423
43.8k
                     void* output, int maxout, blosc2_context* ctx) {
424
43.8k
  BLOSC_UNUSED_PARAM(ctx);
425
43.8k
  uint8_t* ibase = (uint8_t*)input;
426
43.8k
  uint32_t htab[1U << (uint8_t)HASH_LOG];
427
428
  /* When we go back in a match (shift), we obtain quite different compression properties.
429
   * It looks like 4 is more useful in combination with bitshuffle and small typesizes
430
   * Fallback to 4 because it provides more consistent results for large cratios.
431
   * UPDATE: new experiments show that using a value of 3 is a bit better, at least for ERA5.
432
   * UPDATE 2: go back to 4, as they seem to provide better cratios in general.
433
   *
434
   * In this block we also check cratios for the beginning of the buffers and
435
   * eventually discard those that are small (take too long to decompress).
436
   * This process is called _entropy probing_.
437
   */
438
43.8k
  unsigned ipshift = 4;
439
  // Minimum lengths for encoding (normally it is good to match the shift value)
440
43.8k
  unsigned minlen = 4;
441
442
43.8k
  uint8_t hashlog_[10] = {0, HASH_LOG - 2, HASH_LOG - 1, HASH_LOG, HASH_LOG,
443
43.8k
                          HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG, HASH_LOG};
444
43.8k
  uint8_t hashlog = hashlog_[clevel];
445
446
  // Experiments say that checking 1/4 of the buffer is enough to figure out approx cratio
447
  // UPDATE: new experiments with ERA5 datasets (float32) say that checking the whole buffer
448
  // is better (specially when combined with bitshuffle).
449
  // The loss in speed for checking the whole buffer is pretty negligible too.
450
43.8k
  int maxlen = length;
451
43.8k
  if (clevel < 2) {
452
35.2k
    maxlen /= 8;
453
35.2k
  }
454
8.64k
  else if (clevel < 4) {
455
1.67k
    maxlen /= 4;
456
1.67k
  }
457
6.97k
  else if (clevel < 7) {
458
1.27k
    maxlen /= 2;
459
1.27k
  }
460
  // Start probing somewhere inside the buffer
461
43.8k
  int shift = length - maxlen;
462
  // Actual entropy probing!
463
43.8k
  double cratio = get_cratio(ibase + shift, maxlen, minlen, ipshift, htab, hashlog);
464
  // discard probes with small compression ratios (too expensive)
465
43.8k
  double cratio_[10] = {0, 2, 1.5, 1.2, 1.2, 1.2, 1.2, 1.15, 1.1, 1.0};
466
43.8k
  if (cratio < cratio_[clevel]) {
467
12.1k
      goto out;
468
12.1k
  }
469
470
31.6k
  uint8_t* ip = ibase;
471
31.6k
  uint8_t* ip_bound = ibase + length - 1;
472
31.6k
  uint8_t* ip_limit = ibase + length - 12;
473
31.6k
  uint8_t* op = (uint8_t*)output;
474
31.6k
  const uint8_t* op_limit = op + maxout;
475
31.6k
  uint32_t seq;
476
31.6k
  uint8_t copy;
477
31.6k
  uint32_t hval;
478
479
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
480
31.6k
  if (length < 16 || maxout < 66) {
481
138
    return 0;
482
138
  }
483
484
  // Initialize the hash table
485
31.5k
  memset(htab, 0, (1U << hashlog) * sizeof(uint32_t));
486
487
  /* we start with literal copy */
488
31.5k
  copy = 4;
489
31.5k
  *op++ = MAX_COPY - 1;
490
31.5k
  *op++ = *ip++;
491
31.5k
  *op++ = *ip++;
492
31.5k
  *op++ = *ip++;
493
31.5k
  *op++ = *ip++;
494
495
  /* main loop */
496
17.9M
  while (BLOSCLZ_LIKELY(ip < ip_limit)) {
497
17.9M
    const uint8_t* ref;
498
17.9M
    unsigned distance;
499
17.9M
    uint8_t* anchor = ip;    /* comparison starting-point */
500
501
    /* find potential match */
502
17.9M
    seq = BLOSCLZ_READU32(ip);
503
17.9M
    HASH_FUNCTION(hval, seq, hashlog)
504
17.9M
    ref = ibase + htab[hval];
505
506
    /* calculate distance to the match */
507
17.9M
    distance = (unsigned int)(anchor - ref);
508
509
    /* update hash table */
510
17.9M
    htab[hval] = (uint32_t) (anchor - ibase);
511
512
17.9M
    if (distance == 0 || (distance >= MAX_FARDISTANCE)) {
513
35.7k
      LITERAL(ip, op, op_limit, anchor, copy)
514
0
      continue;
515
35.7k
    }
516
517
    /* is this a match? check the first 4 bytes */
518
17.8M
    if (BLOSCLZ_UNLIKELY(BLOSCLZ_READU32(ref) == BLOSCLZ_READU32(ip))) {
519
3.35M
      ref += 4;
520
14.5M
    } else {
521
      /* no luck, copy as a literal */
522
14.5M
      LITERAL(ip, op, op_limit, anchor, copy)
523
0
      continue;
524
14.5M
    }
525
526
    /* last matched byte */
527
3.35M
    ip = anchor + 4;
528
529
    /* distance is biased */
530
3.35M
    distance--;
531
532
    /* get runs or matches; zero distance means a run */
533
3.35M
    ip = get_run_or_match(ip, ip_bound, ref, !distance);
534
535
    /* length is biased, '1' means a match of 3 bytes */
536
3.35M
    ip -= ipshift;
537
538
3.35M
    unsigned len = (int)(ip - anchor);
539
540
    // Encoding short lengths is expensive during decompression
541
3.35M
    if (len < minlen || (len <= 5 && distance >= MAX_DISTANCE)) {
542
2.45M
      LITERAL(ip, op, op_limit, anchor, copy)
543
0
      continue;
544
2.45M
    }
545
546
    /* if we have copied something, adjust the copy count */
547
896k
    if (copy)
548
      /* copy is biased, '0' means 1 byte copy */
549
629k
      *(op - copy - 1) = (uint8_t)(copy - 1);
550
266k
    else
551
      /* back, to overwrite the copy count */
552
266k
      op--;
553
    /* reset literal counter */
554
896k
    copy = 0;
555
556
    /* encode the match */
557
896k
    if (distance < MAX_DISTANCE) {
558
881k
      if (len < 7) {
559
376k
        MATCH_SHORT(op, op_limit, len, distance)
560
505k
      } else {
561
1.51M
        MATCH_LONG(op, op_limit, len, distance)
562
1.51M
      }
563
881k
    } else {
564
      /* far away, but not yet in the another galaxy... */
565
14.7k
      distance -= MAX_DISTANCE;
566
14.7k
      if (len < 7) {
567
1.04k
        MATCH_SHORT_FAR(op, op_limit, len, distance)
568
13.7k
      } else {
569
41.0k
        MATCH_LONG_FAR(op, op_limit, len, distance)
570
41.0k
      }
571
14.7k
    }
572
573
    /* update the hash at match boundary */
574
896k
    seq = BLOSCLZ_READU32(ip);
575
896k
    HASH_FUNCTION(hval, seq, hashlog)
576
896k
    htab[hval] = (uint32_t) (ip++ - ibase);
577
896k
    if (clevel == 9) {
578
      // In some situations, including a second hash proves to be useful,
579
      // but not in others.  Activating here in max clevel only.
580
341k
      seq >>= 8U;
581
341k
      HASH_FUNCTION(hval, seq, hashlog)
582
341k
      htab[hval] = (uint32_t) (ip++ - ibase);
583
341k
    }
584
555k
    else {
585
555k
      ip++;
586
555k
    }
587
588
896k
    if (BLOSCLZ_UNLIKELY(op + 1 > op_limit))
589
74
      goto out;
590
591
    /* assuming literal copy */
592
896k
    *op++ = MAX_COPY - 1;
593
896k
  }
594
595
  /* left-over as literal copy */
596
175k
  while (BLOSCLZ_UNLIKELY(ip <= ip_bound)) {
597
144k
    if (BLOSCLZ_UNLIKELY(op + 2 > op_limit)) goto out;
598
144k
    *op++ = *ip++;
599
144k
    copy++;
600
144k
    if (BLOSCLZ_UNLIKELY(copy == MAX_COPY)) {
601
902
      copy = 0;
602
902
      *op++ = MAX_COPY - 1;
603
902
    }
604
144k
  }
605
606
  /* if we have copied something, adjust the copy length */
607
30.5k
  if (copy)
608
30.2k
    *(op - copy - 1) = (uint8_t)(copy - 1);
609
307
  else
610
307
    op--;
611
612
  /* marker for blosclz */
613
30.5k
  *(uint8_t*)output |= (1U << 5U);
614
615
30.5k
  return (int)(op - (uint8_t*)output);
616
617
13.1k
  out:
618
13.1k
  return 0;
619
30.9k
}
620
621
// See https://habr.com/en/company/yandex/blog/457612/
622
#if defined(__AVX2__)
623
624
#if defined(_MSC_VER)
625
#define ALIGNED_(x) __declspec(align(x))
626
#else
627
#if defined(__GNUC__)
628
#define ALIGNED_(x) __attribute__ ((aligned(x)))
629
#endif
630
#endif
631
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
632
633
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
634
{
635
  size_t offset = op - match;
636
  while (len >= 16) {
637
638
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
639
      {
640
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
641
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
642
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
643
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
644
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
645
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
646
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
647
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
648
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
649
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
650
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
651
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
652
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
653
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
654
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
655
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
656
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
657
      };
658
659
    _mm_storeu_si128((__m128i *)(op),
660
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
661
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
662
663
    match += masks[offset];
664
665
    op += 16;
666
    len -= 16;
667
  }
668
  // Deal with remainders
669
  for (; len > 0; len--) {
670
    *op++ = *match++;
671
  }
672
  return op;
673
}
674
#endif
675
676
// LZ4 wildCopy which can reach excellent copy bandwidth (even if insecure)
677
201k
static inline void wild_copy(uint8_t *out, const uint8_t* from, uint8_t* end) {
678
201k
  uint8_t* d = out;
679
201k
  const uint8_t* s = from;
680
201k
  uint8_t* const e = end;
681
682
1.13M
  do { memcpy(d,s,8); d+=8; s+=8; } while (d<e);
683
201k
}
684
685
12.3k
int blosclz_decompress(const void* input, int length, void* output, int maxout) {
686
12.3k
  const uint8_t* ip = (const uint8_t*)input;
687
12.3k
  const uint8_t* ip_limit = ip + length;
688
12.3k
  uint8_t* op = (uint8_t*)output;
689
12.3k
  uint32_t ctrl;
690
12.3k
  uint8_t* op_limit = op + maxout;
691
12.3k
  if (BLOSCLZ_UNLIKELY(length == 0)) {
692
0
    return 0;
693
0
  }
694
12.3k
  ctrl = (*ip++) & 31U;
695
696
563k
  while (1) {
697
563k
    if (ctrl >= 32) {
698
      // match
699
285k
      int32_t len = (int32_t)(ctrl >> 5U) - 1 ;
700
285k
      int32_t ofs = (int32_t)(ctrl & 31U) << 8U;
701
285k
      uint8_t code;
702
285k
      const uint8_t* ref = op - ofs;
703
704
285k
      if (len == 7 - 1) {
705
216k
        do {
706
216k
          if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
707
0
            return 0;
708
0
          }
709
216k
          code = *ip++;
710
216k
          len += code;
711
216k
        } while (code == 255);
712
187k
      }
713
97.7k
      else {
714
97.7k
        if (BLOSCLZ_UNLIKELY(ip + 1 >= ip_limit)) {
715
0
          return 0;
716
0
        }
717
97.7k
      }
718
285k
      code = *ip++;
719
285k
      len += 3;
720
285k
      ref -= code;
721
722
      /* match from 16-bit distance */
723
285k
      if (BLOSCLZ_UNLIKELY(code == 255)) {
724
21.2k
        if (ofs == (31U << 8U)) {
725
11.5k
          if (ip + 1 >= ip_limit) {
726
0
            return 0;
727
0
          }
728
11.5k
          ofs = (*ip++) << 8U;
729
11.5k
          ofs += *ip++;
730
11.5k
          ref = op - ofs - MAX_DISTANCE;
731
11.5k
        }
732
21.2k
      }
733
734
285k
      if (BLOSCLZ_UNLIKELY(op + len > op_limit)) {
735
0
        return 0;
736
0
      }
737
738
285k
      if (BLOSCLZ_UNLIKELY(ref - 1 < (uint8_t*)output)) {
739
0
        return 0;
740
0
      }
741
742
285k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
743
285k
      ctrl = *ip++;
744
745
285k
      ref--;
746
285k
      if (ref == op - 1) {
747
        /* optimized copy for a run */
748
30.4k
        memset(op, *ref, len);
749
30.4k
        op += len;
750
30.4k
      }
751
254k
      else if ((op - ref >= 8) && (op_limit - op >= len + 8)) {
752
        // copy with an overlap not larger than 8
753
201k
        wild_copy(op, ref, op + len);
754
201k
        op += len;
755
201k
      }
756
53.4k
      else {
757
        // general copy with any overlap
758
#if 0 && defined(__AVX2__)
759
        if (op - ref <= 16) {
760
          // This is not faster on a combination of compilers (clang, gcc, icc) or machines, but
761
          // it is not too slower either.
762
          op = copy_match_16(op, ref, len);
763
        }
764
        else {
765
#endif
766
53.4k
          op = copy_match(op, ref, (unsigned) len);
767
#if 0 && defined(__AVX2__)
768
        }
769
#endif
770
53.4k
      }
771
285k
    }
772
278k
    else {
773
      // literal
774
278k
      ctrl++;
775
278k
      if (BLOSCLZ_UNLIKELY(op + ctrl > op_limit)) {
776
0
        return 0;
777
0
      }
778
278k
      if (BLOSCLZ_UNLIKELY(ip + ctrl > ip_limit)) {
779
0
        return 0;
780
0
      }
781
782
278k
      memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
783
      // On GCC-6, fastcopy this is still faster than plain memcpy
784
      // However, using recent CLANG/LLVM 9.0, there is almost no difference
785
      // in performance.
786
      // And starting on CLANG/LLVM 10 and GCC 9, memcpy is generally faster.
787
      // op = fastcopy(op, ip, (unsigned) ctrl); ip += ctrl;
788
789
278k
      if (BLOSCLZ_UNLIKELY(ip >= ip_limit)) break;
790
266k
      ctrl = *ip++;
791
266k
    }
792
563k
  }
793
794
12.3k
  return (int)(op - (uint8_t*)output);
795
12.3k
}