Coverage Report

Created: 2024-05-21 06:22

/src/c-blosc2/plugins/codecs/ndlz/ndlz4x4.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  This codec is meant to leverage multidimensionality for getting
13
  better compression ratios.  The idea is to look for similarities
14
  in places that are closer in a euclidean metric, not the typical
15
  linear one.
16
**********************************************************************/
17
18
#include "ndlz4x4.h"
19
#include "ndlz.h"
20
#include "xxhash.h"
21
#include "../plugins/plugin_utils.h"
22
23
#include <stdlib.h>
24
#include <string.h>
25
26
/*
27
 * Give hints to the compiler for branch prediction optimization.
28
 */
29
#if defined(__GNUC__) && (__GNUC__ > 2)
30
#define NDLZ_EXPECT_CONDITIONAL(c)    (__builtin_expect((c), 1))
31
3.72k
#define NDLZ_UNEXPECT_CONDITIONAL(c)  (__builtin_expect((c), 0))
32
#else
33
#define NDLZ_EXPECT_CONDITIONAL(c)    (c)
34
#define NDLZ_UNEXPECT_CONDITIONAL(c)  (c)
35
#endif
36
37
/*
38
 * Use inlined functions for supported systems.
39
 */
40
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
41
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
42
#endif
43
44
#define MAX_COPY 32U
45
0
#define MAX_DISTANCE 65535
46
47
48
#ifdef BLOSC_STRICT_ALIGN
49
#define NDLZ_READU16(p) ((p)[0] | (p)[1]<<8)
50
#define NDLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24)
51
#else
52
#define NDLZ_READU16(p) *((const uint16_t*)(p))
53
#define NDLZ_READU32(p) *((const uint32_t*)(p))
54
#endif
55
56
#define HASH_LOG (12)
57
58
59
int ndlz4_compress(const uint8_t *input, int32_t input_len, uint8_t *output, int32_t output_len,
60
0
                   uint8_t meta, blosc2_cparams *cparams) {
61
0
  BLOSC_UNUSED_PARAM(meta);
62
0
  BLOSC_ERROR_NULL(cparams, BLOSC2_ERROR_NULL_POINTER);
63
0
  BLOSC_ERROR_NULL(cparams->schunk, BLOSC2_ERROR_NULL_POINTER);
64
0
  uint8_t *smeta;
65
0
  int32_t smeta_len;
66
67
0
  if (blosc2_meta_get(cparams->schunk, "b2nd", &smeta, &smeta_len) < 0) {
68
0
    BLOSC_TRACE_ERROR("b2nd layer not found!");
69
0
    return BLOSC2_ERROR_FAILURE;
70
0
  }
71
72
0
  int8_t ndim;
73
0
  int64_t *shape = malloc(8 * sizeof(int64_t));
74
0
  int32_t *chunkshape = malloc(8 * sizeof(int32_t));
75
0
  int32_t *blockshape = malloc(8 * sizeof(int32_t));
76
0
  deserialize_meta(smeta, smeta_len, &ndim, shape, chunkshape, blockshape);
77
0
  free(smeta);
78
79
0
  if (ndim != 2) {
80
0
    BLOSC_TRACE_ERROR("This codec only works for ndim = 2");
81
0
    return BLOSC2_ERROR_FAILURE;
82
0
  }
83
84
0
  if (input_len != (blockshape[0] * blockshape[1])) {
85
0
    BLOSC_TRACE_ERROR("Length not equal to blocksize");
86
0
    return BLOSC2_ERROR_FAILURE;
87
0
  }
88
89
0
  if (NDLZ_UNEXPECT_CONDITIONAL(output_len < (int) (1 + ndim * sizeof(int32_t)))) {
90
0
    BLOSC_TRACE_ERROR("Output too small");
91
0
    return BLOSC2_ERROR_FAILURE;
92
0
  }
93
94
0
  uint8_t *ip = (uint8_t *) input;
95
0
  uint8_t *op = (uint8_t *) output;
96
0
  uint8_t *op_limit;
97
0
  uint32_t hval, hash_cell;
98
0
  uint32_t hash_triple[2] = {0};
99
0
  uint32_t hash_pair[3] = {0};
100
0
  uint8_t bufarea[16];
101
0
  uint8_t *buf_cell = bufarea;
102
0
  uint8_t buf_triple[12];
103
0
  uint8_t buf_pair[8];
104
0
  uint8_t *buf_aux;
105
0
  uint32_t tab_cell[1U << 12U] = {0};
106
0
  uint32_t tab_triple[1U << 12U] = {0};
107
0
  uint32_t tab_pair[1U << 12U] = {0};
108
0
  uint32_t update_triple[2] = {0};
109
0
  uint32_t update_pair[3] = {0};
110
111
  // Minimum cratios before issuing and _early giveup_
112
  // Remind that ndlz is not meant for cratios <= 2 (too costly to decompress)
113
114
0
  op_limit = op + output_len;
115
116
  // Initialize the hash table to distances of 0
117
0
  for (unsigned i = 0; i < (1U << 12U); i++) {
118
0
    tab_cell[i] = 0;
119
0
  }
120
121
  /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */
122
0
  int overhead = 17 + (blockshape[0] * blockshape[1] / 16 - 1) * 2;
123
0
  if (input_len < 16 || output_len < overhead) {
124
0
    BLOSC_TRACE_ERROR("Incorrect length or maxout");
125
0
    return 0;
126
0
  }
127
128
0
  uint8_t *obase = op;
129
130
  /* we start with literal copy */
131
0
  *op++ = ndim;
132
0
  memcpy(op, &blockshape[0], 4);
133
0
  op += 4;
134
0
  memcpy(op, &blockshape[1], 4);
135
0
  op += 4;
136
137
0
  uint32_t i_stop[2];
138
0
  for (int i = 0; i < 2; ++i) {
139
0
    i_stop[i] = (blockshape[i] + 3) / 4;
140
0
  }
141
142
  /* main loop */
143
0
  uint32_t padding[2];
144
0
  uint32_t ii[2];
145
0
  for (ii[0] = 0; ii[0] < i_stop[0]; ++ii[0]) {
146
0
    for (ii[1] = 0; ii[1] < i_stop[1]; ++ii[1]) {      // for each cell
147
0
      uint8_t token;
148
0
      for (int h = 0; h < 2; h++) {         // new cell -> new possible references
149
0
        update_triple[h] = 0;
150
0
        update_pair[h] = 0;
151
0
      }
152
0
      update_pair[2] = 0;
153
154
0
      if (NDLZ_UNEXPECT_CONDITIONAL(op + 16 + 1 > op_limit)) {
155
0
        free(shape);
156
0
        free(chunkshape);
157
0
        free(blockshape);
158
0
        return 0;
159
0
      }
160
161
0
      uint32_t orig = ii[0] * 4 * blockshape[1] + ii[1] * 4;
162
0
      if (((blockshape[0] % 4 != 0) && (ii[0] == i_stop[0] - 1)) ||
163
0
          ((blockshape[1] % 4 != 0) && (ii[1] == i_stop[1] - 1))) {
164
0
        token = 0;                                   // padding -> literal copy
165
0
        *op++ = token;
166
0
        if (ii[0] == i_stop[0] - 1) {
167
0
          padding[0] = (blockshape[0] % 4 == 0) ? 4 : blockshape[0] % 4;
168
0
        } else {
169
0
          padding[0] = 4;
170
0
        }
171
0
        if (ii[1] == i_stop[1] - 1) {
172
0
          padding[1] = (blockshape[1] % 4 == 0) ? 4 : blockshape[1] % 4;
173
0
        } else {
174
0
          padding[1] = 4;
175
0
        }
176
0
        for (uint32_t i = 0; i < padding[0]; i++) {
177
0
          memcpy(op, &ip[orig + i * blockshape[1]], padding[1]);
178
0
          op += padding[1];
179
0
        }
180
0
      } else {
181
0
        for (uint64_t i = 0; i < 4; i++) {           // fill cell buffer
182
0
          uint64_t ind = orig + i * blockshape[1];
183
0
          memcpy(buf_cell, &ip[ind], 4);
184
0
          buf_cell += 4;
185
0
        }
186
0
        buf_cell -= 16;
187
188
0
        const uint8_t *ref;
189
0
        uint32_t distance;
190
0
        uint8_t *anchor = op;    /* comparison starting-point */
191
192
        /* find potential match */
193
0
        hash_cell = XXH32(buf_cell, 16, 1);        // calculate cell hash
194
0
        hash_cell >>= 32U - 12U;
195
0
        ref = obase + tab_cell[hash_cell];
196
197
        /* calculate distance to the match */
198
0
        if (tab_cell[hash_cell] == 0) {
199
0
          distance = 0;
200
0
        } else {
201
0
          bool same = true;
202
0
          buf_aux = obase + tab_cell[hash_cell];
203
0
          for (int i = 0; i < 16; i++) {
204
0
            if (buf_cell[i] != buf_aux[i]) {
205
0
              same = false;
206
0
              break;
207
0
            }
208
0
          }
209
0
          if (same) {
210
0
            distance = (int32_t) (anchor - ref);
211
0
          } else {
212
0
            distance = 0;
213
0
          }
214
0
        }
215
216
0
        bool alleq = true;
217
0
        for (int i = 1; i < 16; i++) {
218
0
          if (buf_cell[i] != buf_cell[0]) {
219
0
            alleq = false;
220
0
            break;
221
0
          }
222
0
        }
223
0
        if (alleq) {                              // all elements of the cell equal
224
0
          token = (uint8_t) (1U << 6U);
225
0
          *op++ = token;
226
0
          *op++ = buf_cell[0];
227
228
0
        } else if (distance == 0 || (distance >= MAX_DISTANCE)) {   // no cell match
229
0
          bool literal = true;
230
231
          // 2 rows pairs matches
232
0
          for (int j = 1; j < 4; j++) {
233
0
            memcpy(buf_pair, buf_cell, 4);
234
0
            memcpy(&buf_pair[4], &buf_cell[j * 4], 4);
235
0
            hval = XXH32(buf_pair, 8, 1);        // calculate rows pair hash
236
0
            hval >>= 32U - 12U;
237
0
            ref = obase + tab_pair[hval];
238
            /* calculate distance to the match */
239
0
            bool same = true;
240
0
            uint16_t offset;
241
0
            if (tab_pair[hval] != 0) {
242
0
              buf_aux = obase + tab_pair[hval];
243
0
              for (int k = 0; k < 8; k++) {
244
0
                if (buf_pair[k] != buf_aux[k]) {
245
0
                  same = false;
246
0
                  break;
247
0
                }
248
0
              }
249
0
              offset = (uint16_t) (anchor - obase - tab_pair[hval]);
250
0
            } else {
251
0
              same = false;
252
0
            }
253
0
            if (same) {
254
0
              distance = (int32_t) (anchor - ref);
255
0
            } else {
256
0
              distance = 0;
257
0
            }
258
0
            if ((distance != 0) && (distance < MAX_DISTANCE)) {     /* rows pair match */
259
0
              int k, m, l = -1;
260
0
              for (k = 1; k < 4; k++) {
261
0
                if (k != j) {
262
0
                  if (l == -1) {
263
0
                    l = k;
264
0
                  } else {
265
0
                    m = k;
266
0
                  }
267
0
                }
268
0
              }
269
0
              memcpy(buf_pair, &buf_cell[l * 4], 4);
270
0
              memcpy(&buf_pair[4], &buf_cell[m * 4], 4);
271
0
              hval = XXH32(buf_pair, 8, 1);        // calculate rows pair hash
272
0
              hval >>= 32U - 12U;
273
0
              ref = obase + tab_pair[hval];
274
0
              same = true;
275
0
              if (tab_pair[hval] != 0) {
276
0
                buf_aux = obase + tab_pair[hval];
277
0
                for (k = 0; k < 8; k++) {
278
0
                  if (buf_pair[k] != buf_aux[k]) {
279
0
                    same = false;
280
0
                    break;
281
0
                  }
282
0
                }
283
0
              } else {
284
0
                same = false;
285
0
              }
286
0
              if (same) {
287
0
                distance = (int32_t) (anchor + l * 4 - ref);
288
0
              } else {
289
0
                distance = 0;
290
0
              }
291
0
              if ((distance != 0) && (distance < MAX_DISTANCE)) {   /* 2 pair matches */
292
0
                literal = false;
293
0
                token = (uint8_t) ((1U << 5U) | (j << 3U));
294
0
                *op++ = token;
295
0
                uint16_t offset_2 = (uint16_t) (anchor - obase - tab_pair[hval]);
296
0
                *(uint16_t *) op = offset;
297
0
                op += sizeof(offset);
298
0
                *(uint16_t *) op = offset_2;
299
0
                op += sizeof(offset_2);
300
0
                goto match;
301
0
              }
302
0
            }
303
0
          }
304
305
          // rows triples
306
0
          for (int i = 0; i < 2; i++) {
307
0
            memcpy(buf_triple, &buf_cell[i * 4], 4);
308
0
            for (int j = i + 1; j < 3; j++) {
309
0
              memcpy(&buf_triple[4], &buf_cell[j * 4], 4);
310
0
              for (int k = j + 1; k < 4; k++) {
311
0
                memcpy(&buf_triple[8], &buf_cell[k * 4], 4);
312
0
                hval = XXH32(buf_triple, 12, 1);        // calculate triple hash
313
0
                hval >>= 32U - 12U;
314
                /* calculate distance to the match */
315
0
                bool same = true;
316
0
                uint16_t offset;
317
0
                if (tab_triple[hval] != 0) {
318
0
                  buf_aux = obase + tab_triple[hval];
319
0
                  for (int l = 0; l < 12; l++) {
320
0
                    if (buf_triple[l] != buf_aux[l]) {
321
0
                      same = false;
322
0
                      break;
323
0
                    }
324
0
                  }
325
0
                  offset = (uint16_t) (anchor - obase - tab_triple[hval]);
326
0
                } else {
327
0
                  same = false;
328
0
                  if ((j - i == 1) && (k - j == 1)) {
329
0
                    update_triple[i] = (uint32_t) (anchor + 1 + i * 4 - obase);     /* update hash table */
330
0
                    hash_triple[i] = hval;
331
0
                  }
332
0
                }
333
0
                ref = obase + tab_triple[hval];
334
335
0
                if (same) {
336
0
                  distance = (int32_t) (anchor + i * 4 - ref);
337
0
                } else {
338
0
                  distance = 0;
339
0
                }
340
0
                if ((distance != 0) && (distance < MAX_DISTANCE)) {
341
0
                  literal = false;
342
0
                  if (i == 1) {
343
0
                    token = (uint8_t) (7U << 5U);
344
0
                  } else {
345
0
                    token = (uint8_t) ((7U << 5U) | ((j + k - 2) << 3U));
346
0
                  }
347
0
                  *op++ = token;
348
0
                  memcpy(op, &offset, 2);
349
0
                  op += 2;
350
0
                  for (int l = 0; l < 4; l++) {
351
0
                    if ((l != i) && (l != j) && (l != k)) {
352
0
                      memcpy(op, &buf_cell[4 * l], 4);
353
0
                      op += 4;
354
0
                      goto match;
355
0
                    }
356
0
                  }
357
0
                }
358
0
              }
359
0
            }
360
0
          }
361
362
          // rows pairs
363
0
          for (int i = 0; i < 3; i++) {
364
0
            memcpy(buf_pair, &buf_cell[i * 4], 4);
365
0
            for (int j = i + 1; j < 4; j++) {
366
0
              memcpy(&buf_pair[4], &buf_cell[j * 4], 4);
367
0
              hval = XXH32(buf_pair, 8, 1);        // calculate rows pair hash
368
0
              hval >>= 32U - 12U;
369
0
              ref = obase + tab_pair[hval];
370
              /* calculate distance to the match */
371
0
              bool same = true;
372
0
              uint16_t offset;
373
0
              if (tab_pair[hval] != 0) {
374
0
                buf_aux = obase + tab_pair[hval];
375
0
                for (int k = 0; k < 8; k++) {
376
0
                  if (buf_pair[k] != buf_aux[k]) {
377
0
                    same = false;
378
0
                    break;
379
0
                  }
380
0
                }
381
0
                offset = (uint16_t) (anchor - obase - tab_pair[hval]);
382
0
              } else {
383
0
                same = false;
384
0
                if (j - i == 1) {
385
0
                  update_pair[i] = (uint32_t) (anchor + 1 + i * 4 - obase);     /* update hash table */
386
0
                  hash_pair[i] = hval;
387
0
                }
388
0
              }
389
0
              if (same) {
390
0
                distance = (int32_t) (anchor + i * 4 - ref);
391
0
              } else {
392
0
                distance = 0;
393
0
              }
394
0
              if ((distance != 0) && (distance < MAX_DISTANCE)) {     /* rows pair match */
395
0
                literal = false;
396
0
                if (i == 2) {
397
0
                  token = (uint8_t) (1U << 7U);
398
0
                } else {
399
0
                  token = (uint8_t) ((1U << 7U) | (i << 5U) | (j << 3U));
400
0
                }
401
0
                *op++ = token;
402
0
                memcpy(op, &offset, 2);
403
0
                op += 2;
404
0
                for (int k = 0; k < 4; k++) {
405
0
                  if ((k != i) && (k != j)) {
406
0
                    memcpy(op, &buf_cell[4 * k], 4);
407
0
                    op += 4;
408
0
                  }
409
0
                }
410
0
                goto match;
411
0
              }
412
0
            }
413
0
          }
414
415
0
          match:
416
0
          if (literal) {
417
0
            tab_cell[hash_cell] = (uint32_t) (anchor + 1 - obase);     /* update hash tables */
418
0
            if (update_triple[0] != 0) {
419
0
              for (int h = 0; h < 2; h++) {
420
0
                tab_triple[hash_triple[h]] = update_triple[h];
421
0
              }
422
0
            }
423
0
            if (update_pair[0] != 0) {
424
0
              for (int h = 0; h < 3; h++) {
425
0
                tab_pair[hash_pair[h]] = update_pair[h];
426
0
              }
427
0
            }
428
0
            token = 0;
429
0
            *op++ = token;
430
0
            memcpy(op, buf_cell, 16);
431
0
            op += 16;
432
0
          }
433
434
0
        } else {   // cell match
435
0
          token = (uint8_t) ((1U << 7U) | (1U << 6U));
436
0
          *op++ = token;
437
0
          uint16_t offset = (uint16_t) (anchor - obase - tab_cell[hash_cell]);
438
0
          memcpy(op, &offset, 2);
439
0
          op += 2;
440
0
        }
441
442
0
      }
443
0
      if ((op - obase) > input_len) {
444
0
        BLOSC_TRACE_ERROR("Compressed data is bigger than input!");
445
0
        return 0;
446
0
      }
447
0
    }
448
0
  }
449
450
0
  free(shape);
451
0
  free(chunkshape);
452
0
  free(blockshape);
453
454
0
  return (int) (op - obase);
455
0
}
456
457
458
// See https://habr.com/en/company/yandex/blog/457612/
459
#ifdef __AVX2__
460
461
#if defined(_MSC_VER)
462
#define ALIGNED_(x) __declspec(align(x))
463
#else
464
#if defined(__GNUC__)
465
#define ALIGNED_(x) __attribute__ ((aligned(x)))
466
#endif
467
#endif
468
#define ALIGNED_TYPE_(t, x) t ALIGNED_(x)
469
470
static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len)
471
{
472
  size_t offset = op - match;
473
  while (len >= 16) {
474
475
    static const ALIGNED_TYPE_(uint8_t, 16) masks[] =
476
      {
477
                0,  1,  2,  1,  4,  1,  4,  2,  8,  7,  6,  5,  4,  3,  2,  1, // offset = 0, not used as mask, but for shift
478
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, // offset = 1
479
                0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
480
                0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,
481
                0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,
482
                0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,
483
                0,  1,  2,  3,  4,  5,  0,  1,  2,  3,  4,  5,  0,  1,  2,  3,
484
                0,  1,  2,  3,  4,  5,  6,  0,  1,  2,  3,  4,  5,  6,  0,  1,
485
                0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
486
                0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,
487
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,
488
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,
489
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,
490
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  1,  2,
491
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  1,
492
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0,
493
                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  15, // offset = 16
494
      };
495
496
    _mm_storeu_si128((__m128i *)(op),
497
                     _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)),
498
                                      _mm_load_si128((const __m128i *)(masks) + offset)));
499
500
    match += masks[offset];
501
502
    op += 16;
503
    len -= 16;
504
  }
505
  // Deal with remainders
506
  for (; len > 0; len--) {
507
    *op++ = *match++;
508
  }
509
  return op;
510
}
511
#endif
512
513
514
int ndlz4_decompress(const uint8_t *input, int32_t input_len, uint8_t *output, int32_t output_len,
515
1.03k
                     uint8_t meta, blosc2_dparams *dparams) {
516
1.03k
  BLOSC_UNUSED_PARAM(meta);
517
1.03k
  BLOSC_UNUSED_PARAM(dparams);
518
1.03k
  BLOSC_ERROR_NULL(input, BLOSC2_ERROR_NULL_POINTER);
519
1.03k
  BLOSC_ERROR_NULL(output, BLOSC2_ERROR_NULL_POINTER);
520
521
1.03k
  uint8_t *ip = (uint8_t *) input;
522
1.03k
  uint8_t *ip_limit = ip + input_len;
523
1.03k
  uint8_t *op = (uint8_t *) output;
524
1.03k
  uint8_t ndim;
525
1.03k
  int32_t blockshape[2];
526
1.03k
  int32_t eshape[2];
527
1.03k
  uint8_t *buffercpy;
528
1.03k
  uint8_t local_buffer[16];
529
1.03k
  uint8_t token;
530
1.03k
  if (NDLZ_UNEXPECT_CONDITIONAL(input_len < 8)) {
531
6
    return 0;
532
6
  }
533
534
  /* we start with literal copy */
535
1.02k
  ndim = *ip;
536
1.02k
  ip++;
537
1.02k
  if (ndim != 2) {
538
6
    BLOSC_TRACE_ERROR("This codec only works for ndim = 2");
539
0
    return BLOSC2_ERROR_FAILURE;
540
6
  }
541
1.02k
  memcpy(&blockshape[0], ip, 4);
542
1.02k
  ip += 4;
543
1.02k
  memcpy(&blockshape[1], ip, 4);
544
1.02k
  ip += 4;
545
546
  // Sanity check.  See https://www.cve.org/CVERecord?id=CVE-2024-3204
547
1.02k
  if (output_len < 0 || blockshape[0] < 0 || blockshape[1] < 0) {
548
64
    BLOSC_TRACE_ERROR("Output length or blockshape is negative");
549
0
    return BLOSC2_ERROR_FAILURE;
550
64
  }
551
552
956
  eshape[0] = ((blockshape[0] + 3) / 4) * 4;
553
956
  eshape[1] = ((blockshape[1] + 3) / 4) * 4;
554
555
956
  if (NDLZ_UNEXPECT_CONDITIONAL((int64_t)output_len < (int64_t)blockshape[0] * (int64_t)blockshape[1])) {
556
105
    BLOSC_TRACE_ERROR("The blockshape is bigger than the output buffer");
557
0
    return 0;
558
105
  }
559
851
  memset(op, 0, blockshape[0] * blockshape[1]);
560
561
851
  uint32_t i_stop[2];
562
2.55k
  for (int i = 0; i < 2; ++i) {
563
1.70k
    i_stop[i] = eshape[i] / 4;
564
1.70k
  }
565
566
  /* main loop */
567
851
  uint32_t ii[2];
568
851
  uint32_t padding[2] = {0};
569
851
  uint32_t ind = 0;
570
851
  uint8_t cell_aux[16];
571
8.95G
  for (ii[0] = 0; ii[0] < i_stop[0]; ++ii[0]) {
572
8.95G
    for (ii[1] = 0; ii[1] < i_stop[1]; ++ii[1]) {      // for each cell
573
1.73k
      if (NDLZ_UNEXPECT_CONDITIONAL(ip > ip_limit)) {
574
5
        BLOSC_TRACE_ERROR("Exceeding input length");
575
0
        return BLOSC2_ERROR_FAILURE;
576
5
      }
577
1.73k
      if (ii[0] == i_stop[0] - 1) {
578
952
        padding[0] = (blockshape[0] % 4 == 0) ? 4 : blockshape[0] % 4;
579
952
      } else {
580
779
        padding[0] = 4;
581
779
      }
582
1.73k
      if (ii[1] == i_stop[1] - 1) {
583
1.57k
        padding[1] = (blockshape[1] % 4 == 0) ? 4 : blockshape[1] % 4;
584
1.57k
      } else {
585
155
        padding[1] = 4;
586
155
      }
587
1.73k
      token = *ip++;
588
1.73k
      if (token == 0) {    // no match
589
316
        buffercpy = ip;
590
316
        ip += padding[0] * padding[1];
591
1.41k
      } else if (token == (uint8_t) ((1U << 7U) | (1U << 6U))) {  // cell match
592
73
        uint16_t offset = *((uint16_t *) ip);
593
73
        buffercpy = ip - offset - 1;
594
73
        ip += 2;
595
1.34k
      } else if (token == (uint8_t) (1U << 6U)) { // whole cell of same element
596
138
        buffercpy = cell_aux;
597
138
        memset(buffercpy, *ip, 16);
598
138
        ip++;
599
1.20k
      } else if (token >= 224) { // three rows match
600
392
        buffercpy = local_buffer;
601
392
        uint16_t offset = *((uint16_t *) ip);
602
392
        offset += 3;
603
392
        ip += 2;
604
392
        int i, j, k;
605
392
        if ((token >> 3U) == 28) {
606
84
          i = 1;
607
84
          j = 2;
608
84
          k = 3;
609
308
        } else {
610
308
          i = 0;
611
308
          if ((token >> 3U) < 30) {
612
83
            j = 1;
613
83
            k = 2;
614
225
          } else {
615
225
            k = 3;
616
225
            if ((token >> 3U) == 30) {
617
121
              j = 1;
618
121
            } else {
619
104
              j = 2;
620
104
            }
621
225
          }
622
308
        }
623
392
        memcpy(&buffercpy[i * 4], ip - offset, 4);
624
392
        memcpy(&buffercpy[j * 4], ip - offset + 4, 4);
625
392
        memcpy(&buffercpy[k * 4], ip - offset + 8, 4);
626
987
        for (int l = 0; l < 4; l++) {
627
987
          if ((l != i) && (l != j) && (l != k)) {
628
392
            memcpy(&buffercpy[l * 4], ip, 4);
629
392
            ip += 4;
630
392
            break;
631
392
          }
632
987
        }
633
634
812
      } else if ((token >= 128) && (token <= 191)) { // rows pair match
635
538
        buffercpy = local_buffer;
636
538
        uint16_t offset = *((uint16_t *) ip);
637
538
        offset += 3;
638
538
        ip += 2;
639
538
        int i, j;
640
538
        if (token == 128) {
641
100
          i = 2;
642
100
          j = 3;
643
438
        } else {
644
438
          i = (token - 128) >> 5U;
645
438
          j = ((token - 128) >> 3U) - (i << 2U);
646
438
        }
647
538
        memcpy(&buffercpy[i * 4], ip - offset, 4);
648
538
        memcpy(&buffercpy[j * 4], ip - offset + 4, 4);
649
2.69k
        for (int k = 0; k < 4; k++) {
650
2.15k
          if ((k != i) && (k != j)) {
651
1.13k
            memcpy(&buffercpy[k * 4], ip, 4);
652
1.13k
            ip += 4;
653
1.13k
          }
654
2.15k
        }
655
538
      } else if ((token >= 40) && (token <= 63)) {  // 2 rows pair matches
656
251
        buffercpy = local_buffer;
657
251
        uint16_t offset_1 = *((uint16_t *) ip);
658
251
        offset_1 += 5;
659
251
        ip += 2;
660
251
        uint16_t offset_2 = *((uint16_t *) ip);
661
251
        offset_2 += 5;
662
251
        ip += 2;
663
251
        int i, j, k, l, m;
664
251
        i = 0;
665
251
        j = ((token - 32) >> 3U);
666
251
        l = -1;
667
1.00k
        for (k = 1; k < 4; k++) {
668
753
          if ((k != i) && (k != j)) {
669
502
            if (l == -1) {
670
251
              l = k;
671
251
            } else {
672
251
              m = k;
673
251
            }
674
502
          }
675
753
        }
676
251
        memcpy(&buffercpy[i * 4], ip - offset_1, 4);
677
251
        memcpy(&buffercpy[j * 4], ip - offset_1 + 4, 4);
678
251
        memcpy(&buffercpy[l * 4], ip - offset_2, 4);
679
251
        memcpy(&buffercpy[m * 4], ip - offset_2 + 4, 4);
680
681
251
      } else {
682
23
        BLOSC_TRACE_ERROR("Invalid token: %u at cell [%d, %d]\n", token, ii[0], ii[1]);
683
0
        return BLOSC2_ERROR_FAILURE;
684
23
      }
685
      // fill op with buffercpy
686
1.70k
      uint32_t orig = ii[0] * 4 * blockshape[1] + ii[1] * 4;
687
8.54k
      for (uint32_t i = 0; i < 4; i++) {
688
6.83k
        if (i < padding[0]) {
689
5.43k
          ind = orig + i * blockshape[1];
690
5.43k
          memcpy(&op[ind], buffercpy, padding[1]);
691
5.43k
        }
692
6.83k
        buffercpy += padding[1];
693
6.83k
      }
694
1.70k
      if (ind > (uint32_t) output_len) {
695
0
        BLOSC_TRACE_ERROR("Exceeding output size");
696
0
        return BLOSC2_ERROR_FAILURE;
697
0
      }
698
1.70k
    }
699
8.95G
  }
700
823
  ind += padding[1];
701
702
823
  if ((int32_t)ind != (blockshape[0] * blockshape[1])) {
703
0
    BLOSC_TRACE_ERROR("Output size is not compatible with embedded blockshape");
704
0
    return BLOSC2_ERROR_FAILURE;
705
0
  }
706
823
  if (ind > (uint32_t) output_len) {
707
0
    BLOSC_TRACE_ERROR("Exceeding output size");
708
0
    return BLOSC2_ERROR_FAILURE;
709
0
  }
710
711
823
  return (int) ind;
712
823
}