/src/c-blosc2/plugins/codecs/ndlz/ndlz4x4.c
Line | Count | Source |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | This codec is meant to leverage multidimensionality for getting |
13 | | better compression ratios. The idea is to look for similarities |
14 | | in places that are closer in a euclidean metric, not the typical |
15 | | linear one. |
16 | | **********************************************************************/ |
17 | | |
18 | | #include "ndlz4x4.h" |
19 | | #include "xxhash.h" |
20 | | #include "b2nd.h" |
21 | | |
22 | | #include <stdlib.h> |
23 | | #include <string.h> |
24 | | |
25 | | /* |
26 | | * Give hints to the compiler for branch prediction optimization. |
27 | | */ |
28 | | #if defined(__GNUC__) && (__GNUC__ > 2) |
29 | | #define NDLZ_EXPECT_CONDITIONAL(c) (__builtin_expect((c), 1)) |
30 | 2.39k | #define NDLZ_UNEXPECT_CONDITIONAL(c) (__builtin_expect((c), 0)) |
31 | | #else |
32 | | #define NDLZ_EXPECT_CONDITIONAL(c) (c) |
33 | | #define NDLZ_UNEXPECT_CONDITIONAL(c) (c) |
34 | | #endif |
35 | | |
36 | | /* |
37 | | * Use inlined functions for supported systems. |
38 | | */ |
39 | | #if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */ |
40 | | #define inline __inline /* Visual C is not C99, but supports some kind of inline */ |
41 | | #endif |
42 | | |
43 | | #define MAX_COPY 32U |
44 | 0 | #define MAX_DISTANCE 65535 |
45 | | |
46 | | |
47 | | #ifdef BLOSC_STRICT_ALIGN |
48 | | #define NDLZ_READU16(p) ((p)[0] | (p)[1]<<8) |
49 | | #define NDLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24) |
50 | | #else |
51 | | #define NDLZ_READU16(p) *((const uint16_t*)(p)) |
52 | | #define NDLZ_READU32(p) *((const uint32_t*)(p)) |
53 | | #endif |
54 | | |
55 | | #define HASH_LOG (12) |
56 | | |
57 | | |
58 | | int ndlz4_compress(const uint8_t *input, int32_t input_len, uint8_t *output, int32_t output_len, |
59 | 0 | uint8_t meta, blosc2_cparams *cparams) { |
60 | 0 | BLOSC_UNUSED_PARAM(meta); |
61 | 0 | BLOSC_ERROR_NULL(cparams, BLOSC2_ERROR_NULL_POINTER); |
62 | 0 | BLOSC_ERROR_NULL(cparams->schunk, BLOSC2_ERROR_NULL_POINTER); |
63 | 0 | uint8_t *smeta; |
64 | 0 | int32_t smeta_len; |
65 | |
|
66 | 0 | if (blosc2_meta_get(cparams->schunk, "b2nd", &smeta, &smeta_len) < 0) { |
67 | 0 | BLOSC_TRACE_ERROR("b2nd layer not found!"); |
68 | 0 | return BLOSC2_ERROR_FAILURE; |
69 | 0 | } |
70 | | |
71 | 0 | int8_t ndim; |
72 | 0 | int64_t *shape = malloc(8 * sizeof(int64_t)); |
73 | 0 | int32_t *chunkshape = malloc(8 * sizeof(int32_t)); |
74 | 0 | int32_t *blockshape = malloc(8 * sizeof(int32_t)); |
75 | 0 | b2nd_deserialize_meta(smeta, smeta_len, &ndim, shape, chunkshape, blockshape, NULL, NULL); |
76 | 0 | free(smeta); |
77 | |
|
78 | 0 | if (ndim != 2) { |
79 | 0 | BLOSC_TRACE_ERROR("This codec only works for ndim = 2"); |
80 | 0 | return BLOSC2_ERROR_FAILURE; |
81 | 0 | } |
82 | | |
83 | 0 | if (input_len != (blockshape[0] * blockshape[1])) { |
84 | 0 | BLOSC_TRACE_ERROR("Length not equal to blocksize"); |
85 | 0 | return BLOSC2_ERROR_FAILURE; |
86 | 0 | } |
87 | | |
88 | 0 | if (NDLZ_UNEXPECT_CONDITIONAL(output_len < (int) (1 + ndim * sizeof(int32_t)))) { |
89 | 0 | BLOSC_TRACE_ERROR("Output too small"); |
90 | 0 | return BLOSC2_ERROR_FAILURE; |
91 | 0 | } |
92 | | |
93 | 0 | uint8_t *ip = (uint8_t *) input; |
94 | 0 | uint8_t *op = (uint8_t *) output; |
95 | 0 | uint8_t *op_limit; |
96 | 0 | uint32_t hval, hash_cell; |
97 | 0 | uint32_t hash_triple[2] = {0}; |
98 | 0 | uint32_t hash_pair[3] = {0}; |
99 | 0 | uint8_t bufarea[16]; |
100 | 0 | uint8_t *buf_cell = bufarea; |
101 | 0 | uint8_t buf_triple[12]; |
102 | 0 | uint8_t buf_pair[8]; |
103 | 0 | uint8_t *buf_aux; |
104 | 0 | uint32_t tab_cell[1U << 12U] = {0}; |
105 | 0 | uint32_t tab_triple[1U << 12U] = {0}; |
106 | 0 | uint32_t tab_pair[1U << 12U] = {0}; |
107 | 0 | uint32_t update_triple[2] = {0}; |
108 | 0 | uint32_t update_pair[3] = {0}; |
109 | | |
110 | | // Minimum cratios before issuing and _early giveup_ |
111 | | // Remind that ndlz is not meant for cratios <= 2 (too costly to decompress) |
112 | |
|
113 | 0 | op_limit = op + output_len; |
114 | | |
115 | | // Initialize the hash table to distances of 0 |
116 | 0 | for (unsigned i = 0; i < (1U << 12U); i++) { |
117 | 0 | tab_cell[i] = 0; |
118 | 0 | } |
119 | | |
120 | | /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */ |
121 | 0 | int overhead = 17 + (blockshape[0] * blockshape[1] / 16 - 1) * 2; |
122 | 0 | if (input_len < 16 || output_len < overhead) { |
123 | 0 | BLOSC_TRACE_ERROR("Incorrect length or maxout"); |
124 | 0 | return 0; |
125 | 0 | } |
126 | | |
127 | 0 | uint8_t *obase = op; |
128 | | |
129 | | /* we start with literal copy */ |
130 | 0 | *op++ = ndim; |
131 | 0 | memcpy(op, &blockshape[0], 4); |
132 | 0 | op += 4; |
133 | 0 | memcpy(op, &blockshape[1], 4); |
134 | 0 | op += 4; |
135 | |
|
136 | 0 | uint32_t i_stop[2]; |
137 | 0 | for (int i = 0; i < 2; ++i) { |
138 | 0 | i_stop[i] = (blockshape[i] + 3) / 4; |
139 | 0 | } |
140 | | |
141 | | /* main loop */ |
142 | 0 | uint32_t padding[2]; |
143 | 0 | uint32_t ii[2]; |
144 | 0 | for (ii[0] = 0; ii[0] < i_stop[0]; ++ii[0]) { |
145 | 0 | for (ii[1] = 0; ii[1] < i_stop[1]; ++ii[1]) { // for each cell |
146 | 0 | uint8_t token; |
147 | 0 | for (int h = 0; h < 2; h++) { // new cell -> new possible references |
148 | 0 | update_triple[h] = 0; |
149 | 0 | update_pair[h] = 0; |
150 | 0 | } |
151 | 0 | update_pair[2] = 0; |
152 | |
|
153 | 0 | if (NDLZ_UNEXPECT_CONDITIONAL(op + 16 + 1 > op_limit)) { |
154 | 0 | free(shape); |
155 | 0 | free(chunkshape); |
156 | 0 | free(blockshape); |
157 | 0 | return 0; |
158 | 0 | } |
159 | | |
160 | 0 | uint32_t orig = ii[0] * 4 * blockshape[1] + ii[1] * 4; |
161 | 0 | if (((blockshape[0] % 4 != 0) && (ii[0] == i_stop[0] - 1)) || |
162 | 0 | ((blockshape[1] % 4 != 0) && (ii[1] == i_stop[1] - 1))) { |
163 | 0 | token = 0; // padding -> literal copy |
164 | 0 | *op++ = token; |
165 | 0 | if (ii[0] == i_stop[0] - 1) { |
166 | 0 | padding[0] = (blockshape[0] % 4 == 0) ? 4 : blockshape[0] % 4; |
167 | 0 | } else { |
168 | 0 | padding[0] = 4; |
169 | 0 | } |
170 | 0 | if (ii[1] == i_stop[1] - 1) { |
171 | 0 | padding[1] = (blockshape[1] % 4 == 0) ? 4 : blockshape[1] % 4; |
172 | 0 | } else { |
173 | 0 | padding[1] = 4; |
174 | 0 | } |
175 | 0 | for (uint32_t i = 0; i < padding[0]; i++) { |
176 | 0 | memcpy(op, &ip[orig + i * blockshape[1]], padding[1]); |
177 | 0 | op += padding[1]; |
178 | 0 | } |
179 | 0 | } else { |
180 | 0 | for (uint64_t i = 0; i < 4; i++) { // fill cell buffer |
181 | 0 | uint64_t ind = orig + i * blockshape[1]; |
182 | 0 | memcpy(buf_cell, &ip[ind], 4); |
183 | 0 | buf_cell += 4; |
184 | 0 | } |
185 | 0 | buf_cell -= 16; |
186 | |
|
187 | 0 | const uint8_t *ref; |
188 | 0 | uint32_t distance; |
189 | 0 | uint8_t *anchor = op; /* comparison starting-point */ |
190 | | |
191 | | /* find potential match */ |
192 | 0 | hash_cell = XXH32(buf_cell, 16, 1); // calculate cell hash |
193 | 0 | hash_cell >>= 32U - 12U; |
194 | 0 | ref = obase + tab_cell[hash_cell]; |
195 | | |
196 | | /* calculate distance to the match */ |
197 | 0 | if (tab_cell[hash_cell] == 0) { |
198 | 0 | distance = 0; |
199 | 0 | } else { |
200 | 0 | bool same = true; |
201 | 0 | buf_aux = obase + tab_cell[hash_cell]; |
202 | 0 | for (int i = 0; i < 16; i++) { |
203 | 0 | if (buf_cell[i] != buf_aux[i]) { |
204 | 0 | same = false; |
205 | 0 | break; |
206 | 0 | } |
207 | 0 | } |
208 | 0 | if (same) { |
209 | 0 | distance = (int32_t) (anchor - ref); |
210 | 0 | } else { |
211 | 0 | distance = 0; |
212 | 0 | } |
213 | 0 | } |
214 | |
|
215 | 0 | bool alleq = true; |
216 | 0 | for (int i = 1; i < 16; i++) { |
217 | 0 | if (buf_cell[i] != buf_cell[0]) { |
218 | 0 | alleq = false; |
219 | 0 | break; |
220 | 0 | } |
221 | 0 | } |
222 | 0 | if (alleq) { // all elements of the cell equal |
223 | 0 | token = (uint8_t) (1U << 6U); |
224 | 0 | *op++ = token; |
225 | 0 | *op++ = buf_cell[0]; |
226 | |
|
227 | 0 | } else if (distance == 0 || (distance >= MAX_DISTANCE)) { // no cell match |
228 | 0 | bool literal = true; |
229 | | |
230 | | // 2 rows pairs matches |
231 | 0 | for (int j = 1; j < 4; j++) { |
232 | 0 | memcpy(buf_pair, buf_cell, 4); |
233 | 0 | memcpy(&buf_pair[4], &buf_cell[j * 4], 4); |
234 | 0 | hval = XXH32(buf_pair, 8, 1); // calculate rows pair hash |
235 | 0 | hval >>= 32U - 12U; |
236 | 0 | ref = obase + tab_pair[hval]; |
237 | | /* calculate distance to the match */ |
238 | 0 | bool same = true; |
239 | 0 | uint16_t offset; |
240 | 0 | if (tab_pair[hval] != 0) { |
241 | 0 | buf_aux = obase + tab_pair[hval]; |
242 | 0 | for (int k = 0; k < 8; k++) { |
243 | 0 | if (buf_pair[k] != buf_aux[k]) { |
244 | 0 | same = false; |
245 | 0 | break; |
246 | 0 | } |
247 | 0 | } |
248 | 0 | offset = (uint16_t) (anchor - obase - tab_pair[hval]); |
249 | 0 | } else { |
250 | 0 | same = false; |
251 | 0 | } |
252 | 0 | if (same) { |
253 | 0 | distance = (int32_t) (anchor - ref); |
254 | 0 | } else { |
255 | 0 | distance = 0; |
256 | 0 | } |
257 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { /* rows pair match */ |
258 | 0 | int k, m, l = -1; |
259 | 0 | for (k = 1; k < 4; k++) { |
260 | 0 | if (k != j) { |
261 | 0 | if (l == -1) { |
262 | 0 | l = k; |
263 | 0 | } else { |
264 | 0 | m = k; |
265 | 0 | } |
266 | 0 | } |
267 | 0 | } |
268 | 0 | memcpy(buf_pair, &buf_cell[l * 4], 4); |
269 | 0 | memcpy(&buf_pair[4], &buf_cell[m * 4], 4); |
270 | 0 | hval = XXH32(buf_pair, 8, 1); // calculate rows pair hash |
271 | 0 | hval >>= 32U - 12U; |
272 | 0 | ref = obase + tab_pair[hval]; |
273 | 0 | same = true; |
274 | 0 | if (tab_pair[hval] != 0) { |
275 | 0 | buf_aux = obase + tab_pair[hval]; |
276 | 0 | for (k = 0; k < 8; k++) { |
277 | 0 | if (buf_pair[k] != buf_aux[k]) { |
278 | 0 | same = false; |
279 | 0 | break; |
280 | 0 | } |
281 | 0 | } |
282 | 0 | } else { |
283 | 0 | same = false; |
284 | 0 | } |
285 | 0 | if (same) { |
286 | 0 | distance = (int32_t) (anchor + l * 4 - ref); |
287 | 0 | } else { |
288 | 0 | distance = 0; |
289 | 0 | } |
290 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { /* 2 pair matches */ |
291 | 0 | literal = false; |
292 | 0 | token = (uint8_t) ((1U << 5U) | (j << 3U)); |
293 | 0 | *op++ = token; |
294 | 0 | uint16_t offset_2 = (uint16_t) (anchor - obase - tab_pair[hval]); |
295 | 0 | *(uint16_t *) op = offset; |
296 | 0 | op += sizeof(offset); |
297 | 0 | *(uint16_t *) op = offset_2; |
298 | 0 | op += sizeof(offset_2); |
299 | 0 | goto match; |
300 | 0 | } |
301 | 0 | } |
302 | 0 | } |
303 | | |
304 | | // rows triples |
305 | 0 | for (int i = 0; i < 2; i++) { |
306 | 0 | memcpy(buf_triple, &buf_cell[i * 4], 4); |
307 | 0 | for (int j = i + 1; j < 3; j++) { |
308 | 0 | memcpy(&buf_triple[4], &buf_cell[j * 4], 4); |
309 | 0 | for (int k = j + 1; k < 4; k++) { |
310 | 0 | memcpy(&buf_triple[8], &buf_cell[k * 4], 4); |
311 | 0 | hval = XXH32(buf_triple, 12, 1); // calculate triple hash |
312 | 0 | hval >>= 32U - 12U; |
313 | | /* calculate distance to the match */ |
314 | 0 | bool same = true; |
315 | 0 | uint16_t offset; |
316 | 0 | if (tab_triple[hval] != 0) { |
317 | 0 | buf_aux = obase + tab_triple[hval]; |
318 | 0 | for (int l = 0; l < 12; l++) { |
319 | 0 | if (buf_triple[l] != buf_aux[l]) { |
320 | 0 | same = false; |
321 | 0 | break; |
322 | 0 | } |
323 | 0 | } |
324 | 0 | offset = (uint16_t) (anchor - obase - tab_triple[hval]); |
325 | 0 | } else { |
326 | 0 | same = false; |
327 | 0 | if ((j - i == 1) && (k - j == 1)) { |
328 | 0 | update_triple[i] = (uint32_t) (anchor + 1 + i * 4 - obase); /* update hash table */ |
329 | 0 | hash_triple[i] = hval; |
330 | 0 | } |
331 | 0 | } |
332 | 0 | ref = obase + tab_triple[hval]; |
333 | |
|
334 | 0 | if (same) { |
335 | 0 | distance = (int32_t) (anchor + i * 4 - ref); |
336 | 0 | } else { |
337 | 0 | distance = 0; |
338 | 0 | } |
339 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { |
340 | 0 | literal = false; |
341 | 0 | if (i == 1) { |
342 | 0 | token = (uint8_t) (7U << 5U); |
343 | 0 | } else { |
344 | 0 | token = (uint8_t) ((7U << 5U) | ((j + k - 2) << 3U)); |
345 | 0 | } |
346 | 0 | *op++ = token; |
347 | 0 | memcpy(op, &offset, 2); |
348 | 0 | op += 2; |
349 | 0 | for (int l = 0; l < 4; l++) { |
350 | 0 | if ((l != i) && (l != j) && (l != k)) { |
351 | 0 | memcpy(op, &buf_cell[4 * l], 4); |
352 | 0 | op += 4; |
353 | 0 | goto match; |
354 | 0 | } |
355 | 0 | } |
356 | 0 | } |
357 | 0 | } |
358 | 0 | } |
359 | 0 | } |
360 | | |
361 | | // rows pairs |
362 | 0 | for (int i = 0; i < 3; i++) { |
363 | 0 | memcpy(buf_pair, &buf_cell[i * 4], 4); |
364 | 0 | for (int j = i + 1; j < 4; j++) { |
365 | 0 | memcpy(&buf_pair[4], &buf_cell[j * 4], 4); |
366 | 0 | hval = XXH32(buf_pair, 8, 1); // calculate rows pair hash |
367 | 0 | hval >>= 32U - 12U; |
368 | 0 | ref = obase + tab_pair[hval]; |
369 | | /* calculate distance to the match */ |
370 | 0 | bool same = true; |
371 | 0 | uint16_t offset; |
372 | 0 | if (tab_pair[hval] != 0) { |
373 | 0 | buf_aux = obase + tab_pair[hval]; |
374 | 0 | for (int k = 0; k < 8; k++) { |
375 | 0 | if (buf_pair[k] != buf_aux[k]) { |
376 | 0 | same = false; |
377 | 0 | break; |
378 | 0 | } |
379 | 0 | } |
380 | 0 | offset = (uint16_t) (anchor - obase - tab_pair[hval]); |
381 | 0 | } else { |
382 | 0 | same = false; |
383 | 0 | if (j - i == 1) { |
384 | 0 | update_pair[i] = (uint32_t) (anchor + 1 + i * 4 - obase); /* update hash table */ |
385 | 0 | hash_pair[i] = hval; |
386 | 0 | } |
387 | 0 | } |
388 | 0 | if (same) { |
389 | 0 | distance = (int32_t) (anchor + i * 4 - ref); |
390 | 0 | } else { |
391 | 0 | distance = 0; |
392 | 0 | } |
393 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { /* rows pair match */ |
394 | 0 | literal = false; |
395 | 0 | if (i == 2) { |
396 | 0 | token = (uint8_t) (1U << 7U); |
397 | 0 | } else { |
398 | 0 | token = (uint8_t) ((1U << 7U) | (i << 5U) | (j << 3U)); |
399 | 0 | } |
400 | 0 | *op++ = token; |
401 | 0 | memcpy(op, &offset, 2); |
402 | 0 | op += 2; |
403 | 0 | for (int k = 0; k < 4; k++) { |
404 | 0 | if ((k != i) && (k != j)) { |
405 | 0 | memcpy(op, &buf_cell[4 * k], 4); |
406 | 0 | op += 4; |
407 | 0 | } |
408 | 0 | } |
409 | 0 | goto match; |
410 | 0 | } |
411 | 0 | } |
412 | 0 | } |
413 | | |
414 | 0 | match: |
415 | 0 | if (literal) { |
416 | 0 | tab_cell[hash_cell] = (uint32_t) (anchor + 1 - obase); /* update hash tables */ |
417 | 0 | if (update_triple[0] != 0) { |
418 | 0 | for (int h = 0; h < 2; h++) { |
419 | 0 | tab_triple[hash_triple[h]] = update_triple[h]; |
420 | 0 | } |
421 | 0 | } |
422 | 0 | if (update_pair[0] != 0) { |
423 | 0 | for (int h = 0; h < 3; h++) { |
424 | 0 | tab_pair[hash_pair[h]] = update_pair[h]; |
425 | 0 | } |
426 | 0 | } |
427 | 0 | token = 0; |
428 | 0 | *op++ = token; |
429 | 0 | memcpy(op, buf_cell, 16); |
430 | 0 | op += 16; |
431 | 0 | } |
432 | |
|
433 | 0 | } else { // cell match |
434 | 0 | token = (uint8_t) ((1U << 7U) | (1U << 6U)); |
435 | 0 | *op++ = token; |
436 | 0 | uint16_t offset = (uint16_t) (anchor - obase - tab_cell[hash_cell]); |
437 | 0 | memcpy(op, &offset, 2); |
438 | 0 | op += 2; |
439 | 0 | } |
440 | |
|
441 | 0 | } |
442 | 0 | if ((op - obase) > input_len) { |
443 | 0 | BLOSC_TRACE_ERROR("Compressed data is bigger than input!"); |
444 | 0 | return 0; |
445 | 0 | } |
446 | 0 | } |
447 | 0 | } |
448 | | |
449 | 0 | free(shape); |
450 | 0 | free(chunkshape); |
451 | 0 | free(blockshape); |
452 | |
|
453 | 0 | return (int) (op - obase); |
454 | 0 | } |
455 | | |
456 | | |
457 | | // See https://habr.com/en/company/yandex/blog/457612/ |
458 | | #ifdef __AVX2__ |
459 | | |
460 | | #if defined(_MSC_VER) |
461 | | #define ALIGNED_(x) __declspec(align(x)) |
462 | | #else |
463 | | #if defined(__GNUC__) |
464 | | #define ALIGNED_(x) __attribute__ ((aligned(x))) |
465 | | #endif |
466 | | #endif |
467 | | #define ALIGNED_TYPE_(t, x) t ALIGNED_(x) |
468 | | |
469 | | static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len) |
470 | | { |
471 | | size_t offset = op - match; |
472 | | while (len >= 16) { |
473 | | |
474 | | static const ALIGNED_TYPE_(uint8_t, 16) masks[] = |
475 | | { |
476 | | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, // offset = 0, not used as mask, but for shift |
477 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // offset = 1 |
478 | | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
479 | | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
480 | | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
481 | | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
482 | | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
483 | | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
484 | | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
485 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
486 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
487 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
488 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
489 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
490 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
491 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
492 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // offset = 16 |
493 | | }; |
494 | | |
495 | | _mm_storeu_si128((__m128i *)(op), |
496 | | _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)), |
497 | | _mm_load_si128((const __m128i *)(masks) + offset))); |
498 | | |
499 | | match += masks[offset]; |
500 | | |
501 | | op += 16; |
502 | | len -= 16; |
503 | | } |
504 | | // Deal with remainders |
505 | | for (; len > 0; len--) { |
506 | | *op++ = *match++; |
507 | | } |
508 | | return op; |
509 | | } |
510 | | #endif |
511 | | |
512 | | |
513 | | int ndlz4_decompress(const uint8_t *input, int32_t input_len, uint8_t *output, int32_t output_len, |
514 | 608 | uint8_t meta, blosc2_dparams *dparams) { |
515 | 608 | BLOSC_UNUSED_PARAM(meta); |
516 | 608 | BLOSC_UNUSED_PARAM(dparams); |
517 | 608 | BLOSC_ERROR_NULL(input, BLOSC2_ERROR_NULL_POINTER); |
518 | 608 | BLOSC_ERROR_NULL(output, BLOSC2_ERROR_NULL_POINTER); |
519 | | |
520 | 608 | uint8_t *ip = (uint8_t *) input; |
521 | 608 | uint8_t *ip_limit = ip + input_len; |
522 | 608 | uint8_t *op = (uint8_t *) output; |
523 | 608 | uint8_t ndim; |
524 | 608 | int32_t blockshape[2]; |
525 | 608 | int32_t eshape[2]; |
526 | 608 | uint8_t *buffercpy; |
527 | 608 | uint8_t local_buffer[16]; |
528 | 608 | uint8_t token; |
529 | 608 | if (NDLZ_UNEXPECT_CONDITIONAL(input_len < 8)) { |
530 | 2 | return 0; |
531 | 2 | } |
532 | | |
533 | | /* we start with literal copy */ |
534 | 606 | ndim = *ip; |
535 | 606 | ip++; |
536 | 606 | if (ndim != 2) { |
537 | 0 | BLOSC_TRACE_ERROR("This codec only works for ndim = 2"); |
538 | 0 | return BLOSC2_ERROR_FAILURE; |
539 | 0 | } |
540 | 606 | memcpy(&blockshape[0], ip, 4); |
541 | 606 | ip += 4; |
542 | 606 | memcpy(&blockshape[1], ip, 4); |
543 | 606 | ip += 4; |
544 | | |
545 | | // Sanity check. See https://www.cve.org/CVERecord?id=CVE-2024-3204 |
546 | 606 | if (output_len < 0 || blockshape[0] < 0 || blockshape[1] < 0) { |
547 | 0 | BLOSC_TRACE_ERROR("Output length or blockshape is negative"); |
548 | 0 | return BLOSC2_ERROR_FAILURE; |
549 | 0 | } |
550 | | |
551 | 606 | eshape[0] = ((blockshape[0] + 3) / 4) * 4; |
552 | 606 | eshape[1] = ((blockshape[1] + 3) / 4) * 4; |
553 | | |
554 | 606 | if (NDLZ_UNEXPECT_CONDITIONAL((int64_t)output_len < (int64_t)blockshape[0] * (int64_t)blockshape[1])) { |
555 | 10 | BLOSC_TRACE_ERROR("The blockshape is bigger than the output buffer"); |
556 | 10 | return 0; |
557 | 10 | } |
558 | 596 | memset(op, 0, blockshape[0] * blockshape[1]); |
559 | | |
560 | 596 | uint32_t i_stop[2]; |
561 | 1.78k | for (int i = 0; i < 2; ++i) { |
562 | 1.19k | i_stop[i] = eshape[i] / 4; |
563 | 1.19k | } |
564 | | |
565 | | /* main loop */ |
566 | 596 | uint32_t ii[2]; |
567 | 596 | uint32_t padding[2] = {0}; |
568 | 596 | uint32_t ind = 0; |
569 | 596 | uint8_t cell_aux[16]; |
570 | 1.46k | for (ii[0] = 0; ii[0] < i_stop[0]; ++ii[0]) { |
571 | 2.04k | for (ii[1] = 0; ii[1] < i_stop[1]; ++ii[1]) { // for each cell |
572 | 1.18k | if (NDLZ_UNEXPECT_CONDITIONAL(ip > ip_limit)) { |
573 | 1 | BLOSC_TRACE_ERROR("Exceeding input length"); |
574 | 1 | return BLOSC2_ERROR_FAILURE; |
575 | 1 | } |
576 | 1.17k | if (ii[0] == i_stop[0] - 1) { |
577 | 880 | padding[0] = (blockshape[0] % 4 == 0) ? 4 : blockshape[0] % 4; |
578 | 880 | } else { |
579 | 299 | padding[0] = 4; |
580 | 299 | } |
581 | 1.17k | if (ii[1] == i_stop[1] - 1) { |
582 | 870 | padding[1] = (blockshape[1] % 4 == 0) ? 4 : blockshape[1] % 4; |
583 | 870 | } else { |
584 | 309 | padding[1] = 4; |
585 | 309 | } |
586 | 1.17k | token = *ip++; |
587 | 1.17k | if (token == 0) { // no match |
588 | 172 | buffercpy = ip; |
589 | 172 | ip += padding[0] * padding[1]; |
590 | 1.00k | } else if (token == (uint8_t) ((1U << 7U) | (1U << 6U))) { // cell match |
591 | 34 | uint16_t offset = *((uint16_t *) ip); |
592 | 34 | buffercpy = ip - offset - 1; |
593 | 34 | ip += 2; |
594 | 973 | } else if (token == (uint8_t) (1U << 6U)) { // whole cell of same element |
595 | 69 | buffercpy = cell_aux; |
596 | 69 | memset(buffercpy, *ip, 16); |
597 | 69 | ip++; |
598 | 904 | } else if (token >= 224) { // three rows match |
599 | 270 | buffercpy = local_buffer; |
600 | 270 | uint16_t offset = *((uint16_t *) ip); |
601 | 270 | offset += 3; |
602 | 270 | ip += 2; |
603 | 270 | int i, j, k; |
604 | 270 | if ((token >> 3U) == 28) { |
605 | 40 | i = 1; |
606 | 40 | j = 2; |
607 | 40 | k = 3; |
608 | 230 | } else { |
609 | 230 | i = 0; |
610 | 230 | if ((token >> 3U) < 30) { |
611 | 42 | j = 1; |
612 | 42 | k = 2; |
613 | 188 | } else { |
614 | 188 | k = 3; |
615 | 188 | if ((token >> 3U) == 30) { |
616 | 121 | j = 1; |
617 | 121 | } else { |
618 | 67 | j = 2; |
619 | 67 | } |
620 | 188 | } |
621 | 230 | } |
622 | 270 | memcpy(&buffercpy[i * 4], ip - offset, 4); |
623 | 270 | memcpy(&buffercpy[j * 4], ip - offset + 4, 4); |
624 | 270 | memcpy(&buffercpy[k * 4], ip - offset + 8, 4); |
625 | 705 | for (int l = 0; l < 4; l++) { |
626 | 705 | if ((l != i) && (l != j) && (l != k)) { |
627 | 270 | memcpy(&buffercpy[l * 4], ip, 4); |
628 | 270 | ip += 4; |
629 | 270 | break; |
630 | 270 | } |
631 | 705 | } |
632 | | |
633 | 634 | } else if ((token >= 128) && (token <= 191)) { // rows pair match |
634 | 482 | buffercpy = local_buffer; |
635 | 482 | uint16_t offset = *((uint16_t *) ip); |
636 | 482 | offset += 3; |
637 | 482 | ip += 2; |
638 | 482 | int i, j; |
639 | 482 | if (token == 128) { |
640 | 95 | i = 2; |
641 | 95 | j = 3; |
642 | 387 | } else { |
643 | 387 | i = (token - 128) >> 5U; |
644 | 387 | j = ((token - 128) >> 3U) - (i << 2U); |
645 | 387 | } |
646 | 482 | memcpy(&buffercpy[i * 4], ip - offset, 4); |
647 | 482 | memcpy(&buffercpy[j * 4], ip - offset + 4, 4); |
648 | 2.41k | for (int k = 0; k < 4; k++) { |
649 | 1.92k | if ((k != i) && (k != j)) { |
650 | 998 | memcpy(&buffercpy[k * 4], ip, 4); |
651 | 998 | ip += 4; |
652 | 998 | } |
653 | 1.92k | } |
654 | 482 | } else if ((token >= 40) && (token <= 63)) { // 2 rows pair matches |
655 | 147 | buffercpy = local_buffer; |
656 | 147 | uint16_t offset_1 = *((uint16_t *) ip); |
657 | 147 | offset_1 += 5; |
658 | 147 | ip += 2; |
659 | 147 | uint16_t offset_2 = *((uint16_t *) ip); |
660 | 147 | offset_2 += 5; |
661 | 147 | ip += 2; |
662 | 147 | int i, j, k, l, m; |
663 | 147 | i = 0; |
664 | 147 | j = ((token - 32) >> 3U); |
665 | 147 | l = -1; |
666 | 588 | for (k = 1; k < 4; k++) { |
667 | 441 | if ((k != i) && (k != j)) { |
668 | 294 | if (l == -1) { |
669 | 147 | l = k; |
670 | 147 | } else { |
671 | 147 | m = k; |
672 | 147 | } |
673 | 294 | } |
674 | 441 | } |
675 | 147 | memcpy(&buffercpy[i * 4], ip - offset_1, 4); |
676 | 147 | memcpy(&buffercpy[j * 4], ip - offset_1 + 4, 4); |
677 | 147 | memcpy(&buffercpy[l * 4], ip - offset_2, 4); |
678 | 147 | memcpy(&buffercpy[m * 4], ip - offset_2 + 4, 4); |
679 | | |
680 | 147 | } else { |
681 | 5 | BLOSC_TRACE_ERROR("Invalid token: %u at cell [%d, %d]\n", token, ii[0], ii[1]); |
682 | 5 | return BLOSC2_ERROR_FAILURE; |
683 | 5 | } |
684 | | // fill op with buffercpy |
685 | 1.17k | uint32_t orig = ii[0] * 4 * blockshape[1] + ii[1] * 4; |
686 | 5.87k | for (uint32_t i = 0; i < 4; i++) { |
687 | 4.69k | if (i < padding[0]) { |
688 | 2.65k | ind = orig + i * blockshape[1]; |
689 | 2.65k | memcpy(&op[ind], buffercpy, padding[1]); |
690 | 2.65k | } |
691 | 4.69k | buffercpy += padding[1]; |
692 | 4.69k | } |
693 | 1.17k | if (ind > (uint32_t) output_len) { |
694 | 0 | BLOSC_TRACE_ERROR("Exceeding output size"); |
695 | 0 | return BLOSC2_ERROR_FAILURE; |
696 | 0 | } |
697 | 1.17k | } |
698 | 873 | } |
699 | 590 | ind += padding[1]; |
700 | | |
701 | 590 | if ((int32_t)ind != (blockshape[0] * blockshape[1])) { |
702 | 0 | BLOSC_TRACE_ERROR("Output size is not compatible with embedded blockshape"); |
703 | 0 | return BLOSC2_ERROR_FAILURE; |
704 | 0 | } |
705 | 590 | if (ind > (uint32_t) output_len) { |
706 | 0 | BLOSC_TRACE_ERROR("Exceeding output size"); |
707 | 0 | return BLOSC2_ERROR_FAILURE; |
708 | 0 | } |
709 | | |
710 | 590 | return (int) ind; |
711 | 590 | } |