/src/c-blosc2/plugins/codecs/ndlz/ndlz4x4.c
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************* |
2 | | Blosc - Blocked Shuffling and Compression Library |
3 | | |
4 | | Copyright (c) 2021 Blosc Development Team <blosc@blosc.org> |
5 | | https://blosc.org |
6 | | License: BSD 3-Clause (see LICENSE.txt) |
7 | | |
8 | | See LICENSE.txt for details about copyright and rights to use. |
9 | | **********************************************************************/ |
10 | | |
11 | | /********************************************************************* |
12 | | This codec is meant to leverage multidimensionality for getting |
13 | | better compression ratios. The idea is to look for similarities |
14 | | in places that are closer in a euclidean metric, not the typical |
15 | | linear one. |
16 | | **********************************************************************/ |
17 | | |
18 | | #include "ndlz4x4.h" |
19 | | #include "ndlz.h" |
20 | | #include "xxhash.h" |
21 | | #include "../plugins/plugin_utils.h" |
22 | | |
23 | | #include <stdlib.h> |
24 | | #include <string.h> |
25 | | |
26 | | /* |
27 | | * Give hints to the compiler for branch prediction optimization. |
28 | | */ |
29 | | #if defined(__GNUC__) && (__GNUC__ > 2) |
30 | | #define NDLZ_EXPECT_CONDITIONAL(c) (__builtin_expect((c), 1)) |
31 | 3.72k | #define NDLZ_UNEXPECT_CONDITIONAL(c) (__builtin_expect((c), 0)) |
32 | | #else |
33 | | #define NDLZ_EXPECT_CONDITIONAL(c) (c) |
34 | | #define NDLZ_UNEXPECT_CONDITIONAL(c) (c) |
35 | | #endif |
36 | | |
37 | | /* |
38 | | * Use inlined functions for supported systems. |
39 | | */ |
40 | | #if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */ |
41 | | #define inline __inline /* Visual C is not C99, but supports some kind of inline */ |
42 | | #endif |
43 | | |
44 | | #define MAX_COPY 32U |
45 | 0 | #define MAX_DISTANCE 65535 |
46 | | |
47 | | |
48 | | #ifdef BLOSC_STRICT_ALIGN |
49 | | #define NDLZ_READU16(p) ((p)[0] | (p)[1]<<8) |
50 | | #define NDLZ_READU32(p) ((p)[0] | (p)[1]<<8 | (p)[2]<<16 | (p)[3]<<24) |
51 | | #else |
52 | | #define NDLZ_READU16(p) *((const uint16_t*)(p)) |
53 | | #define NDLZ_READU32(p) *((const uint32_t*)(p)) |
54 | | #endif |
55 | | |
56 | | #define HASH_LOG (12) |
57 | | |
58 | | |
59 | | int ndlz4_compress(const uint8_t *input, int32_t input_len, uint8_t *output, int32_t output_len, |
60 | 0 | uint8_t meta, blosc2_cparams *cparams) { |
61 | 0 | BLOSC_UNUSED_PARAM(meta); |
62 | 0 | BLOSC_ERROR_NULL(cparams, BLOSC2_ERROR_NULL_POINTER); |
63 | 0 | BLOSC_ERROR_NULL(cparams->schunk, BLOSC2_ERROR_NULL_POINTER); |
64 | 0 | uint8_t *smeta; |
65 | 0 | int32_t smeta_len; |
66 | |
|
67 | 0 | if (blosc2_meta_get(cparams->schunk, "b2nd", &smeta, &smeta_len) < 0) { |
68 | 0 | BLOSC_TRACE_ERROR("b2nd layer not found!"); |
69 | 0 | return BLOSC2_ERROR_FAILURE; |
70 | 0 | } |
71 | | |
72 | 0 | int8_t ndim; |
73 | 0 | int64_t *shape = malloc(8 * sizeof(int64_t)); |
74 | 0 | int32_t *chunkshape = malloc(8 * sizeof(int32_t)); |
75 | 0 | int32_t *blockshape = malloc(8 * sizeof(int32_t)); |
76 | 0 | deserialize_meta(smeta, smeta_len, &ndim, shape, chunkshape, blockshape); |
77 | 0 | free(smeta); |
78 | |
|
79 | 0 | if (ndim != 2) { |
80 | 0 | BLOSC_TRACE_ERROR("This codec only works for ndim = 2"); |
81 | 0 | return BLOSC2_ERROR_FAILURE; |
82 | 0 | } |
83 | | |
84 | 0 | if (input_len != (blockshape[0] * blockshape[1])) { |
85 | 0 | BLOSC_TRACE_ERROR("Length not equal to blocksize"); |
86 | 0 | return BLOSC2_ERROR_FAILURE; |
87 | 0 | } |
88 | | |
89 | 0 | if (NDLZ_UNEXPECT_CONDITIONAL(output_len < (int) (1 + ndim * sizeof(int32_t)))) { |
90 | 0 | BLOSC_TRACE_ERROR("Output too small"); |
91 | 0 | return BLOSC2_ERROR_FAILURE; |
92 | 0 | } |
93 | | |
94 | 0 | uint8_t *ip = (uint8_t *) input; |
95 | 0 | uint8_t *op = (uint8_t *) output; |
96 | 0 | uint8_t *op_limit; |
97 | 0 | uint32_t hval, hash_cell; |
98 | 0 | uint32_t hash_triple[2] = {0}; |
99 | 0 | uint32_t hash_pair[3] = {0}; |
100 | 0 | uint8_t bufarea[16]; |
101 | 0 | uint8_t *buf_cell = bufarea; |
102 | 0 | uint8_t buf_triple[12]; |
103 | 0 | uint8_t buf_pair[8]; |
104 | 0 | uint8_t *buf_aux; |
105 | 0 | uint32_t tab_cell[1U << 12U] = {0}; |
106 | 0 | uint32_t tab_triple[1U << 12U] = {0}; |
107 | 0 | uint32_t tab_pair[1U << 12U] = {0}; |
108 | 0 | uint32_t update_triple[2] = {0}; |
109 | 0 | uint32_t update_pair[3] = {0}; |
110 | | |
111 | | // Minimum cratios before issuing and _early giveup_ |
112 | | // Remind that ndlz is not meant for cratios <= 2 (too costly to decompress) |
113 | |
|
114 | 0 | op_limit = op + output_len; |
115 | | |
116 | | // Initialize the hash table to distances of 0 |
117 | 0 | for (unsigned i = 0; i < (1U << 12U); i++) { |
118 | 0 | tab_cell[i] = 0; |
119 | 0 | } |
120 | | |
121 | | /* input and output buffer cannot be less than 16 and 66 bytes or we can get into trouble */ |
122 | 0 | int overhead = 17 + (blockshape[0] * blockshape[1] / 16 - 1) * 2; |
123 | 0 | if (input_len < 16 || output_len < overhead) { |
124 | 0 | BLOSC_TRACE_ERROR("Incorrect length or maxout"); |
125 | 0 | return 0; |
126 | 0 | } |
127 | | |
128 | 0 | uint8_t *obase = op; |
129 | | |
130 | | /* we start with literal copy */ |
131 | 0 | *op++ = ndim; |
132 | 0 | memcpy(op, &blockshape[0], 4); |
133 | 0 | op += 4; |
134 | 0 | memcpy(op, &blockshape[1], 4); |
135 | 0 | op += 4; |
136 | |
|
137 | 0 | uint32_t i_stop[2]; |
138 | 0 | for (int i = 0; i < 2; ++i) { |
139 | 0 | i_stop[i] = (blockshape[i] + 3) / 4; |
140 | 0 | } |
141 | | |
142 | | /* main loop */ |
143 | 0 | uint32_t padding[2]; |
144 | 0 | uint32_t ii[2]; |
145 | 0 | for (ii[0] = 0; ii[0] < i_stop[0]; ++ii[0]) { |
146 | 0 | for (ii[1] = 0; ii[1] < i_stop[1]; ++ii[1]) { // for each cell |
147 | 0 | uint8_t token; |
148 | 0 | for (int h = 0; h < 2; h++) { // new cell -> new possible references |
149 | 0 | update_triple[h] = 0; |
150 | 0 | update_pair[h] = 0; |
151 | 0 | } |
152 | 0 | update_pair[2] = 0; |
153 | |
|
154 | 0 | if (NDLZ_UNEXPECT_CONDITIONAL(op + 16 + 1 > op_limit)) { |
155 | 0 | free(shape); |
156 | 0 | free(chunkshape); |
157 | 0 | free(blockshape); |
158 | 0 | return 0; |
159 | 0 | } |
160 | | |
161 | 0 | uint32_t orig = ii[0] * 4 * blockshape[1] + ii[1] * 4; |
162 | 0 | if (((blockshape[0] % 4 != 0) && (ii[0] == i_stop[0] - 1)) || |
163 | 0 | ((blockshape[1] % 4 != 0) && (ii[1] == i_stop[1] - 1))) { |
164 | 0 | token = 0; // padding -> literal copy |
165 | 0 | *op++ = token; |
166 | 0 | if (ii[0] == i_stop[0] - 1) { |
167 | 0 | padding[0] = (blockshape[0] % 4 == 0) ? 4 : blockshape[0] % 4; |
168 | 0 | } else { |
169 | 0 | padding[0] = 4; |
170 | 0 | } |
171 | 0 | if (ii[1] == i_stop[1] - 1) { |
172 | 0 | padding[1] = (blockshape[1] % 4 == 0) ? 4 : blockshape[1] % 4; |
173 | 0 | } else { |
174 | 0 | padding[1] = 4; |
175 | 0 | } |
176 | 0 | for (uint32_t i = 0; i < padding[0]; i++) { |
177 | 0 | memcpy(op, &ip[orig + i * blockshape[1]], padding[1]); |
178 | 0 | op += padding[1]; |
179 | 0 | } |
180 | 0 | } else { |
181 | 0 | for (uint64_t i = 0; i < 4; i++) { // fill cell buffer |
182 | 0 | uint64_t ind = orig + i * blockshape[1]; |
183 | 0 | memcpy(buf_cell, &ip[ind], 4); |
184 | 0 | buf_cell += 4; |
185 | 0 | } |
186 | 0 | buf_cell -= 16; |
187 | |
|
188 | 0 | const uint8_t *ref; |
189 | 0 | uint32_t distance; |
190 | 0 | uint8_t *anchor = op; /* comparison starting-point */ |
191 | | |
192 | | /* find potential match */ |
193 | 0 | hash_cell = XXH32(buf_cell, 16, 1); // calculate cell hash |
194 | 0 | hash_cell >>= 32U - 12U; |
195 | 0 | ref = obase + tab_cell[hash_cell]; |
196 | | |
197 | | /* calculate distance to the match */ |
198 | 0 | if (tab_cell[hash_cell] == 0) { |
199 | 0 | distance = 0; |
200 | 0 | } else { |
201 | 0 | bool same = true; |
202 | 0 | buf_aux = obase + tab_cell[hash_cell]; |
203 | 0 | for (int i = 0; i < 16; i++) { |
204 | 0 | if (buf_cell[i] != buf_aux[i]) { |
205 | 0 | same = false; |
206 | 0 | break; |
207 | 0 | } |
208 | 0 | } |
209 | 0 | if (same) { |
210 | 0 | distance = (int32_t) (anchor - ref); |
211 | 0 | } else { |
212 | 0 | distance = 0; |
213 | 0 | } |
214 | 0 | } |
215 | |
|
216 | 0 | bool alleq = true; |
217 | 0 | for (int i = 1; i < 16; i++) { |
218 | 0 | if (buf_cell[i] != buf_cell[0]) { |
219 | 0 | alleq = false; |
220 | 0 | break; |
221 | 0 | } |
222 | 0 | } |
223 | 0 | if (alleq) { // all elements of the cell equal |
224 | 0 | token = (uint8_t) (1U << 6U); |
225 | 0 | *op++ = token; |
226 | 0 | *op++ = buf_cell[0]; |
227 | |
|
228 | 0 | } else if (distance == 0 || (distance >= MAX_DISTANCE)) { // no cell match |
229 | 0 | bool literal = true; |
230 | | |
231 | | // 2 rows pairs matches |
232 | 0 | for (int j = 1; j < 4; j++) { |
233 | 0 | memcpy(buf_pair, buf_cell, 4); |
234 | 0 | memcpy(&buf_pair[4], &buf_cell[j * 4], 4); |
235 | 0 | hval = XXH32(buf_pair, 8, 1); // calculate rows pair hash |
236 | 0 | hval >>= 32U - 12U; |
237 | 0 | ref = obase + tab_pair[hval]; |
238 | | /* calculate distance to the match */ |
239 | 0 | bool same = true; |
240 | 0 | uint16_t offset; |
241 | 0 | if (tab_pair[hval] != 0) { |
242 | 0 | buf_aux = obase + tab_pair[hval]; |
243 | 0 | for (int k = 0; k < 8; k++) { |
244 | 0 | if (buf_pair[k] != buf_aux[k]) { |
245 | 0 | same = false; |
246 | 0 | break; |
247 | 0 | } |
248 | 0 | } |
249 | 0 | offset = (uint16_t) (anchor - obase - tab_pair[hval]); |
250 | 0 | } else { |
251 | 0 | same = false; |
252 | 0 | } |
253 | 0 | if (same) { |
254 | 0 | distance = (int32_t) (anchor - ref); |
255 | 0 | } else { |
256 | 0 | distance = 0; |
257 | 0 | } |
258 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { /* rows pair match */ |
259 | 0 | int k, m, l = -1; |
260 | 0 | for (k = 1; k < 4; k++) { |
261 | 0 | if (k != j) { |
262 | 0 | if (l == -1) { |
263 | 0 | l = k; |
264 | 0 | } else { |
265 | 0 | m = k; |
266 | 0 | } |
267 | 0 | } |
268 | 0 | } |
269 | 0 | memcpy(buf_pair, &buf_cell[l * 4], 4); |
270 | 0 | memcpy(&buf_pair[4], &buf_cell[m * 4], 4); |
271 | 0 | hval = XXH32(buf_pair, 8, 1); // calculate rows pair hash |
272 | 0 | hval >>= 32U - 12U; |
273 | 0 | ref = obase + tab_pair[hval]; |
274 | 0 | same = true; |
275 | 0 | if (tab_pair[hval] != 0) { |
276 | 0 | buf_aux = obase + tab_pair[hval]; |
277 | 0 | for (k = 0; k < 8; k++) { |
278 | 0 | if (buf_pair[k] != buf_aux[k]) { |
279 | 0 | same = false; |
280 | 0 | break; |
281 | 0 | } |
282 | 0 | } |
283 | 0 | } else { |
284 | 0 | same = false; |
285 | 0 | } |
286 | 0 | if (same) { |
287 | 0 | distance = (int32_t) (anchor + l * 4 - ref); |
288 | 0 | } else { |
289 | 0 | distance = 0; |
290 | 0 | } |
291 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { /* 2 pair matches */ |
292 | 0 | literal = false; |
293 | 0 | token = (uint8_t) ((1U << 5U) | (j << 3U)); |
294 | 0 | *op++ = token; |
295 | 0 | uint16_t offset_2 = (uint16_t) (anchor - obase - tab_pair[hval]); |
296 | 0 | *(uint16_t *) op = offset; |
297 | 0 | op += sizeof(offset); |
298 | 0 | *(uint16_t *) op = offset_2; |
299 | 0 | op += sizeof(offset_2); |
300 | 0 | goto match; |
301 | 0 | } |
302 | 0 | } |
303 | 0 | } |
304 | | |
305 | | // rows triples |
306 | 0 | for (int i = 0; i < 2; i++) { |
307 | 0 | memcpy(buf_triple, &buf_cell[i * 4], 4); |
308 | 0 | for (int j = i + 1; j < 3; j++) { |
309 | 0 | memcpy(&buf_triple[4], &buf_cell[j * 4], 4); |
310 | 0 | for (int k = j + 1; k < 4; k++) { |
311 | 0 | memcpy(&buf_triple[8], &buf_cell[k * 4], 4); |
312 | 0 | hval = XXH32(buf_triple, 12, 1); // calculate triple hash |
313 | 0 | hval >>= 32U - 12U; |
314 | | /* calculate distance to the match */ |
315 | 0 | bool same = true; |
316 | 0 | uint16_t offset; |
317 | 0 | if (tab_triple[hval] != 0) { |
318 | 0 | buf_aux = obase + tab_triple[hval]; |
319 | 0 | for (int l = 0; l < 12; l++) { |
320 | 0 | if (buf_triple[l] != buf_aux[l]) { |
321 | 0 | same = false; |
322 | 0 | break; |
323 | 0 | } |
324 | 0 | } |
325 | 0 | offset = (uint16_t) (anchor - obase - tab_triple[hval]); |
326 | 0 | } else { |
327 | 0 | same = false; |
328 | 0 | if ((j - i == 1) && (k - j == 1)) { |
329 | 0 | update_triple[i] = (uint32_t) (anchor + 1 + i * 4 - obase); /* update hash table */ |
330 | 0 | hash_triple[i] = hval; |
331 | 0 | } |
332 | 0 | } |
333 | 0 | ref = obase + tab_triple[hval]; |
334 | |
|
335 | 0 | if (same) { |
336 | 0 | distance = (int32_t) (anchor + i * 4 - ref); |
337 | 0 | } else { |
338 | 0 | distance = 0; |
339 | 0 | } |
340 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { |
341 | 0 | literal = false; |
342 | 0 | if (i == 1) { |
343 | 0 | token = (uint8_t) (7U << 5U); |
344 | 0 | } else { |
345 | 0 | token = (uint8_t) ((7U << 5U) | ((j + k - 2) << 3U)); |
346 | 0 | } |
347 | 0 | *op++ = token; |
348 | 0 | memcpy(op, &offset, 2); |
349 | 0 | op += 2; |
350 | 0 | for (int l = 0; l < 4; l++) { |
351 | 0 | if ((l != i) && (l != j) && (l != k)) { |
352 | 0 | memcpy(op, &buf_cell[4 * l], 4); |
353 | 0 | op += 4; |
354 | 0 | goto match; |
355 | 0 | } |
356 | 0 | } |
357 | 0 | } |
358 | 0 | } |
359 | 0 | } |
360 | 0 | } |
361 | | |
362 | | // rows pairs |
363 | 0 | for (int i = 0; i < 3; i++) { |
364 | 0 | memcpy(buf_pair, &buf_cell[i * 4], 4); |
365 | 0 | for (int j = i + 1; j < 4; j++) { |
366 | 0 | memcpy(&buf_pair[4], &buf_cell[j * 4], 4); |
367 | 0 | hval = XXH32(buf_pair, 8, 1); // calculate rows pair hash |
368 | 0 | hval >>= 32U - 12U; |
369 | 0 | ref = obase + tab_pair[hval]; |
370 | | /* calculate distance to the match */ |
371 | 0 | bool same = true; |
372 | 0 | uint16_t offset; |
373 | 0 | if (tab_pair[hval] != 0) { |
374 | 0 | buf_aux = obase + tab_pair[hval]; |
375 | 0 | for (int k = 0; k < 8; k++) { |
376 | 0 | if (buf_pair[k] != buf_aux[k]) { |
377 | 0 | same = false; |
378 | 0 | break; |
379 | 0 | } |
380 | 0 | } |
381 | 0 | offset = (uint16_t) (anchor - obase - tab_pair[hval]); |
382 | 0 | } else { |
383 | 0 | same = false; |
384 | 0 | if (j - i == 1) { |
385 | 0 | update_pair[i] = (uint32_t) (anchor + 1 + i * 4 - obase); /* update hash table */ |
386 | 0 | hash_pair[i] = hval; |
387 | 0 | } |
388 | 0 | } |
389 | 0 | if (same) { |
390 | 0 | distance = (int32_t) (anchor + i * 4 - ref); |
391 | 0 | } else { |
392 | 0 | distance = 0; |
393 | 0 | } |
394 | 0 | if ((distance != 0) && (distance < MAX_DISTANCE)) { /* rows pair match */ |
395 | 0 | literal = false; |
396 | 0 | if (i == 2) { |
397 | 0 | token = (uint8_t) (1U << 7U); |
398 | 0 | } else { |
399 | 0 | token = (uint8_t) ((1U << 7U) | (i << 5U) | (j << 3U)); |
400 | 0 | } |
401 | 0 | *op++ = token; |
402 | 0 | memcpy(op, &offset, 2); |
403 | 0 | op += 2; |
404 | 0 | for (int k = 0; k < 4; k++) { |
405 | 0 | if ((k != i) && (k != j)) { |
406 | 0 | memcpy(op, &buf_cell[4 * k], 4); |
407 | 0 | op += 4; |
408 | 0 | } |
409 | 0 | } |
410 | 0 | goto match; |
411 | 0 | } |
412 | 0 | } |
413 | 0 | } |
414 | | |
415 | 0 | match: |
416 | 0 | if (literal) { |
417 | 0 | tab_cell[hash_cell] = (uint32_t) (anchor + 1 - obase); /* update hash tables */ |
418 | 0 | if (update_triple[0] != 0) { |
419 | 0 | for (int h = 0; h < 2; h++) { |
420 | 0 | tab_triple[hash_triple[h]] = update_triple[h]; |
421 | 0 | } |
422 | 0 | } |
423 | 0 | if (update_pair[0] != 0) { |
424 | 0 | for (int h = 0; h < 3; h++) { |
425 | 0 | tab_pair[hash_pair[h]] = update_pair[h]; |
426 | 0 | } |
427 | 0 | } |
428 | 0 | token = 0; |
429 | 0 | *op++ = token; |
430 | 0 | memcpy(op, buf_cell, 16); |
431 | 0 | op += 16; |
432 | 0 | } |
433 | |
|
434 | 0 | } else { // cell match |
435 | 0 | token = (uint8_t) ((1U << 7U) | (1U << 6U)); |
436 | 0 | *op++ = token; |
437 | 0 | uint16_t offset = (uint16_t) (anchor - obase - tab_cell[hash_cell]); |
438 | 0 | memcpy(op, &offset, 2); |
439 | 0 | op += 2; |
440 | 0 | } |
441 | |
|
442 | 0 | } |
443 | 0 | if ((op - obase) > input_len) { |
444 | 0 | BLOSC_TRACE_ERROR("Compressed data is bigger than input!"); |
445 | 0 | return 0; |
446 | 0 | } |
447 | 0 | } |
448 | 0 | } |
449 | | |
450 | 0 | free(shape); |
451 | 0 | free(chunkshape); |
452 | 0 | free(blockshape); |
453 | |
|
454 | 0 | return (int) (op - obase); |
455 | 0 | } |
456 | | |
457 | | |
458 | | // See https://habr.com/en/company/yandex/blog/457612/ |
459 | | #ifdef __AVX2__ |
460 | | |
461 | | #if defined(_MSC_VER) |
462 | | #define ALIGNED_(x) __declspec(align(x)) |
463 | | #else |
464 | | #if defined(__GNUC__) |
465 | | #define ALIGNED_(x) __attribute__ ((aligned(x))) |
466 | | #endif |
467 | | #endif |
468 | | #define ALIGNED_TYPE_(t, x) t ALIGNED_(x) |
469 | | |
470 | | static unsigned char* copy_match_16(unsigned char *op, const unsigned char *match, int32_t len) |
471 | | { |
472 | | size_t offset = op - match; |
473 | | while (len >= 16) { |
474 | | |
475 | | static const ALIGNED_TYPE_(uint8_t, 16) masks[] = |
476 | | { |
477 | | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, // offset = 0, not used as mask, but for shift |
478 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // offset = 1 |
479 | | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
480 | | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
481 | | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
482 | | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
483 | | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
484 | | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
485 | | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
486 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
487 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
488 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
489 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
490 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
491 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
492 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
493 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // offset = 16 |
494 | | }; |
495 | | |
496 | | _mm_storeu_si128((__m128i *)(op), |
497 | | _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(match)), |
498 | | _mm_load_si128((const __m128i *)(masks) + offset))); |
499 | | |
500 | | match += masks[offset]; |
501 | | |
502 | | op += 16; |
503 | | len -= 16; |
504 | | } |
505 | | // Deal with remainders |
506 | | for (; len > 0; len--) { |
507 | | *op++ = *match++; |
508 | | } |
509 | | return op; |
510 | | } |
511 | | #endif |
512 | | |
513 | | |
514 | | int ndlz4_decompress(const uint8_t *input, int32_t input_len, uint8_t *output, int32_t output_len, |
515 | 1.03k | uint8_t meta, blosc2_dparams *dparams) { |
516 | 1.03k | BLOSC_UNUSED_PARAM(meta); |
517 | 1.03k | BLOSC_UNUSED_PARAM(dparams); |
518 | 1.03k | BLOSC_ERROR_NULL(input, BLOSC2_ERROR_NULL_POINTER); |
519 | 1.03k | BLOSC_ERROR_NULL(output, BLOSC2_ERROR_NULL_POINTER); |
520 | | |
521 | 1.03k | uint8_t *ip = (uint8_t *) input; |
522 | 1.03k | uint8_t *ip_limit = ip + input_len; |
523 | 1.03k | uint8_t *op = (uint8_t *) output; |
524 | 1.03k | uint8_t ndim; |
525 | 1.03k | int32_t blockshape[2]; |
526 | 1.03k | int32_t eshape[2]; |
527 | 1.03k | uint8_t *buffercpy; |
528 | 1.03k | uint8_t local_buffer[16]; |
529 | 1.03k | uint8_t token; |
530 | 1.03k | if (NDLZ_UNEXPECT_CONDITIONAL(input_len < 8)) { |
531 | 6 | return 0; |
532 | 6 | } |
533 | | |
534 | | /* we start with literal copy */ |
535 | 1.02k | ndim = *ip; |
536 | 1.02k | ip++; |
537 | 1.02k | if (ndim != 2) { |
538 | 6 | BLOSC_TRACE_ERROR("This codec only works for ndim = 2"); |
539 | 0 | return BLOSC2_ERROR_FAILURE; |
540 | 6 | } |
541 | 1.02k | memcpy(&blockshape[0], ip, 4); |
542 | 1.02k | ip += 4; |
543 | 1.02k | memcpy(&blockshape[1], ip, 4); |
544 | 1.02k | ip += 4; |
545 | | |
546 | | // Sanity check. See https://www.cve.org/CVERecord?id=CVE-2024-3204 |
547 | 1.02k | if (output_len < 0 || blockshape[0] < 0 || blockshape[1] < 0) { |
548 | 64 | BLOSC_TRACE_ERROR("Output length or blockshape is negative"); |
549 | 0 | return BLOSC2_ERROR_FAILURE; |
550 | 64 | } |
551 | | |
552 | 956 | eshape[0] = ((blockshape[0] + 3) / 4) * 4; |
553 | 956 | eshape[1] = ((blockshape[1] + 3) / 4) * 4; |
554 | | |
555 | 956 | if (NDLZ_UNEXPECT_CONDITIONAL((int64_t)output_len < (int64_t)blockshape[0] * (int64_t)blockshape[1])) { |
556 | 105 | BLOSC_TRACE_ERROR("The blockshape is bigger than the output buffer"); |
557 | 0 | return 0; |
558 | 105 | } |
559 | 851 | memset(op, 0, blockshape[0] * blockshape[1]); |
560 | | |
561 | 851 | uint32_t i_stop[2]; |
562 | 2.55k | for (int i = 0; i < 2; ++i) { |
563 | 1.70k | i_stop[i] = eshape[i] / 4; |
564 | 1.70k | } |
565 | | |
566 | | /* main loop */ |
567 | 851 | uint32_t ii[2]; |
568 | 851 | uint32_t padding[2] = {0}; |
569 | 851 | uint32_t ind = 0; |
570 | 851 | uint8_t cell_aux[16]; |
571 | 8.95G | for (ii[0] = 0; ii[0] < i_stop[0]; ++ii[0]) { |
572 | 8.95G | for (ii[1] = 0; ii[1] < i_stop[1]; ++ii[1]) { // for each cell |
573 | 1.73k | if (NDLZ_UNEXPECT_CONDITIONAL(ip > ip_limit)) { |
574 | 5 | BLOSC_TRACE_ERROR("Exceeding input length"); |
575 | 0 | return BLOSC2_ERROR_FAILURE; |
576 | 5 | } |
577 | 1.73k | if (ii[0] == i_stop[0] - 1) { |
578 | 952 | padding[0] = (blockshape[0] % 4 == 0) ? 4 : blockshape[0] % 4; |
579 | 952 | } else { |
580 | 779 | padding[0] = 4; |
581 | 779 | } |
582 | 1.73k | if (ii[1] == i_stop[1] - 1) { |
583 | 1.57k | padding[1] = (blockshape[1] % 4 == 0) ? 4 : blockshape[1] % 4; |
584 | 1.57k | } else { |
585 | 155 | padding[1] = 4; |
586 | 155 | } |
587 | 1.73k | token = *ip++; |
588 | 1.73k | if (token == 0) { // no match |
589 | 316 | buffercpy = ip; |
590 | 316 | ip += padding[0] * padding[1]; |
591 | 1.41k | } else if (token == (uint8_t) ((1U << 7U) | (1U << 6U))) { // cell match |
592 | 73 | uint16_t offset = *((uint16_t *) ip); |
593 | 73 | buffercpy = ip - offset - 1; |
594 | 73 | ip += 2; |
595 | 1.34k | } else if (token == (uint8_t) (1U << 6U)) { // whole cell of same element |
596 | 138 | buffercpy = cell_aux; |
597 | 138 | memset(buffercpy, *ip, 16); |
598 | 138 | ip++; |
599 | 1.20k | } else if (token >= 224) { // three rows match |
600 | 392 | buffercpy = local_buffer; |
601 | 392 | uint16_t offset = *((uint16_t *) ip); |
602 | 392 | offset += 3; |
603 | 392 | ip += 2; |
604 | 392 | int i, j, k; |
605 | 392 | if ((token >> 3U) == 28) { |
606 | 84 | i = 1; |
607 | 84 | j = 2; |
608 | 84 | k = 3; |
609 | 308 | } else { |
610 | 308 | i = 0; |
611 | 308 | if ((token >> 3U) < 30) { |
612 | 83 | j = 1; |
613 | 83 | k = 2; |
614 | 225 | } else { |
615 | 225 | k = 3; |
616 | 225 | if ((token >> 3U) == 30) { |
617 | 121 | j = 1; |
618 | 121 | } else { |
619 | 104 | j = 2; |
620 | 104 | } |
621 | 225 | } |
622 | 308 | } |
623 | 392 | memcpy(&buffercpy[i * 4], ip - offset, 4); |
624 | 392 | memcpy(&buffercpy[j * 4], ip - offset + 4, 4); |
625 | 392 | memcpy(&buffercpy[k * 4], ip - offset + 8, 4); |
626 | 987 | for (int l = 0; l < 4; l++) { |
627 | 987 | if ((l != i) && (l != j) && (l != k)) { |
628 | 392 | memcpy(&buffercpy[l * 4], ip, 4); |
629 | 392 | ip += 4; |
630 | 392 | break; |
631 | 392 | } |
632 | 987 | } |
633 | | |
634 | 812 | } else if ((token >= 128) && (token <= 191)) { // rows pair match |
635 | 538 | buffercpy = local_buffer; |
636 | 538 | uint16_t offset = *((uint16_t *) ip); |
637 | 538 | offset += 3; |
638 | 538 | ip += 2; |
639 | 538 | int i, j; |
640 | 538 | if (token == 128) { |
641 | 100 | i = 2; |
642 | 100 | j = 3; |
643 | 438 | } else { |
644 | 438 | i = (token - 128) >> 5U; |
645 | 438 | j = ((token - 128) >> 3U) - (i << 2U); |
646 | 438 | } |
647 | 538 | memcpy(&buffercpy[i * 4], ip - offset, 4); |
648 | 538 | memcpy(&buffercpy[j * 4], ip - offset + 4, 4); |
649 | 2.69k | for (int k = 0; k < 4; k++) { |
650 | 2.15k | if ((k != i) && (k != j)) { |
651 | 1.13k | memcpy(&buffercpy[k * 4], ip, 4); |
652 | 1.13k | ip += 4; |
653 | 1.13k | } |
654 | 2.15k | } |
655 | 538 | } else if ((token >= 40) && (token <= 63)) { // 2 rows pair matches |
656 | 251 | buffercpy = local_buffer; |
657 | 251 | uint16_t offset_1 = *((uint16_t *) ip); |
658 | 251 | offset_1 += 5; |
659 | 251 | ip += 2; |
660 | 251 | uint16_t offset_2 = *((uint16_t *) ip); |
661 | 251 | offset_2 += 5; |
662 | 251 | ip += 2; |
663 | 251 | int i, j, k, l, m; |
664 | 251 | i = 0; |
665 | 251 | j = ((token - 32) >> 3U); |
666 | 251 | l = -1; |
667 | 1.00k | for (k = 1; k < 4; k++) { |
668 | 753 | if ((k != i) && (k != j)) { |
669 | 502 | if (l == -1) { |
670 | 251 | l = k; |
671 | 251 | } else { |
672 | 251 | m = k; |
673 | 251 | } |
674 | 502 | } |
675 | 753 | } |
676 | 251 | memcpy(&buffercpy[i * 4], ip - offset_1, 4); |
677 | 251 | memcpy(&buffercpy[j * 4], ip - offset_1 + 4, 4); |
678 | 251 | memcpy(&buffercpy[l * 4], ip - offset_2, 4); |
679 | 251 | memcpy(&buffercpy[m * 4], ip - offset_2 + 4, 4); |
680 | | |
681 | 251 | } else { |
682 | 23 | BLOSC_TRACE_ERROR("Invalid token: %u at cell [%d, %d]\n", token, ii[0], ii[1]); |
683 | 0 | return BLOSC2_ERROR_FAILURE; |
684 | 23 | } |
685 | | // fill op with buffercpy |
686 | 1.70k | uint32_t orig = ii[0] * 4 * blockshape[1] + ii[1] * 4; |
687 | 8.54k | for (uint32_t i = 0; i < 4; i++) { |
688 | 6.83k | if (i < padding[0]) { |
689 | 5.43k | ind = orig + i * blockshape[1]; |
690 | 5.43k | memcpy(&op[ind], buffercpy, padding[1]); |
691 | 5.43k | } |
692 | 6.83k | buffercpy += padding[1]; |
693 | 6.83k | } |
694 | 1.70k | if (ind > (uint32_t) output_len) { |
695 | 0 | BLOSC_TRACE_ERROR("Exceeding output size"); |
696 | 0 | return BLOSC2_ERROR_FAILURE; |
697 | 0 | } |
698 | 1.70k | } |
699 | 8.95G | } |
700 | 823 | ind += padding[1]; |
701 | | |
702 | 823 | if ((int32_t)ind != (blockshape[0] * blockshape[1])) { |
703 | 0 | BLOSC_TRACE_ERROR("Output size is not compatible with embedded blockshape"); |
704 | 0 | return BLOSC2_ERROR_FAILURE; |
705 | 0 | } |
706 | 823 | if (ind > (uint32_t) output_len) { |
707 | 0 | BLOSC_TRACE_ERROR("Exceeding output size"); |
708 | 0 | return BLOSC2_ERROR_FAILURE; |
709 | 0 | } |
710 | | |
711 | 823 | return (int) ind; |
712 | 823 | } |