/src/htslib/htscodecs/htscodecs/rANS_static4x16pr.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017-2023, 2025 Genome Research Ltd. |
3 | | * Author(s): James Bonfield |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright notice, |
9 | | * this list of conditions and the following disclaimer. |
10 | | * |
11 | | * 2. Redistributions in binary form must reproduce the above |
12 | | * copyright notice, this list of conditions and the following |
13 | | * disclaimer in the documentation and/or other materials provided |
14 | | * with the distribution. |
15 | | * |
16 | | * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
17 | | * Institute nor the names of its contributors may be used to endorse |
18 | | * or promote products derived from this software without specific |
19 | | * prior written permission. |
20 | | * |
21 | | * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS |
22 | | * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
24 | | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH |
25 | | * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
26 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
27 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
28 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
29 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | // FIXME Can we get decoder to return the compressed sized read, avoiding |
35 | | // us needing to store it? Yes we can. See c-size comments. If we added all these |
36 | | // together we could get rans_uncompress_to_4x16 to return the number of bytes |
37 | | // consumed, avoiding the calling code from needed to explicitly stored the size. |
38 | | // However the effect on name tokeniser is to save 0.1 to 0.2% so not worth it. |
39 | | |
40 | | /*-------------------------------------------------------------------------- */ |
41 | | /* |
42 | | * Example wrapper to use the rans_byte.h functions included above. |
43 | | * |
44 | | * This demonstrates how to use, and unroll, an order-0 and order-1 frequency |
45 | | * model. |
46 | | */ |
47 | | |
48 | | #include "config.h" |
49 | | |
50 | | #include <stdint.h> |
51 | | #include <stdlib.h> |
52 | | #include <stdio.h> |
53 | | #include <unistd.h> |
54 | | #include <assert.h> |
55 | | #include <string.h> |
56 | | #include <sys/time.h> |
57 | | #include <limits.h> |
58 | | #include <math.h> |
59 | | |
60 | | #ifndef NO_THREADS |
61 | | #include <pthread.h> |
62 | | #endif |
63 | | |
64 | | #include "rANS_word.h" |
65 | | #include "rANS_static4x16.h" |
66 | | #include "rANS_static16_int.h" |
67 | | #include "pack.h" |
68 | | #include "rle.h" |
69 | | #include "utils.h" |
70 | | |
71 | 3.18M | #define TF_SHIFT 12 |
72 | 981k | #define TOTFREQ (1<<TF_SHIFT) |
73 | | |
74 | | // 9-11 is considerably faster in the O1 variant due to reduced table size. |
75 | | // We auto-tune between 10 and 12 though. Anywhere from 9 to 14 are viable. |
76 | | #ifndef TF_SHIFT_O1 |
77 | | #define TF_SHIFT_O1 12 |
78 | | #endif |
79 | | #ifndef TF_SHIFT_O1_FAST |
80 | | #define TF_SHIFT_O1_FAST 10 |
81 | | #endif |
82 | 3.08M | #define TOTFREQ_O1 (1<<TF_SHIFT_O1) |
83 | 2.68M | #define TOTFREQ_O1_FAST (1<<TF_SHIFT_O1_FAST) |
84 | | |
85 | | |
86 | | /*----------------------------------------------------------------------------- |
87 | | * Memory to memory compression functions. |
88 | | * |
89 | | * These are original versions without any manual loop unrolling. They |
90 | | * are easier to understand, but can be up to 2x slower. |
91 | | */ |
92 | | |
93 | 1.02M | unsigned int rans_compress_bound_4x16(unsigned int size, int order) { |
94 | 1.02M | int N = (order>>8) & 0xff; |
95 | 1.02M | if (!N) N=4; |
96 | | |
97 | 1.02M | order &= 0xff; |
98 | 1.02M | unsigned int sz = (order == 0 |
99 | 1.02M | ? 1.05*size + 257*3 + 4 |
100 | 1.02M | : 1.05*size + 257*257*3 + 4 + 257*3+4) + |
101 | 1.02M | ((order & RANS_ORDER_PACK) ? 1 : 0) + |
102 | 1.02M | ((order & RANS_ORDER_RLE) ? 1 + 257*3+4: 0) + 20 + |
103 | 1.02M | ((order & RANS_ORDER_X32) ? (32-4)*4 : 0) + |
104 | 1.02M | ((order & RANS_ORDER_STRIPE) ? 7 + 5*N: 0); |
105 | 1.02M | return sz + (sz&1) + 2; // make this even so buffers are word aligned |
106 | 1.02M | } |
107 | | |
108 | | // Compresses in_size bytes from 'in' to *out_size bytes in 'out'. |
109 | | // |
110 | | // NB: The output buffer does not hold the original size, so it is up to |
111 | | // the caller to store this. |
112 | | unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size, |
113 | 568k | unsigned char *out, unsigned int *out_size) { |
114 | 568k | unsigned char *cp, *out_end; |
115 | 568k | RansEncSymbol syms[256]; |
116 | 568k | RansState rans0; |
117 | 568k | RansState rans2; |
118 | 568k | RansState rans1; |
119 | 568k | RansState rans3; |
120 | 568k | uint8_t* ptr; |
121 | 568k | uint32_t F[256+MAGIC] = {0}; |
122 | 568k | int i, j, tab_size = 0, rle, x; |
123 | | // -20 for order/size/meta |
124 | 568k | uint32_t bound = rans_compress_bound_4x16(in_size,0)-20; |
125 | | |
126 | 568k | if (!out) { |
127 | 338 | *out_size = bound; |
128 | 338 | out = malloc(*out_size); |
129 | 338 | } |
130 | 568k | if (!out || bound > *out_size) |
131 | 0 | return NULL; |
132 | | |
133 | | // If "out" isn't word aligned, tweak out_end/ptr to ensure it is. |
134 | | // We already added more round in bound to allow for this. |
135 | 568k | if (((size_t)out)&1) |
136 | 189k | bound--; |
137 | 568k | ptr = out_end = out + bound; |
138 | | |
139 | 568k | if (in_size == 0) |
140 | 84.9k | goto empty; |
141 | | |
142 | | // Compute statistics |
143 | 483k | if (hist8(in, in_size, F) < 0) |
144 | 0 | return NULL; |
145 | | |
146 | | // Normalise so frequences sum to power of 2 |
147 | 483k | uint32_t fsum = in_size; |
148 | 483k | uint32_t max_val = round2(fsum); |
149 | 483k | if (max_val > TOTFREQ) |
150 | 8.26k | max_val = TOTFREQ; |
151 | | |
152 | 483k | if (normalise_freq(F, fsum, max_val) < 0) |
153 | 0 | return NULL; |
154 | 483k | fsum=max_val; |
155 | | |
156 | 483k | cp = out; |
157 | 483k | cp += encode_freq(cp, F); |
158 | 483k | tab_size = cp-out; |
159 | | //write(2, out+4, cp-(out+4)); |
160 | | |
161 | 483k | if (normalise_freq(F, fsum, TOTFREQ) < 0) |
162 | 0 | return NULL; |
163 | | |
164 | | // Encode statistics. |
165 | 124M | for (x = rle = j = 0; j < 256; j++) { |
166 | 123M | if (F[j]) { |
167 | 1.92M | RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); |
168 | 1.92M | x += F[j]; |
169 | 1.92M | } |
170 | 123M | } |
171 | | |
172 | 483k | RansEncInit(&rans0); |
173 | 483k | RansEncInit(&rans1); |
174 | 483k | RansEncInit(&rans2); |
175 | 483k | RansEncInit(&rans3); |
176 | | |
177 | 483k | switch (i=(in_size&3)) { |
178 | 88.1k | case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); |
179 | | // fall-through |
180 | 213k | case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); |
181 | | // fall-through |
182 | 397k | case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); |
183 | | // fall-through |
184 | 483k | case 0: |
185 | 483k | break; |
186 | 483k | } |
187 | 886M | for (i=(in_size &~3); i>0; i-=4) { |
188 | 886M | RansEncSymbol *s3 = &syms[in[i-1]]; |
189 | 886M | RansEncSymbol *s2 = &syms[in[i-2]]; |
190 | 886M | RansEncSymbol *s1 = &syms[in[i-3]]; |
191 | 886M | RansEncSymbol *s0 = &syms[in[i-4]]; |
192 | | |
193 | 886M | #if 1 |
194 | 886M | RansEncPutSymbol(&rans3, &ptr, s3); |
195 | 886M | RansEncPutSymbol(&rans2, &ptr, s2); |
196 | 886M | RansEncPutSymbol(&rans1, &ptr, s1); |
197 | 886M | RansEncPutSymbol(&rans0, &ptr, s0); |
198 | | #else |
199 | | // Slightly beter on gcc, much better on clang |
200 | | uint16_t *ptr16 = (uint16_t *)ptr; |
201 | | |
202 | | if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16; |
203 | | if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16; |
204 | | uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift); |
205 | | uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift); |
206 | | rans3 += s3->bias + q3 * s3->cmpl_freq; |
207 | | rans2 += s2->bias + q2 * s2->cmpl_freq; |
208 | | |
209 | | if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16; |
210 | | if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16; |
211 | | uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift); |
212 | | uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift); |
213 | | rans1 += s1->bias + q1 * s1->cmpl_freq; |
214 | | rans0 += s0->bias + q0 * s0->cmpl_freq; |
215 | | |
216 | | ptr = (uint8_t *)ptr16; |
217 | | #endif |
218 | 886M | } |
219 | | |
220 | 483k | RansEncFlush(&rans3, &ptr); |
221 | 483k | RansEncFlush(&rans2, &ptr); |
222 | 483k | RansEncFlush(&rans1, &ptr); |
223 | 483k | RansEncFlush(&rans0, &ptr); |
224 | | |
225 | 568k | empty: |
226 | | // Finalise block size and return it |
227 | 568k | *out_size = (out_end - ptr) + tab_size; |
228 | | |
229 | 568k | memmove(out + tab_size, ptr, out_end-ptr); |
230 | | |
231 | 568k | return out; |
232 | 483k | } |
233 | | |
234 | | unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size, |
235 | 1.88k | unsigned char *out, unsigned int out_sz) { |
236 | 1.88k | if (in_size < 16) // 4-states at least |
237 | 43 | return NULL; |
238 | | |
239 | 1.84k | if (out_sz >= INT_MAX) |
240 | 4 | return NULL; // protect against some overflow cases |
241 | | |
242 | 1.83k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
243 | 1.83k | if (out_sz > 100000) |
244 | 12 | return NULL; |
245 | 1.82k | #endif |
246 | | |
247 | | /* Load in the static tables */ |
248 | 1.82k | unsigned char *cp = in, *out_free = NULL; |
249 | 1.82k | unsigned char *cp_end = in + in_size - 8; // within 8 => be extra safe |
250 | 1.82k | int i, j; |
251 | 1.82k | unsigned int x, y; |
252 | 1.82k | uint16_t sfreq[TOTFREQ+32]; |
253 | 1.82k | uint16_t sbase[TOTFREQ+32]; // faster to use 32-bit on clang |
254 | 1.82k | uint8_t ssym [TOTFREQ+64]; // faster to use 16-bit on clang |
255 | | |
256 | 1.82k | if (!out) |
257 | 133 | out_free = out = malloc(out_sz); |
258 | 1.82k | if (!out) |
259 | 0 | return NULL; |
260 | | |
261 | | // Precompute reverse lookup of frequency. |
262 | 1.82k | uint32_t F[256] = {0}, fsum; |
263 | 1.82k | int fsz = decode_freq(cp, cp_end, F, &fsum); |
264 | 1.82k | if (!fsz) |
265 | 0 | goto err; |
266 | 1.82k | cp += fsz; |
267 | | |
268 | 1.82k | normalise_freq_shift(F, fsum, TOTFREQ); |
269 | | |
270 | | // Build symbols; fixme, do as part of decode, see the _d variant |
271 | 443k | for (j = x = 0; j < 256; j++) { |
272 | 441k | if (F[j]) { |
273 | 2.85k | if (F[j] > TOTFREQ - x) |
274 | 133 | goto err; |
275 | 6.80M | for (y = 0; y < F[j]; y++) { |
276 | 6.79M | ssym [y + x] = j; |
277 | 6.79M | sfreq[y + x] = F[j]; |
278 | 6.79M | sbase[y + x] = y; |
279 | 6.79M | } |
280 | 2.71k | x += F[j]; |
281 | 2.71k | } |
282 | 441k | } |
283 | | |
284 | 1.69k | if (x != TOTFREQ) |
285 | 54 | goto err; |
286 | | |
287 | 1.64k | if (cp+16 > cp_end+8) |
288 | 7 | goto err; |
289 | | |
290 | 1.63k | RansState R[4]; |
291 | 1.63k | RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto err; |
292 | 1.62k | RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto err; |
293 | 1.62k | RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto err; |
294 | 1.61k | RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto err; |
295 | | |
296 | | // Simple version is comparable to below, but only with -O3 |
297 | | // |
298 | | // for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) { |
299 | | // for(j=0; j<8;j++) { |
300 | | // RansState m = RansDecGet(&R[j%4], TF_SHIFT); |
301 | | // R[j%4] = sfreq[m] * (R[j%4] >> TF_SHIFT) + sbase[m]; |
302 | | // out[i+j] = ssym[m]; |
303 | | // RansDecRenorm(&R[j%4], &cp); |
304 | | // } |
305 | | // } |
306 | | |
307 | 12.6k | for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) { |
308 | 33.0k | for (j = 0; j < 8; j+=4) { |
309 | 22.0k | RansState m0 = RansDecGet(&R[0], TF_SHIFT); |
310 | 22.0k | RansState m1 = RansDecGet(&R[1], TF_SHIFT); |
311 | 22.0k | out[i+j+0] = ssym[m0]; |
312 | 22.0k | out[i+j+1] = ssym[m1]; |
313 | | |
314 | 22.0k | R[0] = sfreq[m0] * (R[0] >> TF_SHIFT) + sbase[m0]; |
315 | 22.0k | R[1] = sfreq[m1] * (R[1] >> TF_SHIFT) + sbase[m1]; |
316 | | |
317 | 22.0k | RansState m2 = RansDecGet(&R[2], TF_SHIFT); |
318 | 22.0k | RansState m3 = RansDecGet(&R[3], TF_SHIFT); |
319 | | |
320 | 22.0k | RansDecRenorm(&R[0], &cp); |
321 | 22.0k | RansDecRenorm(&R[1], &cp); |
322 | | |
323 | 22.0k | R[2] = sfreq[m2] * (R[2] >> TF_SHIFT) + sbase[m2]; |
324 | 22.0k | R[3] = sfreq[m3] * (R[3] >> TF_SHIFT) + sbase[m3]; |
325 | | |
326 | 22.0k | RansDecRenorm(&R[2], &cp); |
327 | 22.0k | RansDecRenorm(&R[3], &cp); |
328 | | |
329 | 22.0k | out[i+j+2] = ssym[m2]; |
330 | 22.0k | out[i+j+3] = ssym[m3]; |
331 | 22.0k | } |
332 | 11.0k | } |
333 | | |
334 | | // remainder |
335 | 52.4k | for (; i < out_sz; i++) { |
336 | 50.8k | RansState m = RansDecGet(&R[i%4], TF_SHIFT); |
337 | 50.8k | R[i%4] = sfreq[m] * (R[i%4] >> TF_SHIFT) + sbase[m]; |
338 | 50.8k | out[i] = ssym[m]; |
339 | 50.8k | RansDecRenormSafe(&R[i%4], &cp, cp_end+8); |
340 | 50.8k | } |
341 | | |
342 | | //fprintf(stderr, " 0 Decoded %d bytes\n", (int)(cp-in)); //c-size |
343 | | |
344 | 1.61k | return out; |
345 | | |
346 | 216 | err: |
347 | 216 | free(out_free); |
348 | 216 | return NULL; |
349 | 1.61k | } |
350 | | |
351 | | //----------------------------------------------------------------------------- |
352 | | |
353 | | // Compute the entropy of 12-bit vs 10-bit frequency tables. |
354 | | // 10 bit means smaller memory footprint when decoding and |
355 | | // more speed due to cache hits, but it *may* be a poor |
356 | | // compression fit. |
357 | | int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T, |
358 | 77.3k | uint32_t *S) { |
359 | 77.3k | int i, j; |
360 | | |
361 | 77.3k | double e10 = 0, e12 = 0; |
362 | 77.3k | int max_tot = 0; |
363 | 19.8M | for (i = 0; i < 256; i++) { |
364 | 19.8M | if (F0[i] == 0) |
365 | 19.2M | continue; |
366 | 587k | unsigned int max_val = round2(T[i]); |
367 | 587k | int ns = 0; |
368 | 2.53M | #define MAX(a,b) ((a)>(b)?(a):(b)) |
369 | | |
370 | | // Number of samples that get their freq bumped to 1 |
371 | 587k | int sm10 = 0, sm12 = 0; |
372 | 150M | for (j = 0; j < 256; j++) { |
373 | 150M | if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1_FAST) |
374 | 31.7k | sm10++; |
375 | 150M | if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1) |
376 | 11.4k | sm12++; |
377 | 150M | } |
378 | | |
379 | 587k | double l10 = log(TOTFREQ_O1_FAST + sm10); |
380 | 587k | double l12 = log(TOTFREQ_O1 + sm12); |
381 | 587k | double T_slow = (double)TOTFREQ_O1/T[i]; |
382 | 587k | double T_fast = (double)TOTFREQ_O1_FAST/T[i]; |
383 | | |
384 | 150M | for (j = 0; j < 256; j++) { |
385 | 150M | if (F[i][j]) { |
386 | 1.26M | ns++; |
387 | | |
388 | 1.26M | e10 -= F[i][j] * (fast_log(MAX(F[i][j]*T_fast,1)) - l10); |
389 | 1.26M | e12 -= F[i][j] * (fast_log(MAX(F[i][j]*T_slow,1)) - l12); |
390 | | |
391 | | // Estimation of compressed symbol freq table too. |
392 | 1.26M | e10 += 1.3; |
393 | 1.26M | e12 += 4.7; |
394 | 1.26M | } |
395 | 150M | } |
396 | | |
397 | | // Order-1 frequencies often end up totalling under TOTFREQ. |
398 | | // In this case it's smaller to output the real frequencies |
399 | | // prior to normalisation and normalise after (with an extra |
400 | | // normalisation step needed in the decoder too). |
401 | | // |
402 | | // Thus we normalise to a power of 2 only, store those, |
403 | | // and renormalise later here (and in decoder) by bit-shift |
404 | | // to get to the fixed size. |
405 | 587k | if (ns < 64 && max_val > 128) max_val /= 2; |
406 | 587k | if (max_val > 1024) max_val /= 2; |
407 | 587k | if (max_val > TOTFREQ_O1) max_val = TOTFREQ_O1; |
408 | 587k | S[i] = max_val; // scale to max this |
409 | 587k | if (max_tot < max_val) |
410 | 134k | max_tot = max_val; |
411 | 587k | } |
412 | 77.3k | int shift = e10/e12 < 1.01 || max_tot <= TOTFREQ_O1_FAST |
413 | 77.3k | ? TF_SHIFT_O1_FAST |
414 | 77.3k | : TF_SHIFT_O1; |
415 | | |
416 | | // fprintf(stderr, "e10/12 = %f %f %f, shift %d\n", |
417 | | // e10/log(256), e12/log(256), e10/e12, shift); |
418 | | |
419 | 77.3k | return shift; |
420 | 77.3k | } |
421 | | |
422 | | static |
423 | | unsigned char *rans_compress_O1_4x16(unsigned char *in, unsigned int in_size, |
424 | 75.5k | unsigned char *out, unsigned int *out_size) { |
425 | 75.5k | unsigned char *cp, *out_end, *out_free = NULL; |
426 | 75.5k | unsigned int tab_size; |
427 | | |
428 | | // -20 for order/size/meta |
429 | 75.5k | uint32_t bound = rans_compress_bound_4x16(in_size,1)-20; |
430 | | |
431 | 75.5k | if (!out) { |
432 | 0 | *out_size = bound; |
433 | 0 | out_free = out = malloc(*out_size); |
434 | 0 | } |
435 | 75.5k | if (!out || bound > *out_size) |
436 | 0 | return NULL; |
437 | | |
438 | 75.5k | if (((size_t)out)&1) |
439 | 20.5k | bound--; |
440 | 75.5k | out_end = out + bound; |
441 | | |
442 | 75.5k | RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms))); |
443 | 75.5k | if (!syms) { |
444 | 0 | free(out_free); |
445 | 0 | return NULL; |
446 | 0 | } |
447 | | |
448 | 75.5k | cp = out; |
449 | 75.5k | int shift = encode_freq1(in, in_size, 4, syms, &cp); |
450 | 75.5k | if (shift < 0) { |
451 | 0 | htscodecs_tls_free(syms); |
452 | 0 | return NULL; |
453 | 0 | } |
454 | 75.5k | tab_size = cp - out; |
455 | | |
456 | 75.5k | RansState rans0, rans1, rans2, rans3; |
457 | 75.5k | RansEncInit(&rans0); |
458 | 75.5k | RansEncInit(&rans1); |
459 | 75.5k | RansEncInit(&rans2); |
460 | 75.5k | RansEncInit(&rans3); |
461 | | |
462 | 75.5k | uint8_t* ptr = out_end; |
463 | | |
464 | 75.5k | int isz4 = in_size>>2; |
465 | 75.5k | int i0 = 1*isz4-2; |
466 | 75.5k | int i1 = 2*isz4-2; |
467 | 75.5k | int i2 = 3*isz4-2; |
468 | 75.5k | int i3 = 4*isz4-2; |
469 | | |
470 | 75.5k | unsigned char l0 = in[i0+1]; |
471 | 75.5k | unsigned char l1 = in[i1+1]; |
472 | 75.5k | unsigned char l2 = in[i2+1]; |
473 | 75.5k | unsigned char l3 = in[i3+1]; |
474 | | |
475 | | // Deal with the remainder |
476 | 75.5k | l3 = in[in_size-1]; |
477 | 175k | for (i3 = in_size-2; i3 > 4*isz4-2; i3--) { |
478 | 99.8k | unsigned char c3 = in[i3]; |
479 | 99.8k | RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]); |
480 | 99.8k | l3 = c3; |
481 | 99.8k | } |
482 | | |
483 | 883M | for (; i0 >= 0; i0--, i1--, i2--, i3--) { |
484 | 883M | unsigned char c0, c1, c2, c3; |
485 | 883M | RansEncSymbol *s3 = &syms[c3 = in[i3]][l3]; |
486 | 883M | RansEncSymbol *s2 = &syms[c2 = in[i2]][l2]; |
487 | 883M | RansEncSymbol *s1 = &syms[c1 = in[i1]][l1]; |
488 | 883M | RansEncSymbol *s0 = &syms[c0 = in[i0]][l0]; |
489 | | |
490 | 883M | RansEncPutSymbol(&rans3, &ptr, s3); |
491 | 883M | RansEncPutSymbol(&rans2, &ptr, s2); |
492 | 883M | RansEncPutSymbol(&rans1, &ptr, s1); |
493 | 883M | RansEncPutSymbol(&rans0, &ptr, s0); |
494 | | |
495 | 883M | l0 = c0; |
496 | 883M | l1 = c1; |
497 | 883M | l2 = c2; |
498 | 883M | l3 = c3; |
499 | 883M | } |
500 | | |
501 | 75.5k | RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]); |
502 | 75.5k | RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]); |
503 | 75.5k | RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]); |
504 | 75.5k | RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]); |
505 | | |
506 | 75.5k | RansEncFlush(&rans3, &ptr); |
507 | 75.5k | RansEncFlush(&rans2, &ptr); |
508 | 75.5k | RansEncFlush(&rans1, &ptr); |
509 | 75.5k | RansEncFlush(&rans0, &ptr); |
510 | | |
511 | 75.5k | *out_size = (out_end - ptr) + tab_size; |
512 | | |
513 | 75.5k | cp = out; |
514 | 75.5k | memmove(out + tab_size, ptr, out_end-ptr); |
515 | | |
516 | 75.5k | htscodecs_tls_free(syms); |
517 | 75.5k | return out; |
518 | 75.5k | } |
519 | | |
520 | | //#define MAGIC2 111 |
521 | 291k | #define MAGIC2 179 |
522 | | //#define MAGIC2 0 |
523 | | |
524 | | static |
525 | | unsigned char *rans_uncompress_O1_4x16(unsigned char *in, unsigned int in_size, |
526 | 1.14k | unsigned char *out, unsigned int out_sz) { |
527 | 1.14k | if (in_size < 16) // 4-states at least |
528 | 10 | return NULL; |
529 | | |
530 | 1.13k | if (out_sz >= INT_MAX) |
531 | 0 | return NULL; // protect against some overflow cases |
532 | | |
533 | 1.13k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
534 | 1.13k | if (out_sz > 100000) |
535 | 0 | return NULL; |
536 | 1.13k | #endif |
537 | | |
538 | | /* Load in the static tables */ |
539 | 1.13k | unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL; |
540 | 1.13k | unsigned char *c_freq = NULL; |
541 | 1.13k | int i, j = -999; |
542 | 1.13k | unsigned int x; |
543 | | |
544 | 1.13k | uint8_t *sfb_ = htscodecs_tls_alloc(256*(TOTFREQ_O1+MAGIC2)*sizeof(*sfb_)); |
545 | 1.13k | uint32_t (*s3)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])sfb_; |
546 | | // reuse the same memory for the fast mode lookup, but this only works |
547 | | // if we're on e.g. 12-bit freqs vs 10-bit freqs as needs 4x larger array. |
548 | | //uint32_t s3[256][TOTFREQ_O1_FAST]; |
549 | | |
550 | 1.13k | if (!sfb_) |
551 | 0 | return NULL; |
552 | 1.13k | fb_t (*fb)[256] = htscodecs_tls_alloc(256 * sizeof(*fb)); |
553 | 1.13k | if (!fb) |
554 | 0 | goto err; |
555 | 1.13k | uint8_t *sfb[256]; |
556 | 1.13k | if ((*cp >> 4) == TF_SHIFT_O1) { |
557 | 51.9k | for (i = 0; i < 256; i++) |
558 | 51.7k | sfb[i]= sfb_ + i*(TOTFREQ_O1+MAGIC2); |
559 | 932 | } else { |
560 | 239k | for (i = 0; i < 256; i++) |
561 | 238k | sfb[i]= sfb_ + i*(TOTFREQ_O1_FAST+MAGIC2); |
562 | 932 | } |
563 | | |
564 | 1.13k | if (!out) |
565 | 0 | out_free = out = malloc(out_sz); |
566 | | |
567 | 1.13k | if (!out) |
568 | 0 | goto err; |
569 | | |
570 | | //fprintf(stderr, "out_sz=%d\n", out_sz); |
571 | | |
572 | | // compressed header? If so uncompress it |
573 | 1.13k | unsigned char *tab_end = NULL; |
574 | 1.13k | unsigned char *c_freq_end = cp_end; |
575 | 1.13k | unsigned int shift = *cp >> 4; |
576 | 1.13k | if (*cp++ & 1) { |
577 | 81 | uint32_t u_freq_sz, c_freq_sz; |
578 | 81 | cp += var_get_u32(cp, cp_end, &u_freq_sz); |
579 | 81 | cp += var_get_u32(cp, cp_end, &c_freq_sz); |
580 | 81 | if (c_freq_sz > cp_end - cp) |
581 | 18 | goto err; |
582 | 63 | tab_end = cp + c_freq_sz; |
583 | 63 | if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, u_freq_sz))) |
584 | 46 | goto err; |
585 | 17 | cp = c_freq; |
586 | 17 | c_freq_end = c_freq + u_freq_sz; |
587 | 17 | } |
588 | | |
589 | | // Decode order-0 symbol list; avoids needing in order-1 tables |
590 | 1.07k | uint32_t F0[256] = {0}; |
591 | 1.07k | int fsz = decode_alphabet(cp, c_freq_end, F0); |
592 | 1.07k | if (!fsz) |
593 | 19 | goto err; |
594 | 1.05k | cp += fsz; |
595 | | |
596 | 1.05k | if (cp >= c_freq_end) |
597 | 19 | goto err; |
598 | | |
599 | 1.03k | const int s3_fast_on = in_size >= 100000; |
600 | | |
601 | 235k | for (i = 0; i < 256; i++) { |
602 | 234k | if (F0[i] == 0) |
603 | 233k | continue; |
604 | | |
605 | 1.63k | uint32_t F[256] = {0}, T = 0; |
606 | 1.63k | fsz = decode_freq_d(cp, c_freq_end, F0, F, &T); |
607 | 1.63k | if (!fsz) |
608 | 3 | goto err; |
609 | 1.63k | cp += fsz; |
610 | | |
611 | 1.63k | if (!T) { |
612 | | //fprintf(stderr, "No freq for F_%d\n", i); |
613 | 847 | continue; |
614 | 847 | } |
615 | | |
616 | 788 | normalise_freq_shift(F, T, 1<<shift); |
617 | | |
618 | | // Build symbols; fixme, do as part of decode, see the _d variant |
619 | 178k | for (j = x = 0; j < 256; j++) { |
620 | 178k | if (F[j]) { |
621 | 1.86k | if (F[j] > (1<<shift) - x) |
622 | 123 | goto err; |
623 | | |
624 | 1.74k | if (shift == TF_SHIFT_O1_FAST && s3_fast_on) { |
625 | 0 | int y; |
626 | 0 | for (y = 0; y < F[j]; y++) |
627 | 0 | s3[i][y+x] = (((uint32_t)F[j])<<(shift+8)) |(y<<8) |j; |
628 | 1.74k | } else { |
629 | 1.74k | memset(&sfb[i][x], j, F[j]); |
630 | 1.74k | fb[i][j].f = F[j]; |
631 | 1.74k | fb[i][j].b = x; |
632 | 1.74k | } |
633 | 1.74k | x += F[j]; |
634 | 1.74k | } |
635 | 178k | } |
636 | 665 | if (x != (1<<shift)) |
637 | 0 | goto err; |
638 | 665 | } |
639 | | |
640 | 906 | if (tab_end) |
641 | 0 | cp = tab_end; |
642 | 906 | free(c_freq); |
643 | 906 | c_freq = NULL; |
644 | | |
645 | 906 | if (cp+16 > cp_end) |
646 | 6 | goto err; |
647 | | |
648 | 900 | RansState rans0, rans1, rans2, rans3; |
649 | 900 | uint8_t *ptr = cp, *ptr_end = in + in_size - 8; |
650 | 900 | RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto err; |
651 | 894 | RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto err; |
652 | 892 | RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto err; |
653 | 892 | RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto err; |
654 | | |
655 | 889 | unsigned int isz4 = out_sz>>2; |
656 | 889 | int l0 = 0, l1 = 0, l2 = 0, l3 = 0; |
657 | 889 | unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4}; |
658 | | |
659 | 889 | RansState R[4]; |
660 | 889 | R[0] = rans0; |
661 | 889 | R[1] = rans1; |
662 | 889 | R[2] = rans2; |
663 | 889 | R[3] = rans3; |
664 | | |
665 | | // Around 15% faster to specialise for 10/12 than to have one |
666 | | // loop with shift as a variable. |
667 | 889 | if (shift == TF_SHIFT_O1) { |
668 | | // TF_SHIFT_O1 = 12 |
669 | | |
670 | 172 | const uint32_t mask = ((1u << TF_SHIFT_O1)-1); |
671 | 38.9k | for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) { |
672 | 38.7k | uint16_t m, c; |
673 | 38.7k | c = sfb[l0][m = R[0] & mask]; |
674 | 38.7k | R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1) + m - fb[l0][c].b; |
675 | 38.7k | out[i4[0]] = l0 = c; |
676 | | |
677 | 38.7k | c = sfb[l1][m = R[1] & mask]; |
678 | 38.7k | R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1) + m - fb[l1][c].b; |
679 | 38.7k | out[i4[1]] = l1 = c; |
680 | | |
681 | 38.7k | c = sfb[l2][m = R[2] & mask]; |
682 | 38.7k | R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1) + m - fb[l2][c].b; |
683 | 38.7k | out[i4[2]] = l2 = c; |
684 | | |
685 | 38.7k | c = sfb[l3][m = R[3] & mask]; |
686 | 38.7k | R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1) + m - fb[l3][c].b; |
687 | 38.7k | out[i4[3]] = l3 = c; |
688 | | |
689 | 38.7k | if (ptr < ptr_end) { |
690 | 2.08k | RansDecRenorm(&R[0], &ptr); |
691 | 2.08k | RansDecRenorm(&R[1], &ptr); |
692 | 2.08k | RansDecRenorm(&R[2], &ptr); |
693 | 2.08k | RansDecRenorm(&R[3], &ptr); |
694 | 36.7k | } else { |
695 | 36.7k | RansDecRenormSafe(&R[0], &ptr, ptr_end+8); |
696 | 36.7k | RansDecRenormSafe(&R[1], &ptr, ptr_end+8); |
697 | 36.7k | RansDecRenormSafe(&R[2], &ptr, ptr_end+8); |
698 | 36.7k | RansDecRenormSafe(&R[3], &ptr, ptr_end+8); |
699 | 36.7k | } |
700 | 38.7k | } |
701 | | |
702 | | // Remainder |
703 | 538 | for (; i4[3] < out_sz; i4[3]++) { |
704 | 366 | uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1)-1); |
705 | 366 | unsigned char c3 = sfb[l3][m3]; |
706 | 366 | out[i4[3]] = c3; |
707 | 366 | R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1) + m3 - fb[l3][c3].b; |
708 | 366 | RansDecRenormSafe(&R[3], &ptr, ptr_end + 8); |
709 | 366 | l3 = c3; |
710 | 366 | } |
711 | 717 | } else if (!s3_fast_on) { |
712 | | // TF_SHIFT_O1 = 10 with sfb[256][1024] & fb[256]256] array lookup |
713 | | // Slightly faster for -o193 on q4 (high comp), but also less |
714 | | // initialisation cost for smaller data |
715 | 717 | const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1); |
716 | 255k | for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) { |
717 | 254k | uint16_t m, c; |
718 | 254k | c = sfb[l0][m = R[0] & mask]; |
719 | 254k | R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1_FAST) + m - fb[l0][c].b; |
720 | 254k | out[i4[0]] = l0 = c; |
721 | | |
722 | 254k | c = sfb[l1][m = R[1] & mask]; |
723 | 254k | R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1_FAST) + m - fb[l1][c].b; |
724 | 254k | out[i4[1]] = l1 = c; |
725 | | |
726 | 254k | c = sfb[l2][m = R[2] & mask]; |
727 | 254k | R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1_FAST) + m - fb[l2][c].b; |
728 | 254k | out[i4[2]] = l2 = c; |
729 | | |
730 | 254k | c = sfb[l3][m = R[3] & mask]; |
731 | 254k | R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1_FAST) + m - fb[l3][c].b; |
732 | 254k | out[i4[3]] = l3 = c; |
733 | | |
734 | 254k | if (ptr < ptr_end) { |
735 | 10.6k | RansDecRenorm(&R[0], &ptr); |
736 | 10.6k | RansDecRenorm(&R[1], &ptr); |
737 | 10.6k | RansDecRenorm(&R[2], &ptr); |
738 | 10.6k | RansDecRenorm(&R[3], &ptr); |
739 | 244k | } else { |
740 | 244k | RansDecRenormSafe(&R[0], &ptr, ptr_end+8); |
741 | 244k | RansDecRenormSafe(&R[1], &ptr, ptr_end+8); |
742 | 244k | RansDecRenormSafe(&R[2], &ptr, ptr_end+8); |
743 | 244k | RansDecRenormSafe(&R[3], &ptr, ptr_end+8); |
744 | 244k | } |
745 | 254k | } |
746 | | |
747 | | // Remainder |
748 | 1.54k | for (; i4[3] < out_sz; i4[3]++) { |
749 | 826 | uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1_FAST)-1); |
750 | 826 | unsigned char c3 = sfb[l3][m3]; |
751 | 826 | out[i4[3]] = c3; |
752 | 826 | R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1_FAST) + m3 - fb[l3][c3].b; |
753 | 826 | RansDecRenormSafe(&R[3], &ptr, ptr_end + 8); |
754 | 826 | l3 = c3; |
755 | 826 | } |
756 | 717 | } else { |
757 | | // TF_SHIFT_O1_FAST. |
758 | | // Significantly faster for -o1 on q40 (low comp). |
759 | | // Higher initialisation cost, so only use if big blocks. |
760 | 0 | const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1); |
761 | 0 | for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) { |
762 | 0 | uint32_t S0 = s3[l0][R[0] & mask]; |
763 | 0 | uint32_t S1 = s3[l1][R[1] & mask]; |
764 | 0 | l0 = out[i4[0]] = S0; |
765 | 0 | l1 = out[i4[1]] = S1; |
766 | 0 | uint16_t F0 = S0>>(TF_SHIFT_O1_FAST+8); |
767 | 0 | uint16_t F1 = S1>>(TF_SHIFT_O1_FAST+8); |
768 | 0 | uint16_t B0 = (S0>>8) & mask; |
769 | 0 | uint16_t B1 = (S1>>8) & mask; |
770 | |
|
771 | 0 | R[0] = F0 * (R[0]>>TF_SHIFT_O1_FAST) + B0; |
772 | 0 | R[1] = F1 * (R[1]>>TF_SHIFT_O1_FAST) + B1; |
773 | |
|
774 | 0 | uint32_t S2 = s3[l2][R[2] & mask]; |
775 | 0 | uint32_t S3 = s3[l3][R[3] & mask]; |
776 | 0 | l2 = out[i4[2]] = S2; |
777 | 0 | l3 = out[i4[3]] = S3; |
778 | 0 | uint16_t F2 = S2>>(TF_SHIFT_O1_FAST+8); |
779 | 0 | uint16_t F3 = S3>>(TF_SHIFT_O1_FAST+8); |
780 | 0 | uint16_t B2 = (S2>>8) & mask; |
781 | 0 | uint16_t B3 = (S3>>8) & mask; |
782 | |
|
783 | 0 | R[2] = F2 * (R[2]>>TF_SHIFT_O1_FAST) + B2; |
784 | 0 | R[3] = F3 * (R[3]>>TF_SHIFT_O1_FAST) + B3; |
785 | |
|
786 | 0 | if (ptr < ptr_end) { |
787 | 0 | RansDecRenorm(&R[0], &ptr); |
788 | 0 | RansDecRenorm(&R[1], &ptr); |
789 | 0 | RansDecRenorm(&R[2], &ptr); |
790 | 0 | RansDecRenorm(&R[3], &ptr); |
791 | 0 | } else { |
792 | 0 | RansDecRenormSafe(&R[0], &ptr, ptr_end+8); |
793 | 0 | RansDecRenormSafe(&R[1], &ptr, ptr_end+8); |
794 | 0 | RansDecRenormSafe(&R[2], &ptr, ptr_end+8); |
795 | 0 | RansDecRenormSafe(&R[3], &ptr, ptr_end+8); |
796 | 0 | } |
797 | 0 | } |
798 | | |
799 | | // Remainder |
800 | 0 | for (; i4[3] < out_sz; i4[3]++) { |
801 | 0 | uint32_t S = s3[l3][R[3] & ((1u<<TF_SHIFT_O1_FAST)-1)]; |
802 | 0 | l3 = out[i4[3]] = S; |
803 | 0 | R[3] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[3]>>TF_SHIFT_O1_FAST) |
804 | 0 | + ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1)); |
805 | 0 | RansDecRenormSafe(&R[3], &ptr, ptr_end + 8); |
806 | 0 | } |
807 | 0 | } |
808 | | //fprintf(stderr, " 1 Decoded %d bytes\n", (int)(ptr-in)); //c-size |
809 | | |
810 | 889 | htscodecs_tls_free(fb); |
811 | 889 | htscodecs_tls_free(sfb_); |
812 | 889 | return out; |
813 | | |
814 | 245 | err: |
815 | 245 | htscodecs_tls_free(fb); |
816 | 245 | htscodecs_tls_free(sfb_); |
817 | 245 | free(out_free); |
818 | 245 | free(c_freq); |
819 | | |
820 | 245 | return NULL; |
821 | 892 | } |
822 | | |
823 | | /*----------------------------------------------------------------------------- |
824 | | * r32x16 implementation, included here for now for simplicity |
825 | | */ |
826 | | #include "rANS_static32x16pr.h" |
827 | | |
828 | | static int rans_cpu = 0xFFFF; // all |
829 | | |
830 | | #if defined(__x86_64__) && \ |
831 | | defined(HAVE_DECL___CPUID_COUNT) && HAVE_DECL___CPUID_COUNT && \ |
832 | | defined(HAVE_DECL___GET_CPUID_MAX) && HAVE_DECL___GET_CPUID_MAX |
833 | | #include <cpuid.h> |
834 | | |
835 | | #if defined(__clang__) && defined(__has_attribute) |
836 | | # if __has_attribute(unused) |
837 | | # define UNUSED __attribute__((unused)) |
838 | | # else |
839 | | # define UNUSED |
840 | | # endif |
841 | | #elif defined(__GNUC__) && __GNUC__ >= 3 |
842 | | # define UNUSED __attribute__((unused)) |
843 | | #else |
844 | | # define UNUSED |
845 | | #endif |
846 | | |
847 | | #if defined(__APPLE__) |
848 | | /* |
849 | | MacOS before 12.2 (a.k.a. Darwin 21.3) had a bug that could cause random |
850 | | failures on some AVX512 operations due to opmask registers not being restored |
851 | | correctly following interrupts. For simplicity, check that the major version |
852 | | is 22 or higher before using AVX512. |
853 | | See https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259 |
854 | | */ |
855 | | |
856 | | #include <sys/utsname.h> |
857 | | static inline int not_ancient_darwin(void) { |
858 | | static long version = 0; |
859 | | if (!version) { |
860 | | struct utsname uname_info; |
861 | | if (uname(&uname_info) == 0) { |
862 | | version = strtol(uname_info.release, NULL, 10); |
863 | | } |
864 | | } |
865 | | return version >= 22; |
866 | | } |
867 | | #else |
868 | 0 | static inline int not_ancient_darwin(void) { |
869 | 0 | return 1; |
870 | 0 | } |
871 | | #endif |
872 | | |
873 | | |
874 | | // CPU detection is performed once. NB this has an assumption that we're |
875 | | // not migrating between processes with different instruction stes, but |
876 | | // to date the only systems I know of that support this don't have different |
877 | | // capabilities (that we use) per core. |
878 | | #ifndef NO_THREADS |
879 | | static pthread_once_t rans_cpu_once = PTHREAD_ONCE_INIT; |
880 | | #endif |
881 | | |
882 | | static int have_ssse3 UNUSED = 0; |
883 | | static int have_sse4_1 UNUSED = 0; |
884 | | static int have_popcnt UNUSED = 0; |
885 | | static int have_avx2 UNUSED = 0; |
886 | | static int have_avx512f UNUSED = 0; |
887 | | static int is_amd UNUSED = 0; |
888 | | |
889 | | #define HAVE_HTSCODECS_TLS_CPU_INIT |
890 | 1 | static void htscodecs_tls_cpu_init(void) { |
891 | 1 | unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0; |
892 | 1 | unsigned int have_xsave UNUSED = 0; |
893 | 1 | unsigned int have_avx UNUSED = 0; |
894 | 1 | uint64_t xcr0 UNUSED = 0ULL; |
895 | 1 | const uint64_t xcr0_can_use_avx UNUSED = (1ULL << 2); |
896 | 1 | const uint64_t xcr0_can_use_avx512 UNUSED = (7ULL << 5); |
897 | | // These may be unused, depending on HAVE_* config.h macros |
898 | | |
899 | 1 | int level = __get_cpuid_max(0, NULL); |
900 | 1 | __cpuid_count(0, 0, eax, ebx, ecx, edx); |
901 | 1 | is_amd = (ecx == 0x444d4163); |
902 | 1 | if (level >= 1) { |
903 | 1 | __cpuid_count(1, 0, eax, ebx, ecx, edx); |
904 | 1 | #if defined(bit_SSSE3) |
905 | 1 | have_ssse3 = ecx & bit_SSSE3; |
906 | 1 | #endif |
907 | 1 | #if defined(bit_POPCNT) |
908 | 1 | have_popcnt = ecx & bit_POPCNT; |
909 | 1 | #endif |
910 | 1 | #if defined(bit_SSE4_1) |
911 | 1 | have_sse4_1 = ecx & bit_SSE4_1; |
912 | 1 | #endif |
913 | 1 | #if defined(bit_AVX) |
914 | 1 | have_avx = ecx & bit_AVX; |
915 | 1 | #endif |
916 | 1 | #if defined(bit_XSAVE) && defined(bit_OSXSAVE) |
917 | 1 | have_xsave = (ecx & bit_XSAVE) && (ecx & bit_OSXSAVE); |
918 | 1 | if (have_xsave) { |
919 | | /* OSXSAVE tells us it's safe to use XGETBV to read XCR0 |
920 | | which then describes if AVX / AVX512 instructions can be |
921 | | executed. See Intel 64 and IA-32 Architectures Software |
922 | | Developer’s Manual Vol. 1 sections 13.2 and 13.3. |
923 | | |
924 | | Use inline assembly for XGETBV here to avoid problems |
925 | | with builtins either not working correctly, or requiring |
926 | | specific compiler options to be in use. Also emit raw |
927 | | bytes here as older toolchains may not have the XGETBV |
928 | | instruction. |
929 | | */ |
930 | 1 | __asm__ volatile (".byte 0x0f, 0x01, 0xd0" : |
931 | 1 | "=d" (edx), "=a" (eax) : |
932 | 1 | "c" (0)); |
933 | 1 | xcr0 = ((uint64_t) edx << 32) | eax; |
934 | 1 | } |
935 | 1 | #endif |
936 | 1 | } |
937 | | // AVX2 and AVX512F depend on XSAVE, AVX and bit 2 of XCR0. |
938 | 1 | if (level >= 7 && have_xsave && have_avx |
939 | 1 | && (xcr0 & xcr0_can_use_avx) == xcr0_can_use_avx) { |
940 | 1 | __cpuid_count(7, 0, eax, ebx, ecx, edx); |
941 | 1 | #if defined(bit_AVX2) |
942 | 1 | have_avx2 = ebx & bit_AVX2; |
943 | 1 | #endif |
944 | 1 | #if defined(bit_AVX512F) |
945 | | // AVX512 depends on bits 5:7 of XCR0 |
946 | 1 | if ((xcr0 & xcr0_can_use_avx512) == xcr0_can_use_avx512 |
947 | 0 | && not_ancient_darwin()) |
948 | 0 | have_avx512f = ebx & bit_AVX512F; |
949 | 1 | #endif |
950 | 1 | } |
951 | | |
952 | 1 | if (!have_popcnt) have_avx512f = have_avx2 = have_sse4_1 = 0; |
953 | 1 | if (!have_ssse3) have_sse4_1 = 0; |
954 | 1 | } |
955 | | |
956 | | static inline |
957 | | unsigned char *(*rans_enc_func(int do_simd, int order)) |
958 | | (unsigned char *in, |
959 | | unsigned int in_size, |
960 | | unsigned char *out, |
961 | 649k | unsigned int *out_size) { |
962 | | |
963 | 649k | if (!do_simd) { // SIMD disabled |
964 | 643k | return order & 1 |
965 | 643k | ? rans_compress_O1_4x16 |
966 | 643k | : rans_compress_O0_4x16; |
967 | 643k | } |
968 | | |
969 | | #ifdef NO_THREADS |
970 | | htscodecs_tls_cpu_init(); |
971 | | #else |
972 | 5.68k | int err = pthread_once(&rans_cpu_once, htscodecs_tls_cpu_init); |
973 | 5.68k | if (err != 0) { |
974 | 0 | fprintf(stderr, "Initialising TLS data failed: pthread_once: %s\n", |
975 | 0 | strerror(err)); |
976 | 0 | fprintf(stderr, "Using scalar code only\n"); |
977 | 0 | } |
978 | 5.68k | #endif |
979 | | |
980 | 5.68k | int have_e_sse4_1 = have_sse4_1; |
981 | 5.68k | int have_e_avx2 = have_avx2; |
982 | 5.68k | int have_e_avx512f = have_avx512f; |
983 | | |
984 | 5.68k | if (!(rans_cpu & RANS_CPU_ENC_AVX512)) have_e_avx512f = 0; |
985 | 5.68k | if (!(rans_cpu & RANS_CPU_ENC_AVX2)) have_e_avx2 = 0; |
986 | 5.68k | if (!(rans_cpu & RANS_CPU_ENC_SSE4)) have_e_sse4_1 = 0; |
987 | | |
988 | 5.68k | if (order & 1) { |
989 | | // With simulated gathers, the AVX512 is now slower than AVX2, so |
990 | | // we avoid using it unless asking for the real avx512 gather. |
991 | | // Note for testing we do -c 0x0404 to enable AVX512 and disable AVX2. |
992 | | // We then need to call the avx512 func regardless. |
993 | 1.84k | int use_gather; |
994 | | #ifdef USE_GATHER |
995 | | use_gather = 1; |
996 | | #else |
997 | 1.84k | use_gather = !have_e_avx2; |
998 | 1.84k | #endif |
999 | | |
1000 | 1.84k | #if defined(HAVE_AVX512) |
1001 | 1.84k | if (have_e_avx512f && (!is_amd || !have_e_avx2) && use_gather) |
1002 | 0 | return rans_compress_O1_32x16_avx512; |
1003 | 1.84k | #endif |
1004 | 1.84k | #if defined(HAVE_AVX2) |
1005 | 1.84k | if (have_e_avx2) |
1006 | 1.84k | return rans_compress_O1_32x16_avx2; |
1007 | 0 | #endif |
1008 | 0 | #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT) |
1009 | 0 | if (have_e_sse4_1) |
1010 | 0 | return rans_compress_O1_32x16; |
1011 | 0 | #endif |
1012 | 0 | return rans_compress_O1_32x16; |
1013 | 3.83k | } else { |
1014 | 3.83k | #if defined(HAVE_AVX512) |
1015 | 3.83k | if (have_e_avx512f && (!is_amd || !have_e_avx2)) |
1016 | 0 | return rans_compress_O0_32x16_avx512; |
1017 | 3.83k | #endif |
1018 | 3.83k | #if defined(HAVE_AVX2) |
1019 | 3.83k | if (have_e_avx2) |
1020 | 3.83k | return rans_compress_O0_32x16_avx2; |
1021 | 0 | #endif |
1022 | 0 | #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT) |
1023 | 0 | if (have_e_sse4_1) |
1024 | 0 | return rans_compress_O0_32x16; |
1025 | 0 | #endif |
1026 | 0 | return rans_compress_O0_32x16; |
1027 | 0 | } |
1028 | 5.68k | } |
1029 | | |
1030 | | static inline |
1031 | | unsigned char *(*rans_dec_func(int do_simd, int order)) |
1032 | | (unsigned char *in, |
1033 | | unsigned int in_size, |
1034 | | unsigned char *out, |
1035 | 11.3k | unsigned int out_size) { |
1036 | | |
1037 | 11.3k | if (!do_simd) { // SIMD disabled |
1038 | 2.87k | return order & 1 |
1039 | 2.87k | ? rans_uncompress_O1_4x16 |
1040 | 2.87k | : rans_uncompress_O0_4x16; |
1041 | 2.87k | } |
1042 | | |
1043 | | #ifdef NO_THREADS |
1044 | | htscodecs_tls_cpu_init(); |
1045 | | #else |
1046 | 8.43k | int err = pthread_once(&rans_cpu_once, htscodecs_tls_cpu_init); |
1047 | 8.43k | if (err != 0) { |
1048 | 0 | fprintf(stderr, "Initialising TLS data failed: pthread_once: %s\n", |
1049 | 0 | strerror(err)); |
1050 | 0 | fprintf(stderr, "Using scalar code only\n"); |
1051 | 0 | } |
1052 | 8.43k | #endif |
1053 | | |
1054 | 8.43k | int have_d_sse4_1 = have_sse4_1; |
1055 | 8.43k | int have_d_avx2 = have_avx2; |
1056 | 8.43k | int have_d_avx512f = have_avx512f; |
1057 | | |
1058 | 8.43k | if (!(rans_cpu & RANS_CPU_DEC_AVX512)) have_d_avx512f = 0; |
1059 | 8.43k | if (!(rans_cpu & RANS_CPU_DEC_AVX2)) have_d_avx2 = 0; |
1060 | 8.43k | if (!(rans_cpu & RANS_CPU_DEC_SSE4)) have_d_sse4_1 = 0; |
1061 | | |
1062 | 8.43k | if (order & 1) { |
1063 | 7.16k | #if defined(HAVE_AVX512) |
1064 | 7.16k | if (have_d_avx512f) |
1065 | 0 | return rans_uncompress_O1_32x16_avx512; |
1066 | 7.16k | #endif |
1067 | 7.16k | #if defined(HAVE_AVX2) |
1068 | 7.16k | if (have_d_avx2) |
1069 | 7.16k | return rans_uncompress_O1_32x16_avx2; |
1070 | 0 | #endif |
1071 | 0 | #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT) |
1072 | 0 | if (have_d_sse4_1) |
1073 | 0 | return rans_uncompress_O1_32x16_sse4; |
1074 | 0 | #endif |
1075 | 0 | return rans_uncompress_O1_32x16; |
1076 | 1.26k | } else { |
1077 | 1.26k | #if defined(HAVE_AVX512) |
1078 | 1.26k | if (have_d_avx512f) |
1079 | 0 | return rans_uncompress_O0_32x16_avx512; |
1080 | 1.26k | #endif |
1081 | 1.26k | #if defined(HAVE_AVX2) |
1082 | 1.26k | if (have_d_avx2) |
1083 | 1.26k | return rans_uncompress_O0_32x16_avx2; |
1084 | 0 | #endif |
1085 | 0 | #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT) |
1086 | 0 | if (have_d_sse4_1) |
1087 | 0 | return rans_uncompress_O0_32x16_sse4; |
1088 | 0 | #endif |
1089 | 0 | return rans_uncompress_O0_32x16; |
1090 | 0 | } |
1091 | 8.43k | } |
1092 | | |
1093 | | #elif defined(__ARM_NEON) && defined(__aarch64__) |
1094 | | |
1095 | | #if defined(__linux__) || defined(__FreeBSD__) |
1096 | | #include <sys/auxv.h> |
1097 | | #elif defined(_WIN32) |
1098 | | #include <processthreadsapi.h> |
1099 | | #endif |
1100 | | |
1101 | | static inline int have_neon(void) { |
1102 | | #if defined(__linux__) && defined(__arm__) |
1103 | | return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0; |
1104 | | #elif defined(__linux__) && defined(__aarch64__) && defined(HWCAP_ASIMD) |
1105 | | return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0; |
1106 | | #elif defined(__APPLE__) |
1107 | | return 1; |
1108 | | #elif defined(__FreeBSD__) && defined(__arm__) |
1109 | | unsigned long cap; |
1110 | | if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; |
1111 | | return (cap & HWCAP_NEON) != 0; |
1112 | | #elif defined(__FreeBSD__) && defined(__aarch64__) && defined(HWCAP_ASIMD) |
1113 | | unsigned long cap; |
1114 | | if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; |
1115 | | return (cap & HWCAP_ASIMD) != 0; |
1116 | | #elif defined(_WIN32) |
1117 | | return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0; |
1118 | | #else |
1119 | | return 0; |
1120 | | #endif |
1121 | | } |
1122 | | |
1123 | | static inline |
1124 | | unsigned char *(*rans_enc_func(int do_simd, int order)) |
1125 | | (unsigned char *in, |
1126 | | unsigned int in_size, |
1127 | | unsigned char *out, |
1128 | | unsigned int *out_size) { |
1129 | | |
1130 | | if (do_simd) { |
1131 | | if ((rans_cpu & RANS_CPU_ENC_NEON) && have_neon()) |
1132 | | return order & 1 |
1133 | | ? rans_compress_O1_32x16_neon |
1134 | | : rans_compress_O0_32x16_neon; |
1135 | | else |
1136 | | return order & 1 |
1137 | | ? rans_compress_O1_32x16 |
1138 | | : rans_compress_O0_32x16; |
1139 | | } else { |
1140 | | return order & 1 |
1141 | | ? rans_compress_O1_4x16 |
1142 | | : rans_compress_O0_4x16; |
1143 | | } |
1144 | | } |
1145 | | |
1146 | | static inline |
1147 | | unsigned char *(*rans_dec_func(int do_simd, int order)) |
1148 | | (unsigned char *in, |
1149 | | unsigned int in_size, |
1150 | | unsigned char *out, |
1151 | | unsigned int out_size) { |
1152 | | |
1153 | | if (do_simd) { |
1154 | | if ((rans_cpu & RANS_CPU_DEC_NEON) && have_neon()) |
1155 | | return order & 1 |
1156 | | ? rans_uncompress_O1_32x16_neon |
1157 | | : rans_uncompress_O0_32x16_neon; |
1158 | | else |
1159 | | return order & 1 |
1160 | | ? rans_uncompress_O1_32x16 |
1161 | | : rans_uncompress_O0_32x16; |
1162 | | } else { |
1163 | | return order & 1 |
1164 | | ? rans_uncompress_O1_4x16 |
1165 | | : rans_uncompress_O0_4x16; |
1166 | | } |
1167 | | } |
1168 | | |
1169 | | #else // !(defined(__GNUC__) && defined(__x86_64__)) && !defined(__ARM_NEON) |
1170 | | |
1171 | | static inline |
1172 | | unsigned char *(*rans_enc_func(int do_simd, int order)) |
1173 | | (unsigned char *in, |
1174 | | unsigned int in_size, |
1175 | | unsigned char *out, |
1176 | | unsigned int *out_size) { |
1177 | | |
1178 | | if (do_simd) { |
1179 | | return order & 1 |
1180 | | ? rans_compress_O1_32x16 |
1181 | | : rans_compress_O0_32x16; |
1182 | | } else { |
1183 | | return order & 1 |
1184 | | ? rans_compress_O1_4x16 |
1185 | | : rans_compress_O0_4x16; |
1186 | | } |
1187 | | } |
1188 | | |
1189 | | static inline |
1190 | | unsigned char *(*rans_dec_func(int do_simd, int order)) |
1191 | | (unsigned char *in, |
1192 | | unsigned int in_size, |
1193 | | unsigned char *out, |
1194 | | unsigned int out_size) { |
1195 | | |
1196 | | if (do_simd) { |
1197 | | return order & 1 |
1198 | | ? rans_uncompress_O1_32x16 |
1199 | | : rans_uncompress_O0_32x16; |
1200 | | } else { |
1201 | | return order & 1 |
1202 | | ? rans_uncompress_O1_4x16 |
1203 | | : rans_uncompress_O0_4x16; |
1204 | | } |
1205 | | } |
1206 | | |
1207 | | #endif |
1208 | | |
1209 | | // Test interface for restricting the auto-detection methods so we |
1210 | | // can forcibly compare different implementations on the same machine. |
1211 | | // See RANS_CPU_ defines in rANS_static4x16.h |
1212 | 0 | void rans_set_cpu(int opts) { |
1213 | 0 | rans_cpu = opts; |
1214 | 0 | #ifdef HAVE_HTSCODECS_TLS_CPU_INIT |
1215 | 0 | htscodecs_tls_cpu_init(); |
1216 | 0 | #endif |
1217 | 0 | } |
1218 | | |
1219 | | /*----------------------------------------------------------------------------- |
1220 | | * Simple interface to the order-0 vs order-1 encoders and decoders. |
1221 | | * |
1222 | | * Smallest is method, <in_size> <input>, so worst case 2 bytes longer. |
1223 | | */ |
1224 | | unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size, |
1225 | | unsigned char *out,unsigned int *out_size, |
1226 | 628k | int order) { |
1227 | 628k | if (in_size > INT_MAX || (out && *out_size == 0)) { |
1228 | 0 | *out_size = 0; |
1229 | 0 | return NULL; |
1230 | 0 | } |
1231 | | |
1232 | | #ifdef VALIDATE_RANS |
1233 | | int orig_order = order; |
1234 | | int orig_in_size = in_size; |
1235 | | unsigned char *orig_in = in; |
1236 | | #endif |
1237 | | |
1238 | 628k | unsigned int c_meta_len; |
1239 | 628k | uint8_t *meta = NULL, *rle = NULL, *packed = NULL; |
1240 | 628k | uint8_t *out_free = NULL; |
1241 | | |
1242 | 628k | if (!out) { |
1243 | 377k | *out_size = rans_compress_bound_4x16(in_size, order); |
1244 | 377k | if (*out_size == 0) |
1245 | 0 | return NULL; |
1246 | 377k | if (!(out_free = out = malloc(*out_size))) { |
1247 | 0 | *out_size = 0; |
1248 | 0 | return NULL; |
1249 | 0 | } |
1250 | 377k | } |
1251 | | |
1252 | 628k | unsigned char *out_end = out + *out_size; |
1253 | | |
1254 | | // Permit 32-way unrolling for large blocks, paving the way for |
1255 | | // AVX2 and AVX512 SIMD variants. |
1256 | 628k | if ((order & RANS_ORDER_SIMD_AUTO) && in_size >= 50000 |
1257 | 10.0k | && !(order & RANS_ORDER_STRIPE)) |
1258 | 8.46k | order |= X_32; |
1259 | | |
1260 | 628k | if (in_size <= 20) |
1261 | 470k | order &= ~RANS_ORDER_STRIPE; |
1262 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1263 | | if (in_size <= 1000) |
1264 | | order &= ~RANS_ORDER_X32; |
1265 | | #endif |
1266 | 628k | if (order & RANS_ORDER_STRIPE) { |
1267 | 15.3k | int N = (order>>8) & 0xff; |
1268 | 15.3k | if (N == 0) N = 4; // default for compatibility with old tests |
1269 | | |
1270 | 15.3k | if (N > in_size) |
1271 | 0 | N = in_size; |
1272 | | |
1273 | 15.3k | unsigned char *transposed = malloc(in_size); |
1274 | 15.3k | unsigned int part_len[256]; |
1275 | 15.3k | unsigned int idx[256]; |
1276 | 15.3k | if (!transposed) { |
1277 | 0 | free(out_free); |
1278 | 0 | *out_size = 0; |
1279 | 0 | return NULL; |
1280 | 0 | } |
1281 | 15.3k | int i, j, x; |
1282 | | |
1283 | 76.8k | for (i = 0; i < N; i++) { |
1284 | 61.4k | part_len[i] = in_size / N + ((in_size % N) > i); |
1285 | 61.4k | idx[i] = i ? idx[i-1] + part_len[i-1] : 0; // cumulative index |
1286 | 61.4k | } |
1287 | | |
1288 | 4.28G | #define KN 8 |
1289 | 15.3k | i = x = 0; |
1290 | 15.3k | if (in_size >= N*KN) { |
1291 | 109M | for (; i < in_size-N*KN;) { |
1292 | 109M | int k; |
1293 | 109M | unsigned char *ink = in+i; |
1294 | 549M | for (j = 0; j < N; j++) |
1295 | 3.95G | for (k = 0; k < KN; k++) |
1296 | 3.51G | transposed[idx[j]+x+k] = ink[j+N*k]; |
1297 | 109M | x += KN; i+=N*KN; |
1298 | 109M | } |
1299 | 11.1k | } |
1300 | 15.3k | #undef KN |
1301 | 78.2k | for (; i < in_size-N; i += N, x++) { |
1302 | 314k | for (j = 0; j < N; j++) |
1303 | 251k | transposed[idx[j]+x] = in[i+j]; |
1304 | 62.8k | } |
1305 | | |
1306 | 30.7k | for (; i < in_size; i += N, x++) { |
1307 | 58.4k | for (j = 0; i+j < in_size; j++) |
1308 | 43.0k | transposed[idx[j]+x] = in[i+j]; |
1309 | 15.3k | } |
1310 | | |
1311 | 15.3k | unsigned int olen2; |
1312 | 15.3k | unsigned char *out2, *out2_start; |
1313 | 15.3k | c_meta_len = 1; |
1314 | 15.3k | *out = order & ~RANS_ORDER_NOSZ; |
1315 | 15.3k | c_meta_len += var_put_u32(out+c_meta_len, out_end, in_size); |
1316 | 15.3k | if (c_meta_len >= *out_size) { |
1317 | 0 | free(out_free); |
1318 | 0 | free(transposed); |
1319 | 0 | *out_size = 0; |
1320 | 0 | return NULL; |
1321 | 0 | } |
1322 | | |
1323 | 15.3k | out[c_meta_len++] = N; |
1324 | | |
1325 | 15.3k | unsigned char *out_best = NULL; |
1326 | 15.3k | unsigned int out_best_len = 0; |
1327 | | |
1328 | 15.3k | out2_start = out2 = out+7+5*N; // shares a buffer with c_meta |
1329 | 76.8k | for (i = 0; i < N; i++) { |
1330 | | // Brute force try all methods. |
1331 | 61.4k | uint8_t *r; |
1332 | 61.4k | int j, m[] = {1,64,128,0}, best_j = 0, best_sz = INT_MAX; |
1333 | 307k | for (j = 0; j < sizeof(m)/sizeof(*m); j++) { |
1334 | 245k | if ((order & m[j]) != m[j]) |
1335 | 119k | continue; |
1336 | | |
1337 | | // order-1 *only*; bit check above cannot elide order-0 |
1338 | 126k | if ((order & RANS_ORDER_STRIPE_NO0) && (m[j]&1) == 0) |
1339 | 0 | continue; |
1340 | | |
1341 | 126k | if (out2 - out > *out_size) |
1342 | 0 | continue; // an error, but caught in best_sz check later |
1343 | | |
1344 | 126k | olen2 = *out_size - (out2 - out); |
1345 | 126k | r = rans_compress_to_4x16(transposed+idx[i], part_len[i], |
1346 | 126k | out2, &olen2, |
1347 | 126k | m[j] | RANS_ORDER_NOSZ |
1348 | 126k | | (order&RANS_ORDER_X32)); |
1349 | 126k | if (r && olen2 && best_sz > olen2) { |
1350 | 81.6k | best_sz = olen2; |
1351 | 81.6k | best_j = j; |
1352 | 81.6k | if (j < sizeof(m)/sizeof(*m) && olen2 > out_best_len) { |
1353 | 17.2k | unsigned char *tmp = realloc(out_best, olen2); |
1354 | 17.2k | if (!tmp) { |
1355 | 0 | free(out_free); |
1356 | 0 | free(transposed); |
1357 | 0 | *out_size = 0; |
1358 | 0 | return NULL; |
1359 | 0 | } |
1360 | 17.2k | out_best = tmp; |
1361 | 17.2k | out_best_len = olen2; |
1362 | 17.2k | } |
1363 | | |
1364 | | // Cache a copy of the best so far |
1365 | 81.6k | memcpy(out_best, out2, olen2); |
1366 | 81.6k | } |
1367 | 126k | } |
1368 | | |
1369 | 61.4k | if (best_sz == INT_MAX) { |
1370 | 0 | free(out_best); |
1371 | 0 | free(out_free); |
1372 | 0 | free(transposed); |
1373 | 0 | *out_size = 0; |
1374 | 0 | return NULL; |
1375 | 0 | } |
1376 | | |
1377 | 61.4k | if (best_j < sizeof(m)/sizeof(*m)) { |
1378 | | // Copy the best compression to output buffer if not current |
1379 | 61.4k | memcpy(out2, out_best, best_sz); |
1380 | 61.4k | olen2 = best_sz; |
1381 | 61.4k | } |
1382 | | |
1383 | 61.4k | out2 += olen2; |
1384 | 61.4k | c_meta_len += var_put_u32(out+c_meta_len, out_end, olen2); |
1385 | 61.4k | } |
1386 | 15.3k | if (out_best) |
1387 | 15.3k | free(out_best); |
1388 | | |
1389 | 15.3k | memmove(out+c_meta_len, out2_start, out2-out2_start); |
1390 | 15.3k | free(transposed); |
1391 | 15.3k | *out_size = c_meta_len + out2-out2_start; |
1392 | 15.3k | return out; |
1393 | 15.3k | } |
1394 | | |
1395 | 612k | if (order & RANS_ORDER_CAT) { |
1396 | 0 | out[0] = RANS_ORDER_CAT; |
1397 | 0 | c_meta_len = 1; |
1398 | 0 | c_meta_len += var_put_u32(&out[1], out_end, in_size); |
1399 | |
|
1400 | 0 | if (c_meta_len + in_size > *out_size) { |
1401 | 0 | free(out_free); |
1402 | 0 | *out_size = 0; |
1403 | 0 | return NULL; |
1404 | 0 | } |
1405 | 0 | if (in_size) |
1406 | 0 | memcpy(out+c_meta_len, in, in_size); |
1407 | 0 | *out_size = c_meta_len + in_size; |
1408 | 0 | return out; |
1409 | 0 | } |
1410 | | |
1411 | 612k | int do_pack = order & RANS_ORDER_PACK; |
1412 | 612k | int do_rle = order & RANS_ORDER_RLE; |
1413 | 612k | int no_size = order & RANS_ORDER_NOSZ; |
1414 | 612k | int do_simd = order & RANS_ORDER_X32; |
1415 | | |
1416 | 612k | out[0] = order; |
1417 | 612k | c_meta_len = 1; |
1418 | | |
1419 | 612k | if (!no_size) |
1420 | 485k | c_meta_len += var_put_u32(&out[1], out_end, in_size); |
1421 | | |
1422 | 612k | order &= 3; |
1423 | | |
1424 | | // Format is compressed meta-data, compressed data. |
1425 | | // Meta-data can be empty, pack, rle lengths, or pack + rle lengths. |
1426 | | // Data is either the original data, bit-packed packed, rle literals or |
1427 | | // packed + rle literals. |
1428 | | |
1429 | 612k | if (do_pack && in_size) { |
1430 | | // PACK 2, 4 or 8 symbols into one byte. |
1431 | 187k | int pmeta_len; |
1432 | 187k | uint64_t packed_len; |
1433 | 187k | if (c_meta_len + 256 > *out_size) { |
1434 | 0 | free(out_free); |
1435 | 0 | *out_size = 0; |
1436 | 0 | return NULL; |
1437 | 0 | } |
1438 | 187k | packed = hts_pack(in, in_size, out+c_meta_len, &pmeta_len, &packed_len); |
1439 | 187k | if (!packed) { |
1440 | 4.31k | out[0] &= ~RANS_ORDER_PACK; |
1441 | 4.31k | do_pack = 0; |
1442 | 4.31k | free(packed); |
1443 | 4.31k | packed = NULL; |
1444 | 183k | } else { |
1445 | 183k | in = packed; |
1446 | 183k | in_size = packed_len; |
1447 | 183k | c_meta_len += pmeta_len; |
1448 | | |
1449 | | // Could derive this rather than storing verbatim. |
1450 | | // Orig size * 8/nbits (+1 if not multiple of 8/n) |
1451 | 183k | int sz = var_put_u32(out+c_meta_len, out_end, in_size); |
1452 | 183k | c_meta_len += sz; |
1453 | 183k | *out_size -= sz; |
1454 | | |
1455 | 183k | if (do_simd && in_size < 32) { |
1456 | 1.12k | do_simd = 0; |
1457 | 1.12k | out[0] &= ~RANS_ORDER_X32; |
1458 | 1.12k | } |
1459 | 183k | } |
1460 | 425k | } else if (do_pack) { |
1461 | 0 | out[0] &= ~RANS_ORDER_PACK; |
1462 | 0 | } |
1463 | | |
1464 | 612k | if (do_rle && in_size) { |
1465 | | // RLE 'in' -> rle_length + rle_literals arrays |
1466 | 110k | unsigned int rmeta_len, c_rmeta_len; |
1467 | 110k | uint64_t rle_len; |
1468 | 110k | c_rmeta_len = in_size+257; |
1469 | 110k | if (!(meta = malloc(c_rmeta_len))) { |
1470 | 0 | free(out_free); |
1471 | 0 | *out_size = 0; |
1472 | 0 | return NULL; |
1473 | 0 | } |
1474 | | |
1475 | 110k | uint8_t rle_syms[256]; |
1476 | 110k | int rle_nsyms = 0; |
1477 | 110k | uint64_t rmeta_len64; |
1478 | 110k | rle = hts_rle_encode(in, in_size, meta, &rmeta_len64, |
1479 | 110k | rle_syms, &rle_nsyms, NULL, &rle_len); |
1480 | 110k | memmove(meta+1+rle_nsyms, meta, rmeta_len64); |
1481 | 110k | meta[0] = rle_nsyms; |
1482 | 110k | memcpy(meta+1, rle_syms, rle_nsyms); |
1483 | 110k | rmeta_len = rmeta_len64 + rle_nsyms+1; |
1484 | | |
1485 | 110k | if (!rle || rle_len + rmeta_len >= .99*in_size) { |
1486 | | // Not worth the speed hit. |
1487 | 74.2k | out[0] &= ~RANS_ORDER_RLE; |
1488 | 74.2k | do_rle = 0; |
1489 | 74.2k | free(rle); |
1490 | 74.2k | rle = NULL; |
1491 | 74.2k | } else { |
1492 | | // Compress lengths with O0 and literals with O0/O1 ("order" param) |
1493 | 36.2k | int sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2), sz2; |
1494 | 36.2k | sz += var_put_u32(out+c_meta_len+sz, out_end, rle_len); |
1495 | 36.2k | if ((c_meta_len+sz+5) > *out_size) { |
1496 | 0 | free(out_free); |
1497 | 0 | free(rle); |
1498 | 0 | free(meta); |
1499 | 0 | free(packed); |
1500 | 0 | *out_size = 0; |
1501 | 0 | return NULL; |
1502 | 0 | } |
1503 | 36.2k | c_rmeta_len = *out_size - (c_meta_len+sz+5); |
1504 | 36.2k | if (do_simd && (rmeta_len < 32 || rle_len < 32)) { |
1505 | 2.21k | do_simd = 0; |
1506 | 2.21k | out[0] &= ~RANS_ORDER_X32; |
1507 | 2.21k | } |
1508 | 36.2k | if (!rans_enc_func(do_simd, 0)(meta, rmeta_len, out+c_meta_len+sz+5, &c_rmeta_len)) { |
1509 | 0 | free(out_free); |
1510 | 0 | free(rle); |
1511 | 0 | free(meta); |
1512 | 0 | free(packed); |
1513 | 0 | *out_size = 0; |
1514 | 0 | return NULL; |
1515 | 0 | } |
1516 | 36.2k | if (c_rmeta_len < rmeta_len) { |
1517 | 2.03k | sz2 = var_put_u32(out+c_meta_len+sz, out_end, c_rmeta_len); |
1518 | 2.03k | memmove(out+c_meta_len+sz+sz2, out+c_meta_len+sz+5, c_rmeta_len); |
1519 | 34.2k | } else { |
1520 | | // Uncompressed RLE meta-data as too small |
1521 | 34.2k | sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2+1); |
1522 | 34.2k | sz2 = var_put_u32(out+c_meta_len+sz, out_end, rle_len); |
1523 | 34.2k | memcpy(out+c_meta_len+sz+sz2, meta, rmeta_len); |
1524 | 34.2k | c_rmeta_len = rmeta_len; |
1525 | 34.2k | } |
1526 | | |
1527 | 36.2k | c_meta_len += sz + sz2 + c_rmeta_len; |
1528 | | |
1529 | 36.2k | in = rle; |
1530 | 36.2k | in_size = rle_len; |
1531 | 36.2k | } |
1532 | | |
1533 | 110k | free(meta); |
1534 | 502k | } else if (do_rle) { |
1535 | 40.9k | out[0] &= ~RANS_ORDER_RLE; |
1536 | 40.9k | } |
1537 | | |
1538 | 612k | if (c_meta_len > *out_size) { |
1539 | 0 | free(out_free); |
1540 | 0 | free(rle); |
1541 | 0 | free(packed); |
1542 | 0 | *out_size = 0; |
1543 | 0 | return NULL; |
1544 | 0 | } |
1545 | | |
1546 | 612k | *out_size -= c_meta_len; |
1547 | 612k | if (order && in_size < 8) { |
1548 | 109k | out[0] &= ~1; |
1549 | 109k | order &= ~1; |
1550 | 109k | } |
1551 | | |
1552 | 612k | if (!rans_enc_func(do_simd, order)(in, in_size, out+c_meta_len, out_size)) { |
1553 | 0 | free(out_free); |
1554 | 0 | free(rle); |
1555 | 0 | free(packed); |
1556 | 0 | *out_size = 0; |
1557 | 0 | return NULL; |
1558 | 0 | } |
1559 | | |
1560 | 612k | if (*out_size >= in_size) { |
1561 | 543k | out[0] &= ~3; |
1562 | 543k | out[0] |= RANS_ORDER_CAT | no_size; |
1563 | | |
1564 | 543k | if (out + c_meta_len + in_size > out_end) { |
1565 | 0 | free(out_free); |
1566 | 0 | free(rle); |
1567 | 0 | free(packed); |
1568 | 0 | *out_size = 0; |
1569 | 0 | return NULL; |
1570 | 0 | } |
1571 | 543k | if (in_size) |
1572 | 459k | memcpy(out+c_meta_len, in, in_size); |
1573 | 543k | *out_size = in_size; |
1574 | 543k | } |
1575 | | |
1576 | 612k | free(rle); |
1577 | 612k | free(packed); |
1578 | | |
1579 | 612k | *out_size += c_meta_len; |
1580 | | |
1581 | | // Validation mode |
1582 | | #ifdef VALIDATE_RANS |
1583 | | unsigned int decoded_size = orig_in_size; |
1584 | | unsigned char *decoded = malloc(decoded_size); |
1585 | | decoded = rans_uncompress_to_4x16(out, *out_size, |
1586 | | decoded, &decoded_size); |
1587 | | if (!decoded || |
1588 | | decoded_size != orig_in_size || |
1589 | | memcmp(orig_in, decoded, orig_in_size) != 0) { |
1590 | | fprintf(stderr, "rans round trip failed for order %d. Written to fd 5\n", orig_order); |
1591 | | if (write(5, orig_in, orig_in_size) < 0) |
1592 | | abort(); |
1593 | | abort(); |
1594 | | } |
1595 | | free(decoded); |
1596 | | #endif |
1597 | | |
1598 | | |
1599 | 612k | return out; |
1600 | 612k | } |
1601 | | |
1602 | | unsigned char *rans_compress_4x16(unsigned char *in, unsigned int in_size, |
1603 | 377k | unsigned int *out_size, int order) { |
1604 | 377k | return rans_compress_to_4x16(in, in_size, NULL, out_size, order); |
1605 | 377k | } |
1606 | | |
1607 | | unsigned char *rans_uncompress_to_4x16(unsigned char *in, unsigned int in_size, |
1608 | 12.4k | unsigned char *out, unsigned int *out_size) { |
1609 | 12.4k | unsigned char *in_end = in + in_size; |
1610 | 12.4k | unsigned char *out_free = NULL, *tmp_free = NULL, *meta_free = NULL; |
1611 | | |
1612 | 12.4k | if (in_size == 0) |
1613 | 3 | return NULL; |
1614 | | |
1615 | 12.4k | if (*in & RANS_ORDER_STRIPE) { |
1616 | 172 | unsigned int ulen, olen, c_meta_len = 1; |
1617 | 172 | int i; |
1618 | 172 | uint64_t clen_tot = 0; |
1619 | | |
1620 | | // Decode lengths |
1621 | 172 | c_meta_len += var_get_u32(in+c_meta_len, in_end, &ulen); |
1622 | 172 | if (c_meta_len >= in_size) |
1623 | 1 | return NULL; |
1624 | 171 | unsigned int N = in[c_meta_len++]; |
1625 | 171 | if (N < 1) // Must be at least one stripe |
1626 | 5 | return NULL; |
1627 | 166 | unsigned int clenN[256], ulenN[256], idxN[256]; |
1628 | 166 | if (!out) { |
1629 | 15 | if (ulen >= INT_MAX) |
1630 | 1 | return NULL; |
1631 | 14 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1632 | 14 | if (ulen > 100000) |
1633 | 5 | return NULL; |
1634 | 9 | #endif |
1635 | 9 | if (!(out_free = out = malloc(ulen))) { |
1636 | 0 | return NULL; |
1637 | 0 | } |
1638 | 9 | *out_size = ulen; |
1639 | 9 | } |
1640 | 160 | if (ulen != *out_size) { |
1641 | 7 | free(out_free); |
1642 | 7 | return NULL; |
1643 | 7 | } |
1644 | | |
1645 | 1.37k | for (i = 0; i < N; i++) { |
1646 | 1.28k | ulenN[i] = ulen / N + ((ulen % N) > i); |
1647 | 1.28k | idxN[i] = i ? idxN[i-1] + ulenN[i-1] : 0; |
1648 | 1.28k | c_meta_len += var_get_u32(in+c_meta_len, in_end, &clenN[i]); |
1649 | 1.28k | clen_tot += clenN[i]; |
1650 | 1.28k | if (c_meta_len > in_size || clenN[i] > in_size || clenN[i] < 1) { |
1651 | 64 | free(out_free); |
1652 | 64 | return NULL; |
1653 | 64 | } |
1654 | 1.28k | } |
1655 | | |
1656 | | // We can call this with a larger buffer, but once we've determined |
1657 | | // how much we really use we limit it so the recursion becomes easier |
1658 | | // to limit. |
1659 | 89 | if (c_meta_len + clen_tot > in_size) { |
1660 | 14 | free(out_free); |
1661 | 14 | return NULL; |
1662 | 14 | } |
1663 | 75 | in_size = c_meta_len + clen_tot; |
1664 | | |
1665 | | //fprintf(stderr, " stripe meta %d\n", c_meta_len); //c-size |
1666 | | |
1667 | | // Uncompress the N streams |
1668 | 75 | unsigned char *outN = malloc(ulen); |
1669 | 75 | if (!outN) { |
1670 | 0 | free(out_free); |
1671 | 0 | return NULL; |
1672 | 0 | } |
1673 | 257 | for (i = 0; i < N; i++) { |
1674 | 203 | olen = ulenN[i]; |
1675 | 203 | if (in_size < c_meta_len) { |
1676 | 0 | free(out_free); |
1677 | 0 | free(outN); |
1678 | 0 | return NULL; |
1679 | 0 | } |
1680 | 203 | if (!rans_uncompress_to_4x16(in+c_meta_len, in_size-c_meta_len, outN + idxN[i], &olen) |
1681 | 182 | || olen != ulenN[i]) { |
1682 | 21 | free(out_free); |
1683 | 21 | free(outN); |
1684 | 21 | return NULL; |
1685 | 21 | } |
1686 | 182 | c_meta_len += clenN[i]; |
1687 | 182 | } |
1688 | | |
1689 | 54 | unstripe(out, outN, ulen, N, idxN); |
1690 | | |
1691 | 54 | free(outN); |
1692 | 54 | *out_size = ulen; |
1693 | 54 | return out; |
1694 | 75 | } |
1695 | | |
1696 | 12.3k | int order = *in++; in_size--; |
1697 | 12.3k | int do_pack = order & RANS_ORDER_PACK; |
1698 | 12.3k | int do_rle = order & RANS_ORDER_RLE; |
1699 | 12.3k | int do_cat = order & RANS_ORDER_CAT; |
1700 | 12.3k | int no_size = order & RANS_ORDER_NOSZ; |
1701 | 12.3k | int do_simd = order & RANS_ORDER_X32; |
1702 | 12.3k | order &= 1; |
1703 | | |
1704 | 12.3k | int sz = 0; |
1705 | 12.3k | unsigned int osz; |
1706 | 12.3k | if (!no_size) { |
1707 | 9.48k | sz = var_get_u32(in, in_end, &osz); |
1708 | 9.48k | } else |
1709 | 2.82k | sz = 0, osz = *out_size; |
1710 | 12.3k | in += sz; |
1711 | 12.3k | in_size -= sz; |
1712 | | |
1713 | 12.3k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1714 | 12.3k | if (osz > 100000) |
1715 | 41 | return NULL; |
1716 | 12.2k | #endif |
1717 | | |
1718 | 12.2k | if (no_size && !out) |
1719 | 0 | goto err; // Need one or the other |
1720 | | |
1721 | 12.2k | if (!out) { |
1722 | 554 | *out_size = osz; |
1723 | 554 | if (!(out = out_free = malloc(*out_size))) |
1724 | 0 | return NULL; |
1725 | 11.7k | } else { |
1726 | 11.7k | if (*out_size < osz) |
1727 | 3 | goto err; |
1728 | 11.7k | *out_size = osz; |
1729 | 11.7k | } |
1730 | | |
1731 | | // if (do_pack || do_rle) { |
1732 | | // in += sz; // size field not needed when pure rANS |
1733 | | // in_size -= sz; |
1734 | | // } |
1735 | | |
1736 | 12.2k | uint32_t c_meta_size = 0; |
1737 | 12.2k | unsigned int tmp1_size = *out_size; |
1738 | 12.2k | unsigned int tmp2_size = *out_size; |
1739 | 12.2k | unsigned int tmp3_size = *out_size; |
1740 | 12.2k | unsigned char *tmp1 = NULL, *tmp2 = NULL, *tmp3 = NULL, *tmp = NULL; |
1741 | | |
1742 | | // Need In, Out and Tmp buffers with temporary buffer of the same size |
1743 | | // as output. All use rANS, but with optional transforms (none, RLE, |
1744 | | // Pack, or both). |
1745 | | // |
1746 | | // rans unrle unpack |
1747 | | // If none: in -> out |
1748 | | // If RLE: in -> tmp -> out |
1749 | | // If Pack: in -> tmp -> out |
1750 | | // If RLE+Pack: in -> out -> tmp -> out |
1751 | | // tmp1 tmp2 tmp3 |
1752 | | // |
1753 | | // So rans is in -> tmp1 |
1754 | | // RLE is tmp1 -> tmp2 |
1755 | | // Unpack is tmp2 -> tmp3 |
1756 | | |
1757 | | // Format is meta data (Pack and RLE in that order if present), |
1758 | | // followed by rANS compressed data. |
1759 | | |
1760 | 12.2k | if (do_pack || do_rle) { |
1761 | 1.84k | if (!(tmp = tmp_free = malloc(*out_size))) |
1762 | 0 | goto err; |
1763 | 1.84k | if (do_pack && do_rle) { |
1764 | 44 | tmp1 = out; |
1765 | 44 | tmp2 = tmp; |
1766 | 44 | tmp3 = out; |
1767 | 1.80k | } else if (do_pack) { |
1768 | 1.53k | tmp1 = tmp; |
1769 | 1.53k | tmp2 = tmp1; |
1770 | 1.53k | tmp3 = out; |
1771 | 1.53k | } else if (do_rle) { |
1772 | 266 | tmp1 = tmp; |
1773 | 266 | tmp2 = out; |
1774 | 266 | tmp3 = out; |
1775 | 266 | } |
1776 | 10.4k | } else { |
1777 | | // neither |
1778 | 10.4k | tmp = NULL; |
1779 | 10.4k | tmp1 = out; |
1780 | 10.4k | tmp2 = out; |
1781 | 10.4k | tmp3 = out; |
1782 | 10.4k | } |
1783 | | |
1784 | | // Decode the bit-packing map. |
1785 | 12.2k | uint8_t map[16] = {0}; |
1786 | 12.2k | int npacked_sym = 0; |
1787 | 12.2k | uint64_t unpacked_sz = 0; // FIXME: rename to packed_per_byte |
1788 | 12.2k | if (do_pack) { |
1789 | 1.58k | c_meta_size = hts_unpack_meta(in, in_size, *out_size, map, &npacked_sym); |
1790 | 1.58k | if (c_meta_size == 0) |
1791 | 3 | goto err; |
1792 | | |
1793 | 1.57k | unpacked_sz = osz; |
1794 | 1.57k | in += c_meta_size; |
1795 | 1.57k | in_size -= c_meta_size; |
1796 | | |
1797 | | // New unpacked size. We could derive this bit from *out_size |
1798 | | // and npacked_sym. |
1799 | 1.57k | unsigned int osz; |
1800 | 1.57k | sz = var_get_u32(in, in_end, &osz); |
1801 | 1.57k | in += sz; |
1802 | 1.57k | in_size -= sz; |
1803 | 1.57k | if (osz > tmp1_size) |
1804 | 33 | goto err; |
1805 | 1.54k | tmp1_size = osz; |
1806 | 1.54k | } |
1807 | | |
1808 | 12.2k | uint8_t *meta = NULL; |
1809 | 12.2k | uint32_t u_meta_size = 0; |
1810 | 12.2k | if (do_rle) { |
1811 | | // Uncompress meta data |
1812 | 310 | uint32_t c_meta_size, rle_len, sz; |
1813 | 310 | sz = var_get_u32(in, in_end, &u_meta_size); |
1814 | 310 | sz += var_get_u32(in+sz, in_end, &rle_len); |
1815 | 310 | if (rle_len > tmp1_size) // should never grow |
1816 | 24 | goto err; |
1817 | 286 | if (u_meta_size & 1) { |
1818 | 127 | meta = in + sz; |
1819 | 127 | u_meta_size = u_meta_size/2 > (in_end-meta) ? (in_end-meta) : u_meta_size/2; |
1820 | 127 | c_meta_size = u_meta_size; |
1821 | 159 | } else { |
1822 | 159 | sz += var_get_u32(in+sz, in_end, &c_meta_size); |
1823 | 159 | u_meta_size /= 2; |
1824 | | |
1825 | 159 | meta_free = meta = rans_dec_func(do_simd, 0)(in+sz, in_size-sz, NULL, u_meta_size); |
1826 | 159 | if (!meta) |
1827 | 111 | goto err; |
1828 | 159 | } |
1829 | 175 | if (c_meta_size+sz > in_size) |
1830 | 1 | goto err; |
1831 | 174 | in += c_meta_size+sz; |
1832 | 174 | in_size -= c_meta_size+sz; |
1833 | 174 | tmp1_size = rle_len; |
1834 | 174 | } |
1835 | | //fprintf(stderr, " meta_size %d bytes\n", (int)(in - orig_in)); //c-size |
1836 | | |
1837 | | // uncompress RLE data. in -> tmp1 |
1838 | 12.0k | if (in_size) { |
1839 | 12.0k | if (do_cat) { |
1840 | | //fprintf(stderr, " CAT %d\n", tmp1_size); //c-size |
1841 | 921 | if (tmp1_size > in_size) |
1842 | 12 | goto err; |
1843 | 909 | if (tmp1_size > *out_size) |
1844 | 0 | goto err; |
1845 | 909 | memcpy(tmp1, in, tmp1_size); |
1846 | 11.1k | } else { |
1847 | 11.1k | tmp1 = rans_dec_func(do_simd, order)(in, in_size, tmp1, tmp1_size); |
1848 | 11.1k | if (!tmp1) |
1849 | 771 | goto err; |
1850 | 11.1k | } |
1851 | 12.0k | } else { |
1852 | 16 | tmp1_size = 0; |
1853 | 16 | } |
1854 | 11.3k | tmp2_size = tmp3_size = tmp1_size; |
1855 | | |
1856 | 11.3k | if (do_rle) { |
1857 | | // Unpack RLE. tmp1 -> tmp2. |
1858 | 73 | if (u_meta_size == 0) |
1859 | 2 | goto err; |
1860 | 71 | uint64_t unrle_size = *out_size; |
1861 | 71 | int rle_nsyms = *meta ? *meta : 256; |
1862 | 71 | if (u_meta_size < 1+rle_nsyms) |
1863 | 22 | goto err; |
1864 | 49 | if (!hts_rle_decode(tmp1, tmp1_size, |
1865 | 49 | meta+1+rle_nsyms, u_meta_size-(1+rle_nsyms), |
1866 | 49 | meta+1, rle_nsyms, tmp2, &unrle_size)) |
1867 | 30 | goto err; |
1868 | 19 | tmp3_size = tmp2_size = unrle_size; |
1869 | 19 | free(meta_free); |
1870 | 19 | meta_free = NULL; |
1871 | 19 | } |
1872 | 11.2k | if (do_pack) { |
1873 | | // Unpack bits via pack-map. tmp2 -> tmp3 |
1874 | 1.42k | if (npacked_sym == 1) |
1875 | 867 | unpacked_sz = tmp2_size; |
1876 | | //uint8_t *porig = unpack(tmp2, tmp2_size, unpacked_sz, npacked_sym, map); |
1877 | | //memcpy(tmp3, porig, unpacked_sz); |
1878 | 1.42k | if (!hts_unpack(tmp2, tmp2_size, tmp3, unpacked_sz, npacked_sym, map)) |
1879 | 14 | goto err; |
1880 | 1.41k | tmp3_size = unpacked_sz; |
1881 | 1.41k | } |
1882 | | |
1883 | 11.2k | if (tmp) |
1884 | 1.42k | free(tmp); |
1885 | | |
1886 | 11.2k | *out_size = tmp3_size; |
1887 | 11.2k | return tmp3; |
1888 | | |
1889 | 1.02k | err: |
1890 | 1.02k | free(meta_free); |
1891 | 1.02k | free(out_free); |
1892 | 1.02k | free(tmp_free); |
1893 | 1.02k | return NULL; |
1894 | 11.2k | } |
1895 | | |
1896 | | unsigned char *rans_uncompress_4x16(unsigned char *in, unsigned int in_size, |
1897 | 571 | unsigned int *out_size) { |
1898 | | return rans_uncompress_to_4x16(in, in_size, NULL, out_size); |
1899 | 571 | } |