/src/htslib/htscodecs/htscodecs/rANS_static32x16pr.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017-2021 Genome Research Ltd. |
3 | | * Author(s): James Bonfield |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright notice, |
9 | | * this list of conditions and the following disclaimer. |
10 | | * |
11 | | * 2. Redistributions in binary form must reproduce the above |
12 | | * copyright notice, this list of conditions and the following |
13 | | * disclaimer in the documentation and/or other materials provided |
14 | | * with the distribution. |
15 | | * |
16 | | * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
17 | | * Institute nor the names of its contributors may be used to endorse |
18 | | * or promote products derived from this software without specific |
19 | | * prior written permission. |
20 | | * |
21 | | * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS |
22 | | * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
24 | | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH |
25 | | * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
26 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
27 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
28 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
29 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | #include "config.h" |
35 | | |
36 | | #include <stdint.h> |
37 | | #include <stdlib.h> |
38 | | #include <stdio.h> |
39 | | #include <assert.h> |
40 | | #include <string.h> |
41 | | #include <limits.h> |
42 | | |
43 | | #include "rANS_word.h" |
44 | | #include "rANS_static4x16.h" |
45 | | #include "rANS_static16_int.h" |
46 | | #include "varint.h" |
47 | | #include "utils.h" |
48 | | |
49 | 0 | #define TF_SHIFT 12 |
50 | 0 | #define TOTFREQ (1<<TF_SHIFT) |
51 | | |
52 | | |
53 | | // 9-11 is considerably faster in the O1 variant due to reduced table size. |
54 | | // We auto-tune between 10 and 12 though. Anywhere from 9 to 14 are viable. |
55 | | #ifndef TF_SHIFT_O1 |
56 | | #define TF_SHIFT_O1 12 |
57 | | #endif |
58 | | #ifndef TF_SHIFT_O1_FAST |
59 | | #define TF_SHIFT_O1_FAST 10 |
60 | | #endif |
61 | 0 | #define TOTFREQ_O1 (1<<TF_SHIFT_O1) |
62 | 0 | #define TOTFREQ_O1_FAST (1<<TF_SHIFT_O1_FAST) |
63 | | |
64 | | |
65 | 0 | #define NX 32 |
66 | | |
67 | | unsigned char *rans_compress_O0_32x16(unsigned char *in, |
68 | | unsigned int in_size, |
69 | | unsigned char *out, |
70 | 0 | unsigned int *out_size) { |
71 | 0 | unsigned char *cp, *out_end, *out_free = NULL; |
72 | 0 | RansEncSymbol syms[256]; |
73 | 0 | RansState ransN[NX]; |
74 | 0 | uint8_t* ptr; |
75 | 0 | uint32_t F[256+MAGIC] = {0}; |
76 | 0 | int i, j, tab_size = 0, x, z; |
77 | | // -20 for order/size/meta |
78 | 0 | unsigned int bound = rans_compress_bound_4x16(in_size,0)-20; |
79 | |
|
80 | 0 | if (!out) { |
81 | 0 | *out_size = bound; |
82 | 0 | out = out_free = malloc(*out_size); |
83 | 0 | } |
84 | 0 | if (!out || bound > *out_size) |
85 | 0 | return NULL; |
86 | | |
87 | | // If "out" isn't word aligned, tweak out_end/ptr to ensure it is. |
88 | | // We already added more round in bound to allow for this. |
89 | 0 | if (((size_t)out)&1) |
90 | 0 | bound--; |
91 | 0 | ptr = out_end = out + bound; |
92 | |
|
93 | 0 | if (in_size == 0) |
94 | 0 | goto empty; |
95 | | |
96 | | // Compute statistics |
97 | 0 | double e = hist8e(in, in_size, F); |
98 | 0 | int low_ent = e < 2; |
99 | | //hist8(in, in_size, F); int low_ent = 0; |
100 | | |
101 | | // Normalise so frequences sum to power of 2 |
102 | 0 | uint32_t fsum = in_size; |
103 | 0 | uint32_t max_val = round2(fsum); |
104 | 0 | if (max_val > TOTFREQ) |
105 | 0 | max_val = TOTFREQ; |
106 | |
|
107 | 0 | if (normalise_freq(F, fsum, max_val) < 0) { |
108 | 0 | free(out_free); |
109 | 0 | return NULL; |
110 | 0 | } |
111 | 0 | fsum=max_val; |
112 | |
|
113 | 0 | cp = out; |
114 | 0 | cp += encode_freq(cp, F); |
115 | 0 | tab_size = cp-out; |
116 | | //write(2, out+4, cp-(out+4)); |
117 | |
|
118 | 0 | if (normalise_freq(F, fsum, TOTFREQ) < 0) { |
119 | 0 | free(out_free); |
120 | 0 | return NULL; |
121 | 0 | } |
122 | | |
123 | | // Encode statistics. |
124 | 0 | for (x = j = 0; j < 256; j++) { |
125 | 0 | if (F[j]) { |
126 | 0 | RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); |
127 | 0 | x += F[j]; |
128 | 0 | } |
129 | 0 | } |
130 | |
|
131 | 0 | for (z = 0; z < NX; z++) |
132 | 0 | RansEncInit(&ransN[z]); |
133 | |
|
134 | 0 | z = i = in_size&(NX-1); |
135 | 0 | while (z-- > 0) |
136 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]); |
137 | |
|
138 | 0 | if (low_ent) { |
139 | | // orig |
140 | | // gcc 446 |
141 | | // clang 427 |
142 | 0 | for (i=(in_size &~(NX-1)); likely(i>0); i-=NX) { |
143 | 0 | for (z = NX-1; z >= 0; z-=4) { |
144 | 0 | RansEncSymbol *s0 = &syms[in[i-(NX-z+0)]]; |
145 | 0 | RansEncSymbol *s1 = &syms[in[i-(NX-z+1)]]; |
146 | 0 | RansEncSymbol *s2 = &syms[in[i-(NX-z+2)]]; |
147 | 0 | RansEncSymbol *s3 = &syms[in[i-(NX-z+3)]]; |
148 | 0 | RansEncPutSymbol_branched(&ransN[z-0], &ptr, s0); |
149 | 0 | RansEncPutSymbol_branched(&ransN[z-1], &ptr, s1); |
150 | 0 | RansEncPutSymbol_branched(&ransN[z-2], &ptr, s2); |
151 | 0 | RansEncPutSymbol_branched(&ransN[z-3], &ptr, s3); |
152 | 0 | if (NX%8 == 0) { |
153 | 0 | z -= 4; |
154 | 0 | RansEncSymbol *s0 = &syms[in[i-(NX-z+0)]]; |
155 | 0 | RansEncSymbol *s1 = &syms[in[i-(NX-z+1)]]; |
156 | 0 | RansEncSymbol *s2 = &syms[in[i-(NX-z+2)]]; |
157 | 0 | RansEncSymbol *s3 = &syms[in[i-(NX-z+3)]]; |
158 | 0 | RansEncPutSymbol_branched(&ransN[z-0], &ptr, s0); |
159 | 0 | RansEncPutSymbol_branched(&ransN[z-1], &ptr, s1); |
160 | 0 | RansEncPutSymbol_branched(&ransN[z-2], &ptr, s2); |
161 | 0 | RansEncPutSymbol_branched(&ransN[z-3], &ptr, s3); |
162 | 0 | } |
163 | 0 | } |
164 | 0 | if (z < -1) abort(); |
165 | 0 | } |
166 | 0 | } else { |
167 | | // Branchless version optimises poorly with gcc unless we have |
168 | | // AVX2 capability, so have a custom rewrite of it. |
169 | 0 | uint16_t* ptr16 = (uint16_t *)ptr; |
170 | 0 | for (i=(in_size &~(NX-1)); likely(i>0); i-=NX) { |
171 | | // Unrolled copy of below, because gcc doesn't optimise this |
172 | | // well in the original form. |
173 | | // |
174 | | // Gcc11: 328 MB/s (this) vs 208 MB/s (orig) |
175 | | // Clang10: 352 MB/s (this) vs 340 MB/s (orig) |
176 | | // |
177 | | // for (z = NX-1; z >= 0; z-=4) { |
178 | | // RansEncSymbol *s0 = &syms[in[i-(NX-z+0)]]; |
179 | | // RansEncSymbol *s1 = &syms[in[i-(NX-z+1)]]; |
180 | | // RansEncSymbol *s2 = &syms[in[i-(NX-z+2)]]; |
181 | | // RansEncSymbol *s3 = &syms[in[i-(NX-z+3)]]; |
182 | | // RansEncPutSymbol(&ransN[z-0], &ptr, s0); |
183 | | // RansEncPutSymbol(&ransN[z-1], &ptr, s1); |
184 | | // RansEncPutSymbol(&ransN[z-2], &ptr, s2); |
185 | | // RansEncPutSymbol(&ransN[z-3], &ptr, s3); |
186 | | // } |
187 | |
|
188 | 0 | for (z = NX-1; z >= 0; z-=4) { |
189 | | // RansEncPutSymbol added in-situ |
190 | 0 | RansState *rp = &ransN[z]-3; |
191 | 0 | RansEncSymbol *sy[4]; |
192 | 0 | uint8_t *C = &in[i-(NX-z)]-3; |
193 | |
|
194 | 0 | sy[0] = &syms[C[3]]; |
195 | 0 | sy[1] = &syms[C[2]]; |
196 | |
|
197 | 0 | int c0 = rp[3-0] > sy[0]->x_max; |
198 | 0 | int c1 = rp[3-1] > sy[1]->x_max; |
199 | |
|
200 | 0 | #ifdef HTSCODECS_LITTLE_ENDIAN |
201 | 0 | ptr16[-1] = rp[3-0]; ptr16 -= c0; |
202 | 0 | ptr16[-1] = rp[3-1]; ptr16 -= c1; |
203 | | #else |
204 | | ((uint8_t *)&ptr16[-1])[0] = rp[3-0]; |
205 | | ((uint8_t *)&ptr16[-1])[1] = rp[3-0]>>8; |
206 | | ptr16 -= c0; |
207 | | ((uint8_t *)&ptr16[-1])[0] = rp[3-1]; |
208 | | ((uint8_t *)&ptr16[-1])[1] = rp[3-1]>>8; |
209 | | ptr16 -= c1; |
210 | | #endif |
211 | |
|
212 | 0 | rp[3-0] = c0 ? rp[3-0]>>16 : rp[3-0]; |
213 | 0 | rp[3-1] = c1 ? rp[3-1]>>16 : rp[3-1]; |
214 | |
|
215 | 0 | sy[2] = &syms[C[1]]; |
216 | 0 | sy[3] = &syms[C[0]]; |
217 | |
|
218 | 0 | int c2 = rp[3-2] > sy[2]->x_max; |
219 | 0 | int c3 = rp[3-3] > sy[3]->x_max; |
220 | 0 | #ifdef HTSCODECS_LITTLE_ENDIAN |
221 | 0 | ptr16[-1] = rp[3-2]; ptr16 -= c2; |
222 | 0 | ptr16[-1] = rp[3-3]; ptr16 -= c3; |
223 | | #else |
224 | | ((uint8_t *)&ptr16[-1])[0] = rp[3-2]; |
225 | | ((uint8_t *)&ptr16[-1])[1] = rp[3-2]>>8; |
226 | | ptr16 -= c2; |
227 | | ((uint8_t *)&ptr16[-1])[0] = rp[3-3]; |
228 | | ((uint8_t *)&ptr16[-1])[1] = rp[3-3]>>8; |
229 | | ptr16 -= c3; |
230 | | #endif |
231 | 0 | rp[3-2] = c2 ? rp[3-2]>>16 : rp[3-2]; |
232 | 0 | rp[3-3] = c3 ? rp[3-3]>>16 : rp[3-3]; |
233 | |
|
234 | 0 | int k; |
235 | 0 | for (k = 0; k < 4; k++) { |
236 | 0 | uint64_t r64 = (uint64_t)rp[3-k]; |
237 | 0 | uint32_t q = (r64 * sy[k]->rcp_freq) >> sy[k]->rcp_shift; |
238 | 0 | rp[3-k] += sy[k]->bias + q*sy[k]->cmpl_freq; |
239 | 0 | } |
240 | 0 | } |
241 | 0 | if (z < -1) abort(); |
242 | 0 | } |
243 | 0 | ptr = (uint8_t *)ptr16; |
244 | 0 | } |
245 | 0 | for (z = NX-1; z >= 0; z--) |
246 | 0 | RansEncFlush(&ransN[z], &ptr); |
247 | |
|
248 | 0 | empty: |
249 | | // Finalise block size and return it |
250 | 0 | *out_size = (out_end - ptr) + tab_size; |
251 | |
|
252 | 0 | memmove(out + tab_size, ptr, out_end-ptr); |
253 | |
|
254 | 0 | return out; |
255 | 0 | } |
256 | | |
257 | | unsigned char *rans_uncompress_O0_32x16(unsigned char *in, |
258 | | unsigned int in_size, |
259 | | unsigned char *out, |
260 | 0 | unsigned int out_sz) { |
261 | 0 | if (in_size < 16) // 4-states at least |
262 | 0 | return NULL; |
263 | | |
264 | 0 | if (out_sz >= INT_MAX) |
265 | 0 | return NULL; // protect against some overflow cases |
266 | | |
267 | 0 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
268 | 0 | if (out_sz > 100000) |
269 | 0 | return NULL; |
270 | 0 | #endif |
271 | | |
272 | | /* Load in the static tables */ |
273 | 0 | unsigned char *cp = in, *out_free = NULL; |
274 | 0 | unsigned char *cp_end = in + in_size; |
275 | 0 | int i; |
276 | 0 | uint32_t s3[TOTFREQ]; // For TF_SHIFT <= 12 |
277 | |
|
278 | 0 | if (!out) |
279 | 0 | out_free = out = malloc(out_sz); |
280 | 0 | if (!out) |
281 | 0 | return NULL; |
282 | | |
283 | | // Precompute reverse lookup of frequency. |
284 | 0 | uint32_t F[256] = {0}, fsum; |
285 | 0 | int fsz = decode_freq(cp, cp_end, F, &fsum); |
286 | 0 | if (!fsz) |
287 | 0 | goto err; |
288 | 0 | cp += fsz; |
289 | |
|
290 | 0 | normalise_freq_shift(F, fsum, TOTFREQ); |
291 | | |
292 | | // Build symbols; fixme, do as part of decode, see the _d variant |
293 | 0 | if (rans_F_to_s3(F, TF_SHIFT, s3)) |
294 | 0 | goto err; |
295 | | |
296 | 0 | if (cp_end - cp < NX * 4) |
297 | 0 | goto err; |
298 | | |
299 | 0 | int z; |
300 | 0 | RansState R[NX]; |
301 | 0 | for (z = 0; z < NX; z++) { |
302 | 0 | RansDecInit(&R[z], &cp); |
303 | 0 | if (R[z] < RANS_BYTE_L) |
304 | 0 | goto err; |
305 | 0 | } |
306 | | |
307 | 0 | int out_end = (out_sz&~(NX-1)); |
308 | 0 | const uint32_t mask = (1u << TF_SHIFT)-1; |
309 | 0 | cp_end -= NX*2; // worst case for renorm bytes |
310 | | |
311 | | // assume NX is divisible by 4 |
312 | 0 | assert(NX%4==0); |
313 | | |
314 | | // Unsafe loop with no ptr overflow checking within loop itself |
315 | 0 | for (i=0; likely(i < out_end && cp < cp_end); i+=NX) { |
316 | 0 | for (z = 0; z < NX; z+=4) { |
317 | 0 | uint32_t S[4]; |
318 | 0 | S[0] = s3[R[z+0] & mask]; |
319 | 0 | S[1] = s3[R[z+1] & mask]; |
320 | 0 | S[2] = s3[R[z+2] & mask]; |
321 | 0 | S[3] = s3[R[z+3] & mask]; |
322 | |
|
323 | 0 | R[z+0] = (S[0]>>(TF_SHIFT+8)) * (R[z+0] >> TF_SHIFT) |
324 | 0 | + ((S[0]>>8) & mask); |
325 | 0 | R[z+1] = (S[1]>>(TF_SHIFT+8)) * (R[z+1] >> TF_SHIFT) |
326 | 0 | + ((S[1]>>8) & mask); |
327 | 0 | R[z+2] = (S[2]>>(TF_SHIFT+8)) * (R[z+2] >> TF_SHIFT) |
328 | 0 | + ((S[2]>>8) & mask); |
329 | 0 | R[z+3] = (S[3]>>(TF_SHIFT+8)) * (R[z+3] >> TF_SHIFT) |
330 | 0 | + ((S[3]>>8) & mask); |
331 | |
|
332 | 0 | out[i+z+0] = S[0]; |
333 | 0 | out[i+z+1] = S[1]; |
334 | 0 | out[i+z+2] = S[2]; |
335 | 0 | out[i+z+3] = S[3]; |
336 | |
|
337 | 0 | RansDecRenorm(&R[z+0], &cp); |
338 | 0 | RansDecRenorm(&R[z+1], &cp); |
339 | 0 | RansDecRenorm(&R[z+2], &cp); |
340 | 0 | RansDecRenorm(&R[z+3], &cp); |
341 | |
|
342 | 0 | if (NX%8==0) { |
343 | 0 | z += 4; |
344 | 0 | S[0] = s3[R[z+0] & mask]; |
345 | 0 | S[1] = s3[R[z+1] & mask]; |
346 | 0 | S[2] = s3[R[z+2] & mask]; |
347 | 0 | S[3] = s3[R[z+3] & mask]; |
348 | |
|
349 | 0 | R[z+0] = (S[0]>>(TF_SHIFT+8)) * (R[z+0] >> TF_SHIFT) |
350 | 0 | + ((S[0]>>8) & mask); |
351 | 0 | R[z+1] = (S[1]>>(TF_SHIFT+8)) * (R[z+1] >> TF_SHIFT) |
352 | 0 | + ((S[1]>>8) & mask); |
353 | 0 | R[z+2] = (S[2]>>(TF_SHIFT+8)) * (R[z+2] >> TF_SHIFT) |
354 | 0 | + ((S[2]>>8) & mask); |
355 | 0 | R[z+3] = (S[3]>>(TF_SHIFT+8)) * (R[z+3] >> TF_SHIFT) |
356 | 0 | + ((S[3]>>8) & mask); |
357 | |
|
358 | 0 | out[i+z+0] = S[0]; |
359 | 0 | out[i+z+1] = S[1]; |
360 | 0 | out[i+z+2] = S[2]; |
361 | 0 | out[i+z+3] = S[3]; |
362 | |
|
363 | 0 | RansDecRenorm(&R[z+0], &cp); |
364 | 0 | RansDecRenorm(&R[z+1], &cp); |
365 | 0 | RansDecRenorm(&R[z+2], &cp); |
366 | 0 | RansDecRenorm(&R[z+3], &cp); |
367 | 0 | } |
368 | 0 | } |
369 | 0 | } |
370 | | |
371 | | // Safe loop |
372 | 0 | for (; i < out_end; i+=NX) { |
373 | 0 | for (z = 0; z < NX; z+=4) { |
374 | 0 | uint32_t S[4]; |
375 | 0 | S[0] = s3[R[z+0] & mask]; |
376 | 0 | S[1] = s3[R[z+1] & mask]; |
377 | 0 | S[2] = s3[R[z+2] & mask]; |
378 | 0 | S[3] = s3[R[z+3] & mask]; |
379 | |
|
380 | 0 | R[z+0] = (S[0]>>(TF_SHIFT+8)) * (R[z+0] >> TF_SHIFT) |
381 | 0 | + ((S[0]>>8) & mask); |
382 | 0 | R[z+1] = (S[1]>>(TF_SHIFT+8)) * (R[z+1] >> TF_SHIFT) |
383 | 0 | + ((S[1]>>8) & mask); |
384 | 0 | R[z+2] = (S[2]>>(TF_SHIFT+8)) * (R[z+2] >> TF_SHIFT) |
385 | 0 | + ((S[2]>>8) & mask); |
386 | 0 | R[z+3] = (S[3]>>(TF_SHIFT+8)) * (R[z+3] >> TF_SHIFT) |
387 | 0 | + ((S[3]>>8) & mask); |
388 | |
|
389 | 0 | out[i+z+0] = S[0]; |
390 | 0 | out[i+z+1] = S[1]; |
391 | 0 | out[i+z+2] = S[2]; |
392 | 0 | out[i+z+3] = S[3]; |
393 | |
|
394 | 0 | RansDecRenormSafe(&R[z+0], &cp, cp_end+NX*2); |
395 | 0 | RansDecRenormSafe(&R[z+1], &cp, cp_end+NX*2); |
396 | 0 | RansDecRenormSafe(&R[z+2], &cp, cp_end+NX*2); |
397 | 0 | RansDecRenormSafe(&R[z+3], &cp, cp_end+NX*2); |
398 | 0 | } |
399 | 0 | } |
400 | |
|
401 | 0 | for (z = out_sz & (NX-1); z-- > 0; ) |
402 | 0 | out[out_end + z] = s3[R[z] & mask]; |
403 | | |
404 | | //fprintf(stderr, " 0 Decoded %d bytes\n", (int)(cp-in)); //c-size |
405 | |
|
406 | 0 | return out; |
407 | | |
408 | 0 | err: |
409 | 0 | free(out_free); |
410 | 0 | return NULL; |
411 | 0 | } |
412 | | |
413 | | |
414 | | //----------------------------------------------------------------------------- |
415 | | unsigned char *rans_compress_O1_32x16(unsigned char *in, |
416 | | unsigned int in_size, |
417 | | unsigned char *out, |
418 | 0 | unsigned int *out_size) { |
419 | 0 | unsigned char *cp, *out_end, *out_free = NULL; |
420 | 0 | unsigned int tab_size; |
421 | 0 | int bound = rans_compress_bound_4x16(in_size,1)-20, z; |
422 | 0 | RansState ransN[NX]; |
423 | |
|
424 | 0 | if (in_size < NX) // force O0 instead |
425 | 0 | return NULL; |
426 | | |
427 | 0 | if (!out) { |
428 | 0 | *out_size = bound; |
429 | 0 | out_free = out = malloc(*out_size); |
430 | 0 | } |
431 | 0 | if (!out || bound > *out_size) |
432 | 0 | return NULL; |
433 | | |
434 | 0 | if (((size_t)out)&1) |
435 | 0 | bound--; |
436 | 0 | out_end = out + bound; |
437 | |
|
438 | 0 | RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms))); |
439 | 0 | if (!syms) { |
440 | 0 | free(out_free); |
441 | 0 | return NULL; |
442 | 0 | } |
443 | | |
444 | 0 | cp = out; |
445 | 0 | int shift = encode_freq1(in, in_size, 32, syms, &cp); |
446 | 0 | if (shift < 0) { |
447 | 0 | free(out_free); |
448 | 0 | htscodecs_tls_free(syms); |
449 | 0 | return NULL; |
450 | 0 | } |
451 | 0 | tab_size = cp - out; |
452 | |
|
453 | 0 | for (z = 0; z < NX; z++) |
454 | 0 | RansEncInit(&ransN[z]); |
455 | |
|
456 | 0 | uint8_t* ptr = out_end; |
457 | |
|
458 | 0 | int iN[NX], isz4 = in_size/NX, i; |
459 | 0 | for (z = 0; z < NX; z++) |
460 | 0 | iN[z] = (z+1)*isz4-2; |
461 | |
|
462 | 0 | unsigned char lN[NX]; |
463 | 0 | for (z = 0; z < NX; z++) |
464 | 0 | lN[z] = in[iN[z]+1]; |
465 | | |
466 | | // Deal with the remainder |
467 | 0 | z = NX-1; |
468 | 0 | lN[z] = in[in_size-1]; |
469 | 0 | for (iN[z] = in_size-2; iN[z] > NX*isz4-2; iN[z]--) { |
470 | 0 | unsigned char c = in[iN[z]]; |
471 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[c][lN[z]]); |
472 | 0 | lN[z] = c; |
473 | 0 | } |
474 | |
|
475 | 0 | unsigned char *i32[NX]; |
476 | 0 | for (i = 0; i < NX; i++) |
477 | 0 | i32[i] = &in[iN[i]]; |
478 | |
|
479 | 0 | for (; likely(i32[0] >= in); ) { |
480 | 0 | uint16_t *ptr16 = (uint16_t *)ptr; |
481 | 0 | for (z = NX-1; z >= 0; z-=4) { |
482 | 0 | RansEncSymbol *sy[4]; |
483 | 0 | int k; |
484 | |
|
485 | 0 | for (k = 0; k < 4; k++) { |
486 | 0 | sy[k] = &syms[*i32[z-k]][lN[z-k]]; |
487 | 0 | lN[z-k] = *i32[z-k]--; |
488 | 0 | } |
489 | | |
490 | | // RansEncPutSymbol added in-situ |
491 | 0 | for (k = 0; k < 4; k++) { |
492 | 0 | int c = ransN[z-k] > sy[k]->x_max; |
493 | 0 | #ifdef HTSCODECS_LITTLE_ENDIAN |
494 | 0 | ptr16[-1] = ransN[z-k]; |
495 | | #else |
496 | | ((uint8_t *)&ptr16[-1])[0] = ransN[z-k]; |
497 | | ((uint8_t *)&ptr16[-1])[1] = ransN[z-k]>>8; |
498 | | #endif |
499 | 0 | ptr16 -= c; |
500 | | //ransN[z-k] >>= c<<4; |
501 | 0 | ransN[z-k] = c ? ransN[z-k]>>16 : ransN[z-k]; |
502 | 0 | } |
503 | |
|
504 | 0 | for (k = 0; k < 4; k++) { |
505 | 0 | uint64_t r64 = ransN[z-k]; |
506 | 0 | uint32_t q = (r64 * sy[k]->rcp_freq) >> sy[k]->rcp_shift; |
507 | 0 | ransN[z-k] += sy[k]->bias + q*sy[k]->cmpl_freq; |
508 | 0 | } |
509 | 0 | } |
510 | 0 | ptr = (uint8_t *)ptr16; |
511 | 0 | } |
512 | |
|
513 | 0 | for (z = NX-1; z>=0; z--) |
514 | 0 | RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]); |
515 | |
|
516 | 0 | for (z = NX-1; z>=0; z--) |
517 | 0 | RansEncFlush(&ransN[z], &ptr); |
518 | |
|
519 | 0 | *out_size = (out_end - ptr) + tab_size; |
520 | |
|
521 | 0 | cp = out; |
522 | 0 | memmove(out + tab_size, ptr, out_end-ptr); |
523 | |
|
524 | 0 | htscodecs_tls_free(syms); |
525 | 0 | return out; |
526 | 0 | } |
527 | | |
528 | | //#define MAGIC2 111 |
529 | 0 | #define MAGIC2 179 |
530 | | //#define MAGIC2 0 |
531 | | |
532 | | unsigned char *rans_uncompress_O1_32x16(unsigned char *in, |
533 | | unsigned int in_size, |
534 | | unsigned char *out, |
535 | 0 | unsigned int out_sz) { |
536 | 0 | if (in_size < NX*4) // 4-states at least |
537 | 0 | return NULL; |
538 | | |
539 | 0 | if (out_sz >= INT_MAX) |
540 | 0 | return NULL; // protect against some overflow cases |
541 | | |
542 | 0 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
543 | 0 | if (out_sz > 100000) |
544 | 0 | return NULL; |
545 | 0 | #endif |
546 | | |
547 | | /* Load in the static tables */ |
548 | 0 | unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL; |
549 | 0 | unsigned char *c_freq = NULL; |
550 | 0 | int i; |
551 | | |
552 | | /* |
553 | | * Somewhat complex memory layout. |
554 | | * With shift==12 (TF_SHIFT_O1) we fill out use both sfb and fb. |
555 | | * With shift==10 (...O1_FAST) we fill out and use s3 only. |
556 | | * |
557 | | * sfb+fb is larger, therefore we allocate this much memory. |
558 | | */ |
559 | 0 | uint8_t *sfb_ = htscodecs_tls_alloc(256* |
560 | 0 | ((TOTFREQ_O1+MAGIC2)*sizeof(*sfb_) |
561 | 0 | +256 * sizeof(fb_t))); |
562 | 0 | if (!sfb_) |
563 | 0 | return NULL; |
564 | | |
565 | | // sfb and fb are consecutive |
566 | 0 | uint8_t *sfb[257]; |
567 | 0 | if ((*cp >> 4) == TF_SHIFT_O1) { |
568 | 0 | for (i = 0; i <= 256; i++) |
569 | 0 | sfb[i]= sfb_ + i*(TOTFREQ_O1+MAGIC2); |
570 | 0 | } else { |
571 | 0 | for (i = 0; i <= 256; i++) |
572 | 0 | sfb[i]= sfb_ + i*(TOTFREQ_O1_FAST+MAGIC2); |
573 | 0 | } |
574 | 0 | fb_t (*fb)[256] = (fb_t (*)[256]) sfb[256]; |
575 | | |
576 | | // NOTE: s3 overlaps sfb/fb |
577 | 0 | uint32_t (*s3)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])sfb_; |
578 | |
|
579 | 0 | if (!out) |
580 | 0 | out_free = out = malloc(out_sz); |
581 | |
|
582 | 0 | if (!out) |
583 | 0 | goto err; |
584 | | |
585 | | //fprintf(stderr, "out_sz=%d\n", out_sz); |
586 | | |
587 | | // compressed header? If so uncompress it |
588 | 0 | unsigned char *tab_end = NULL; |
589 | 0 | unsigned char *c_freq_end = cp_end; |
590 | 0 | unsigned int shift = *cp >> 4; |
591 | 0 | if (*cp++ & 1) { |
592 | 0 | uint32_t u_freq_sz, c_freq_sz; |
593 | 0 | cp += var_get_u32(cp, cp_end, &u_freq_sz); |
594 | 0 | cp += var_get_u32(cp, cp_end, &c_freq_sz); |
595 | 0 | if (c_freq_sz >= cp_end - cp - 16) |
596 | 0 | goto err; |
597 | 0 | tab_end = cp + c_freq_sz; |
598 | 0 | if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL,u_freq_sz))) |
599 | 0 | goto err; |
600 | 0 | cp = c_freq; |
601 | 0 | c_freq_end = c_freq + u_freq_sz; |
602 | 0 | } |
603 | | |
604 | | // Decode order-0 symbol list; avoids needing in order-1 tables |
605 | 0 | cp += decode_freq1(cp, c_freq_end, shift, NULL, s3, sfb, fb); |
606 | |
|
607 | 0 | if (tab_end) |
608 | 0 | cp = tab_end; |
609 | 0 | free(c_freq); |
610 | 0 | c_freq = NULL; |
611 | |
|
612 | 0 | if (cp_end - cp < NX * 4) |
613 | 0 | goto err; |
614 | | |
615 | 0 | RansState R[NX]; |
616 | 0 | uint8_t *ptr = cp, *ptr_end = in + in_size - 2*NX; |
617 | 0 | int z; |
618 | 0 | for (z = 0; z < NX; z++) { |
619 | 0 | RansDecInit(&R[z], &ptr); |
620 | 0 | if (R[z] < RANS_BYTE_L) |
621 | 0 | goto err; |
622 | 0 | } |
623 | | |
624 | 0 | int isz4 = out_sz/NX; |
625 | 0 | int i4[NX], l[NX] = {0}; |
626 | 0 | for (z = 0; z < NX; z++) |
627 | 0 | i4[z] = z*isz4; |
628 | |
|
629 | 0 | const int low_ent = in_size < 0.2 * out_sz; |
630 | | |
631 | | // Around 15% faster to specialise for 10/12 than to have one |
632 | | // loop with shift as a variable. |
633 | 0 | if (shift == TF_SHIFT_O1) { |
634 | | // TF_SHIFT_O1 = 12 |
635 | 0 | const uint32_t mask = ((1u << TF_SHIFT_O1)-1); |
636 | 0 | for (; likely(i4[0] < isz4);) { |
637 | 0 | for (z = 0; z < NX; z+=4) { |
638 | 0 | uint16_t m[4], c[4]; |
639 | |
|
640 | 0 | c[0] = sfb[l[z+0]][m[0] = R[z+0] & mask]; |
641 | 0 | c[1] = sfb[l[z+1]][m[1] = R[z+1] & mask]; |
642 | 0 | c[2] = sfb[l[z+2]][m[2] = R[z+2] & mask]; |
643 | 0 | c[3] = sfb[l[z+3]][m[3] = R[z+3] & mask]; |
644 | |
|
645 | 0 | R[z+0] = fb[l[z+0]][c[0]].f * (R[z+0]>>TF_SHIFT_O1); |
646 | 0 | R[z+0] += m[0] - fb[l[z+0]][c[0]].b; |
647 | |
|
648 | 0 | R[z+1] = fb[l[z+1]][c[1]].f * (R[z+1]>>TF_SHIFT_O1); |
649 | 0 | R[z+1] += m[1] - fb[l[z+1]][c[1]].b; |
650 | |
|
651 | 0 | R[z+2] = fb[l[z+2]][c[2]].f * (R[z+2]>>TF_SHIFT_O1); |
652 | 0 | R[z+2] += m[2] - fb[l[z+2]][c[2]].b; |
653 | |
|
654 | 0 | R[z+3] = fb[l[z+3]][c[3]].f * (R[z+3]>>TF_SHIFT_O1); |
655 | 0 | R[z+3] += m[3] - fb[l[z+3]][c[3]].b; |
656 | |
|
657 | 0 | out[i4[z+0]++] = l[z+0] = c[0]; |
658 | 0 | out[i4[z+1]++] = l[z+1] = c[1]; |
659 | 0 | out[i4[z+2]++] = l[z+2] = c[2]; |
660 | 0 | out[i4[z+3]++] = l[z+3] = c[3]; |
661 | |
|
662 | 0 | if (!low_ent && likely(ptr < ptr_end)) { |
663 | 0 | RansDecRenorm(&R[z+0], &ptr); |
664 | 0 | RansDecRenorm(&R[z+1], &ptr); |
665 | 0 | RansDecRenorm(&R[z+2], &ptr); |
666 | 0 | RansDecRenorm(&R[z+3], &ptr); |
667 | 0 | } else { |
668 | 0 | RansDecRenormSafe(&R[z+0], &ptr, ptr_end+2*NX); |
669 | 0 | RansDecRenormSafe(&R[z+1], &ptr, ptr_end+2*NX); |
670 | 0 | RansDecRenormSafe(&R[z+2], &ptr, ptr_end+2*NX); |
671 | 0 | RansDecRenormSafe(&R[z+3], &ptr, ptr_end+2*NX); |
672 | 0 | } |
673 | 0 | } |
674 | 0 | } |
675 | | |
676 | | // Remainder |
677 | 0 | for (; i4[NX-1] < out_sz; i4[NX-1]++) { |
678 | 0 | uint32_t m = R[NX-1] & ((1u<<TF_SHIFT_O1)-1); |
679 | 0 | unsigned char c = sfb[l[NX-1]][m]; |
680 | 0 | out[i4[NX-1]] = c; |
681 | 0 | R[NX-1] = fb[l[NX-1]][c].f * (R[NX-1]>>TF_SHIFT_O1) + |
682 | 0 | m - fb[l[NX-1]][c].b; |
683 | 0 | RansDecRenormSafe(&R[NX-1], &ptr, ptr_end + 2*NX); |
684 | 0 | l[NX-1] = c; |
685 | 0 | } |
686 | 0 | } else { |
687 | | // TF_SHIFT_O1 = 10 |
688 | 0 | const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1); |
689 | 0 | for (; likely(i4[0] < isz4);) { |
690 | 0 | for (z = 0; z < NX; z+=4) { |
691 | | // Merged sfb and fb into single s3 lookup. |
692 | | // The m[4] array completely vanishes in this method. |
693 | 0 | uint32_t S[4] = { |
694 | 0 | s3[l[z+0]][R[z+0] & mask], |
695 | 0 | s3[l[z+1]][R[z+1] & mask], |
696 | 0 | s3[l[z+2]][R[z+2] & mask], |
697 | 0 | s3[l[z+3]][R[z+3] & mask], |
698 | 0 | }; |
699 | |
|
700 | 0 | l[z+0] = out[i4[z+0]++] = S[0]; |
701 | 0 | l[z+1] = out[i4[z+1]++] = S[1]; |
702 | 0 | l[z+2] = out[i4[z+2]++] = S[2]; |
703 | 0 | l[z+3] = out[i4[z+3]++] = S[3]; |
704 | |
|
705 | 0 | uint32_t F[4] = { |
706 | 0 | S[0]>>(TF_SHIFT_O1_FAST+8), |
707 | 0 | S[1]>>(TF_SHIFT_O1_FAST+8), |
708 | 0 | S[2]>>(TF_SHIFT_O1_FAST+8), |
709 | 0 | S[3]>>(TF_SHIFT_O1_FAST+8), |
710 | 0 | }; |
711 | 0 | uint32_t B[4] = { |
712 | 0 | (S[0]>>8) & mask, |
713 | 0 | (S[1]>>8) & mask, |
714 | 0 | (S[2]>>8) & mask, |
715 | 0 | (S[3]>>8) & mask, |
716 | 0 | }; |
717 | |
|
718 | 0 | R[z+0] = F[0] * (R[z+0]>>TF_SHIFT_O1_FAST) + B[0]; |
719 | 0 | R[z+1] = F[1] * (R[z+1]>>TF_SHIFT_O1_FAST) + B[1]; |
720 | 0 | R[z+2] = F[2] * (R[z+2]>>TF_SHIFT_O1_FAST) + B[2]; |
721 | 0 | R[z+3] = F[3] * (R[z+3]>>TF_SHIFT_O1_FAST) + B[3]; |
722 | |
|
723 | 0 | if (!low_ent && (ptr < ptr_end)) { |
724 | | // branchless & asm |
725 | 0 | RansDecRenorm(&R[z+0], &ptr); |
726 | 0 | RansDecRenorm(&R[z+1], &ptr); |
727 | 0 | RansDecRenorm(&R[z+2], &ptr); |
728 | 0 | RansDecRenorm(&R[z+3], &ptr); |
729 | 0 | } else { |
730 | | // branched, but better when predictable |
731 | 0 | RansDecRenormSafe(&R[z+0], &ptr, ptr_end+2*NX); |
732 | 0 | RansDecRenormSafe(&R[z+1], &ptr, ptr_end+2*NX); |
733 | 0 | RansDecRenormSafe(&R[z+2], &ptr, ptr_end+2*NX); |
734 | 0 | RansDecRenormSafe(&R[z+3], &ptr, ptr_end+2*NX); |
735 | 0 | } |
736 | 0 | } |
737 | 0 | } |
738 | | |
739 | | // Remainder |
740 | 0 | for (; i4[NX-1] < out_sz; i4[NX-1]++) { |
741 | 0 | uint32_t S = s3[l[NX-1]][R[NX-1] & ((1u<<TF_SHIFT_O1_FAST)-1)]; |
742 | 0 | out[i4[NX-1]] = l[NX-1] = S&0xff; |
743 | 0 | R[NX-1] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[NX-1]>>TF_SHIFT_O1_FAST) |
744 | 0 | + ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1)); |
745 | 0 | RansDecRenormSafe(&R[NX-1], &ptr, ptr_end + 2*NX); |
746 | 0 | } |
747 | 0 | } |
748 | | //fprintf(stderr, " 1 Decoded %d bytes\n", (int)(ptr-in)); //c-size |
749 | |
|
750 | 0 | htscodecs_tls_free(sfb_); |
751 | 0 | return out; |
752 | | |
753 | 0 | err: |
754 | 0 | htscodecs_tls_free(sfb_); |
755 | 0 | free(out_free); |
756 | 0 | free(c_freq); |
757 | |
|
758 | 0 | return NULL; |
759 | 0 | } |