Coverage Report

Created: 2025-10-10 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/htscodecs/htscodecs/rANS_static4x16pr.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017-2023, 2025 Genome Research Ltd.
3
 * Author(s): James Bonfield
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *
8
 *    1. Redistributions of source code must retain the above copyright notice,
9
 *       this list of conditions and the following disclaimer.
10
 *
11
 *    2. Redistributions in binary form must reproduce the above
12
 *       copyright notice, this list of conditions and the following
13
 *       disclaimer in the documentation and/or other materials provided
14
 *       with the distribution.
15
 *
16
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17
 *       Institute nor the names of its contributors may be used to endorse
18
 *       or promote products derived from this software without specific
19
 *       prior written permission.
20
 *
21
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 */
33
34
// FIXME Can we get decoder to return the compressed sized read, avoiding
35
// us needing to store it?  Yes we can.  See c-size comments.  If we added all these
36
// together we could get rans_uncompress_to_4x16 to return the number of bytes
37
// consumed, avoiding the calling code from needed to explicitly stored the size.
38
// However the effect on name tokeniser is to save 0.1 to 0.2% so not worth it.
39
40
/*-------------------------------------------------------------------------- */
41
/*
42
 * Example wrapper to use the rans_byte.h functions included above.
43
 *
44
 * This demonstrates how to use, and unroll, an order-0 and order-1 frequency
45
 * model.
46
 */
47
48
#include "config.h"
49
50
#include <stdint.h>
51
#include <stdlib.h>
52
#include <stdio.h>
53
#include <unistd.h>
54
#include <assert.h>
55
#include <string.h>
56
#include <sys/time.h>
57
#include <limits.h>
58
#include <math.h>
59
60
#ifndef NO_THREADS
61
#include <pthread.h>
62
#endif
63
64
#include "rANS_word.h"
65
#include "rANS_static4x16.h"
66
#include "rANS_static16_int.h"
67
#include "pack.h"
68
#include "rle.h"
69
#include "utils.h"
70
71
3.18M
#define TF_SHIFT 12
72
981k
#define TOTFREQ (1<<TF_SHIFT)
73
74
// 9-11 is considerably faster in the O1 variant due to reduced table size.
75
// We auto-tune between 10 and 12 though.  Anywhere from 9 to 14 are viable.
76
#ifndef TF_SHIFT_O1
77
#define TF_SHIFT_O1 12
78
#endif
79
#ifndef TF_SHIFT_O1_FAST
80
#define TF_SHIFT_O1_FAST 10
81
#endif
82
3.08M
#define TOTFREQ_O1 (1<<TF_SHIFT_O1)
83
2.68M
#define TOTFREQ_O1_FAST (1<<TF_SHIFT_O1_FAST)
84
85
86
/*-----------------------------------------------------------------------------
87
 * Memory to memory compression functions.
88
 *
89
 * These are original versions without any manual loop unrolling. They
90
 * are easier to understand, but can be up to 2x slower.
91
 */
92
93
1.02M
unsigned int rans_compress_bound_4x16(unsigned int size, int order) {
94
1.02M
    int N = (order>>8) & 0xff;
95
1.02M
    if (!N) N=4;
96
97
1.02M
    order &= 0xff;
98
1.02M
    unsigned int sz = (order == 0
99
1.02M
        ? 1.05*size + 257*3 + 4
100
1.02M
        : 1.05*size + 257*257*3 + 4 + 257*3+4) +
101
1.02M
        ((order & RANS_ORDER_PACK) ? 1 : 0) +
102
1.02M
        ((order & RANS_ORDER_RLE) ? 1 + 257*3+4: 0) + 20 +
103
1.02M
        ((order & RANS_ORDER_X32) ? (32-4)*4 : 0) +
104
1.02M
        ((order & RANS_ORDER_STRIPE) ? 7 + 5*N: 0);
105
1.02M
    return sz + (sz&1) + 2; // make this even so buffers are word aligned
106
1.02M
}
107
108
// Compresses in_size bytes from 'in' to *out_size bytes in 'out'.
109
//
110
// NB: The output buffer does not hold the original size, so it is up to
111
// the caller to store this.
112
unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
113
568k
                                     unsigned char *out, unsigned int *out_size) {
114
568k
    unsigned char *cp, *out_end;
115
568k
    RansEncSymbol syms[256];
116
568k
    RansState rans0;
117
568k
    RansState rans2;
118
568k
    RansState rans1;
119
568k
    RansState rans3;
120
568k
    uint8_t* ptr;
121
568k
    uint32_t F[256+MAGIC] = {0};
122
568k
    int i, j, tab_size = 0, rle, x;
123
    // -20 for order/size/meta
124
568k
    uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;
125
126
568k
    if (!out) {
127
338
        *out_size = bound;
128
338
        out = malloc(*out_size);
129
338
    }
130
568k
    if (!out || bound > *out_size)
131
0
        return NULL;
132
133
    // If "out" isn't word aligned, tweak out_end/ptr to ensure it is.
134
    // We already added more round in bound to allow for this.
135
568k
    if (((size_t)out)&1)
136
189k
        bound--;
137
568k
    ptr = out_end = out + bound;
138
139
568k
    if (in_size == 0)
140
84.9k
        goto empty;
141
142
    // Compute statistics
143
483k
    if (hist8(in, in_size, F) < 0)
144
0
        return NULL;
145
146
    // Normalise so frequences sum to power of 2
147
483k
    uint32_t fsum = in_size;
148
483k
    uint32_t max_val = round2(fsum);
149
483k
    if (max_val > TOTFREQ)
150
8.26k
        max_val = TOTFREQ;
151
152
483k
    if (normalise_freq(F, fsum, max_val) < 0)
153
0
        return NULL;
154
483k
    fsum=max_val;
155
156
483k
    cp = out;
157
483k
    cp += encode_freq(cp, F);
158
483k
    tab_size = cp-out;
159
    //write(2, out+4, cp-(out+4));
160
161
483k
    if (normalise_freq(F, fsum, TOTFREQ) < 0)
162
0
        return NULL;
163
164
    // Encode statistics.
165
124M
    for (x = rle = j = 0; j < 256; j++) {
166
123M
        if (F[j]) {
167
1.92M
            RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
168
1.92M
            x += F[j];
169
1.92M
        }
170
123M
    }
171
172
483k
    RansEncInit(&rans0);
173
483k
    RansEncInit(&rans1);
174
483k
    RansEncInit(&rans2);
175
483k
    RansEncInit(&rans3);
176
177
483k
    switch (i=(in_size&3)) {
178
88.1k
    case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
179
        // fall-through
180
213k
    case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
181
        // fall-through
182
397k
    case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
183
        // fall-through
184
483k
    case 0:
185
483k
        break;
186
483k
    }
187
886M
    for (i=(in_size &~3); i>0; i-=4) {
188
886M
        RansEncSymbol *s3 = &syms[in[i-1]];
189
886M
        RansEncSymbol *s2 = &syms[in[i-2]];
190
886M
        RansEncSymbol *s1 = &syms[in[i-3]];
191
886M
        RansEncSymbol *s0 = &syms[in[i-4]];
192
193
886M
#if 1
194
886M
        RansEncPutSymbol(&rans3, &ptr, s3);
195
886M
        RansEncPutSymbol(&rans2, &ptr, s2);
196
886M
        RansEncPutSymbol(&rans1, &ptr, s1);
197
886M
        RansEncPutSymbol(&rans0, &ptr, s0);
198
#else
199
        // Slightly beter on gcc, much better on clang
200
        uint16_t *ptr16 = (uint16_t *)ptr;
201
202
        if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16;
203
        if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16;
204
        uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift);
205
        uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift);
206
        rans3 += s3->bias + q3 * s3->cmpl_freq;
207
        rans2 += s2->bias + q2 * s2->cmpl_freq;
208
209
        if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16;
210
        if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16;
211
        uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift);
212
        uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift);
213
        rans1 += s1->bias + q1 * s1->cmpl_freq;
214
        rans0 += s0->bias + q0 * s0->cmpl_freq;
215
216
        ptr = (uint8_t *)ptr16;
217
#endif
218
886M
    }
219
220
483k
    RansEncFlush(&rans3, &ptr);
221
483k
    RansEncFlush(&rans2, &ptr);
222
483k
    RansEncFlush(&rans1, &ptr);
223
483k
    RansEncFlush(&rans0, &ptr);
224
225
568k
 empty:
226
    // Finalise block size and return it
227
568k
    *out_size = (out_end - ptr) + tab_size;
228
229
568k
    memmove(out + tab_size, ptr, out_end-ptr);
230
231
568k
    return out;
232
483k
}
233
234
unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size,
235
1.88k
                                       unsigned char *out, unsigned int out_sz) {
236
1.88k
    if (in_size < 16) // 4-states at least
237
43
        return NULL;
238
239
1.84k
    if (out_sz >= INT_MAX)
240
4
        return NULL; // protect against some overflow cases
241
242
1.83k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
243
1.83k
    if (out_sz > 100000)
244
12
        return NULL;
245
1.82k
#endif
246
247
    /* Load in the static tables */
248
1.82k
    unsigned char *cp = in, *out_free = NULL;
249
1.82k
    unsigned char *cp_end = in + in_size - 8; // within 8 => be extra safe
250
1.82k
    int i, j;
251
1.82k
    unsigned int x, y;
252
1.82k
    uint16_t sfreq[TOTFREQ+32];
253
1.82k
    uint16_t sbase[TOTFREQ+32]; // faster to use 32-bit on clang
254
1.82k
    uint8_t  ssym [TOTFREQ+64]; // faster to use 16-bit on clang
255
256
1.82k
    if (!out)
257
133
        out_free = out = malloc(out_sz);
258
1.82k
    if (!out)
259
0
        return NULL;
260
261
    // Precompute reverse lookup of frequency.
262
1.82k
    uint32_t F[256] = {0}, fsum;
263
1.82k
    int fsz = decode_freq(cp, cp_end, F, &fsum);
264
1.82k
    if (!fsz)
265
0
        goto err;
266
1.82k
    cp += fsz;
267
268
1.82k
    normalise_freq_shift(F, fsum, TOTFREQ);
269
270
    // Build symbols; fixme, do as part of decode, see the _d variant
271
443k
    for (j = x = 0; j < 256; j++) {
272
441k
        if (F[j]) {
273
2.85k
            if (F[j] > TOTFREQ - x)
274
133
                goto err;
275
6.80M
            for (y = 0; y < F[j]; y++) {
276
6.79M
                ssym [y + x] = j;
277
6.79M
                sfreq[y + x] = F[j];
278
6.79M
                sbase[y + x] = y;
279
6.79M
            }
280
2.71k
            x += F[j];
281
2.71k
        }
282
441k
    }
283
284
1.69k
    if (x != TOTFREQ)
285
54
        goto err;
286
287
1.64k
    if (cp+16 > cp_end+8)
288
7
        goto err;
289
290
1.63k
    RansState R[4];
291
1.63k
    RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto err;
292
1.62k
    RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto err;
293
1.62k
    RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto err;
294
1.61k
    RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto err;
295
296
// Simple version is comparable to below, but only with -O3
297
//
298
//    for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
299
//        for(j=0; j<8;j++) {
300
//          RansState m = RansDecGet(&R[j%4], TF_SHIFT);
301
//          R[j%4] = sfreq[m] * (R[j%4] >> TF_SHIFT) + sbase[m];
302
//          out[i+j] = ssym[m];
303
//          RansDecRenorm(&R[j%4], &cp);
304
//        }
305
//    }
306
307
12.6k
    for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
308
33.0k
        for (j = 0; j < 8; j+=4) {
309
22.0k
            RansState m0 = RansDecGet(&R[0], TF_SHIFT);
310
22.0k
            RansState m1 = RansDecGet(&R[1], TF_SHIFT);
311
22.0k
            out[i+j+0] = ssym[m0];
312
22.0k
            out[i+j+1] = ssym[m1];
313
314
22.0k
            R[0] = sfreq[m0] * (R[0] >> TF_SHIFT) + sbase[m0];
315
22.0k
            R[1] = sfreq[m1] * (R[1] >> TF_SHIFT) + sbase[m1];
316
317
22.0k
            RansState m2 = RansDecGet(&R[2], TF_SHIFT);
318
22.0k
            RansState m3 = RansDecGet(&R[3], TF_SHIFT);
319
320
22.0k
            RansDecRenorm(&R[0], &cp);
321
22.0k
            RansDecRenorm(&R[1], &cp);
322
323
22.0k
            R[2] = sfreq[m2] * (R[2] >> TF_SHIFT) + sbase[m2];
324
22.0k
            R[3] = sfreq[m3] * (R[3] >> TF_SHIFT) + sbase[m3];
325
326
22.0k
            RansDecRenorm(&R[2], &cp);
327
22.0k
            RansDecRenorm(&R[3], &cp);
328
329
22.0k
            out[i+j+2] = ssym[m2];
330
22.0k
            out[i+j+3] = ssym[m3];
331
22.0k
        }
332
11.0k
    }
333
334
    // remainder
335
52.4k
    for (; i < out_sz; i++) {
336
50.8k
        RansState m = RansDecGet(&R[i%4], TF_SHIFT);
337
50.8k
        R[i%4] = sfreq[m] * (R[i%4] >> TF_SHIFT) + sbase[m];
338
50.8k
        out[i] = ssym[m];
339
50.8k
        RansDecRenormSafe(&R[i%4], &cp, cp_end+8);
340
50.8k
    }
341
342
    //fprintf(stderr, "    0 Decoded %d bytes\n", (int)(cp-in)); //c-size
343
344
1.61k
    return out;
345
346
216
 err:
347
216
    free(out_free);
348
216
    return NULL;
349
1.61k
}
350
351
//-----------------------------------------------------------------------------
352
353
// Compute the entropy of 12-bit vs 10-bit frequency tables.
354
// 10 bit means smaller memory footprint when decoding and
355
// more speed due to cache hits, but it *may* be a poor
356
// compression fit.
357
int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T,
358
77.3k
                       uint32_t *S) {
359
77.3k
    int i, j;
360
361
77.3k
    double e10 = 0, e12 = 0;
362
77.3k
    int max_tot = 0;
363
19.8M
    for (i = 0; i < 256; i++) {
364
19.8M
        if (F0[i] == 0)
365
19.2M
            continue;
366
587k
        unsigned int max_val = round2(T[i]);
367
587k
        int ns = 0;
368
2.53M
#define MAX(a,b) ((a)>(b)?(a):(b))
369
370
        // Number of samples that get their freq bumped to 1
371
587k
        int sm10 = 0, sm12 = 0;
372
150M
        for (j = 0; j < 256; j++) {
373
150M
            if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1_FAST)
374
31.7k
                sm10++;
375
150M
            if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1)
376
11.4k
                sm12++;
377
150M
        }
378
379
587k
        double l10 = log(TOTFREQ_O1_FAST + sm10);
380
587k
        double l12 = log(TOTFREQ_O1      + sm12);
381
587k
        double T_slow = (double)TOTFREQ_O1/T[i];
382
587k
        double T_fast = (double)TOTFREQ_O1_FAST/T[i];
383
384
150M
        for (j = 0; j < 256; j++) {
385
150M
            if (F[i][j]) {
386
1.26M
                ns++;
387
388
1.26M
                e10 -= F[i][j] * (fast_log(MAX(F[i][j]*T_fast,1)) - l10);
389
1.26M
                e12 -= F[i][j] * (fast_log(MAX(F[i][j]*T_slow,1)) - l12);
390
391
                // Estimation of compressed symbol freq table too.
392
1.26M
                e10 += 1.3;
393
1.26M
                e12 += 4.7;
394
1.26M
            }
395
150M
        }
396
397
        // Order-1 frequencies often end up totalling under TOTFREQ.
398
        // In this case it's smaller to output the real frequencies
399
        // prior to normalisation and normalise after (with an extra
400
        // normalisation step needed in the decoder too).
401
        //
402
        // Thus we normalise to a power of 2 only, store those,
403
        // and renormalise later here (and in decoder) by bit-shift
404
        // to get to the fixed size.
405
587k
        if (ns < 64 && max_val > 128) max_val /= 2;
406
587k
        if (max_val > 1024)           max_val /= 2;
407
587k
        if (max_val > TOTFREQ_O1)     max_val = TOTFREQ_O1;
408
587k
        S[i] = max_val; // scale to max this
409
587k
        if (max_tot < max_val)
410
134k
            max_tot = max_val;
411
587k
    }
412
77.3k
    int shift = e10/e12 < 1.01 || max_tot <= TOTFREQ_O1_FAST
413
77.3k
        ? TF_SHIFT_O1_FAST
414
77.3k
        : TF_SHIFT_O1;
415
416
//    fprintf(stderr, "e10/12 = %f %f %f, shift %d\n",
417
//          e10/log(256), e12/log(256), e10/e12, shift);
418
419
77.3k
    return shift;
420
77.3k
}
421
422
static
423
unsigned char *rans_compress_O1_4x16(unsigned char *in, unsigned int in_size,
424
75.5k
                                     unsigned char *out, unsigned int *out_size) {
425
75.5k
    unsigned char *cp, *out_end, *out_free = NULL;
426
75.5k
    unsigned int tab_size;
427
    
428
    // -20 for order/size/meta
429
75.5k
    uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
430
431
75.5k
    if (!out) {
432
0
        *out_size = bound;
433
0
        out_free = out = malloc(*out_size);
434
0
    }
435
75.5k
    if (!out || bound > *out_size)
436
0
        return NULL;
437
438
75.5k
    if (((size_t)out)&1)
439
20.5k
        bound--;
440
75.5k
    out_end = out + bound;
441
442
75.5k
    RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms)));
443
75.5k
    if (!syms) {
444
0
        free(out_free);
445
0
        return NULL;
446
0
    }
447
448
75.5k
    cp = out;
449
75.5k
    int shift = encode_freq1(in, in_size, 4, syms, &cp); 
450
75.5k
    if (shift < 0) {
451
0
        htscodecs_tls_free(syms);
452
0
        return NULL;
453
0
    }
454
75.5k
    tab_size = cp - out;
455
456
75.5k
    RansState rans0, rans1, rans2, rans3;
457
75.5k
    RansEncInit(&rans0);
458
75.5k
    RansEncInit(&rans1);
459
75.5k
    RansEncInit(&rans2);
460
75.5k
    RansEncInit(&rans3);
461
462
75.5k
    uint8_t* ptr = out_end;
463
464
75.5k
    int isz4 = in_size>>2;
465
75.5k
    int i0 = 1*isz4-2;
466
75.5k
    int i1 = 2*isz4-2;
467
75.5k
    int i2 = 3*isz4-2;
468
75.5k
    int i3 = 4*isz4-2;
469
470
75.5k
    unsigned char l0 = in[i0+1];
471
75.5k
    unsigned char l1 = in[i1+1];
472
75.5k
    unsigned char l2 = in[i2+1];
473
75.5k
    unsigned char l3 = in[i3+1];
474
475
    // Deal with the remainder
476
75.5k
    l3 = in[in_size-1];
477
175k
    for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
478
99.8k
        unsigned char c3 = in[i3];
479
99.8k
        RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
480
99.8k
        l3 = c3;
481
99.8k
    }
482
483
883M
    for (; i0 >= 0; i0--, i1--, i2--, i3--) {
484
883M
        unsigned char c0, c1, c2, c3;
485
883M
        RansEncSymbol *s3 = &syms[c3 = in[i3]][l3];
486
883M
        RansEncSymbol *s2 = &syms[c2 = in[i2]][l2];
487
883M
        RansEncSymbol *s1 = &syms[c1 = in[i1]][l1];
488
883M
        RansEncSymbol *s0 = &syms[c0 = in[i0]][l0];
489
490
883M
        RansEncPutSymbol(&rans3, &ptr, s3);
491
883M
        RansEncPutSymbol(&rans2, &ptr, s2);
492
883M
        RansEncPutSymbol(&rans1, &ptr, s1);
493
883M
        RansEncPutSymbol(&rans0, &ptr, s0);
494
495
883M
        l0 = c0;
496
883M
        l1 = c1;
497
883M
        l2 = c2;
498
883M
        l3 = c3;
499
883M
    }
500
501
75.5k
    RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
502
75.5k
    RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
503
75.5k
    RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
504
75.5k
    RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
505
506
75.5k
    RansEncFlush(&rans3, &ptr);
507
75.5k
    RansEncFlush(&rans2, &ptr);
508
75.5k
    RansEncFlush(&rans1, &ptr);
509
75.5k
    RansEncFlush(&rans0, &ptr);
510
511
75.5k
    *out_size = (out_end - ptr) + tab_size;
512
513
75.5k
    cp = out;
514
75.5k
    memmove(out + tab_size, ptr, out_end-ptr);
515
516
75.5k
    htscodecs_tls_free(syms);
517
75.5k
    return out;
518
75.5k
}
519
520
//#define MAGIC2 111
521
291k
#define MAGIC2 179
522
//#define MAGIC2 0
523
524
static
525
unsigned char *rans_uncompress_O1_4x16(unsigned char *in, unsigned int in_size,
526
1.14k
                                       unsigned char *out, unsigned int out_sz) {
527
1.14k
    if (in_size < 16) // 4-states at least
528
10
        return NULL;
529
530
1.13k
    if (out_sz >= INT_MAX)
531
0
        return NULL; // protect against some overflow cases
532
533
1.13k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
534
1.13k
    if (out_sz > 100000)
535
0
        return NULL;
536
1.13k
#endif
537
538
    /* Load in the static tables */
539
1.13k
    unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL;
540
1.13k
    unsigned char *c_freq = NULL;
541
1.13k
    int i, j = -999;
542
1.13k
    unsigned int x;
543
544
1.13k
    uint8_t *sfb_ = htscodecs_tls_alloc(256*(TOTFREQ_O1+MAGIC2)*sizeof(*sfb_));
545
1.13k
    uint32_t (*s3)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])sfb_;
546
    // reuse the same memory for the fast mode lookup, but this only works
547
    // if we're on e.g. 12-bit freqs vs 10-bit freqs as needs 4x larger array.
548
    //uint32_t s3[256][TOTFREQ_O1_FAST];
549
550
1.13k
    if (!sfb_)
551
0
        return NULL;
552
1.13k
    fb_t (*fb)[256] = htscodecs_tls_alloc(256 * sizeof(*fb));
553
1.13k
    if (!fb)
554
0
        goto err;
555
1.13k
    uint8_t *sfb[256];
556
1.13k
    if ((*cp >> 4) == TF_SHIFT_O1) {
557
51.9k
        for (i = 0; i < 256; i++)
558
51.7k
            sfb[i]=  sfb_ + i*(TOTFREQ_O1+MAGIC2);
559
932
    } else {
560
239k
        for (i = 0; i < 256; i++)
561
238k
            sfb[i]=  sfb_ + i*(TOTFREQ_O1_FAST+MAGIC2);
562
932
    }
563
564
1.13k
    if (!out)
565
0
        out_free = out = malloc(out_sz);
566
567
1.13k
    if (!out)
568
0
        goto err;
569
570
    //fprintf(stderr, "out_sz=%d\n", out_sz);
571
572
    // compressed header? If so uncompress it
573
1.13k
    unsigned char *tab_end = NULL;
574
1.13k
    unsigned char *c_freq_end = cp_end;
575
1.13k
    unsigned int shift = *cp >> 4;
576
1.13k
    if (*cp++ & 1) {
577
81
        uint32_t u_freq_sz, c_freq_sz;
578
81
        cp += var_get_u32(cp, cp_end, &u_freq_sz);
579
81
        cp += var_get_u32(cp, cp_end, &c_freq_sz);
580
81
        if (c_freq_sz > cp_end - cp)
581
18
            goto err;
582
63
        tab_end = cp + c_freq_sz;
583
63
        if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, u_freq_sz)))
584
46
            goto err;
585
17
        cp = c_freq;
586
17
        c_freq_end = c_freq + u_freq_sz;
587
17
    }
588
589
    // Decode order-0 symbol list; avoids needing in order-1 tables
590
1.07k
    uint32_t F0[256] = {0};
591
1.07k
    int fsz = decode_alphabet(cp, c_freq_end, F0);
592
1.07k
    if (!fsz)
593
19
        goto err;
594
1.05k
    cp += fsz;
595
596
1.05k
    if (cp >= c_freq_end)
597
19
        goto err;
598
599
1.03k
    const int s3_fast_on = in_size >= 100000;
600
601
235k
    for (i = 0; i < 256; i++) {
602
234k
        if (F0[i] == 0)
603
233k
            continue;
604
605
1.63k
        uint32_t F[256] = {0}, T = 0;
606
1.63k
        fsz = decode_freq_d(cp, c_freq_end, F0, F, &T);
607
1.63k
        if (!fsz)
608
3
            goto err;
609
1.63k
        cp += fsz;
610
611
1.63k
        if (!T) {
612
            //fprintf(stderr, "No freq for F_%d\n", i);
613
847
            continue;
614
847
        }
615
616
788
        normalise_freq_shift(F, T, 1<<shift);
617
618
        // Build symbols; fixme, do as part of decode, see the _d variant
619
178k
        for (j = x = 0; j < 256; j++) {
620
178k
            if (F[j]) {
621
1.86k
                if (F[j] > (1<<shift) - x)
622
123
                    goto err;
623
624
1.74k
                if (shift == TF_SHIFT_O1_FAST && s3_fast_on) {
625
0
                    int y;
626
0
                    for (y = 0; y < F[j]; y++)
627
0
                        s3[i][y+x] = (((uint32_t)F[j])<<(shift+8)) |(y<<8) |j;
628
1.74k
                } else {
629
1.74k
                    memset(&sfb[i][x], j, F[j]);
630
1.74k
                    fb[i][j].f = F[j];
631
1.74k
                    fb[i][j].b = x;
632
1.74k
                }
633
1.74k
                x += F[j];
634
1.74k
            }
635
178k
        }
636
665
        if (x != (1<<shift))
637
0
            goto err;
638
665
    }
639
640
906
    if (tab_end)
641
0
        cp = tab_end;
642
906
    free(c_freq);
643
906
    c_freq = NULL;
644
645
906
    if (cp+16 > cp_end)
646
6
        goto err;
647
648
900
    RansState rans0, rans1, rans2, rans3;
649
900
    uint8_t *ptr = cp, *ptr_end = in + in_size - 8;
650
900
    RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto err;
651
894
    RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto err;
652
892
    RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto err;
653
892
    RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto err;
654
655
889
    unsigned int isz4 = out_sz>>2;
656
889
    int l0 = 0, l1 = 0, l2 = 0, l3 = 0;
657
889
    unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
658
659
889
    RansState R[4];
660
889
    R[0] = rans0;
661
889
    R[1] = rans1;
662
889
    R[2] = rans2;
663
889
    R[3] = rans3;
664
665
    // Around 15% faster to specialise for 10/12 than to have one
666
    // loop with shift as a variable.
667
889
    if (shift == TF_SHIFT_O1) {
668
        // TF_SHIFT_O1 = 12
669
670
172
        const uint32_t mask = ((1u << TF_SHIFT_O1)-1);
671
38.9k
        for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
672
38.7k
            uint16_t m, c;
673
38.7k
            c = sfb[l0][m = R[0] & mask];
674
38.7k
            R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1) + m - fb[l0][c].b;
675
38.7k
            out[i4[0]] = l0 = c;
676
677
38.7k
            c = sfb[l1][m = R[1] & mask];
678
38.7k
            R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1) + m - fb[l1][c].b;
679
38.7k
            out[i4[1]] = l1 = c;
680
681
38.7k
            c = sfb[l2][m = R[2] & mask];
682
38.7k
            R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1) + m - fb[l2][c].b;
683
38.7k
            out[i4[2]] = l2 = c;
684
685
38.7k
            c = sfb[l3][m = R[3] & mask];
686
38.7k
            R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1) + m - fb[l3][c].b;
687
38.7k
            out[i4[3]] = l3 = c;
688
689
38.7k
            if (ptr < ptr_end) {
690
2.08k
                RansDecRenorm(&R[0], &ptr);
691
2.08k
                RansDecRenorm(&R[1], &ptr);
692
2.08k
                RansDecRenorm(&R[2], &ptr);
693
2.08k
                RansDecRenorm(&R[3], &ptr);
694
36.7k
            } else {
695
36.7k
                RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
696
36.7k
                RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
697
36.7k
                RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
698
36.7k
                RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
699
36.7k
            }
700
38.7k
        }
701
702
        // Remainder
703
538
        for (; i4[3] < out_sz; i4[3]++) {
704
366
            uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1)-1);
705
366
            unsigned char c3 = sfb[l3][m3];
706
366
            out[i4[3]] = c3;
707
366
            R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1) + m3 - fb[l3][c3].b;
708
366
            RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
709
366
            l3 = c3;
710
366
        }
711
717
    } else if (!s3_fast_on) {
712
        // TF_SHIFT_O1 = 10 with sfb[256][1024] & fb[256]256] array lookup
713
        // Slightly faster for -o193 on q4 (high comp), but also less
714
        // initialisation cost for smaller data
715
717
        const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
716
255k
        for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
717
254k
            uint16_t m, c;
718
254k
            c = sfb[l0][m = R[0] & mask];
719
254k
            R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1_FAST) + m - fb[l0][c].b;
720
254k
            out[i4[0]] = l0 = c;
721
722
254k
            c = sfb[l1][m = R[1] & mask];
723
254k
            R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1_FAST) + m - fb[l1][c].b;
724
254k
            out[i4[1]] = l1 = c;
725
726
254k
            c = sfb[l2][m = R[2] & mask];
727
254k
            R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1_FAST) + m - fb[l2][c].b;
728
254k
            out[i4[2]] = l2 = c;
729
730
254k
            c = sfb[l3][m = R[3] & mask];
731
254k
            R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1_FAST) + m - fb[l3][c].b;
732
254k
            out[i4[3]] = l3 = c;
733
734
254k
            if (ptr < ptr_end) {
735
10.6k
                RansDecRenorm(&R[0], &ptr);
736
10.6k
                RansDecRenorm(&R[1], &ptr);
737
10.6k
                RansDecRenorm(&R[2], &ptr);
738
10.6k
                RansDecRenorm(&R[3], &ptr);
739
244k
            } else {
740
244k
                RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
741
244k
                RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
742
244k
                RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
743
244k
                RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
744
244k
            }
745
254k
        }
746
747
        // Remainder
748
1.54k
        for (; i4[3] < out_sz; i4[3]++) {
749
826
            uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1_FAST)-1);
750
826
            unsigned char c3 = sfb[l3][m3];
751
826
            out[i4[3]] = c3;
752
826
            R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1_FAST) + m3 - fb[l3][c3].b;
753
826
            RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
754
826
            l3 = c3;
755
826
        }
756
717
    } else {
757
        // TF_SHIFT_O1_FAST.
758
        // Significantly faster for -o1 on q40 (low comp).
759
        // Higher initialisation cost, so only use if big blocks.
760
0
        const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
761
0
        for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
762
0
            uint32_t S0 = s3[l0][R[0] & mask];
763
0
            uint32_t S1 = s3[l1][R[1] & mask];
764
0
            l0 = out[i4[0]] = S0;
765
0
            l1 = out[i4[1]] = S1;
766
0
            uint16_t F0 = S0>>(TF_SHIFT_O1_FAST+8);
767
0
            uint16_t F1 = S1>>(TF_SHIFT_O1_FAST+8);
768
0
            uint16_t B0 = (S0>>8) & mask;
769
0
            uint16_t B1 = (S1>>8) & mask;
770
771
0
            R[0] = F0 * (R[0]>>TF_SHIFT_O1_FAST) + B0;
772
0
            R[1] = F1 * (R[1]>>TF_SHIFT_O1_FAST) + B1;
773
774
0
            uint32_t S2 = s3[l2][R[2] & mask];
775
0
            uint32_t S3 = s3[l3][R[3] & mask];
776
0
            l2 = out[i4[2]] = S2;
777
0
            l3 = out[i4[3]] = S3;
778
0
            uint16_t F2 = S2>>(TF_SHIFT_O1_FAST+8);
779
0
            uint16_t F3 = S3>>(TF_SHIFT_O1_FAST+8);
780
0
            uint16_t B2 = (S2>>8) & mask;
781
0
            uint16_t B3 = (S3>>8) & mask;
782
783
0
            R[2] = F2 * (R[2]>>TF_SHIFT_O1_FAST) + B2;
784
0
            R[3] = F3 * (R[3]>>TF_SHIFT_O1_FAST) + B3;
785
786
0
            if (ptr < ptr_end) {
787
0
                RansDecRenorm(&R[0], &ptr);
788
0
                RansDecRenorm(&R[1], &ptr);
789
0
                RansDecRenorm(&R[2], &ptr);
790
0
                RansDecRenorm(&R[3], &ptr);
791
0
            } else {
792
0
                RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
793
0
                RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
794
0
                RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
795
0
                RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
796
0
            }
797
0
        }
798
799
        // Remainder
800
0
        for (; i4[3] < out_sz; i4[3]++) {
801
0
            uint32_t S = s3[l3][R[3] & ((1u<<TF_SHIFT_O1_FAST)-1)];
802
0
            l3 = out[i4[3]] = S;
803
0
            R[3] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[3]>>TF_SHIFT_O1_FAST)
804
0
                + ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
805
0
            RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
806
0
        }
807
0
    }
808
    //fprintf(stderr, "    1 Decoded %d bytes\n", (int)(ptr-in)); //c-size
809
810
889
    htscodecs_tls_free(fb);
811
889
    htscodecs_tls_free(sfb_);
812
889
    return out;
813
814
245
 err:
815
245
    htscodecs_tls_free(fb);
816
245
    htscodecs_tls_free(sfb_);
817
245
    free(out_free);
818
245
    free(c_freq);
819
820
245
    return NULL;
821
892
}
822
823
/*-----------------------------------------------------------------------------
824
 * r32x16 implementation, included here for now for simplicity
825
 */
826
#include "rANS_static32x16pr.h"
827
828
static int rans_cpu = 0xFFFF; // all
829
830
#if defined(__x86_64__) && \
831
    defined(HAVE_DECL___CPUID_COUNT)   && HAVE_DECL___CPUID_COUNT && \
832
    defined(HAVE_DECL___GET_CPUID_MAX) && HAVE_DECL___GET_CPUID_MAX
833
#include <cpuid.h>
834
835
#if defined(__clang__) && defined(__has_attribute)
836
#  if __has_attribute(unused)
837
#    define UNUSED __attribute__((unused))
838
#  else
839
#    define UNUSED
840
#  endif
841
#elif defined(__GNUC__) && __GNUC__ >= 3
842
#  define UNUSED __attribute__((unused))
843
#else
844
#  define UNUSED
845
#endif
846
847
#if defined(__APPLE__)
848
/*
849
   MacOS before 12.2 (a.k.a. Darwin 21.3) had a bug that could cause random
850
   failures on some AVX512 operations due to opmask registers not being restored
851
   correctly following interrupts.  For simplicity, check that the major version
852
   is 22 or higher before using AVX512.
853
   See https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
854
*/
855
856
#include <sys/utsname.h>
857
static inline int not_ancient_darwin(void) {
858
    static long version = 0;
859
    if (!version) {
860
        struct utsname uname_info;
861
        if (uname(&uname_info) == 0) {
862
            version = strtol(uname_info.release, NULL, 10);
863
        }
864
    }
865
    return version >= 22;
866
}
867
#else
868
0
static inline int not_ancient_darwin(void) {
869
0
    return 1;
870
0
}
871
#endif
872
873
874
// CPU detection is performed once.  NB this has an assumption that we're
875
// not migrating between processes with different instruction stes, but
876
// to date the only systems I know of that support this don't have different
877
// capabilities (that we use) per core.
878
#ifndef NO_THREADS
879
static pthread_once_t rans_cpu_once = PTHREAD_ONCE_INIT;
880
#endif
881
882
static int have_ssse3   UNUSED = 0;
883
static int have_sse4_1  UNUSED = 0;
884
static int have_popcnt  UNUSED = 0;
885
static int have_avx2    UNUSED = 0;
886
static int have_avx512f UNUSED = 0;
887
static int is_amd       UNUSED = 0;
888
889
#define HAVE_HTSCODECS_TLS_CPU_INIT
890
1
static void htscodecs_tls_cpu_init(void) {
891
1
    unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
892
1
    unsigned int have_xsave UNUSED = 0;
893
1
    unsigned int have_avx   UNUSED = 0;
894
1
    uint64_t xcr0 UNUSED = 0ULL;
895
1
    const uint64_t xcr0_can_use_avx UNUSED = (1ULL << 2);
896
1
    const uint64_t xcr0_can_use_avx512 UNUSED = (7ULL << 5);
897
    // These may be unused, depending on HAVE_* config.h macros
898
899
1
    int level = __get_cpuid_max(0, NULL);
900
1
    __cpuid_count(0, 0, eax, ebx, ecx, edx);
901
1
    is_amd = (ecx == 0x444d4163);
902
1
    if (level >= 1) {
903
1
        __cpuid_count(1, 0, eax, ebx, ecx, edx);
904
1
#if defined(bit_SSSE3)
905
1
        have_ssse3 = ecx & bit_SSSE3;
906
1
#endif
907
1
#if defined(bit_POPCNT)
908
1
        have_popcnt = ecx & bit_POPCNT;
909
1
#endif
910
1
#if defined(bit_SSE4_1)
911
1
        have_sse4_1 = ecx & bit_SSE4_1;
912
1
#endif
913
1
#if defined(bit_AVX)
914
1
        have_avx = ecx & bit_AVX;
915
1
#endif
916
1
#if defined(bit_XSAVE) && defined(bit_OSXSAVE)
917
1
        have_xsave = (ecx & bit_XSAVE) && (ecx & bit_OSXSAVE);
918
1
        if (have_xsave) {
919
            /* OSXSAVE tells us it's safe to use XGETBV to read XCR0
920
               which then describes if AVX / AVX512 instructions can be
921
               executed.  See Intel 64 and IA-32 Architectures Software
922
               Developer’s Manual Vol. 1 sections 13.2 and 13.3.
923
924
               Use inline assembly for XGETBV here to avoid problems
925
               with builtins either not working correctly, or requiring
926
               specific compiler options to be in use.  Also emit raw
927
               bytes here as older toolchains may not have the XGETBV
928
               instruction.
929
            */
930
1
            __asm__ volatile (".byte 0x0f, 0x01, 0xd0" :
931
1
                              "=d" (edx), "=a" (eax) :
932
1
                              "c" (0));
933
1
            xcr0 = ((uint64_t) edx << 32) | eax;
934
1
        }
935
1
#endif
936
1
    }
937
    // AVX2 and AVX512F depend on XSAVE, AVX and bit 2 of XCR0.
938
1
    if (level >= 7 && have_xsave && have_avx
939
1
        && (xcr0 & xcr0_can_use_avx) == xcr0_can_use_avx) {
940
1
        __cpuid_count(7, 0, eax, ebx, ecx, edx);
941
1
#if defined(bit_AVX2)
942
1
        have_avx2 = ebx & bit_AVX2;
943
1
#endif
944
1
#if defined(bit_AVX512F)
945
        // AVX512 depends on bits 5:7 of XCR0
946
1
        if ((xcr0 & xcr0_can_use_avx512) == xcr0_can_use_avx512
947
0
            && not_ancient_darwin())
948
0
            have_avx512f = ebx & bit_AVX512F;
949
1
#endif
950
1
    }
951
952
1
    if (!have_popcnt) have_avx512f = have_avx2 = have_sse4_1 = 0;
953
1
    if (!have_ssse3)  have_sse4_1 = 0;
954
1
}
955
956
static inline
957
unsigned char *(*rans_enc_func(int do_simd, int order))
958
    (unsigned char *in,
959
     unsigned int in_size,
960
     unsigned char *out,
961
649k
     unsigned int *out_size) {
962
963
649k
    if (!do_simd) { // SIMD disabled
964
643k
        return order & 1
965
643k
            ? rans_compress_O1_4x16
966
643k
            : rans_compress_O0_4x16;
967
643k
    }
968
969
#ifdef NO_THREADS
970
    htscodecs_tls_cpu_init();
971
#else
972
5.68k
    int err = pthread_once(&rans_cpu_once, htscodecs_tls_cpu_init);
973
5.68k
    if (err != 0) {
974
0
        fprintf(stderr, "Initialising TLS data failed: pthread_once: %s\n",
975
0
                strerror(err));
976
0
        fprintf(stderr, "Using scalar code only\n");
977
0
    }
978
5.68k
#endif
979
980
5.68k
    int have_e_sse4_1  = have_sse4_1;
981
5.68k
    int have_e_avx2    = have_avx2;
982
5.68k
    int have_e_avx512f = have_avx512f;
983
984
5.68k
    if (!(rans_cpu & RANS_CPU_ENC_AVX512)) have_e_avx512f = 0;
985
5.68k
    if (!(rans_cpu & RANS_CPU_ENC_AVX2))   have_e_avx2    = 0;
986
5.68k
    if (!(rans_cpu & RANS_CPU_ENC_SSE4))   have_e_sse4_1  = 0;
987
988
5.68k
    if (order & 1) {
989
        // With simulated gathers, the AVX512 is now slower than AVX2, so
990
        // we avoid using it unless asking for the real avx512 gather.
991
        // Note for testing we do -c 0x0404 to enable AVX512 and disable AVX2.
992
        // We then need to call the avx512 func regardless.
993
1.84k
        int use_gather;
994
#ifdef USE_GATHER
995
        use_gather = 1;
996
#else
997
1.84k
        use_gather = !have_e_avx2;
998
1.84k
#endif
999
1000
1.84k
#if defined(HAVE_AVX512)
1001
1.84k
        if (have_e_avx512f && (!is_amd || !have_e_avx2) && use_gather)
1002
0
            return rans_compress_O1_32x16_avx512;
1003
1.84k
#endif
1004
1.84k
#if defined(HAVE_AVX2)
1005
1.84k
        if (have_e_avx2)
1006
1.84k
            return rans_compress_O1_32x16_avx2;
1007
0
#endif
1008
0
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
1009
0
        if (have_e_sse4_1)
1010
0
            return rans_compress_O1_32x16;
1011
0
#endif
1012
0
        return rans_compress_O1_32x16;
1013
3.83k
    } else {
1014
3.83k
#if defined(HAVE_AVX512)
1015
3.83k
        if (have_e_avx512f && (!is_amd || !have_e_avx2))
1016
0
            return rans_compress_O0_32x16_avx512;
1017
3.83k
#endif
1018
3.83k
#if defined(HAVE_AVX2)
1019
3.83k
        if (have_e_avx2)
1020
3.83k
            return rans_compress_O0_32x16_avx2;
1021
0
#endif
1022
0
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
1023
0
        if (have_e_sse4_1)
1024
0
            return rans_compress_O0_32x16;
1025
0
#endif
1026
0
        return rans_compress_O0_32x16;
1027
0
    }
1028
5.68k
}
1029
1030
static inline
1031
unsigned char *(*rans_dec_func(int do_simd, int order))
1032
    (unsigned char *in,
1033
     unsigned int in_size,
1034
     unsigned char *out,
1035
11.3k
     unsigned int out_size) {
1036
1037
11.3k
    if (!do_simd) { // SIMD disabled
1038
2.87k
        return order & 1
1039
2.87k
            ? rans_uncompress_O1_4x16
1040
2.87k
            : rans_uncompress_O0_4x16;
1041
2.87k
    }
1042
1043
#ifdef NO_THREADS
1044
    htscodecs_tls_cpu_init();
1045
#else
1046
8.43k
    int err = pthread_once(&rans_cpu_once, htscodecs_tls_cpu_init);
1047
8.43k
    if (err != 0) {
1048
0
        fprintf(stderr, "Initialising TLS data failed: pthread_once: %s\n",
1049
0
                strerror(err));
1050
0
        fprintf(stderr, "Using scalar code only\n");
1051
0
    }
1052
8.43k
#endif
1053
1054
8.43k
    int have_d_sse4_1  = have_sse4_1;
1055
8.43k
    int have_d_avx2    = have_avx2;
1056
8.43k
    int have_d_avx512f = have_avx512f;
1057
1058
8.43k
    if (!(rans_cpu & RANS_CPU_DEC_AVX512)) have_d_avx512f = 0;
1059
8.43k
    if (!(rans_cpu & RANS_CPU_DEC_AVX2))   have_d_avx2    = 0;
1060
8.43k
    if (!(rans_cpu & RANS_CPU_DEC_SSE4))   have_d_sse4_1  = 0;
1061
1062
8.43k
    if (order & 1) {
1063
7.16k
#if defined(HAVE_AVX512)
1064
7.16k
        if (have_d_avx512f)
1065
0
            return rans_uncompress_O1_32x16_avx512;
1066
7.16k
#endif
1067
7.16k
#if defined(HAVE_AVX2)
1068
7.16k
        if (have_d_avx2)
1069
7.16k
            return rans_uncompress_O1_32x16_avx2;
1070
0
#endif
1071
0
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
1072
0
        if (have_d_sse4_1)
1073
0
            return rans_uncompress_O1_32x16_sse4;
1074
0
#endif
1075
0
        return rans_uncompress_O1_32x16;
1076
1.26k
    } else {
1077
1.26k
#if defined(HAVE_AVX512)
1078
1.26k
        if (have_d_avx512f)
1079
0
            return rans_uncompress_O0_32x16_avx512;
1080
1.26k
#endif
1081
1.26k
#if defined(HAVE_AVX2)
1082
1.26k
        if (have_d_avx2)
1083
1.26k
            return rans_uncompress_O0_32x16_avx2;
1084
0
#endif
1085
0
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
1086
0
        if (have_d_sse4_1)
1087
0
            return rans_uncompress_O0_32x16_sse4;
1088
0
#endif
1089
0
        return rans_uncompress_O0_32x16;
1090
0
    }
1091
8.43k
}
1092
1093
#elif defined(__ARM_NEON) && defined(__aarch64__)
1094
1095
#if defined(__linux__) || defined(__FreeBSD__)
1096
#include <sys/auxv.h>
1097
#elif defined(_WIN32)
1098
#include <processthreadsapi.h>
1099
#endif
1100
1101
static inline int have_neon(void) {
1102
#if defined(__linux__) && defined(__arm__)
1103
    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
1104
#elif defined(__linux__) && defined(__aarch64__) && defined(HWCAP_ASIMD)
1105
    return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0;
1106
#elif defined(__APPLE__)
1107
    return 1;
1108
#elif defined(__FreeBSD__) && defined(__arm__)
1109
    unsigned long cap;
1110
    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
1111
    return (cap & HWCAP_NEON) != 0;
1112
#elif defined(__FreeBSD__) && defined(__aarch64__) && defined(HWCAP_ASIMD)
1113
    unsigned long cap;
1114
    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
1115
    return (cap & HWCAP_ASIMD) != 0;
1116
#elif defined(_WIN32)
1117
    return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0;
1118
#else
1119
    return 0;
1120
#endif
1121
}
1122
1123
static inline
1124
unsigned char *(*rans_enc_func(int do_simd, int order))
1125
    (unsigned char *in,
1126
     unsigned int in_size,
1127
     unsigned char *out,
1128
     unsigned int *out_size) {
1129
1130
    if (do_simd) {
1131
        if ((rans_cpu & RANS_CPU_ENC_NEON) && have_neon())
1132
            return order & 1
1133
                ? rans_compress_O1_32x16_neon
1134
                : rans_compress_O0_32x16_neon;
1135
        else
1136
            return order & 1
1137
                ? rans_compress_O1_32x16
1138
                : rans_compress_O0_32x16;
1139
    } else {
1140
        return order & 1
1141
            ? rans_compress_O1_4x16
1142
            : rans_compress_O0_4x16;
1143
    }
1144
}
1145
1146
static inline
1147
unsigned char *(*rans_dec_func(int do_simd, int order))
1148
    (unsigned char *in,
1149
     unsigned int in_size,
1150
     unsigned char *out,
1151
     unsigned int out_size) {
1152
1153
    if (do_simd) {
1154
        if ((rans_cpu & RANS_CPU_DEC_NEON) && have_neon())
1155
            return order & 1
1156
                ? rans_uncompress_O1_32x16_neon
1157
                : rans_uncompress_O0_32x16_neon;
1158
        else
1159
            return order & 1
1160
                ? rans_uncompress_O1_32x16
1161
                : rans_uncompress_O0_32x16;
1162
    } else {
1163
        return order & 1
1164
            ? rans_uncompress_O1_4x16
1165
            : rans_uncompress_O0_4x16;
1166
    }
1167
}
1168
1169
#else // !(defined(__GNUC__) && defined(__x86_64__)) && !defined(__ARM_NEON)
1170
1171
static inline
1172
unsigned char *(*rans_enc_func(int do_simd, int order))
1173
    (unsigned char *in,
1174
     unsigned int in_size,
1175
     unsigned char *out,
1176
     unsigned int *out_size) {
1177
1178
    if (do_simd) {
1179
        return order & 1
1180
            ? rans_compress_O1_32x16
1181
            : rans_compress_O0_32x16;
1182
    } else {
1183
        return order & 1
1184
            ? rans_compress_O1_4x16
1185
            : rans_compress_O0_4x16;
1186
    }
1187
}
1188
1189
static inline
1190
unsigned char *(*rans_dec_func(int do_simd, int order))
1191
    (unsigned char *in,
1192
     unsigned int in_size,
1193
     unsigned char *out,
1194
     unsigned int out_size) {
1195
1196
    if (do_simd) {
1197
        return order & 1
1198
            ? rans_uncompress_O1_32x16
1199
            : rans_uncompress_O0_32x16;
1200
    } else {
1201
        return order & 1
1202
            ? rans_uncompress_O1_4x16
1203
            : rans_uncompress_O0_4x16;
1204
    }
1205
}
1206
1207
#endif
1208
1209
// Test interface for restricting the auto-detection methods so we
1210
// can forcibly compare different implementations on the same machine.
1211
// See RANS_CPU_ defines in rANS_static4x16.h
1212
0
void rans_set_cpu(int opts) {
1213
0
    rans_cpu = opts;
1214
0
#ifdef HAVE_HTSCODECS_TLS_CPU_INIT
1215
0
    htscodecs_tls_cpu_init();
1216
0
#endif
1217
0
}
1218
1219
/*-----------------------------------------------------------------------------
1220
 * Simple interface to the order-0 vs order-1 encoders and decoders.
1221
 *
1222
 * Smallest is method, <in_size> <input>, so worst case 2 bytes longer.
1223
 */
1224
unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
1225
                                     unsigned char *out,unsigned int *out_size,
1226
628k
                                     int order) {
1227
628k
    if (in_size > INT_MAX || (out && *out_size == 0)) {
1228
0
        *out_size = 0;
1229
0
        return NULL;
1230
0
    }
1231
1232
#ifdef VALIDATE_RANS
1233
    int orig_order = order;
1234
    int orig_in_size = in_size;
1235
    unsigned char *orig_in = in;
1236
#endif
1237
1238
628k
    unsigned int c_meta_len;
1239
628k
    uint8_t *meta = NULL, *rle = NULL, *packed = NULL;
1240
628k
    uint8_t *out_free = NULL;
1241
1242
628k
    if (!out) {
1243
377k
        *out_size = rans_compress_bound_4x16(in_size, order);
1244
377k
        if (*out_size == 0)
1245
0
            return NULL;
1246
377k
        if (!(out_free = out = malloc(*out_size))) {
1247
0
            *out_size = 0;
1248
0
            return NULL;
1249
0
        }
1250
377k
    }
1251
1252
628k
    unsigned char *out_end = out + *out_size;
1253
1254
    // Permit 32-way unrolling for large blocks, paving the way for
1255
    // AVX2 and AVX512 SIMD variants.
1256
628k
    if ((order & RANS_ORDER_SIMD_AUTO) && in_size >= 50000
1257
10.0k
        && !(order & RANS_ORDER_STRIPE))
1258
8.46k
        order |= X_32;
1259
1260
628k
    if (in_size <= 20)
1261
470k
        order &= ~RANS_ORDER_STRIPE;
1262
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1263
    if (in_size <= 1000)
1264
        order &= ~RANS_ORDER_X32;
1265
#endif
1266
628k
    if (order & RANS_ORDER_STRIPE) {
1267
15.3k
        int N = (order>>8) & 0xff;
1268
15.3k
        if (N == 0) N = 4; // default for compatibility with old tests
1269
1270
15.3k
        if (N > in_size)
1271
0
            N = in_size;
1272
1273
15.3k
        unsigned char *transposed = malloc(in_size);
1274
15.3k
        unsigned int part_len[256];
1275
15.3k
        unsigned int idx[256];
1276
15.3k
        if (!transposed) {
1277
0
            free(out_free);
1278
0
            *out_size = 0;
1279
0
            return NULL;
1280
0
        }
1281
15.3k
        int i, j, x;
1282
1283
76.8k
        for (i = 0; i < N; i++) {
1284
61.4k
            part_len[i] = in_size / N + ((in_size % N) > i);
1285
61.4k
            idx[i] = i ? idx[i-1] + part_len[i-1] : 0; // cumulative index
1286
61.4k
        }
1287
1288
4.28G
#define KN 8
1289
15.3k
        i = x = 0;
1290
15.3k
        if (in_size >= N*KN) {
1291
109M
            for (; i < in_size-N*KN;) {
1292
109M
                int k;
1293
109M
                unsigned char *ink = in+i;
1294
549M
                for (j = 0; j < N; j++)
1295
3.95G
                    for (k = 0; k < KN; k++)
1296
3.51G
                        transposed[idx[j]+x+k] = ink[j+N*k];
1297
109M
                x += KN; i+=N*KN;
1298
109M
            }
1299
11.1k
        }
1300
15.3k
#undef KN
1301
78.2k
        for (; i < in_size-N; i += N, x++) {
1302
314k
            for (j = 0; j < N; j++)
1303
251k
                transposed[idx[j]+x] = in[i+j];
1304
62.8k
        }
1305
1306
30.7k
        for (; i < in_size; i += N, x++) {
1307
58.4k
            for (j = 0; i+j < in_size; j++)
1308
43.0k
                transposed[idx[j]+x] = in[i+j];
1309
15.3k
        }
1310
1311
15.3k
        unsigned int olen2;
1312
15.3k
        unsigned char *out2, *out2_start;
1313
15.3k
        c_meta_len = 1;
1314
15.3k
        *out = order & ~RANS_ORDER_NOSZ;
1315
15.3k
        c_meta_len += var_put_u32(out+c_meta_len, out_end, in_size);
1316
15.3k
        if (c_meta_len >= *out_size) {
1317
0
            free(out_free);
1318
0
            free(transposed);
1319
0
            *out_size = 0;
1320
0
            return NULL;
1321
0
        }
1322
1323
15.3k
        out[c_meta_len++] = N;
1324
        
1325
15.3k
        unsigned char *out_best = NULL;
1326
15.3k
        unsigned int out_best_len = 0;
1327
1328
15.3k
        out2_start = out2 = out+7+5*N; // shares a buffer with c_meta
1329
76.8k
        for (i = 0; i < N; i++) {
1330
            // Brute force try all methods.
1331
61.4k
            uint8_t *r;
1332
61.4k
            int j, m[] = {1,64,128,0}, best_j = 0, best_sz = INT_MAX;
1333
307k
            for (j = 0; j < sizeof(m)/sizeof(*m); j++) {
1334
245k
                if ((order & m[j]) != m[j])
1335
119k
                    continue;
1336
1337
                // order-1 *only*; bit check above cannot elide order-0
1338
126k
                if ((order & RANS_ORDER_STRIPE_NO0) && (m[j]&1) == 0)
1339
0
                    continue;
1340
1341
126k
                if (out2 - out > *out_size)
1342
0
                    continue; // an error, but caught in best_sz check later
1343
1344
126k
                olen2 = *out_size - (out2 - out);
1345
126k
                r = rans_compress_to_4x16(transposed+idx[i], part_len[i],
1346
126k
                                          out2, &olen2,
1347
126k
                                          m[j] | RANS_ORDER_NOSZ
1348
126k
                                          | (order&RANS_ORDER_X32));
1349
126k
                if (r && olen2 && best_sz > olen2) {
1350
81.6k
                    best_sz = olen2;
1351
81.6k
                    best_j = j;
1352
81.6k
                    if (j < sizeof(m)/sizeof(*m) && olen2 > out_best_len) {
1353
17.2k
                        unsigned char *tmp = realloc(out_best, olen2);
1354
17.2k
                        if (!tmp) {
1355
0
                            free(out_free);
1356
0
                            free(transposed);
1357
0
                            *out_size = 0;
1358
0
                            return NULL;
1359
0
                        }
1360
17.2k
                        out_best = tmp;
1361
17.2k
                        out_best_len = olen2;
1362
17.2k
                    }
1363
1364
                    // Cache a copy of the best so far
1365
81.6k
                    memcpy(out_best, out2, olen2);
1366
81.6k
                }
1367
126k
            }
1368
1369
61.4k
            if (best_sz == INT_MAX) {
1370
0
                free(out_best);
1371
0
                free(out_free);
1372
0
                free(transposed);
1373
0
                *out_size = 0;
1374
0
                return NULL;
1375
0
            }
1376
1377
61.4k
            if (best_j < sizeof(m)/sizeof(*m)) {
1378
                // Copy the best compression to output buffer if not current
1379
61.4k
                memcpy(out2, out_best, best_sz);
1380
61.4k
                olen2 = best_sz;
1381
61.4k
            }
1382
1383
61.4k
            out2 += olen2;
1384
61.4k
            c_meta_len += var_put_u32(out+c_meta_len, out_end, olen2);
1385
61.4k
        }
1386
15.3k
        if (out_best)
1387
15.3k
            free(out_best);
1388
1389
15.3k
        memmove(out+c_meta_len, out2_start, out2-out2_start);
1390
15.3k
        free(transposed);
1391
15.3k
        *out_size = c_meta_len + out2-out2_start;
1392
15.3k
        return out;
1393
15.3k
    }
1394
1395
612k
    if (order & RANS_ORDER_CAT) {
1396
0
        out[0] = RANS_ORDER_CAT;
1397
0
        c_meta_len = 1;
1398
0
        c_meta_len += var_put_u32(&out[1], out_end, in_size);
1399
1400
0
        if (c_meta_len + in_size > *out_size) {
1401
0
            free(out_free);
1402
0
            *out_size = 0;
1403
0
            return NULL;
1404
0
        }
1405
0
        if (in_size)
1406
0
            memcpy(out+c_meta_len, in, in_size);
1407
0
        *out_size = c_meta_len + in_size;
1408
0
        return out;
1409
0
    }
1410
1411
612k
    int do_pack = order & RANS_ORDER_PACK;
1412
612k
    int do_rle  = order & RANS_ORDER_RLE;
1413
612k
    int no_size = order & RANS_ORDER_NOSZ;
1414
612k
    int do_simd = order & RANS_ORDER_X32;
1415
1416
612k
    out[0] = order;
1417
612k
    c_meta_len = 1;
1418
1419
612k
    if (!no_size)
1420
485k
        c_meta_len += var_put_u32(&out[1], out_end, in_size);
1421
1422
612k
    order &= 3;
1423
1424
    // Format is compressed meta-data, compressed data.
1425
    // Meta-data can be empty, pack, rle lengths, or pack + rle lengths.
1426
    // Data is either the original data, bit-packed packed, rle literals or
1427
    // packed + rle literals.
1428
1429
612k
    if (do_pack && in_size) {
1430
        // PACK 2, 4 or 8 symbols into one byte.
1431
187k
        int pmeta_len;
1432
187k
        uint64_t packed_len;
1433
187k
        if (c_meta_len + 256 > *out_size) {
1434
0
            free(out_free);
1435
0
            *out_size = 0;
1436
0
            return NULL;
1437
0
        }
1438
187k
        packed = hts_pack(in, in_size, out+c_meta_len, &pmeta_len, &packed_len);
1439
187k
        if (!packed) {
1440
4.31k
            out[0] &= ~RANS_ORDER_PACK;
1441
4.31k
            do_pack = 0;
1442
4.31k
            free(packed);
1443
4.31k
            packed = NULL;
1444
183k
        } else {
1445
183k
            in = packed;
1446
183k
            in_size = packed_len;
1447
183k
            c_meta_len += pmeta_len;
1448
1449
            // Could derive this rather than storing verbatim.
1450
            // Orig size * 8/nbits (+1 if not multiple of 8/n)
1451
183k
            int sz = var_put_u32(out+c_meta_len, out_end, in_size);
1452
183k
            c_meta_len += sz;
1453
183k
            *out_size -= sz;
1454
1455
183k
            if (do_simd && in_size < 32) {
1456
1.12k
                do_simd = 0;
1457
1.12k
                out[0] &= ~RANS_ORDER_X32;
1458
1.12k
            }
1459
183k
        }
1460
425k
    } else if (do_pack) {
1461
0
        out[0] &= ~RANS_ORDER_PACK;
1462
0
    }
1463
1464
612k
    if (do_rle && in_size) {
1465
        // RLE 'in' -> rle_length + rle_literals arrays
1466
110k
        unsigned int rmeta_len, c_rmeta_len;
1467
110k
        uint64_t rle_len;
1468
110k
        c_rmeta_len = in_size+257;
1469
110k
        if (!(meta = malloc(c_rmeta_len))) {
1470
0
            free(out_free);
1471
0
            *out_size = 0;
1472
0
            return NULL;
1473
0
        }
1474
1475
110k
        uint8_t rle_syms[256];
1476
110k
        int rle_nsyms = 0;
1477
110k
        uint64_t rmeta_len64;
1478
110k
        rle = hts_rle_encode(in, in_size, meta, &rmeta_len64,
1479
110k
                             rle_syms, &rle_nsyms, NULL, &rle_len);
1480
110k
        memmove(meta+1+rle_nsyms, meta, rmeta_len64);
1481
110k
        meta[0] = rle_nsyms;
1482
110k
        memcpy(meta+1, rle_syms, rle_nsyms);
1483
110k
        rmeta_len = rmeta_len64 + rle_nsyms+1;
1484
1485
110k
        if (!rle || rle_len + rmeta_len >= .99*in_size) {
1486
            // Not worth the speed hit.
1487
74.2k
            out[0] &= ~RANS_ORDER_RLE;
1488
74.2k
            do_rle = 0;
1489
74.2k
            free(rle);
1490
74.2k
            rle = NULL;
1491
74.2k
        } else {
1492
            // Compress lengths with O0 and literals with O0/O1 ("order" param)
1493
36.2k
            int sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2), sz2;
1494
36.2k
            sz += var_put_u32(out+c_meta_len+sz, out_end, rle_len);
1495
36.2k
            if ((c_meta_len+sz+5) > *out_size) {
1496
0
                free(out_free);
1497
0
                free(rle);
1498
0
                free(meta);
1499
0
                free(packed);
1500
0
                *out_size = 0;
1501
0
                return NULL;
1502
0
            }
1503
36.2k
            c_rmeta_len = *out_size - (c_meta_len+sz+5);
1504
36.2k
            if (do_simd && (rmeta_len < 32 || rle_len < 32)) {
1505
2.21k
                do_simd = 0;
1506
2.21k
                out[0] &= ~RANS_ORDER_X32;
1507
2.21k
            }
1508
36.2k
            if (!rans_enc_func(do_simd, 0)(meta, rmeta_len, out+c_meta_len+sz+5, &c_rmeta_len)) {
1509
0
                free(out_free);
1510
0
                free(rle);
1511
0
                free(meta);
1512
0
                free(packed);
1513
0
                *out_size = 0;
1514
0
                return NULL;
1515
0
            }
1516
36.2k
            if (c_rmeta_len < rmeta_len) {
1517
2.03k
                sz2 = var_put_u32(out+c_meta_len+sz, out_end, c_rmeta_len);
1518
2.03k
                memmove(out+c_meta_len+sz+sz2, out+c_meta_len+sz+5, c_rmeta_len);
1519
34.2k
            } else {
1520
                // Uncompressed RLE meta-data as too small
1521
34.2k
                sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2+1);
1522
34.2k
                sz2 = var_put_u32(out+c_meta_len+sz, out_end, rle_len);
1523
34.2k
                memcpy(out+c_meta_len+sz+sz2, meta, rmeta_len);
1524
34.2k
                c_rmeta_len = rmeta_len;
1525
34.2k
            }
1526
1527
36.2k
            c_meta_len += sz + sz2 + c_rmeta_len;
1528
1529
36.2k
            in = rle;
1530
36.2k
            in_size = rle_len;
1531
36.2k
        }
1532
1533
110k
        free(meta);
1534
502k
    } else if (do_rle) {
1535
40.9k
        out[0] &= ~RANS_ORDER_RLE;
1536
40.9k
    }
1537
1538
612k
    if (c_meta_len > *out_size) {
1539
0
        free(out_free);
1540
0
        free(rle);
1541
0
        free(packed);
1542
0
        *out_size = 0;
1543
0
        return NULL;
1544
0
    }
1545
1546
612k
    *out_size -= c_meta_len;
1547
612k
    if (order && in_size < 8) {
1548
109k
        out[0] &= ~1;
1549
109k
        order  &= ~1;
1550
109k
    }
1551
1552
612k
    if (!rans_enc_func(do_simd, order)(in, in_size, out+c_meta_len, out_size)) {
1553
0
        free(out_free);
1554
0
        free(rle);
1555
0
        free(packed);
1556
0
        *out_size = 0;
1557
0
        return NULL;
1558
0
    }
1559
1560
612k
    if (*out_size >= in_size) {
1561
543k
        out[0] &= ~3;
1562
543k
        out[0] |= RANS_ORDER_CAT | no_size;
1563
1564
543k
        if (out + c_meta_len + in_size > out_end) {
1565
0
            free(out_free);
1566
0
            free(rle);
1567
0
            free(packed);
1568
0
            *out_size = 0;
1569
0
            return NULL;
1570
0
        }
1571
543k
        if (in_size)
1572
459k
            memcpy(out+c_meta_len, in, in_size);
1573
543k
        *out_size = in_size;
1574
543k
    }
1575
1576
612k
    free(rle);
1577
612k
    free(packed);
1578
1579
612k
    *out_size += c_meta_len;
1580
1581
// Validation mode
1582
#ifdef VALIDATE_RANS
1583
    unsigned int decoded_size = orig_in_size;
1584
    unsigned char *decoded = malloc(decoded_size);
1585
    decoded = rans_uncompress_to_4x16(out, *out_size,
1586
                                      decoded, &decoded_size);
1587
    if (!decoded ||
1588
        decoded_size != orig_in_size ||
1589
        memcmp(orig_in, decoded, orig_in_size) != 0) {
1590
        fprintf(stderr, "rans round trip failed for order %d. Written to fd 5\n", orig_order);
1591
        if (write(5, orig_in, orig_in_size) < 0)
1592
            abort();
1593
        abort();
1594
    }
1595
    free(decoded);
1596
#endif
1597
1598
1599
612k
    return out;
1600
612k
}
1601
1602
unsigned char *rans_compress_4x16(unsigned char *in, unsigned int in_size,
1603
377k
                                  unsigned int *out_size, int order) {
1604
377k
    return rans_compress_to_4x16(in, in_size, NULL, out_size, order);
1605
377k
}
1606
1607
unsigned char *rans_uncompress_to_4x16(unsigned char *in,  unsigned int in_size,
1608
12.4k
                                       unsigned char *out, unsigned int *out_size) {
1609
12.4k
    unsigned char *in_end = in + in_size;
1610
12.4k
    unsigned char *out_free = NULL, *tmp_free = NULL, *meta_free = NULL;
1611
1612
12.4k
    if (in_size == 0)
1613
3
        return NULL;
1614
1615
12.4k
    if (*in & RANS_ORDER_STRIPE) {
1616
172
        unsigned int ulen, olen, c_meta_len = 1;
1617
172
        int i;
1618
172
        uint64_t clen_tot = 0;
1619
1620
        // Decode lengths
1621
172
        c_meta_len += var_get_u32(in+c_meta_len, in_end, &ulen);
1622
172
        if (c_meta_len >= in_size)
1623
1
            return NULL;
1624
171
        unsigned int N = in[c_meta_len++];
1625
171
        if (N < 1)  // Must be at least one stripe
1626
5
            return NULL;
1627
166
        unsigned int clenN[256], ulenN[256], idxN[256];
1628
166
        if (!out) {
1629
15
            if (ulen >= INT_MAX)
1630
1
                return NULL;
1631
14
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1632
14
            if (ulen > 100000)
1633
5
                return NULL;
1634
9
#endif
1635
9
            if (!(out_free = out = malloc(ulen))) {
1636
0
                return NULL;
1637
0
            }
1638
9
            *out_size = ulen;
1639
9
        }
1640
160
        if (ulen != *out_size) {
1641
7
            free(out_free);
1642
7
            return NULL;
1643
7
        }
1644
1645
1.37k
        for (i = 0; i < N; i++) {
1646
1.28k
            ulenN[i] = ulen / N + ((ulen % N) > i);
1647
1.28k
            idxN[i] = i ? idxN[i-1] + ulenN[i-1] : 0;
1648
1.28k
            c_meta_len += var_get_u32(in+c_meta_len, in_end, &clenN[i]);
1649
1.28k
            clen_tot += clenN[i];
1650
1.28k
            if (c_meta_len > in_size || clenN[i] > in_size || clenN[i] < 1) {
1651
64
                free(out_free);
1652
64
                return NULL;
1653
64
            }
1654
1.28k
        }
1655
1656
        // We can call this with a larger buffer, but once we've determined
1657
        // how much we really use we limit it so the recursion becomes easier
1658
        // to limit.
1659
89
        if (c_meta_len + clen_tot > in_size) {
1660
14
            free(out_free);
1661
14
            return NULL;
1662
14
        }
1663
75
        in_size = c_meta_len + clen_tot;
1664
1665
        //fprintf(stderr, "    stripe meta %d\n", c_meta_len); //c-size
1666
1667
        // Uncompress the N streams
1668
75
        unsigned char *outN = malloc(ulen);
1669
75
        if (!outN) {
1670
0
            free(out_free);
1671
0
            return NULL;
1672
0
        }
1673
257
        for (i = 0; i < N; i++) {
1674
203
            olen = ulenN[i];
1675
203
            if (in_size < c_meta_len) {
1676
0
                free(out_free);
1677
0
                free(outN);
1678
0
                return NULL;
1679
0
            }
1680
203
            if (!rans_uncompress_to_4x16(in+c_meta_len, in_size-c_meta_len, outN + idxN[i], &olen)
1681
182
                || olen != ulenN[i]) {
1682
21
                free(out_free);
1683
21
                free(outN);
1684
21
                return NULL;
1685
21
            }
1686
182
            c_meta_len += clenN[i];
1687
182
        }
1688
1689
54
        unstripe(out, outN, ulen, N, idxN);
1690
1691
54
        free(outN);
1692
54
        *out_size = ulen;
1693
54
        return out;
1694
75
    }
1695
1696
12.3k
    int order = *in++;  in_size--;
1697
12.3k
    int do_pack = order & RANS_ORDER_PACK;
1698
12.3k
    int do_rle  = order & RANS_ORDER_RLE;
1699
12.3k
    int do_cat  = order & RANS_ORDER_CAT;
1700
12.3k
    int no_size = order & RANS_ORDER_NOSZ;
1701
12.3k
    int do_simd = order & RANS_ORDER_X32;
1702
12.3k
    order &= 1;
1703
1704
12.3k
    int sz = 0;
1705
12.3k
    unsigned int osz;
1706
12.3k
    if (!no_size) {
1707
9.48k
        sz = var_get_u32(in, in_end, &osz);
1708
9.48k
    } else
1709
2.82k
        sz = 0, osz = *out_size;
1710
12.3k
    in += sz;
1711
12.3k
    in_size -= sz;
1712
1713
12.3k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1714
12.3k
    if (osz > 100000)
1715
41
        return NULL;
1716
12.2k
#endif
1717
1718
12.2k
    if (no_size && !out)
1719
0
        goto err; // Need one or the other
1720
1721
12.2k
    if (!out) {
1722
554
        *out_size = osz;
1723
554
        if (!(out = out_free = malloc(*out_size)))
1724
0
            return NULL;
1725
11.7k
    } else {
1726
11.7k
        if (*out_size < osz)
1727
3
        goto err;
1728
11.7k
        *out_size = osz;
1729
11.7k
    }
1730
1731
//    if (do_pack || do_rle) {
1732
//      in += sz; // size field not needed when pure rANS
1733
//      in_size -= sz;
1734
//    }
1735
1736
12.2k
    uint32_t c_meta_size = 0;
1737
12.2k
    unsigned int tmp1_size = *out_size;
1738
12.2k
    unsigned int tmp2_size = *out_size;
1739
12.2k
    unsigned int tmp3_size = *out_size;
1740
12.2k
    unsigned char *tmp1 = NULL, *tmp2 = NULL, *tmp3 = NULL, *tmp = NULL;
1741
1742
    // Need In, Out and Tmp buffers with temporary buffer of the same size
1743
    // as output.  All use rANS, but with optional transforms (none, RLE,
1744
    // Pack, or both).
1745
    //
1746
    //                    rans   unrle  unpack
1747
    // If none:     in -> out
1748
    // If RLE:      in -> tmp -> out
1749
    // If Pack:     in -> tmp        -> out
1750
    // If RLE+Pack: in -> out -> tmp -> out
1751
    //                    tmp1   tmp2   tmp3
1752
    //
1753
    // So rans is in   -> tmp1
1754
    // RLE     is tmp1 -> tmp2
1755
    // Unpack  is tmp2 -> tmp3
1756
1757
    // Format is meta data (Pack and RLE in that order if present),
1758
    // followed by rANS compressed data.
1759
1760
12.2k
    if (do_pack || do_rle) {
1761
1.84k
        if (!(tmp = tmp_free = malloc(*out_size)))
1762
0
            goto err;
1763
1.84k
        if (do_pack && do_rle) {
1764
44
            tmp1 = out;
1765
44
            tmp2 = tmp;
1766
44
            tmp3 = out;
1767
1.80k
        } else if (do_pack) {
1768
1.53k
            tmp1 = tmp;
1769
1.53k
            tmp2 = tmp1;
1770
1.53k
            tmp3 = out;
1771
1.53k
        } else if (do_rle) {
1772
266
            tmp1 = tmp;
1773
266
            tmp2 = out;
1774
266
            tmp3 = out;
1775
266
        }
1776
10.4k
    } else {
1777
        // neither
1778
10.4k
        tmp  = NULL;
1779
10.4k
        tmp1 = out;
1780
10.4k
        tmp2 = out;
1781
10.4k
        tmp3 = out;
1782
10.4k
    }
1783
1784
    // Decode the bit-packing map.
1785
12.2k
    uint8_t map[16] = {0};
1786
12.2k
    int npacked_sym = 0;
1787
12.2k
    uint64_t unpacked_sz = 0; // FIXME: rename to packed_per_byte
1788
12.2k
    if (do_pack) {
1789
1.58k
        c_meta_size = hts_unpack_meta(in, in_size, *out_size, map, &npacked_sym);
1790
1.58k
        if (c_meta_size == 0)
1791
3
            goto err;
1792
1793
1.57k
        unpacked_sz = osz;
1794
1.57k
        in      += c_meta_size;
1795
1.57k
        in_size -= c_meta_size;
1796
1797
        // New unpacked size.  We could derive this bit from *out_size
1798
        // and npacked_sym.
1799
1.57k
        unsigned int osz;
1800
1.57k
        sz = var_get_u32(in, in_end, &osz);
1801
1.57k
        in += sz;
1802
1.57k
        in_size -= sz;
1803
1.57k
        if (osz > tmp1_size)
1804
33
            goto err;
1805
1.54k
        tmp1_size = osz;
1806
1.54k
    }
1807
1808
12.2k
    uint8_t *meta = NULL;
1809
12.2k
    uint32_t u_meta_size = 0;
1810
12.2k
    if (do_rle) {
1811
        // Uncompress meta data
1812
310
        uint32_t c_meta_size, rle_len, sz;
1813
310
        sz  = var_get_u32(in,    in_end, &u_meta_size);
1814
310
        sz += var_get_u32(in+sz, in_end, &rle_len);
1815
310
        if (rle_len > tmp1_size) // should never grow
1816
24
            goto err;
1817
286
        if (u_meta_size & 1) {
1818
127
            meta = in + sz;
1819
127
            u_meta_size = u_meta_size/2 > (in_end-meta) ? (in_end-meta) : u_meta_size/2;
1820
127
            c_meta_size = u_meta_size;
1821
159
        } else {
1822
159
            sz += var_get_u32(in+sz, in_end, &c_meta_size);
1823
159
            u_meta_size /= 2;
1824
1825
159
            meta_free = meta = rans_dec_func(do_simd, 0)(in+sz, in_size-sz, NULL, u_meta_size);
1826
159
            if (!meta)
1827
111
                goto err;
1828
159
        }
1829
175
        if (c_meta_size+sz > in_size)
1830
1
            goto err;
1831
174
        in      += c_meta_size+sz;
1832
174
        in_size -= c_meta_size+sz;
1833
174
        tmp1_size = rle_len;
1834
174
    }
1835
    //fprintf(stderr, "    meta_size %d bytes\n", (int)(in - orig_in)); //c-size
1836
1837
    // uncompress RLE data.  in -> tmp1
1838
12.0k
    if (in_size) {
1839
12.0k
        if (do_cat) {
1840
            //fprintf(stderr, "    CAT %d\n", tmp1_size); //c-size
1841
921
            if (tmp1_size > in_size)
1842
12
                goto err;
1843
909
            if (tmp1_size > *out_size)
1844
0
                goto err;
1845
909
            memcpy(tmp1, in, tmp1_size);
1846
11.1k
        } else {
1847
11.1k
            tmp1 = rans_dec_func(do_simd, order)(in, in_size, tmp1, tmp1_size);
1848
11.1k
            if (!tmp1)
1849
771
                goto err;
1850
11.1k
        }
1851
12.0k
    } else {
1852
16
        tmp1_size = 0;
1853
16
    }
1854
11.3k
    tmp2_size = tmp3_size = tmp1_size;
1855
1856
11.3k
    if (do_rle) {
1857
        // Unpack RLE.  tmp1 -> tmp2.
1858
73
        if (u_meta_size == 0)
1859
2
            goto err;
1860
71
        uint64_t unrle_size = *out_size;
1861
71
        int rle_nsyms = *meta ? *meta : 256;
1862
71
        if (u_meta_size < 1+rle_nsyms)
1863
22
            goto err;
1864
49
        if (!hts_rle_decode(tmp1, tmp1_size,
1865
49
                            meta+1+rle_nsyms, u_meta_size-(1+rle_nsyms),
1866
49
                            meta+1, rle_nsyms, tmp2, &unrle_size))
1867
30
            goto err;
1868
19
        tmp3_size = tmp2_size = unrle_size;
1869
19
        free(meta_free);
1870
19
        meta_free = NULL;
1871
19
    }
1872
11.2k
    if (do_pack) {
1873
        // Unpack bits via pack-map.  tmp2 -> tmp3
1874
1.42k
        if (npacked_sym == 1)
1875
867
            unpacked_sz = tmp2_size;
1876
        //uint8_t *porig = unpack(tmp2, tmp2_size, unpacked_sz, npacked_sym, map);
1877
        //memcpy(tmp3, porig, unpacked_sz);
1878
1.42k
        if (!hts_unpack(tmp2, tmp2_size, tmp3, unpacked_sz, npacked_sym, map))
1879
14
            goto err;
1880
1.41k
        tmp3_size = unpacked_sz;
1881
1.41k
    }
1882
1883
11.2k
    if (tmp)
1884
1.42k
        free(tmp);
1885
1886
11.2k
    *out_size = tmp3_size;
1887
11.2k
    return tmp3;
1888
1889
1.02k
 err:
1890
1.02k
    free(meta_free);
1891
1.02k
    free(out_free);
1892
1.02k
    free(tmp_free);
1893
1.02k
    return NULL;
1894
11.2k
}
1895
1896
unsigned char *rans_uncompress_4x16(unsigned char *in, unsigned int in_size,
1897
571
                                    unsigned int *out_size) {
1898
    return rans_uncompress_to_4x16(in, in_size, NULL, out_size);
1899
571
}