Coverage Report

Created: 2026-01-09 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/htscodecs/htscodecs/rANS_static16_int.h
Line
Count
Source
1
#ifndef RANS_INTERNAL_H
2
#define RANS_INTERNAL_H
3
4
#include "config.h"
5
#include "varint.h"
6
#include "utils.h"
7
8
/*
9
 * Copyright (c) 2017-2022 Genome Research Ltd.
10
 * Author(s): James Bonfield
11
 *
12
 * Redistribution and use in source and binary forms, with or without
13
 * modification, are permitted provided that the following conditions are met:
14
 *
15
 *    1. Redistributions of source code must retain the above copyright notice,
16
 *       this list of conditions and the following disclaimer.
17
 *
18
 *    2. Redistributions in binary form must reproduce the above
19
 *       copyright notice, this list of conditions and the following
20
 *       disclaimer in the documentation and/or other materials provided
21
 *       with the distribution.
22
 *
23
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
24
 *       Institute nor the names of its contributors may be used to endorse
25
 *       or promote products derived from this software without specific
26
 *       prior written permission.
27
 *
28
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
29
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
32
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
33
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
34
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39
 */
40
41
// Internal: common parts to all the rANSNx16pr implementations.
42
43
// As per standard rANS_static but using optional RLE or bit-packing
44
// techniques prior to entropy encoding.  This is a significant
45
// reduction in some data sets.
46
47
// top bits in order byte
48
#define X_PACK   0x80    // Pack 2,4,8 or infinite symbols into a byte.
49
#define X_RLE    0x40    // Run length encoding with runs & lits encoded separately
50
#define X_CAT    0x20    // Nop; for tiny segments where rANS overhead is too big
51
#define X_NOSZ   0x10    // Don't store the original size; used by STRIPE mode
52
#define X_STRIPE 0x08    // For N-byte integer data; rotate & encode N streams.
53
9.49k
#define X_32     0x04    // 32-way unrolling instead of 4-way
54
55
// Not part of the file format, but used to direct the encoder
56
#define X_SIMD_AUTO 0x100 // automatically enable X_32 if we deem it worthy
57
#define X_SW32_ENC  0x200 // forcibly use the software version of X_32
58
#define X_SW32_DEC  0x400 // forcibly use the software version of X_32
59
#define X_NO_AVX512 0x800 // turn off avx512, but permits AVX2
60
61
125k
#define TF_SHIFT 12
62
13.4k
#define TOTFREQ (1<<TF_SHIFT)
63
64
65
// 9-11 is considerably faster in the O1 variant due to reduced table size.
66
// We auto-tune between 10 and 12 though.  Anywhere from 9 to 14 are viable.
67
#ifndef TF_SHIFT_O1
68
30.6M
#define TF_SHIFT_O1 12
69
#endif
70
#ifndef TF_SHIFT_O1_FAST
71
56.2M
#define TF_SHIFT_O1_FAST 10
72
#endif
73
402k
#define TOTFREQ_O1 (1<<TF_SHIFT_O1)
74
779k
#define TOTFREQ_O1_FAST (1<<TF_SHIFT_O1_FAST)
75
76
unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
77
                                     unsigned char *out, unsigned int *out_size);
78
unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size,
79
                                       unsigned char *out, unsigned int out_sz);
80
81
int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T,
82
                       uint32_t *S);
83
84
// Rounds to next power of 2.
85
// credit to http://graphics.stanford.edu/~seander/bithacks.html
86
1.59M
static inline uint32_t round2(uint32_t v) {
87
1.59M
    v--;
88
1.59M
    v |= v >> 1;
89
1.59M
    v |= v >> 2;
90
1.59M
    v |= v >> 4;
91
1.59M
    v |= v >> 8;
92
1.59M
    v |= v >> 16;
93
1.59M
    v++;
94
1.59M
    return v;
95
1.59M
}
rANS_static4x16pr.c:round2
Line
Count
Source
86
1.59M
static inline uint32_t round2(uint32_t v) {
87
1.59M
    v--;
88
1.59M
    v |= v >> 1;
89
1.59M
    v |= v >> 2;
90
1.59M
    v |= v >> 4;
91
1.59M
    v |= v >> 8;
92
1.59M
    v |= v >> 16;
93
1.59M
    v++;
94
1.59M
    return v;
95
1.59M
}
rANS_static32x16pr_avx2.c:round2
Line
Count
Source
86
4.55k
static inline uint32_t round2(uint32_t v) {
87
4.55k
    v--;
88
4.55k
    v |= v >> 1;
89
4.55k
    v |= v >> 2;
90
4.55k
    v |= v >> 4;
91
4.55k
    v |= v >> 8;
92
4.55k
    v |= v >> 16;
93
4.55k
    v++;
94
4.55k
    return v;
95
4.55k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:round2
Unexecuted instantiation: rANS_static32x16pr_sse4.c:round2
Unexecuted instantiation: rANS_static32x16pr.c:round2
96
97
2.40M
static inline int normalise_freq(uint32_t *F, int size, uint32_t tot) {
98
2.40M
    int m, M, j, loop = 0;
99
2.40M
    uint64_t tr;
100
2.40M
    if (!size)
101
0
        return 0;
102
103
2.40M
 again:
104
2.40M
    tr = ((uint64_t)tot<<31)/size + (1<<30)/size;
105
106
617M
    for (size = m = M = j = 0; j < 256; j++) {
107
615M
        if (!F[j])
108
606M
            continue;
109
110
8.43M
        if (m < F[j])
111
3.15M
            m = F[j], M = j;
112
113
8.43M
        if ((F[j] = (F[j]*tr)>>31) == 0)
114
132k
            F[j] = 1;
115
8.43M
        size += F[j];
116
//      if (F[j] == tot)
117
//          F[j]--;
118
8.43M
    }
119
120
2.40M
    int adjust = tot - size;
121
2.40M
    if (adjust > 0) {
122
511k
        F[M] += adjust;
123
1.89M
    } else if (adjust < 0) {
124
7.47k
        if (F[M] > -adjust && (loop == 1 || F[M]/2 >= -adjust)) {
125
7.45k
            F[M] += adjust;
126
7.45k
        } else {
127
27
            if (loop < 1) {
128
27
                loop++;
129
27
                goto again;
130
27
            }
131
0
            adjust += F[M]-1;
132
0
            F[M] = 1;
133
0
            for (j = 0; adjust && j < 256; j++) {
134
0
                if (F[j] < 2) continue;
135
136
0
                int d = F[j] > -adjust;
137
0
                int m = d ? adjust : 1-F[j];
138
0
                F[j]   += m;
139
0
                adjust -= m;
140
0
            }
141
0
        }
142
7.47k
    }
143
144
    //printf("F[%d]=%d\n", M, F[M]);
145
2.40M
    return F[M]>0 ? 0 : -1;
146
2.40M
}
rANS_static4x16pr.c:normalise_freq
Line
Count
Source
97
2.35M
static inline int normalise_freq(uint32_t *F, int size, uint32_t tot) {
98
2.35M
    int m, M, j, loop = 0;
99
2.35M
    uint64_t tr;
100
2.35M
    if (!size)
101
0
        return 0;
102
103
2.35M
 again:
104
2.35M
    tr = ((uint64_t)tot<<31)/size + (1<<30)/size;
105
106
605M
    for (size = m = M = j = 0; j < 256; j++) {
107
603M
        if (!F[j])
108
595M
            continue;
109
110
8.08M
        if (m < F[j])
111
3.08M
            m = F[j], M = j;
112
113
8.08M
        if ((F[j] = (F[j]*tr)>>31) == 0)
114
79.1k
            F[j] = 1;
115
8.08M
        size += F[j];
116
//      if (F[j] == tot)
117
//          F[j]--;
118
8.08M
    }
119
120
2.35M
    int adjust = tot - size;
121
2.35M
    if (adjust > 0) {
122
497k
        F[M] += adjust;
123
1.85M
    } else if (adjust < 0) {
124
3.94k
        if (F[M] > -adjust && (loop == 1 || F[M]/2 >= -adjust)) {
125
3.93k
            F[M] += adjust;
126
3.93k
        } else {
127
7
            if (loop < 1) {
128
7
                loop++;
129
7
                goto again;
130
7
            }
131
0
            adjust += F[M]-1;
132
0
            F[M] = 1;
133
0
            for (j = 0; adjust && j < 256; j++) {
134
0
                if (F[j] < 2) continue;
135
136
0
                int d = F[j] > -adjust;
137
0
                int m = d ? adjust : 1-F[j];
138
0
                F[j]   += m;
139
0
                adjust -= m;
140
0
            }
141
0
        }
142
3.94k
    }
143
144
    //printf("F[%d]=%d\n", M, F[M]);
145
2.35M
    return F[M]>0 ? 0 : -1;
146
2.35M
}
rANS_static32x16pr_avx2.c:normalise_freq
Line
Count
Source
97
46.2k
static inline int normalise_freq(uint32_t *F, int size, uint32_t tot) {
98
46.2k
    int m, M, j, loop = 0;
99
46.2k
    uint64_t tr;
100
46.2k
    if (!size)
101
0
        return 0;
102
103
46.2k
 again:
104
46.2k
    tr = ((uint64_t)tot<<31)/size + (1<<30)/size;
105
106
11.8M
    for (size = m = M = j = 0; j < 256; j++) {
107
11.8M
        if (!F[j])
108
11.4M
            continue;
109
110
354k
        if (m < F[j])
111
75.5k
            m = F[j], M = j;
112
113
354k
        if ((F[j] = (F[j]*tr)>>31) == 0)
114
53.3k
            F[j] = 1;
115
354k
        size += F[j];
116
//      if (F[j] == tot)
117
//          F[j]--;
118
354k
    }
119
120
46.2k
    int adjust = tot - size;
121
46.2k
    if (adjust > 0) {
122
14.0k
        F[M] += adjust;
123
32.1k
    } else if (adjust < 0) {
124
3.53k
        if (F[M] > -adjust && (loop == 1 || F[M]/2 >= -adjust)) {
125
3.51k
            F[M] += adjust;
126
3.51k
        } else {
127
20
            if (loop < 1) {
128
20
                loop++;
129
20
                goto again;
130
20
            }
131
0
            adjust += F[M]-1;
132
0
            F[M] = 1;
133
0
            for (j = 0; adjust && j < 256; j++) {
134
0
                if (F[j] < 2) continue;
135
136
0
                int d = F[j] > -adjust;
137
0
                int m = d ? adjust : 1-F[j];
138
0
                F[j]   += m;
139
0
                adjust -= m;
140
0
            }
141
0
        }
142
3.53k
    }
143
144
    //printf("F[%d]=%d\n", M, F[M]);
145
46.2k
    return F[M]>0 ? 0 : -1;
146
46.2k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:normalise_freq
Unexecuted instantiation: rANS_static32x16pr_sse4.c:normalise_freq
Unexecuted instantiation: rANS_static32x16pr.c:normalise_freq
147
148
// A specialised version of normalise_freq_shift where the input size
149
// is already normalised to a power of 2, meaning we can just perform
150
// shifts instead of hard to define multiplications and adjustments.
151
static inline void normalise_freq_shift(uint32_t *F, uint32_t size,
152
795k
                                        uint32_t max_tot) {
153
795k
    if (size == 0 || size == max_tot)
154
19.5k
        return;
155
156
775k
    int shift = 0, i;
157
6.92M
    while (size < max_tot)
158
6.15M
        size*=2, shift++;
159
160
199M
    for (i = 0; i < 256; i++)
161
198M
        F[i] <<= shift;
162
775k
}
rANS_static4x16pr.c:normalise_freq_shift
Line
Count
Source
152
752k
                                        uint32_t max_tot) {
153
752k
    if (size == 0 || size == max_tot)
154
14.3k
        return;
155
156
738k
    int shift = 0, i;
157
6.70M
    while (size < max_tot)
158
5.96M
        size*=2, shift++;
159
160
189M
    for (i = 0; i < 256; i++)
161
189M
        F[i] <<= shift;
162
738k
}
rANS_static32x16pr_avx2.c:normalise_freq_shift
Line
Count
Source
152
42.1k
                                        uint32_t max_tot) {
153
42.1k
    if (size == 0 || size == max_tot)
154
5.13k
        return;
155
156
37.0k
    int shift = 0, i;
157
223k
    while (size < max_tot)
158
186k
        size*=2, shift++;
159
160
9.50M
    for (i = 0; i < 256; i++)
161
9.47M
        F[i] <<= shift;
162
37.0k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:normalise_freq_shift
Unexecuted instantiation: rANS_static32x16pr_sse4.c:normalise_freq_shift
Unexecuted instantiation: rANS_static32x16pr.c:normalise_freq_shift
163
164
// symbols only
165
912k
static inline int encode_alphabet(uint8_t *cp, uint32_t *F) {
166
912k
    uint8_t *op = cp;
167
912k
    int rle, j;
168
169
234M
    for (rle = j = 0; j < 256; j++) {
170
233M
        if (F[j]) {
171
            // j
172
4.20M
            if (rle) {
173
396k
                rle--;
174
3.81M
            } else {
175
3.81M
                *cp++ = j;
176
3.81M
                if (!rle && j && F[j-1])  {
177
861k
                    for(rle=j+1; rle<256 && F[rle]; rle++)
178
396k
                        ;
179
464k
                    rle -= j+1;
180
464k
                    *cp++ = rle;
181
464k
                }
182
                //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
183
3.81M
            }
184
4.20M
        }
185
233M
    }
186
912k
    *cp++ = 0;
187
    
188
912k
    return cp - op;
189
912k
}
rANS_static4x16pr.c:encode_alphabet
Line
Count
Source
165
905k
static inline int encode_alphabet(uint8_t *cp, uint32_t *F) {
166
905k
    uint8_t *op = cp;
167
905k
    int rle, j;
168
169
232M
    for (rle = j = 0; j < 256; j++) {
170
231M
        if (F[j]) {
171
            // j
172
4.07M
            if (rle) {
173
337k
                rle--;
174
3.73M
            } else {
175
3.73M
                *cp++ = j;
176
3.73M
                if (!rle && j && F[j-1])  {
177
781k
                    for(rle=j+1; rle<256 && F[rle]; rle++)
178
337k
                        ;
179
443k
                    rle -= j+1;
180
443k
                    *cp++ = rle;
181
443k
                }
182
                //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
183
3.73M
            }
184
4.07M
        }
185
231M
    }
186
905k
    *cp++ = 0;
187
    
188
905k
    return cp - op;
189
905k
}
rANS_static32x16pr_avx2.c:encode_alphabet
Line
Count
Source
165
6.68k
static inline int encode_alphabet(uint8_t *cp, uint32_t *F) {
166
6.68k
    uint8_t *op = cp;
167
6.68k
    int rle, j;
168
169
1.71M
    for (rle = j = 0; j < 256; j++) {
170
1.71M
        if (F[j]) {
171
            // j
172
137k
            if (rle) {
173
59.3k
                rle--;
174
77.7k
            } else {
175
77.7k
                *cp++ = j;
176
77.7k
                if (!rle && j && F[j-1])  {
177
80.5k
                    for(rle=j+1; rle<256 && F[rle]; rle++)
178
59.3k
                        ;
179
21.1k
                    rle -= j+1;
180
21.1k
                    *cp++ = rle;
181
21.1k
                }
182
                //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
183
77.7k
            }
184
137k
        }
185
1.71M
    }
186
6.68k
    *cp++ = 0;
187
    
188
6.68k
    return cp - op;
189
6.68k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:encode_alphabet
Unexecuted instantiation: rANS_static32x16pr_sse4.c:encode_alphabet
Unexecuted instantiation: rANS_static32x16pr.c:encode_alphabet
190
191
7.82k
static inline int decode_alphabet(uint8_t *cp, uint8_t *cp_end, uint32_t *F) {
192
7.82k
    if (cp == cp_end)
193
0
        return 0;
194
195
7.82k
    uint8_t *op = cp;
196
7.82k
    int rle = 0;
197
7.82k
    int j = *cp++;
198
7.82k
    if (cp+2 >= cp_end)
199
0
        goto carefully;
200
201
806k
    do {
202
806k
        F[j] = 1;
203
806k
        if (!rle && j+1 == *cp) {
204
2.50k
            j = *cp++;
205
2.50k
            rle = *cp++;
206
804k
        } else if (rle) {
207
142k
            rle--;
208
142k
            j++;
209
142k
            if (j > 255)
210
144
                return 0;
211
661k
        } else {
212
661k
            j = *cp++;
213
661k
        }
214
806k
    } while(j && cp+2 < cp_end);
215
216
7.67k
 carefully:
217
7.67k
    if (j) {
218
574
        do {
219
574
            F[j] = 1;
220
574
            if(cp >= cp_end) return 0;
221
574
            if (!rle && j+1 == *cp) {
222
8
                if (cp+1 >= cp_end) return 0;
223
0
                j = *cp++;
224
0
                rle = *cp++;
225
566
            } else if (rle) {
226
528
                rle--;
227
528
                j++;
228
528
                if (j > 255)
229
6
                    return 0;
230
528
            } else {
231
38
                if (cp >= cp_end) return 0;
232
38
                j = *cp++;
233
38
            }
234
574
        } while(j && cp < cp_end);
235
32
    }
236
237
7.66k
    return cp - op;
238
7.67k
}
rANS_static4x16pr.c:decode_alphabet
Line
Count
Source
191
2.77k
static inline int decode_alphabet(uint8_t *cp, uint8_t *cp_end, uint32_t *F) {
192
2.77k
    if (cp == cp_end)
193
0
        return 0;
194
195
2.77k
    uint8_t *op = cp;
196
2.77k
    int rle = 0;
197
2.77k
    int j = *cp++;
198
2.77k
    if (cp+2 >= cp_end)
199
0
        goto carefully;
200
201
593k
    do {
202
593k
        F[j] = 1;
203
593k
        if (!rle && j+1 == *cp) {
204
694
            j = *cp++;
205
694
            rle = *cp++;
206
593k
        } else if (rle) {
207
46.2k
            rle--;
208
46.2k
            j++;
209
46.2k
            if (j > 255)
210
105
                return 0;
211
546k
        } else {
212
546k
            j = *cp++;
213
546k
        }
214
593k
    } while(j && cp+2 < cp_end);
215
216
2.67k
 carefully:
217
2.67k
    if (j) {
218
4
        do {
219
4
            F[j] = 1;
220
4
            if(cp >= cp_end) return 0;
221
4
            if (!rle && j+1 == *cp) {
222
2
                if (cp+1 >= cp_end) return 0;
223
0
                j = *cp++;
224
0
                rle = *cp++;
225
2
            } else if (rle) {
226
0
                rle--;
227
0
                j++;
228
0
                if (j > 255)
229
0
                    return 0;
230
2
            } else {
231
2
                if (cp >= cp_end) return 0;
232
2
                j = *cp++;
233
2
            }
234
4
        } while(j && cp < cp_end);
235
2
    }
236
237
2.67k
    return cp - op;
238
2.67k
}
rANS_static32x16pr_avx2.c:decode_alphabet
Line
Count
Source
191
5.04k
static inline int decode_alphabet(uint8_t *cp, uint8_t *cp_end, uint32_t *F) {
192
5.04k
    if (cp == cp_end)
193
0
        return 0;
194
195
5.04k
    uint8_t *op = cp;
196
5.04k
    int rle = 0;
197
5.04k
    int j = *cp++;
198
5.04k
    if (cp+2 >= cp_end)
199
0
        goto carefully;
200
201
213k
    do {
202
213k
        F[j] = 1;
203
213k
        if (!rle && j+1 == *cp) {
204
1.81k
            j = *cp++;
205
1.81k
            rle = *cp++;
206
211k
        } else if (rle) {
207
96.1k
            rle--;
208
96.1k
            j++;
209
96.1k
            if (j > 255)
210
39
                return 0;
211
115k
        } else {
212
115k
            j = *cp++;
213
115k
        }
214
213k
    } while(j && cp+2 < cp_end);
215
216
5.00k
 carefully:
217
5.00k
    if (j) {
218
570
        do {
219
570
            F[j] = 1;
220
570
            if(cp >= cp_end) return 0;
221
570
            if (!rle && j+1 == *cp) {
222
6
                if (cp+1 >= cp_end) return 0;
223
0
                j = *cp++;
224
0
                rle = *cp++;
225
564
            } else if (rle) {
226
528
                rle--;
227
528
                j++;
228
528
                if (j > 255)
229
6
                    return 0;
230
528
            } else {
231
36
                if (cp >= cp_end) return 0;
232
36
                j = *cp++;
233
36
            }
234
570
        } while(j && cp < cp_end);
235
30
    }
236
237
4.99k
    return cp - op;
238
5.00k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:decode_alphabet
Unexecuted instantiation: rANS_static32x16pr_sse4.c:decode_alphabet
Unexecuted instantiation: rANS_static32x16pr.c:decode_alphabet
239
240
808k
static inline int encode_freq(uint8_t *cp, uint32_t *F) {
241
808k
    uint8_t *op = cp;
242
808k
    int j;
243
244
808k
    cp += encode_alphabet(cp, F);
245
246
207M
    for (j = 0; j < 256; j++) {
247
206M
        if (F[j])
248
3.42M
            cp += var_put_u32(cp, NULL, F[j]);
249
206M
    }
250
251
808k
    return cp - op;
252
808k
}
rANS_static4x16pr.c:encode_freq
Line
Count
Source
240
803k
static inline int encode_freq(uint8_t *cp, uint32_t *F) {
241
803k
    uint8_t *op = cp;
242
803k
    int j;
243
244
803k
    cp += encode_alphabet(cp, F);
245
246
206M
    for (j = 0; j < 256; j++) {
247
205M
        if (F[j])
248
3.32M
            cp += var_put_u32(cp, NULL, F[j]);
249
205M
    }
250
251
803k
    return cp - op;
252
803k
}
rANS_static32x16pr_avx2.c:encode_freq
Line
Count
Source
240
4.55k
static inline int encode_freq(uint8_t *cp, uint32_t *F) {
241
4.55k
    uint8_t *op = cp;
242
4.55k
    int j;
243
244
4.55k
    cp += encode_alphabet(cp, F);
245
246
1.17M
    for (j = 0; j < 256; j++) {
247
1.16M
        if (F[j])
248
100k
            cp += var_put_u32(cp, NULL, F[j]);
249
1.16M
    }
250
251
4.55k
    return cp - op;
252
4.55k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:encode_freq
Unexecuted instantiation: rANS_static32x16pr_sse4.c:encode_freq
Unexecuted instantiation: rANS_static32x16pr.c:encode_freq
253
254
static inline int decode_freq(uint8_t *cp, uint8_t *cp_end, uint32_t *F,
255
2.54k
                              uint32_t *fsum) {
256
2.54k
    if (cp == cp_end)
257
0
        return 0;
258
259
2.54k
    uint8_t *op = cp;
260
2.54k
    cp += decode_alphabet(cp, cp_end, F);
261
262
2.54k
    int j, tot = 0;
263
652k
    for (j = 0; j < 256; j++) {
264
650k
        if (F[j]) {
265
24.8k
            cp += var_get_u32(cp, cp_end, (unsigned int *)&F[j]);
266
24.8k
            tot += F[j];
267
24.8k
        }
268
650k
    }
269
270
2.54k
    *fsum = tot;
271
2.54k
    return cp - op;
272
2.54k
}
rANS_static4x16pr.c:decode_freq
Line
Count
Source
255
2.45k
                              uint32_t *fsum) {
256
2.45k
    if (cp == cp_end)
257
0
        return 0;
258
259
2.45k
    uint8_t *op = cp;
260
2.45k
    cp += decode_alphabet(cp, cp_end, F);
261
262
2.45k
    int j, tot = 0;
263
631k
    for (j = 0; j < 256; j++) {
264
628k
        if (F[j]) {
265
19.8k
            cp += var_get_u32(cp, cp_end, (unsigned int *)&F[j]);
266
19.8k
            tot += F[j];
267
19.8k
        }
268
628k
    }
269
270
2.45k
    *fsum = tot;
271
2.45k
    return cp - op;
272
2.45k
}
rANS_static32x16pr_avx2.c:decode_freq
Line
Count
Source
255
84
                              uint32_t *fsum) {
256
84
    if (cp == cp_end)
257
0
        return 0;
258
259
84
    uint8_t *op = cp;
260
84
    cp += decode_alphabet(cp, cp_end, F);
261
262
84
    int j, tot = 0;
263
21.5k
    for (j = 0; j < 256; j++) {
264
21.5k
        if (F[j]) {
265
4.96k
            cp += var_get_u32(cp, cp_end, (unsigned int *)&F[j]);
266
4.96k
            tot += F[j];
267
4.96k
        }
268
21.5k
    }
269
270
84
    *fsum = tot;
271
84
    return cp - op;
272
84
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:decode_freq
Unexecuted instantiation: rANS_static32x16pr_sse4.c:decode_freq
Unexecuted instantiation: rANS_static32x16pr.c:decode_freq
273
274
275
// Use the order-0 freqs in F0 to encode the order-1 stats in F.
276
// All symbols present in F are present in F0, but some in F0 will
277
// be empty in F.  Thus we run-length encode the 0 frequencies.
278
787k
static inline int encode_freq_d(uint8_t *cp, uint32_t *F0, uint32_t *F) {
279
787k
    uint8_t *op = cp;
280
787k
    int j, dz;
281
282
202M
    for (dz = j = 0; j < 256; j++) {
283
201M
        if (F0[j]) {
284
21.8M
            if (F[j] != 0) {
285
1.60M
                if (dz) {
286
                    // Replace dz zeros with zero + dz-1 run length
287
1.10M
                    cp -= dz-1;
288
1.10M
                    *cp++ = dz-1;
289
1.10M
                }
290
1.60M
                dz = 0;
291
1.60M
                cp += var_put_u32(cp, NULL, F[j]);
292
20.2M
            } else {
293
                //fprintf(stderr, "2: j=%d F0[j]=%d, F[j]=%d, dz=%d\n", j, F0[j], F[j], dz);
294
20.2M
                dz++;
295
20.2M
                *cp++ = 0;
296
20.2M
            }
297
21.8M
        }
298
201M
    }
299
    
300
787k
    if (dz) {
301
555k
        cp -= dz-1;
302
555k
        *cp++ = dz-1;
303
555k
    }
304
305
787k
    return cp - op;
306
787k
}
rANS_static4x16pr.c:encode_freq_d
Line
Count
Source
278
750k
static inline int encode_freq_d(uint8_t *cp, uint32_t *F0, uint32_t *F) {
279
750k
    uint8_t *op = cp;
280
750k
    int j, dz;
281
282
192M
    for (dz = j = 0; j < 256; j++) {
283
192M
        if (F0[j]) {
284
18.4M
            if (F[j] != 0) {
285
1.45M
                if (dz) {
286
                    // Replace dz zeros with zero + dz-1 run length
287
1.00M
                    cp -= dz-1;
288
1.00M
                    *cp++ = dz-1;
289
1.00M
                }
290
1.45M
                dz = 0;
291
1.45M
                cp += var_put_u32(cp, NULL, F[j]);
292
16.9M
            } else {
293
                //fprintf(stderr, "2: j=%d F0[j]=%d, F[j]=%d, dz=%d\n", j, F0[j], F[j], dz);
294
16.9M
                dz++;
295
16.9M
                *cp++ = 0;
296
16.9M
            }
297
18.4M
        }
298
192M
    }
299
    
300
750k
    if (dz) {
301
524k
        cp -= dz-1;
302
524k
        *cp++ = dz-1;
303
524k
    }
304
305
750k
    return cp - op;
306
750k
}
rANS_static32x16pr_avx2.c:encode_freq_d
Line
Count
Source
278
37.1k
static inline int encode_freq_d(uint8_t *cp, uint32_t *F0, uint32_t *F) {
279
37.1k
    uint8_t *op = cp;
280
37.1k
    int j, dz;
281
282
9.54M
    for (dz = j = 0; j < 256; j++) {
283
9.50M
        if (F0[j]) {
284
3.35M
            if (F[j] != 0) {
285
150k
                if (dz) {
286
                    // Replace dz zeros with zero + dz-1 run length
287
106k
                    cp -= dz-1;
288
106k
                    *cp++ = dz-1;
289
106k
                }
290
150k
                dz = 0;
291
150k
                cp += var_put_u32(cp, NULL, F[j]);
292
3.20M
            } else {
293
                //fprintf(stderr, "2: j=%d F0[j]=%d, F[j]=%d, dz=%d\n", j, F0[j], F[j], dz);
294
3.20M
                dz++;
295
3.20M
                *cp++ = 0;
296
3.20M
            }
297
3.35M
        }
298
9.50M
    }
299
    
300
37.1k
    if (dz) {
301
30.4k
        cp -= dz-1;
302
30.4k
        *cp++ = dz-1;
303
30.4k
    }
304
305
37.1k
    return cp - op;
306
37.1k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:encode_freq_d
Unexecuted instantiation: rANS_static32x16pr_sse4.c:encode_freq_d
Unexecuted instantiation: rANS_static32x16pr.c:encode_freq_d
307
308
// Normalise frequency total T[i] to match TOTFREQ_O1 and encode.
309
// Also initialises the RansEncSymbol structs.
310
//
311
// Returns the desired TF_SHIFT; 10 or 12 bit, or -1 on error.
312
static inline int encode_freq1(uint8_t *in, uint32_t in_size, int Nway,
313
104k
                               RansEncSymbol syms[256][256], uint8_t **cp_p) {
314
104k
    int i, j, z;
315
104k
    uint8_t *out = *cp_p, *cp = out;
316
317
    // Compute O1 frequency statistics
318
104k
    uint32_t (*F)[256] = htscodecs_tls_calloc(256, (sizeof(*F)));
319
104k
    if (!F)
320
0
        return -1;
321
104k
    uint32_t T[256+MAGIC] = {0};
322
104k
    int isz4 = in_size/Nway;
323
104k
    if (hist1_4(in, in_size, F, T) < 0)
324
0
        goto err;
325
478k
    for (z = 1; z < Nway; z++)
326
373k
        F[0][in[z*isz4]]++;
327
104k
    T[0]+=Nway-1;
328
329
    // Potential fix for the wrap-around bug in AVX2 O1 encoder with shift=12.
330
    // This occurs when we have one single symbol, giving freq=4096.
331
    // We fix it elsewhere for now by looking for the wrap-around.
332
    // See "if (1)" statements in the AVX2 code, which is an alternative
333
    // to the "if (0)" here.
334
//    if (0) {
335
//      int x = -1, y = -1;
336
//      int n1, n2;
337
//      for (x = 0; x < 256; x++) {
338
//          n1 = n2 = -1;
339
//          for (y = 0; y < 256; y++) {
340
//              if (F[x][y])
341
//                  n2 = n1, n1 = y;
342
//          }
343
//          if (n2!=-1 || n1 == -1)
344
//              continue;
345
//
346
//          for (y = 0; y < 256; y++)
347
//              if (!F[x][y])
348
//                  break;
349
//          assert(y<256);
350
//          F[x][y]++;
351
//          F[0][y]++; T[y]++; F0[y]=1;
352
//          F[0][x]++; T[x]++; F0[x]=1;
353
//      }
354
//    }
355
356
    // Encode the order-0 stats
357
104k
    int tmp_T0 = T[0];
358
104k
    T[0] = 1;
359
104k
    *cp++ = 0; // marker for uncompressed (may change)
360
104k
    cp += encode_alphabet(cp, T);
361
104k
    T[0] = tmp_T0;
362
363
    // Decide between 10-bit and 12-bit freqs.
364
    // Fills out S[] to hold the new scaled maximum value.
365
104k
    uint32_t S[256] = {0};
366
104k
    int shift = rans_compute_shift(T, F, T, S);
367
368
    // Normalise so T[i] == TOTFREQ_O1
369
26.8M
    for (i = 0; i < 256; i++) {
370
26.7M
        unsigned int x;
371
372
26.7M
        if (T[i] == 0)
373
26.0M
            continue;
374
375
787k
        uint32_t max_val = S[i];
376
787k
        if (shift == TF_SHIFT_O1_FAST && max_val > TOTFREQ_O1_FAST)
377
10.1k
            max_val = TOTFREQ_O1_FAST;
378
379
787k
        if (normalise_freq(F[i], T[i], max_val) < 0)
380
0
            goto err;
381
787k
        T[i]=max_val;
382
383
        // Encode our frequency array
384
787k
        cp += encode_freq_d(cp, T, F[i]);
385
386
787k
        normalise_freq_shift(F[i], T[i], 1<<shift); T[i]=1<<shift;
387
388
        // Initialise Rans Symbol struct too.
389
787k
        uint32_t *F_i_ = F[i];
390
202M
        for (x = j = 0; j < 256; j++) {
391
201M
            RansEncSymbolInit(&syms[i][j], x, F_i_[j], shift);
392
201M
            x += F_i_[j];
393
201M
        }
394
787k
    }
395
396
104k
    *out = shift<<4;
397
104k
    if (cp - out > 1000) {
398
317
        uint8_t *op = out;
399
        // try rans0 compression of header
400
317
        unsigned int u_freq_sz = cp-(op+1);
401
317
        unsigned int c_freq_sz;
402
317
        unsigned char *c_freq = rans_compress_O0_4x16(op+1, u_freq_sz, NULL,
403
317
                                                      &c_freq_sz);
404
317
        if (c_freq && c_freq_sz + 6 < cp-op) {
405
317
            *op++ |= 1; // compressed
406
317
            op += var_put_u32(op, NULL, u_freq_sz);
407
317
            op += var_put_u32(op, NULL, c_freq_sz);
408
317
            memcpy(op, c_freq, c_freq_sz);
409
317
            cp = op+c_freq_sz;
410
317
        }
411
317
        free(c_freq);
412
317
    }
413
414
104k
    *cp_p = cp;
415
104k
    htscodecs_tls_free(F);
416
104k
    return shift;
417
418
0
 err:
419
0
    htscodecs_tls_free(F);
420
0
    return -1;
421
104k
}
rANS_static4x16pr.c:encode_freq1
Line
Count
Source
313
102k
                               RansEncSymbol syms[256][256], uint8_t **cp_p) {
314
102k
    int i, j, z;
315
102k
    uint8_t *out = *cp_p, *cp = out;
316
317
    // Compute O1 frequency statistics
318
102k
    uint32_t (*F)[256] = htscodecs_tls_calloc(256, (sizeof(*F)));
319
102k
    if (!F)
320
0
        return -1;
321
102k
    uint32_t T[256+MAGIC] = {0};
322
102k
    int isz4 = in_size/Nway;
323
102k
    if (hist1_4(in, in_size, F, T) < 0)
324
0
        goto err;
325
410k
    for (z = 1; z < Nway; z++)
326
307k
        F[0][in[z*isz4]]++;
327
102k
    T[0]+=Nway-1;
328
329
    // Potential fix for the wrap-around bug in AVX2 O1 encoder with shift=12.
330
    // This occurs when we have one single symbol, giving freq=4096.
331
    // We fix it elsewhere for now by looking for the wrap-around.
332
    // See "if (1)" statements in the AVX2 code, which is an alternative
333
    // to the "if (0)" here.
334
//    if (0) {
335
//      int x = -1, y = -1;
336
//      int n1, n2;
337
//      for (x = 0; x < 256; x++) {
338
//          n1 = n2 = -1;
339
//          for (y = 0; y < 256; y++) {
340
//              if (F[x][y])
341
//                  n2 = n1, n1 = y;
342
//          }
343
//          if (n2!=-1 || n1 == -1)
344
//              continue;
345
//
346
//          for (y = 0; y < 256; y++)
347
//              if (!F[x][y])
348
//                  break;
349
//          assert(y<256);
350
//          F[x][y]++;
351
//          F[0][y]++; T[y]++; F0[y]=1;
352
//          F[0][x]++; T[x]++; F0[x]=1;
353
//      }
354
//    }
355
356
    // Encode the order-0 stats
357
102k
    int tmp_T0 = T[0];
358
102k
    T[0] = 1;
359
102k
    *cp++ = 0; // marker for uncompressed (may change)
360
102k
    cp += encode_alphabet(cp, T);
361
102k
    T[0] = tmp_T0;
362
363
    // Decide between 10-bit and 12-bit freqs.
364
    // Fills out S[] to hold the new scaled maximum value.
365
102k
    uint32_t S[256] = {0};
366
102k
    int shift = rans_compute_shift(T, F, T, S);
367
368
    // Normalise so T[i] == TOTFREQ_O1
369
26.3M
    for (i = 0; i < 256; i++) {
370
26.2M
        unsigned int x;
371
372
26.2M
        if (T[i] == 0)
373
25.4M
            continue;
374
375
750k
        uint32_t max_val = S[i];
376
750k
        if (shift == TF_SHIFT_O1_FAST && max_val > TOTFREQ_O1_FAST)
377
7.29k
            max_val = TOTFREQ_O1_FAST;
378
379
750k
        if (normalise_freq(F[i], T[i], max_val) < 0)
380
0
            goto err;
381
750k
        T[i]=max_val;
382
383
        // Encode our frequency array
384
750k
        cp += encode_freq_d(cp, T, F[i]);
385
386
750k
        normalise_freq_shift(F[i], T[i], 1<<shift); T[i]=1<<shift;
387
388
        // Initialise Rans Symbol struct too.
389
750k
        uint32_t *F_i_ = F[i];
390
192M
        for (x = j = 0; j < 256; j++) {
391
192M
            RansEncSymbolInit(&syms[i][j], x, F_i_[j], shift);
392
192M
            x += F_i_[j];
393
192M
        }
394
750k
    }
395
396
102k
    *out = shift<<4;
397
102k
    if (cp - out > 1000) {
398
205
        uint8_t *op = out;
399
        // try rans0 compression of header
400
205
        unsigned int u_freq_sz = cp-(op+1);
401
205
        unsigned int c_freq_sz;
402
205
        unsigned char *c_freq = rans_compress_O0_4x16(op+1, u_freq_sz, NULL,
403
205
                                                      &c_freq_sz);
404
205
        if (c_freq && c_freq_sz + 6 < cp-op) {
405
205
            *op++ |= 1; // compressed
406
205
            op += var_put_u32(op, NULL, u_freq_sz);
407
205
            op += var_put_u32(op, NULL, c_freq_sz);
408
205
            memcpy(op, c_freq, c_freq_sz);
409
205
            cp = op+c_freq_sz;
410
205
        }
411
205
        free(c_freq);
412
205
    }
413
414
102k
    *cp_p = cp;
415
102k
    htscodecs_tls_free(F);
416
102k
    return shift;
417
418
0
 err:
419
0
    htscodecs_tls_free(F);
420
0
    return -1;
421
102k
}
rANS_static32x16pr_avx2.c:encode_freq1
Line
Count
Source
313
2.12k
                               RansEncSymbol syms[256][256], uint8_t **cp_p) {
314
2.12k
    int i, j, z;
315
2.12k
    uint8_t *out = *cp_p, *cp = out;
316
317
    // Compute O1 frequency statistics
318
2.12k
    uint32_t (*F)[256] = htscodecs_tls_calloc(256, (sizeof(*F)));
319
2.12k
    if (!F)
320
0
        return -1;
321
2.12k
    uint32_t T[256+MAGIC] = {0};
322
2.12k
    int isz4 = in_size/Nway;
323
2.12k
    if (hist1_4(in, in_size, F, T) < 0)
324
0
        goto err;
325
68.0k
    for (z = 1; z < Nway; z++)
326
65.9k
        F[0][in[z*isz4]]++;
327
2.12k
    T[0]+=Nway-1;
328
329
    // Potential fix for the wrap-around bug in AVX2 O1 encoder with shift=12.
330
    // This occurs when we have one single symbol, giving freq=4096.
331
    // We fix it elsewhere for now by looking for the wrap-around.
332
    // See "if (1)" statements in the AVX2 code, which is an alternative
333
    // to the "if (0)" here.
334
//    if (0) {
335
//      int x = -1, y = -1;
336
//      int n1, n2;
337
//      for (x = 0; x < 256; x++) {
338
//          n1 = n2 = -1;
339
//          for (y = 0; y < 256; y++) {
340
//              if (F[x][y])
341
//                  n2 = n1, n1 = y;
342
//          }
343
//          if (n2!=-1 || n1 == -1)
344
//              continue;
345
//
346
//          for (y = 0; y < 256; y++)
347
//              if (!F[x][y])
348
//                  break;
349
//          assert(y<256);
350
//          F[x][y]++;
351
//          F[0][y]++; T[y]++; F0[y]=1;
352
//          F[0][x]++; T[x]++; F0[x]=1;
353
//      }
354
//    }
355
356
    // Encode the order-0 stats
357
2.12k
    int tmp_T0 = T[0];
358
2.12k
    T[0] = 1;
359
2.12k
    *cp++ = 0; // marker for uncompressed (may change)
360
2.12k
    cp += encode_alphabet(cp, T);
361
2.12k
    T[0] = tmp_T0;
362
363
    // Decide between 10-bit and 12-bit freqs.
364
    // Fills out S[] to hold the new scaled maximum value.
365
2.12k
    uint32_t S[256] = {0};
366
2.12k
    int shift = rans_compute_shift(T, F, T, S);
367
368
    // Normalise so T[i] == TOTFREQ_O1
369
546k
    for (i = 0; i < 256; i++) {
370
544k
        unsigned int x;
371
372
544k
        if (T[i] == 0)
373
507k
            continue;
374
375
37.1k
        uint32_t max_val = S[i];
376
37.1k
        if (shift == TF_SHIFT_O1_FAST && max_val > TOTFREQ_O1_FAST)
377
2.88k
            max_val = TOTFREQ_O1_FAST;
378
379
37.1k
        if (normalise_freq(F[i], T[i], max_val) < 0)
380
0
            goto err;
381
37.1k
        T[i]=max_val;
382
383
        // Encode our frequency array
384
37.1k
        cp += encode_freq_d(cp, T, F[i]);
385
386
37.1k
        normalise_freq_shift(F[i], T[i], 1<<shift); T[i]=1<<shift;
387
388
        // Initialise Rans Symbol struct too.
389
37.1k
        uint32_t *F_i_ = F[i];
390
9.54M
        for (x = j = 0; j < 256; j++) {
391
9.50M
            RansEncSymbolInit(&syms[i][j], x, F_i_[j], shift);
392
9.50M
            x += F_i_[j];
393
9.50M
        }
394
37.1k
    }
395
396
2.12k
    *out = shift<<4;
397
2.12k
    if (cp - out > 1000) {
398
112
        uint8_t *op = out;
399
        // try rans0 compression of header
400
112
        unsigned int u_freq_sz = cp-(op+1);
401
112
        unsigned int c_freq_sz;
402
112
        unsigned char *c_freq = rans_compress_O0_4x16(op+1, u_freq_sz, NULL,
403
112
                                                      &c_freq_sz);
404
112
        if (c_freq && c_freq_sz + 6 < cp-op) {
405
112
            *op++ |= 1; // compressed
406
112
            op += var_put_u32(op, NULL, u_freq_sz);
407
112
            op += var_put_u32(op, NULL, c_freq_sz);
408
112
            memcpy(op, c_freq, c_freq_sz);
409
112
            cp = op+c_freq_sz;
410
112
        }
411
112
        free(c_freq);
412
112
    }
413
414
2.12k
    *cp_p = cp;
415
2.12k
    htscodecs_tls_free(F);
416
2.12k
    return shift;
417
418
0
 err:
419
0
    htscodecs_tls_free(F);
420
0
    return -1;
421
2.12k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:encode_freq1
Unexecuted instantiation: rANS_static32x16pr_sse4.c:encode_freq1
Unexecuted instantiation: rANS_static32x16pr.c:encode_freq1
422
423
// Part of decode_freq1 below.  This decodes an order-1 frequency table
424
// using an order-0 table to determine which stats may be stored.
425
static inline int decode_freq_d(uint8_t *cp, uint8_t *cp_end, uint32_t *F0,
426
5.64k
                                uint32_t *F, uint32_t *total) {
427
5.64k
    if (cp == cp_end)
428
3
        return 0;
429
430
5.63k
    uint8_t *op = cp;
431
5.63k
    int j, dz, T = 0;
432
433
1.44M
    for (j = dz = 0; j < 256 && cp < cp_end; j++) {
434
        //if (F0[j]) fprintf(stderr, "F0[%d]=%d\n", j, F0[j]);
435
1.44M
        if (!F0[j])
436
1.36M
            continue;
437
438
71.5k
        uint32_t f;
439
71.5k
        if (dz) {
440
27.0k
            f = 0;
441
27.0k
            dz--;
442
44.5k
        } else {
443
44.5k
            if (cp >= cp_end) return 0;
444
44.5k
            cp += var_get_u32(cp, cp_end, &f);
445
44.5k
            if (f == 0) {
446
3.12k
                if (cp >= cp_end) return 0;
447
3.12k
                dz = *cp++;
448
3.12k
            }
449
44.5k
        }
450
71.5k
        F[j] = f;
451
71.5k
        T += f;
452
71.5k
    }
453
454
5.63k
    if (total) *total = T;
455
5.63k
    return cp - op;
456
5.63k
}
rANS_static4x16pr.c:decode_freq_d
Line
Count
Source
426
694
                                uint32_t *F, uint32_t *total) {
427
694
    if (cp == cp_end)
428
3
        return 0;
429
430
691
    uint8_t *op = cp;
431
691
    int j, dz, T = 0;
432
433
176k
    for (j = dz = 0; j < 256 && cp < cp_end; j++) {
434
        //if (F0[j]) fprintf(stderr, "F0[%d]=%d\n", j, F0[j]);
435
176k
        if (!F0[j])
436
154k
            continue;
437
438
21.5k
        uint32_t f;
439
21.5k
        if (dz) {
440
18.4k
            f = 0;
441
18.4k
            dz--;
442
18.4k
        } else {
443
3.09k
            if (cp >= cp_end) return 0;
444
3.09k
            cp += var_get_u32(cp, cp_end, &f);
445
3.09k
            if (f == 0) {
446
1.07k
                if (cp >= cp_end) return 0;
447
1.07k
                dz = *cp++;
448
1.07k
            }
449
3.09k
        }
450
21.5k
        F[j] = f;
451
21.5k
        T += f;
452
21.5k
    }
453
454
691
    if (total) *total = T;
455
691
    return cp - op;
456
691
}
rANS_static32x16pr_avx2.c:decode_freq_d
Line
Count
Source
426
4.94k
                                uint32_t *F, uint32_t *total) {
427
4.94k
    if (cp == cp_end)
428
0
        return 0;
429
430
4.94k
    uint8_t *op = cp;
431
4.94k
    int j, dz, T = 0;
432
433
1.26M
    for (j = dz = 0; j < 256 && cp < cp_end; j++) {
434
        //if (F0[j]) fprintf(stderr, "F0[%d]=%d\n", j, F0[j]);
435
1.26M
        if (!F0[j])
436
1.21M
            continue;
437
438
50.0k
        uint32_t f;
439
50.0k
        if (dz) {
440
8.56k
            f = 0;
441
8.56k
            dz--;
442
41.4k
        } else {
443
41.4k
            if (cp >= cp_end) return 0;
444
41.4k
            cp += var_get_u32(cp, cp_end, &f);
445
41.4k
            if (f == 0) {
446
2.05k
                if (cp >= cp_end) return 0;
447
2.05k
                dz = *cp++;
448
2.05k
            }
449
41.4k
        }
450
50.0k
        F[j] = f;
451
50.0k
        T += f;
452
50.0k
    }
453
454
4.94k
    if (total) *total = T;
455
4.94k
    return cp - op;
456
4.94k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:decode_freq_d
Unexecuted instantiation: rANS_static32x16pr_sse4.c:decode_freq_d
Unexecuted instantiation: rANS_static32x16pr.c:decode_freq_d
457
458
typedef struct {
459
    uint16_t f;
460
    uint16_t b;
461
} fb_t;
462
463
// Decode order-1 frequency table, filling out various lookup tables
464
// in the process. (Which will depend on shift and which values have
465
// been passed in.)
466
//
467
// Returns the number of bytes decoded.
468
static inline int decode_freq1(uint8_t *cp, uint8_t *cp_end, int shift,
469
                               uint32_t s3 [256][TOTFREQ_O1],
470
                               uint32_t s3F[256][TOTFREQ_O1_FAST],
471
4.96k
                               uint8_t *sfb[256], fb_t fb[256][256]) {
472
4.96k
    uint8_t *cp_start = cp;
473
4.96k
    int i, j, x;
474
4.96k
    uint32_t F0[256] = {0};
475
4.96k
    int fsz = decode_alphabet(cp, cp_end, F0);
476
4.96k
    if (!fsz)
477
33
        goto err;
478
4.92k
    cp += fsz;
479
480
4.92k
    if (cp >= cp_end)
481
18
        goto err;
482
483
    // silence false gcc warnings
484
4.91k
    if (fb) {fb [0][0].b= 0;}
485
4.91k
    if (s3) {s3 [0][0]  = 0;}
486
4.91k
    if (s3F){s3F[0][0]  = 0;}
487
488
16.1k
    for (i = 0; i < 256; i++) {
489
16.1k
        if (F0[i] == 0)
490
11.1k
            continue;
491
492
4.94k
        uint32_t F[256] = {0}, T = 0;
493
4.94k
        fsz = decode_freq_d(cp, cp_end, F0, F, &T);
494
4.94k
        if (!fsz)
495
0
            goto err;
496
4.94k
        cp += fsz;
497
498
4.94k
        if (!T) {
499
            //fprintf(stderr, "No freq for F_%d\n", i);
500
33
            continue;
501
33
        }
502
503
4.91k
        normalise_freq_shift(F, T, 1<<shift);
504
505
        // Build symbols; fixme, do as part of decode, see the _d variant
506
40.5k
        for (j = x = 0; j < 256; j++) {
507
40.5k
            if (F[j]) {
508
9.11k
                if (F[j] > (1<<shift) - x)
509
4.90k
                    goto err;
510
511
4.20k
                if (sfb && shift == TF_SHIFT_O1) {
512
0
                    memset(&sfb[i][x], j, F[j]);
513
0
                    fb[i][j].f = F[j];
514
0
                    fb[i][j].b = x;
515
4.20k
                } else if (s3 && shift == TF_SHIFT_O1) {
516
3.62k
                    int y;
517
2.37M
                    for (y = 0; y < F[j]; y++)
518
2.37M
                        s3[i][y+x] = (((uint32_t)F[j])<<(shift+8)) |(y<<8) |j;
519
3.62k
                } else if (s3F && shift == TF_SHIFT_O1_FAST) {
520
255
                    int y;
521
96.3k
                    for (y = 0; y < F[j]; y++)
522
96.1k
                        s3F[i][y+x] = (((uint32_t)F[j])<<(shift+8)) |(y<<8) |j;
523
255
                }
524
525
4.20k
                x += F[j];
526
4.20k
            }
527
40.5k
        }
528
5
        if (x != (1<<shift))
529
0
            goto err;
530
5
    }
531
532
2
    return cp - cp_start;
533
534
4.96k
 err:
535
4.96k
    return 0;
536
4.91k
}
Unexecuted instantiation: rANS_static4x16pr.c:decode_freq1
rANS_static32x16pr_avx2.c:decode_freq1
Line
Count
Source
471
4.96k
                               uint8_t *sfb[256], fb_t fb[256][256]) {
472
4.96k
    uint8_t *cp_start = cp;
473
4.96k
    int i, j, x;
474
4.96k
    uint32_t F0[256] = {0};
475
4.96k
    int fsz = decode_alphabet(cp, cp_end, F0);
476
4.96k
    if (!fsz)
477
33
        goto err;
478
4.92k
    cp += fsz;
479
480
4.92k
    if (cp >= cp_end)
481
18
        goto err;
482
483
    // silence false gcc warnings
484
4.91k
    if (fb) {fb [0][0].b= 0;}
485
4.91k
    if (s3) {s3 [0][0]  = 0;}
486
4.91k
    if (s3F){s3F[0][0]  = 0;}
487
488
16.1k
    for (i = 0; i < 256; i++) {
489
16.1k
        if (F0[i] == 0)
490
11.1k
            continue;
491
492
4.94k
        uint32_t F[256] = {0}, T = 0;
493
4.94k
        fsz = decode_freq_d(cp, cp_end, F0, F, &T);
494
4.94k
        if (!fsz)
495
0
            goto err;
496
4.94k
        cp += fsz;
497
498
4.94k
        if (!T) {
499
            //fprintf(stderr, "No freq for F_%d\n", i);
500
33
            continue;
501
33
        }
502
503
4.91k
        normalise_freq_shift(F, T, 1<<shift);
504
505
        // Build symbols; fixme, do as part of decode, see the _d variant
506
40.5k
        for (j = x = 0; j < 256; j++) {
507
40.5k
            if (F[j]) {
508
9.11k
                if (F[j] > (1<<shift) - x)
509
4.90k
                    goto err;
510
511
4.20k
                if (sfb && shift == TF_SHIFT_O1) {
512
0
                    memset(&sfb[i][x], j, F[j]);
513
0
                    fb[i][j].f = F[j];
514
0
                    fb[i][j].b = x;
515
4.20k
                } else if (s3 && shift == TF_SHIFT_O1) {
516
3.62k
                    int y;
517
2.37M
                    for (y = 0; y < F[j]; y++)
518
2.37M
                        s3[i][y+x] = (((uint32_t)F[j])<<(shift+8)) |(y<<8) |j;
519
3.62k
                } else if (s3F && shift == TF_SHIFT_O1_FAST) {
520
255
                    int y;
521
96.3k
                    for (y = 0; y < F[j]; y++)
522
96.1k
                        s3F[i][y+x] = (((uint32_t)F[j])<<(shift+8)) |(y<<8) |j;
523
255
                }
524
525
4.20k
                x += F[j];
526
4.20k
            }
527
40.5k
        }
528
5
        if (x != (1<<shift))
529
0
            goto err;
530
5
    }
531
532
2
    return cp - cp_start;
533
534
4.96k
 err:
535
4.96k
    return 0;
536
4.91k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:decode_freq1
Unexecuted instantiation: rANS_static32x16pr_sse4.c:decode_freq1
Unexecuted instantiation: rANS_static32x16pr.c:decode_freq1
537
538
// Build s3 symbol lookup table.
539
// This is 12 bit freq, 12 bit bias and 8 bit symbol.
540
84
static inline int rans_F_to_s3(const uint32_t *F, int shift, uint32_t *s3) {
541
84
    int j, x;
542
21.5k
    for (j = x = 0; j < 256; j++) {
543
21.5k
        if (F[j] && F[j] <= (1<<shift) - x) {
544
3.02k
            uint32_t base = (((uint32_t)F[j])<<(shift+8))|j, y;
545
325k
            for (y = 0; y < F[j]; y++, x++)
546
322k
                s3[x] = base + (y<<8);
547
3.02k
        }
548
21.5k
    }
549
550
84
    return x == (1<<shift) ? 0 : 1;
551
84
}
Unexecuted instantiation: rANS_static4x16pr.c:rans_F_to_s3
rANS_static32x16pr_avx2.c:rans_F_to_s3
Line
Count
Source
540
84
static inline int rans_F_to_s3(const uint32_t *F, int shift, uint32_t *s3) {
541
84
    int j, x;
542
21.5k
    for (j = x = 0; j < 256; j++) {
543
21.5k
        if (F[j] && F[j] <= (1<<shift) - x) {
544
3.02k
            uint32_t base = (((uint32_t)F[j])<<(shift+8))|j, y;
545
325k
            for (y = 0; y < F[j]; y++, x++)
546
322k
                s3[x] = base + (y<<8);
547
3.02k
        }
548
21.5k
    }
549
550
84
    return x == (1<<shift) ? 0 : 1;
551
84
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:rans_F_to_s3
Unexecuted instantiation: rANS_static32x16pr_sse4.c:rans_F_to_s3
Unexecuted instantiation: rANS_static32x16pr.c:rans_F_to_s3
552
553
#ifdef ROT32_SIMD
554
#include <x86intrin.h>
555
556
// Our own implementation of _mm256_set_m128i as it's not there on older
557
// gcc implementations.  This is basically the same thing.
558
749k
static inline __m256i _mm256_set_m128ix(__m128i H, __m128i L) {
559
749k
    return _mm256_insertf128_si256(_mm256_castsi128_si256(L), H, 1);
560
749k
}
rANS_static32x16pr_avx2.c:_mm256_set_m128ix
Line
Count
Source
558
749k
static inline __m256i _mm256_set_m128ix(__m128i H, __m128i L) {
559
    return _mm256_insertf128_si256(_mm256_castsi128_si256(L), H, 1);
560
749k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:_mm256_set_m128ix
561
562
23.4k
static inline void rot32_simd(uint8_t t[32][32], uint8_t *out, int iN[32]) {
563
23.4k
    int z;
564
565
23.4k
    __m256i lh8[32];
566
210k
    for (z = 0; z < 32/2; z+=2) {
567
187k
        __m256i a, b, c, d;
568
187k
        a = _mm256_loadu_si256((__m256i *)&t[z*2+0]);
569
187k
        b = _mm256_loadu_si256((__m256i *)&t[z*2+1]);
570
187k
        c = _mm256_loadu_si256((__m256i *)&t[z*2+2]);
571
187k
        d = _mm256_loadu_si256((__m256i *)&t[z*2+3]);
572
573
187k
        lh8[z+0]  = _mm256_unpacklo_epi8(a, b);
574
187k
        lh8[z+16] = _mm256_unpackhi_epi8(a, b);
575
187k
        lh8[z+1]  = _mm256_unpacklo_epi8(c, d);
576
187k
        lh8[z+17] = _mm256_unpackhi_epi8(c, d);
577
187k
    }
578
579
23.4k
    __m256i lh32[32];
580
117k
    for (z = 0; z < 32/4; z+=2) {
581
93.7k
        __m256i a, b, c, d;
582
93.7k
        a = _mm256_unpacklo_epi16(lh8[z*4+0], lh8[z*4+1]);
583
93.7k
        b = _mm256_unpacklo_epi16(lh8[z*4+2], lh8[z*4+3]);
584
93.7k
        c = _mm256_unpackhi_epi16(lh8[z*4+0], lh8[z*4+1]);
585
93.7k
        d = _mm256_unpackhi_epi16(lh8[z*4+2], lh8[z*4+3]);
586
587
93.7k
        __m256i e, f, g, h;
588
93.7k
        e = _mm256_unpacklo_epi16(lh8[(z+1)*4+0], lh8[(z+1)*4+1]);
589
93.7k
        f = _mm256_unpacklo_epi16(lh8[(z+1)*4+2], lh8[(z+1)*4+3]);
590
93.7k
        g = _mm256_unpackhi_epi16(lh8[(z+1)*4+0], lh8[(z+1)*4+1]);
591
93.7k
        h = _mm256_unpackhi_epi16(lh8[(z+1)*4+2], lh8[(z+1)*4+3]);
592
593
93.7k
        lh32[z+0]  = _mm256_unpacklo_epi32(a,b);
594
93.7k
        lh32[z+8]  = _mm256_unpacklo_epi32(c,d);
595
93.7k
        lh32[z+16] = _mm256_unpackhi_epi32(a,b);
596
93.7k
        lh32[z+24] = _mm256_unpackhi_epi32(c,d);
597
598
93.7k
        lh32[z+1+0]  = _mm256_unpacklo_epi32(e,f);
599
93.7k
        lh32[z+1+8]  = _mm256_unpacklo_epi32(g,h);
600
93.7k
        lh32[z+1+16] = _mm256_unpackhi_epi32(e,f);
601
93.7k
        lh32[z+1+24] = _mm256_unpackhi_epi32(g,h);
602
93.7k
    }
603
604
    // Final unpack 64 and store
605
23.4k
    int idx[] = {0, 8, 4, 12, 2, 10, 6, 14};
606
210k
    for (z = 0; z < 8; z++) {
607
187k
        int i = idx[z];
608
609
        // Putting this here doesn't soeed things up
610
187k
        __m256i a = _mm256_unpacklo_epi64(lh32[i*2+0], lh32[i*2+1]);
611
187k
        __m256i b = _mm256_unpacklo_epi64(lh32[i*2+2], lh32[i*2+3]);
612
187k
        __m256i c = _mm256_unpackhi_epi64(lh32[i*2+0], lh32[i*2+1]);
613
187k
        __m256i d = _mm256_unpackhi_epi64(lh32[i*2+2], lh32[i*2+3]);
614
615
187k
        __m256i p = _mm256_set_m128ix(_mm256_extracti128_si256(b,0),
616
187k
                                      _mm256_extracti128_si256(a,0));
617
187k
        __m256i q = _mm256_set_m128ix(_mm256_extracti128_si256(d,0),
618
187k
                                      _mm256_extracti128_si256(c,0));
619
187k
        __m256i r = _mm256_set_m128ix(_mm256_extracti128_si256(b,1),
620
187k
                                      _mm256_extracti128_si256(a,1));
621
187k
        __m256i s = _mm256_set_m128ix(_mm256_extracti128_si256(d,1),
622
187k
                                      _mm256_extracti128_si256(c,1));
623
624
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+0]]),  p);
625
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+1]]),  q);
626
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+16]]), r);
627
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+17]]), s);
628
187k
    }
629
630
    // Store
631
773k
    for (z = 0; z < 32; z++)
632
749k
        iN[z] += 32;
633
23.4k
}
rANS_static32x16pr_avx2.c:rot32_simd
Line
Count
Source
562
23.4k
static inline void rot32_simd(uint8_t t[32][32], uint8_t *out, int iN[32]) {
563
23.4k
    int z;
564
565
23.4k
    __m256i lh8[32];
566
210k
    for (z = 0; z < 32/2; z+=2) {
567
187k
        __m256i a, b, c, d;
568
187k
        a = _mm256_loadu_si256((__m256i *)&t[z*2+0]);
569
187k
        b = _mm256_loadu_si256((__m256i *)&t[z*2+1]);
570
187k
        c = _mm256_loadu_si256((__m256i *)&t[z*2+2]);
571
187k
        d = _mm256_loadu_si256((__m256i *)&t[z*2+3]);
572
573
187k
        lh8[z+0]  = _mm256_unpacklo_epi8(a, b);
574
187k
        lh8[z+16] = _mm256_unpackhi_epi8(a, b);
575
187k
        lh8[z+1]  = _mm256_unpacklo_epi8(c, d);
576
187k
        lh8[z+17] = _mm256_unpackhi_epi8(c, d);
577
187k
    }
578
579
23.4k
    __m256i lh32[32];
580
117k
    for (z = 0; z < 32/4; z+=2) {
581
93.7k
        __m256i a, b, c, d;
582
93.7k
        a = _mm256_unpacklo_epi16(lh8[z*4+0], lh8[z*4+1]);
583
93.7k
        b = _mm256_unpacklo_epi16(lh8[z*4+2], lh8[z*4+3]);
584
93.7k
        c = _mm256_unpackhi_epi16(lh8[z*4+0], lh8[z*4+1]);
585
93.7k
        d = _mm256_unpackhi_epi16(lh8[z*4+2], lh8[z*4+3]);
586
587
93.7k
        __m256i e, f, g, h;
588
93.7k
        e = _mm256_unpacklo_epi16(lh8[(z+1)*4+0], lh8[(z+1)*4+1]);
589
93.7k
        f = _mm256_unpacklo_epi16(lh8[(z+1)*4+2], lh8[(z+1)*4+3]);
590
93.7k
        g = _mm256_unpackhi_epi16(lh8[(z+1)*4+0], lh8[(z+1)*4+1]);
591
93.7k
        h = _mm256_unpackhi_epi16(lh8[(z+1)*4+2], lh8[(z+1)*4+3]);
592
593
93.7k
        lh32[z+0]  = _mm256_unpacklo_epi32(a,b);
594
93.7k
        lh32[z+8]  = _mm256_unpacklo_epi32(c,d);
595
93.7k
        lh32[z+16] = _mm256_unpackhi_epi32(a,b);
596
93.7k
        lh32[z+24] = _mm256_unpackhi_epi32(c,d);
597
598
93.7k
        lh32[z+1+0]  = _mm256_unpacklo_epi32(e,f);
599
93.7k
        lh32[z+1+8]  = _mm256_unpacklo_epi32(g,h);
600
93.7k
        lh32[z+1+16] = _mm256_unpackhi_epi32(e,f);
601
93.7k
        lh32[z+1+24] = _mm256_unpackhi_epi32(g,h);
602
93.7k
    }
603
604
    // Final unpack 64 and store
605
23.4k
    int idx[] = {0, 8, 4, 12, 2, 10, 6, 14};
606
210k
    for (z = 0; z < 8; z++) {
607
187k
        int i = idx[z];
608
609
        // Putting this here doesn't soeed things up
610
187k
        __m256i a = _mm256_unpacklo_epi64(lh32[i*2+0], lh32[i*2+1]);
611
187k
        __m256i b = _mm256_unpacklo_epi64(lh32[i*2+2], lh32[i*2+3]);
612
187k
        __m256i c = _mm256_unpackhi_epi64(lh32[i*2+0], lh32[i*2+1]);
613
187k
        __m256i d = _mm256_unpackhi_epi64(lh32[i*2+2], lh32[i*2+3]);
614
615
187k
        __m256i p = _mm256_set_m128ix(_mm256_extracti128_si256(b,0),
616
187k
                                      _mm256_extracti128_si256(a,0));
617
187k
        __m256i q = _mm256_set_m128ix(_mm256_extracti128_si256(d,0),
618
187k
                                      _mm256_extracti128_si256(c,0));
619
187k
        __m256i r = _mm256_set_m128ix(_mm256_extracti128_si256(b,1),
620
187k
                                      _mm256_extracti128_si256(a,1));
621
187k
        __m256i s = _mm256_set_m128ix(_mm256_extracti128_si256(d,1),
622
187k
                                      _mm256_extracti128_si256(c,1));
623
624
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+0]]),  p);
625
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+1]]),  q);
626
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+16]]), r);
627
187k
        _mm256_storeu_si256((__m256i *)(&out[iN[z*2+17]]), s);
628
187k
    }
629
630
    // Store
631
773k
    for (z = 0; z < 32; z++)
632
749k
        iN[z] += 32;
633
23.4k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:rot32_simd
634
#endif
635
636
#endif // RANS_INTERNAL_H