Coverage Report

Created: 2023-01-17 06:24

/src/htslib/htscodecs/htscodecs/rANS_word.h
Line
Count
Source (jump to first uncovered line)
1
/* rans_byte.h originally from https://github.com/rygorous/ryg_rans
2
 *
3
 * This is a public-domain implementation of several rANS variants. rANS is an
4
 * entropy coder from the ANS family, as described in Jarek Duda's paper
5
 * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540).
6
 */
7
8
/*-------------------------------------------------------------------------- */
9
/* rans_byte.h from https://github.com/rygorous/ryg_rans */
10
11
// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014
12
//
13
// Not intended to be "industrial strength"; just meant to illustrate the general
14
// idea.
15
16
#ifndef RANS_WORD_HEADER
17
#define RANS_WORD_HEADER
18
19
#include <stdio.h>
20
#include <stdint.h>
21
#include <string.h>
22
#include <assert.h>
23
#include "htscodecs_endian.h"
24
25
#ifdef assert
26
0
#define RansAssert assert
27
#else
28
#define RansAssert(x)
29
#endif
30
31
// READ ME FIRST:
32
//
33
// This is designed like a typical arithmetic coder API, but there's three
34
// twists you absolutely should be aware of before you start hacking:
35
//
36
// 1. You need to encode data in *reverse* - last symbol first. rANS works
37
//    like a stack: last in, first out.
38
// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give
39
//    it a pointer to the *end* of your buffer (exclusive), and it will
40
//    slowly move towards the beginning as more bytes are emitted.
41
// 3. Unlike basically any other entropy coder implementation you might
42
//    have used, you can interleave data from multiple independent rANS
43
//    encoders into the same bytestream without any extra signaling;
44
//    you can also just write some bytes by yourself in the middle if
45
//    you want to. This is in addition to the usual arithmetic encoder
46
//    property of being able to switch models on the fly. Writing raw
47
//    bytes can be useful when you have some data that you know is
48
//    incompressible, and is cheaper than going through the rANS encode
49
//    function. Using multiple rANS coders on the same byte stream wastes
50
//    a few bytes compared to using just one, but execution of two
51
//    independent encoders can happen in parallel on superscalar and
52
//    Out-of-Order CPUs, so this can be *much* faster in tight decoding
53
//    loops.
54
//
55
//    This is why all the rANS functions take the write pointer as an
56
//    argument instead of just storing it in some context struct.
57
58
// --------------------------------------------------------------------------
59
60
// L ('l' in the paper) is the lower bound of our normalization interval.
61
// Between this and our byte-aligned emission, we use 31 (not 32!) bits.
62
// This is done intentionally because exact reciprocals for 31-bit uints
63
// fit in 32-bit uints: this permits some optimizations during encoding.
64
11.1M
#define RANS_BYTE_L (1u << 15)  // lower bound of our normalization interval
65
66
// State for a rANS encoder. Yep, that's all there is to it.
67
typedef uint32_t RansState;
68
69
// Initialize a rANS encoder.
70
static inline void RansEncInit(RansState* r)
71
0
{
72
0
    *r = RANS_BYTE_L;
73
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansEncInit
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncInit
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncInit
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncInit
Unexecuted instantiation: rANS_static32x16pr.c:RansEncInit
74
75
// Renormalize the encoder. Internal function.
76
static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits)
77
0
{
78
0
    uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq-1; // this turns into a shift.
79
0
    if (x > x_max) {
80
0
        uint16_t* ptr = (uint16_t *)*pptr;
81
0
        *--ptr = (uint16_t) (x & 0xffff);
82
0
        x >>= 16;
83
0
        *pptr = (uint8_t *)ptr;
84
0
    }
85
0
    return x;
86
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansEncRenorm
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncRenorm
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncRenorm
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncRenorm
Unexecuted instantiation: rANS_static32x16pr.c:RansEncRenorm
87
88
// Encodes a single symbol with range start "start" and frequency "freq".
89
// All frequencies are assumed to sum to "1 << scale_bits", and the
90
// resulting bytes get written to ptr (which is updated).
91
//
92
// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from
93
// beginning to end! Likewise, the output bytestream is written *backwards*:
94
// ptr starts pointing at the end of the output buffer and keeps decrementing.
95
static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
96
0
{
97
0
    // renormalize
98
0
    RansState x = RansEncRenorm(*r, pptr, freq, scale_bits);
99
0
100
0
    // x = C(s,x)
101
0
    *r = ((x / freq) << scale_bits) + (x % freq) + start;
102
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansEncPut
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncPut
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncPut
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncPut
Unexecuted instantiation: rANS_static32x16pr.c:RansEncPut
103
104
// Flushes the rANS encoder.
105
static inline void RansEncFlush(RansState* r, uint8_t** pptr)
106
0
{
107
0
    uint32_t x = *r;
108
0
    uint8_t* ptr = *pptr;
109
110
0
    ptr -= 4;
111
0
    ptr[0] = (uint8_t) (x >> 0);
112
0
    ptr[1] = (uint8_t) (x >> 8);
113
0
    ptr[2] = (uint8_t) (x >> 16);
114
0
    ptr[3] = (uint8_t) (x >> 24);
115
116
0
    *pptr = ptr;
117
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansEncFlush
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncFlush
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncFlush
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncFlush
Unexecuted instantiation: rANS_static32x16pr.c:RansEncFlush
118
119
// Initializes a rANS decoder.
120
// Unlike the encoder, the decoder works forwards as you'd expect.
121
static inline void RansDecInit(RansState* r, uint8_t** pptr)
122
81.1k
{
123
81.1k
    uint32_t x;
124
81.1k
    uint8_t* ptr = *pptr;
125
126
81.1k
    x  = ptr[0] << 0;
127
81.1k
    x |= ptr[1] << 8;
128
81.1k
    x |= ptr[2] << 16;
129
81.1k
    x |= ((uint32_t)ptr[3]) << 24;
130
81.1k
    ptr += 4;
131
132
81.1k
    *pptr = ptr;
133
81.1k
    *r = x;
134
81.1k
}
rANS_static4x16pr.c:RansDecInit
Line
Count
Source
122
9.33k
{
123
9.33k
    uint32_t x;
124
9.33k
    uint8_t* ptr = *pptr;
125
126
9.33k
    x  = ptr[0] << 0;
127
9.33k
    x |= ptr[1] << 8;
128
9.33k
    x |= ptr[2] << 16;
129
9.33k
    x |= ((uint32_t)ptr[3]) << 24;
130
9.33k
    ptr += 4;
131
132
9.33k
    *pptr = ptr;
133
9.33k
    *r = x;
134
9.33k
}
rANS_static32x16pr_avx2.c:RansDecInit
Line
Count
Source
122
71.8k
{
123
71.8k
    uint32_t x;
124
71.8k
    uint8_t* ptr = *pptr;
125
126
71.8k
    x  = ptr[0] << 0;
127
71.8k
    x |= ptr[1] << 8;
128
71.8k
    x |= ptr[2] << 16;
129
71.8k
    x |= ((uint32_t)ptr[3]) << 24;
130
71.8k
    ptr += 4;
131
132
71.8k
    *pptr = ptr;
133
71.8k
    *r = x;
134
71.8k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecInit
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecInit
Unexecuted instantiation: rANS_static32x16pr.c:RansDecInit
135
136
// Returns the current cumulative frequency (map it to a symbol yourself!)
137
static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits)
138
188k
{
139
188k
    return *r & ((1u << scale_bits) - 1);
140
188k
}
rANS_static4x16pr.c:RansDecGet
Line
Count
Source
138
188k
{
139
188k
    return *r & ((1u << scale_bits) - 1);
140
188k
}
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecGet
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecGet
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecGet
Unexecuted instantiation: rANS_static32x16pr.c:RansDecGet
141
142
// Advances in the bit stream by "popping" a single symbol with range start
143
// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits",
144
// and the resulting bytes get written to ptr (which is updated).
145
static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
146
0
{
147
0
    uint32_t mask = (1u << scale_bits) - 1;
148
0
149
0
    // s, x = D(x)
150
0
    uint32_t x = *r;
151
0
    x = freq * (x >> scale_bits) + (x & mask) - start;
152
0
153
0
    // renormalize
154
0
    if (x < RANS_BYTE_L) {
155
0
        uint8_t* ptr = *pptr;
156
0
        do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L);
157
0
        *pptr = ptr;
158
0
    }
159
0
160
0
    *r = x;
161
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvance
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvance
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvance
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvance
Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvance
162
163
// --------------------------------------------------------------------------
164
165
// That's all you need for a full encoder; below here are some utility
166
// functions with extra convenience or optimizations.
167
168
// Encoder symbol description
169
// This (admittedly odd) selection of parameters was chosen to make
170
// RansEncPutSymbol as cheap as possible.
171
typedef struct {
172
    uint32_t x_max;     // (Exclusive) upper bound of pre-normalization interval
173
    uint32_t rcp_freq;  // Fixed-point reciprocal frequency
174
    uint32_t bias;      // Bias
175
176
    // NB: This pair are read as a 32-bit value by the SIMD o1 encoder.
177
    uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq
178
    uint16_t rcp_shift; // Reciprocal shift
179
} RansEncSymbol;
180
181
// As above, but with cmpl_freq and rcp_shift combined into
182
// a single value.  This could be done with a cast, but it avoids
183
// a type punning error.  We could use a union, but anonymous unions
184
// are C11 only (still that's 10 year old!).  For now we just cheat
185
// instead.
186
typedef struct {
187
    uint32_t x_max;     // (Exclusive) upper bound of pre-normalization interval
188
    uint32_t rcp_freq;  // Fixed-point reciprocal frequency
189
    uint32_t bias;      // Bias
190
191
    uint32_t cmpl_freq; // cmpl_freq+rcp_shift
192
} RansEncSymbol_simd;
193
194
// Decoder symbols are straightforward.
195
typedef struct {
196
    uint16_t start;     // Start of range.
197
    uint16_t freq;      // Symbol frequency.
198
} RansDecSymbol;
199
200
// Initializes an encoder symbol to start "start" and frequency "freq"
201
static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits)
202
0
{
203
0
    RansAssert(scale_bits <= 16);
204
0
    RansAssert(start <= (1u << scale_bits));
205
0
    RansAssert(freq <= (1u << scale_bits) - start);
206
207
    // Say M := 1 << scale_bits.
208
    //
209
    // The original encoder does:
210
    //   x_new = (x/freq)*M + start + (x%freq)
211
    //
212
    // The fast encoder does (schematically):
213
    //   q     = mul_hi(x, rcp_freq) >> rcp_shift   (division)
214
    //   r     = x - q*freq                         (remainder)
215
    //   x_new = q*M + bias + r                     (new x)
216
    // plugging in r into x_new yields:
217
    //   x_new = bias + x + q*(M - freq)
218
    //        =: bias + x + q*cmpl_freq             (*)
219
    //
220
    // and we can just precompute cmpl_freq. Now we just need to
221
    // set up our parameters such that the original encoder and
222
    // the fast encoder agree.
223
    
224
0
    s->x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq -1;
225
0
    s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq);
226
0
    if (freq < 2) {
227
        // freq=0 symbols are never valid to encode, so it doesn't matter what
228
        // we set our values to.
229
        //
230
        // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately,
231
        // our fixed-point reciprocal approximation can only multiply by values
232
        // smaller than 1.
233
        //
234
        // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0.
235
        // This gives:
236
        //   q = mul_hi(x, rcp_freq) >> rcp_shift
237
        //     = mul_hi(x, (1<<32) - 1)) >> 0
238
        //     = floor(x - x/(2^32))
239
        //     = x - 1 if 1 <= x < 2^32
240
        // and we know that x>0 (x=0 is never in a valid normalization interval).
241
        //
242
        // So we now need to choose the other parameters such that
243
        //   x_new = x*M + start
244
        // plug it in:
245
        //     x*M + start                   (desired result)
246
        //   = bias + x + q*cmpl_freq        (*)
247
        //   = bias + x + (x - 1)*(M - 1)    (plug in q=x-1, cmpl_freq)
248
        //   = bias + 1 + (x - 1)*M
249
        //   = x*M + (bias + 1 - M)
250
        //
251
        // so we have start = bias + 1 - M, or equivalently
252
        //   bias = start + M - 1.
253
0
        s->rcp_freq = ~0u;
254
0
        s->rcp_shift = 0;
255
0
        s->bias = start + (1 << scale_bits) - 1;
256
0
    } else {
257
        // Alverson, "Integer Division using reciprocals"
258
        // shift=ceil(log2(freq))
259
0
        uint32_t shift = 0;
260
0
        while (freq > (1u << shift))
261
0
            shift++;
262
263
0
        s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq);
264
0
        s->rcp_shift = shift - 1;
265
266
        // With these values, 'q' is the correct quotient, so we
267
        // have bias=start.
268
0
        s->bias = start;
269
0
    }
270
271
0
    s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol
272
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansEncSymbolInit
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncSymbolInit
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncSymbolInit
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncSymbolInit
Unexecuted instantiation: rANS_static32x16pr.c:RansEncSymbolInit
273
274
// Initialize a decoder symbol to start "start" and frequency "freq"
275
static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq)
276
0
{
277
0
    RansAssert(start <= (1 << 16));
278
0
    RansAssert(freq <= (1 << 16) - start);
279
0
    s->start = (uint16_t) start;
280
0
    s->freq = (uint16_t) freq;
281
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansDecSymbolInit
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecSymbolInit
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecSymbolInit
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecSymbolInit
Unexecuted instantiation: rANS_static32x16pr.c:RansDecSymbolInit
282
283
// Encodes a given symbol. This is faster than straight RansEnc since we can do
284
// multiplications instead of a divide.
285
//
286
// See RansEncSymbolInit for a description of how this works.
287
static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym)
288
0
{
289
    //RansAssert(sym->x_max != 0); // can't encode symbol with freq=0
290
291
    // renormalize
292
0
    uint32_t x = *r;
293
0
    uint32_t x_max = sym->x_max;
294
295
0
#ifdef HTSCODECS_LITTLE_ENDIAN
296
    // Branchless renorm.
297
    //
298
    // This works best on high entropy data where branch prediction
299
    // is poor.
300
    //
301
    // Note the bit-packing and RLE modes are more likely to be used on
302
    // low entropy data, making this assertion generally true.  See
303
    // RansEncPutSymbol_branched for a low-entropy optimised function.
304
305
    // NB: "(x > x_max)*2" turns back into branched code with gcc.
306
0
    int c = (x > x_max); c*=2;
307
0
    memcpy(*pptr-2, &x, 2);
308
0
    x >>= c*8;
309
0
    *pptr = *pptr - c;
310
#else
311
    if (x > x_max) {
312
        uint8_t* ptr = *pptr;
313
        ptr -= 2;
314
        ptr[0] = x & 0xff;
315
        ptr[1] = (x >> 8) & 0xff;
316
        x >>= 16;
317
        *pptr = ptr;
318
    }
319
#endif
320
321
    // x = C(s,x)
322
    // NOTE: written this way so we get a 32-bit "multiply high" when
323
    // available. If you're on a 64-bit platform with cheap multiplies
324
    // (e.g. x64), just bake the +32 into rcp_shift.
325
    //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift;
326
327
    // Slow method, but robust
328
//    *r = ((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start;
329
//    return;
330
331
    // The extra >>32 has already been added to RansEncSymbolInit
332
0
    uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift);
333
0
    *r = x + sym->bias + q * sym->cmpl_freq;
334
335
//    assert(((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start == *r);
336
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansEncPutSymbol
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncPutSymbol
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncPutSymbol
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncPutSymbol
Unexecuted instantiation: rANS_static32x16pr.c:RansEncPutSymbol
337
338
static inline void RansEncPutSymbol_branched(RansState* r, uint8_t** pptr, RansEncSymbol const* sym)
339
0
{
340
    //RansAssert(sym->x_max != 0); // can't encode symbol with freq=0
341
342
    // renormalize
343
0
    uint32_t x = *r;
344
0
    uint32_t x_max = sym->x_max;
345
346
0
#ifdef HTSCODECS_LITTLE_ENDIAN
347
    // The old non-branchless method
348
0
    if (x > x_max) {
349
0
        (*pptr) -= 2;
350
0
        memcpy(*pptr, &x, 2);
351
0
        x >>= 16;
352
0
    }
353
#else
354
    if (x > x_max) {
355
        uint8_t* ptr = *pptr;
356
        ptr -= 2;
357
        ptr[0] = x & 0xff;
358
        ptr[1] = (x >> 8) & 0xff;
359
        x >>= 16;
360
        *pptr = ptr;
361
    }
362
#endif
363
364
    // x = C(s,x)
365
    // NOTE: written this way so we get a 32-bit "multiply high" when
366
    // available. If you're on a 64-bit platform with cheap multiplies
367
    // (e.g. x64), just bake the +32 into rcp_shift.
368
    //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift;
369
370
    // Slow method, but robust
371
//    *r = ((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start;
372
//    return;
373
374
    // The extra >>32 has already been added to RansEncSymbolInit
375
0
    uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift);
376
0
    *r = x + sym->bias + q * sym->cmpl_freq;
377
378
//    assert(((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start == *r);
379
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansEncPutSymbol_branched
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncPutSymbol_branched
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncPutSymbol_branched
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncPutSymbol_branched
Unexecuted instantiation: rANS_static32x16pr.c:RansEncPutSymbol_branched
380
381
// Equivalent to RansDecAdvance that takes a symbol.
382
static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits)
383
0
{
384
0
    RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits);
385
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvanceSymbol
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvanceSymbol
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvanceSymbol
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvanceSymbol
Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvanceSymbol
386
387
// Advances in the bit stream by "popping" a single symbol with range start
388
// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits".
389
// No renormalization or output happens.
390
static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits)
391
0
{
392
0
    uint32_t mask = (1u << scale_bits) - 1;
393
0
394
0
    // s, x = D(x)
395
0
    uint32_t x = *r;
396
0
    *r = freq * (x >> scale_bits) + (x & mask) - start;
397
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvanceStep
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvanceStep
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvanceStep
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvanceStep
Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvanceStep
398
399
// Equivalent to RansDecAdvanceStep that takes a symbol.
400
static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits)
401
0
{
402
0
    RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits);
403
0
}
Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvanceSymbolStep
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvanceSymbolStep
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvanceSymbolStep
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvanceSymbolStep
Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvanceSymbolStep
404
405
// Renormalize.
406
407
#if defined(__x86_64) && !defined(__ILP32__)
408
409
/*
410
 * Assembly variants of the RansDecRenorm code.
411
 * These are based on joint ideas from Rob Davies and from looking at
412
 * the clang assembly output.
413
 */
414
11.2M
static inline void RansDecRenorm(RansState* r, uint8_t** pptr) {
415
    //       q4        q40
416
    // clang 730/608   717/467
417
    // gcc8  733/588   737/458
418
11.2M
    uint32_t  x   = *r;
419
11.2M
    uint8_t  *ptr = *pptr;
420
11.2M
    __asm__ ("movzwl (%0),  %%eax\n\t"
421
11.2M
             "mov    %1,    %%edx\n\t"
422
11.2M
             "shl    $0x10, %%edx\n\t"
423
11.2M
             "or     %%eax, %%edx\n\t"
424
11.2M
             "xor    %%eax, %%eax\n\t"
425
11.2M
             "cmp    $0x8000,%1\n\t"
426
11.2M
             "cmovb  %%edx, %1\n\t"
427
11.2M
             "lea    2(%0), %%rax\n\t"
428
11.2M
             "cmovb  %%rax, %0\n\t"
429
11.2M
             : "=r" (ptr), "=r" (x)
430
11.2M
             : "0"  (ptr), "1"  (x)
431
11.2M
             : "eax", "edx"
432
11.2M
             );
433
11.2M
    *pptr = (uint8_t *)ptr;
434
11.2M
    *r = x;
435
11.2M
}
rANS_static4x16pr.c:RansDecRenorm
Line
Count
Source
414
11.2M
static inline void RansDecRenorm(RansState* r, uint8_t** pptr) {
415
    //       q4        q40
416
    // clang 730/608   717/467
417
    // gcc8  733/588   737/458
418
11.2M
    uint32_t  x   = *r;
419
11.2M
    uint8_t  *ptr = *pptr;
420
11.2M
    __asm__ ("movzwl (%0),  %%eax\n\t"
421
11.2M
             "mov    %1,    %%edx\n\t"
422
11.2M
             "shl    $0x10, %%edx\n\t"
423
11.2M
             "or     %%eax, %%edx\n\t"
424
11.2M
             "xor    %%eax, %%eax\n\t"
425
11.2M
             "cmp    $0x8000,%1\n\t"
426
11.2M
             "cmovb  %%edx, %1\n\t"
427
11.2M
             "lea    2(%0), %%rax\n\t"
428
11.2M
             "cmovb  %%rax, %0\n\t"
429
11.2M
             : "=r" (ptr), "=r" (x)
430
11.2M
             : "0"  (ptr), "1"  (x)
431
11.2M
             : "eax", "edx"
432
11.2M
             );
433
11.2M
    *pptr = (uint8_t *)ptr;
434
11.2M
    *r = x;
435
11.2M
}
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecRenorm
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecRenorm
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecRenorm
Unexecuted instantiation: rANS_static32x16pr.c:RansDecRenorm
436
437
#else /* __x86_64 */
438
439
static inline void RansDecRenorm(RansState* r, uint8_t** pptr)
440
{
441
    // renormalize, branchless
442
    uint32_t x = *r;
443
    int cmp = (x < RANS_BYTE_L)*2;
444
    uint32_t y = (*pptr)[0] + ((*pptr)[1]<<8);
445
    uint32_t x2 = (x << 16) | y;
446
    x = cmp ? x2 : x;
447
    (*pptr) += cmp;
448
    *r = x;
449
450
//    // renormalize, branched.  Faster on low-complexity data, but generally
451
//    // that is best compressed with PACK and/or RLE which turns it back
452
//    // into high complexity data.
453
//    uint32_t x = *r;
454
//    uint32_t y = (*pptr)[0] | ((*pptr)[1]<<8);
455
//
456
//    if (x < RANS_BYTE_L)
457
//      (*pptr)+=2;
458
//    if (x < RANS_BYTE_L)
459
//      x = (x << 16) | y;
460
//
461
//    *r = x;
462
}
463
#endif /* __x86_64 */
464
465
// Note the data may not be word aligned here.
466
// This function is only used sparingly, for the last few bytes in the buffer,
467
// so speed isn't critical.
468
static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_end)
469
5.49M
{
470
5.49M
    uint32_t x = *r;
471
5.49M
    if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return;
472
9.58k
    uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8);
473
9.58k
    x = (x << 16) | y;
474
9.58k
    (*pptr) += 2;
475
9.58k
    *r = x;
476
9.58k
}
rANS_static4x16pr.c:RansDecRenormSafe
Line
Count
Source
469
4.05M
{
470
4.05M
    uint32_t x = *r;
471
4.05M
    if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return;
472
1.77k
    uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8);
473
1.77k
    x = (x << 16) | y;
474
1.77k
    (*pptr) += 2;
475
1.77k
    *r = x;
476
1.77k
}
rANS_static32x16pr_avx2.c:RansDecRenormSafe
Line
Count
Source
469
1.44M
{
470
1.44M
    uint32_t x = *r;
471
1.44M
    if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return;
472
7.81k
    uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8);
473
7.81k
    x = (x << 16) | y;
474
7.81k
    (*pptr) += 2;
475
7.81k
    *r = x;
476
7.81k
}
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecRenormSafe
Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecRenormSafe
Unexecuted instantiation: rANS_static32x16pr.c:RansDecRenormSafe
477
478
#endif // RANS_WORD_HEADER