/src/htslib/htscodecs/htscodecs/rANS_word.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* rans_byte.h originally from https://github.com/rygorous/ryg_rans |
2 | | * |
3 | | * This is a public-domain implementation of several rANS variants. rANS is an |
4 | | * entropy coder from the ANS family, as described in Jarek Duda's paper |
5 | | * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540). |
6 | | */ |
7 | | |
8 | | /*-------------------------------------------------------------------------- */ |
9 | | /* rans_byte.h from https://github.com/rygorous/ryg_rans */ |
10 | | |
11 | | // Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014 |
12 | | // |
13 | | // Not intended to be "industrial strength"; just meant to illustrate the general |
14 | | // idea. |
15 | | |
16 | | #ifndef RANS_WORD_HEADER |
17 | | #define RANS_WORD_HEADER |
18 | | |
19 | | #include <stdio.h> |
20 | | #include <stdint.h> |
21 | | #include <string.h> |
22 | | #include <assert.h> |
23 | | #include "htscodecs_endian.h" |
24 | | |
25 | | #ifdef assert |
26 | 0 | #define RansAssert assert |
27 | | #else |
28 | | #define RansAssert(x) |
29 | | #endif |
30 | | |
31 | | // READ ME FIRST: |
32 | | // |
33 | | // This is designed like a typical arithmetic coder API, but there's three |
34 | | // twists you absolutely should be aware of before you start hacking: |
35 | | // |
36 | | // 1. You need to encode data in *reverse* - last symbol first. rANS works |
37 | | // like a stack: last in, first out. |
38 | | // 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give |
39 | | // it a pointer to the *end* of your buffer (exclusive), and it will |
40 | | // slowly move towards the beginning as more bytes are emitted. |
41 | | // 3. Unlike basically any other entropy coder implementation you might |
42 | | // have used, you can interleave data from multiple independent rANS |
43 | | // encoders into the same bytestream without any extra signaling; |
44 | | // you can also just write some bytes by yourself in the middle if |
45 | | // you want to. This is in addition to the usual arithmetic encoder |
46 | | // property of being able to switch models on the fly. Writing raw |
47 | | // bytes can be useful when you have some data that you know is |
48 | | // incompressible, and is cheaper than going through the rANS encode |
49 | | // function. Using multiple rANS coders on the same byte stream wastes |
50 | | // a few bytes compared to using just one, but execution of two |
51 | | // independent encoders can happen in parallel on superscalar and |
52 | | // Out-of-Order CPUs, so this can be *much* faster in tight decoding |
53 | | // loops. |
54 | | // |
55 | | // This is why all the rANS functions take the write pointer as an |
56 | | // argument instead of just storing it in some context struct. |
57 | | |
58 | | // -------------------------------------------------------------------------- |
59 | | |
60 | | // L ('l' in the paper) is the lower bound of our normalization interval. |
61 | | // Between this and our byte-aligned emission, we use 31 (not 32!) bits. |
62 | | // This is done intentionally because exact reciprocals for 31-bit uints |
63 | | // fit in 32-bit uints: this permits some optimizations during encoding. |
64 | 11.1M | #define RANS_BYTE_L (1u << 15) // lower bound of our normalization interval |
65 | | |
66 | | // State for a rANS encoder. Yep, that's all there is to it. |
67 | | typedef uint32_t RansState; |
68 | | |
69 | | // Initialize a rANS encoder. |
70 | | static inline void RansEncInit(RansState* r) |
71 | 0 | { |
72 | 0 | *r = RANS_BYTE_L; |
73 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansEncInit Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncInit Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncInit Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncInit Unexecuted instantiation: rANS_static32x16pr.c:RansEncInit |
74 | | |
75 | | // Renormalize the encoder. Internal function. |
76 | | static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits) |
77 | 0 | { |
78 | 0 | uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq-1; // this turns into a shift. |
79 | 0 | if (x > x_max) { |
80 | 0 | uint16_t* ptr = (uint16_t *)*pptr; |
81 | 0 | *--ptr = (uint16_t) (x & 0xffff); |
82 | 0 | x >>= 16; |
83 | 0 | *pptr = (uint8_t *)ptr; |
84 | 0 | } |
85 | 0 | return x; |
86 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansEncRenorm Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncRenorm Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncRenorm Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncRenorm Unexecuted instantiation: rANS_static32x16pr.c:RansEncRenorm |
87 | | |
88 | | // Encodes a single symbol with range start "start" and frequency "freq". |
89 | | // All frequencies are assumed to sum to "1 << scale_bits", and the |
90 | | // resulting bytes get written to ptr (which is updated). |
91 | | // |
92 | | // NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from |
93 | | // beginning to end! Likewise, the output bytestream is written *backwards*: |
94 | | // ptr starts pointing at the end of the output buffer and keeps decrementing. |
95 | | static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) |
96 | 0 | { |
97 | 0 | // renormalize |
98 | 0 | RansState x = RansEncRenorm(*r, pptr, freq, scale_bits); |
99 | 0 |
|
100 | 0 | // x = C(s,x) |
101 | 0 | *r = ((x / freq) << scale_bits) + (x % freq) + start; |
102 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansEncPut Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncPut Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncPut Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncPut Unexecuted instantiation: rANS_static32x16pr.c:RansEncPut |
103 | | |
104 | | // Flushes the rANS encoder. |
105 | | static inline void RansEncFlush(RansState* r, uint8_t** pptr) |
106 | 0 | { |
107 | 0 | uint32_t x = *r; |
108 | 0 | uint8_t* ptr = *pptr; |
109 | |
|
110 | 0 | ptr -= 4; |
111 | 0 | ptr[0] = (uint8_t) (x >> 0); |
112 | 0 | ptr[1] = (uint8_t) (x >> 8); |
113 | 0 | ptr[2] = (uint8_t) (x >> 16); |
114 | 0 | ptr[3] = (uint8_t) (x >> 24); |
115 | |
|
116 | 0 | *pptr = ptr; |
117 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansEncFlush Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncFlush Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncFlush Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncFlush Unexecuted instantiation: rANS_static32x16pr.c:RansEncFlush |
118 | | |
119 | | // Initializes a rANS decoder. |
120 | | // Unlike the encoder, the decoder works forwards as you'd expect. |
121 | | static inline void RansDecInit(RansState* r, uint8_t** pptr) |
122 | 81.1k | { |
123 | 81.1k | uint32_t x; |
124 | 81.1k | uint8_t* ptr = *pptr; |
125 | | |
126 | 81.1k | x = ptr[0] << 0; |
127 | 81.1k | x |= ptr[1] << 8; |
128 | 81.1k | x |= ptr[2] << 16; |
129 | 81.1k | x |= ((uint32_t)ptr[3]) << 24; |
130 | 81.1k | ptr += 4; |
131 | | |
132 | 81.1k | *pptr = ptr; |
133 | 81.1k | *r = x; |
134 | 81.1k | } rANS_static4x16pr.c:RansDecInit Line | Count | Source | 122 | 9.33k | { | 123 | 9.33k | uint32_t x; | 124 | 9.33k | uint8_t* ptr = *pptr; | 125 | | | 126 | 9.33k | x = ptr[0] << 0; | 127 | 9.33k | x |= ptr[1] << 8; | 128 | 9.33k | x |= ptr[2] << 16; | 129 | 9.33k | x |= ((uint32_t)ptr[3]) << 24; | 130 | 9.33k | ptr += 4; | 131 | | | 132 | 9.33k | *pptr = ptr; | 133 | 9.33k | *r = x; | 134 | 9.33k | } |
rANS_static32x16pr_avx2.c:RansDecInit Line | Count | Source | 122 | 71.8k | { | 123 | 71.8k | uint32_t x; | 124 | 71.8k | uint8_t* ptr = *pptr; | 125 | | | 126 | 71.8k | x = ptr[0] << 0; | 127 | 71.8k | x |= ptr[1] << 8; | 128 | 71.8k | x |= ptr[2] << 16; | 129 | 71.8k | x |= ((uint32_t)ptr[3]) << 24; | 130 | 71.8k | ptr += 4; | 131 | | | 132 | 71.8k | *pptr = ptr; | 133 | 71.8k | *r = x; | 134 | 71.8k | } |
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecInit Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecInit Unexecuted instantiation: rANS_static32x16pr.c:RansDecInit |
135 | | |
136 | | // Returns the current cumulative frequency (map it to a symbol yourself!) |
137 | | static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits) |
138 | 188k | { |
139 | 188k | return *r & ((1u << scale_bits) - 1); |
140 | 188k | } rANS_static4x16pr.c:RansDecGet Line | Count | Source | 138 | 188k | { | 139 | 188k | return *r & ((1u << scale_bits) - 1); | 140 | 188k | } |
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecGet Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecGet Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecGet Unexecuted instantiation: rANS_static32x16pr.c:RansDecGet |
141 | | |
142 | | // Advances in the bit stream by "popping" a single symbol with range start |
143 | | // "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits", |
144 | | // and the resulting bytes get written to ptr (which is updated). |
145 | | static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) |
146 | 0 | { |
147 | 0 | uint32_t mask = (1u << scale_bits) - 1; |
148 | 0 |
|
149 | 0 | // s, x = D(x) |
150 | 0 | uint32_t x = *r; |
151 | 0 | x = freq * (x >> scale_bits) + (x & mask) - start; |
152 | 0 |
|
153 | 0 | // renormalize |
154 | 0 | if (x < RANS_BYTE_L) { |
155 | 0 | uint8_t* ptr = *pptr; |
156 | 0 | do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L); |
157 | 0 | *pptr = ptr; |
158 | 0 | } |
159 | 0 |
|
160 | 0 | *r = x; |
161 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvance Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvance Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvance Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvance Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvance |
162 | | |
163 | | // -------------------------------------------------------------------------- |
164 | | |
165 | | // That's all you need for a full encoder; below here are some utility |
166 | | // functions with extra convenience or optimizations. |
167 | | |
168 | | // Encoder symbol description |
169 | | // This (admittedly odd) selection of parameters was chosen to make |
170 | | // RansEncPutSymbol as cheap as possible. |
171 | | typedef struct { |
172 | | uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval |
173 | | uint32_t rcp_freq; // Fixed-point reciprocal frequency |
174 | | uint32_t bias; // Bias |
175 | | |
176 | | // NB: This pair are read as a 32-bit value by the SIMD o1 encoder. |
177 | | uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq |
178 | | uint16_t rcp_shift; // Reciprocal shift |
179 | | } RansEncSymbol; |
180 | | |
181 | | // As above, but with cmpl_freq and rcp_shift combined into |
182 | | // a single value. This could be done with a cast, but it avoids |
183 | | // a type punning error. We could use a union, but anonymous unions |
184 | | // are C11 only (still that's 10 year old!). For now we just cheat |
185 | | // instead. |
186 | | typedef struct { |
187 | | uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval |
188 | | uint32_t rcp_freq; // Fixed-point reciprocal frequency |
189 | | uint32_t bias; // Bias |
190 | | |
191 | | uint32_t cmpl_freq; // cmpl_freq+rcp_shift |
192 | | } RansEncSymbol_simd; |
193 | | |
194 | | // Decoder symbols are straightforward. |
195 | | typedef struct { |
196 | | uint16_t start; // Start of range. |
197 | | uint16_t freq; // Symbol frequency. |
198 | | } RansDecSymbol; |
199 | | |
200 | | // Initializes an encoder symbol to start "start" and frequency "freq" |
201 | | static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits) |
202 | 0 | { |
203 | 0 | RansAssert(scale_bits <= 16); |
204 | 0 | RansAssert(start <= (1u << scale_bits)); |
205 | 0 | RansAssert(freq <= (1u << scale_bits) - start); |
206 | | |
207 | | // Say M := 1 << scale_bits. |
208 | | // |
209 | | // The original encoder does: |
210 | | // x_new = (x/freq)*M + start + (x%freq) |
211 | | // |
212 | | // The fast encoder does (schematically): |
213 | | // q = mul_hi(x, rcp_freq) >> rcp_shift (division) |
214 | | // r = x - q*freq (remainder) |
215 | | // x_new = q*M + bias + r (new x) |
216 | | // plugging in r into x_new yields: |
217 | | // x_new = bias + x + q*(M - freq) |
218 | | // =: bias + x + q*cmpl_freq (*) |
219 | | // |
220 | | // and we can just precompute cmpl_freq. Now we just need to |
221 | | // set up our parameters such that the original encoder and |
222 | | // the fast encoder agree. |
223 | | |
224 | 0 | s->x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq -1; |
225 | 0 | s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq); |
226 | 0 | if (freq < 2) { |
227 | | // freq=0 symbols are never valid to encode, so it doesn't matter what |
228 | | // we set our values to. |
229 | | // |
230 | | // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately, |
231 | | // our fixed-point reciprocal approximation can only multiply by values |
232 | | // smaller than 1. |
233 | | // |
234 | | // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0. |
235 | | // This gives: |
236 | | // q = mul_hi(x, rcp_freq) >> rcp_shift |
237 | | // = mul_hi(x, (1<<32) - 1)) >> 0 |
238 | | // = floor(x - x/(2^32)) |
239 | | // = x - 1 if 1 <= x < 2^32 |
240 | | // and we know that x>0 (x=0 is never in a valid normalization interval). |
241 | | // |
242 | | // So we now need to choose the other parameters such that |
243 | | // x_new = x*M + start |
244 | | // plug it in: |
245 | | // x*M + start (desired result) |
246 | | // = bias + x + q*cmpl_freq (*) |
247 | | // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq) |
248 | | // = bias + 1 + (x - 1)*M |
249 | | // = x*M + (bias + 1 - M) |
250 | | // |
251 | | // so we have start = bias + 1 - M, or equivalently |
252 | | // bias = start + M - 1. |
253 | 0 | s->rcp_freq = ~0u; |
254 | 0 | s->rcp_shift = 0; |
255 | 0 | s->bias = start + (1 << scale_bits) - 1; |
256 | 0 | } else { |
257 | | // Alverson, "Integer Division using reciprocals" |
258 | | // shift=ceil(log2(freq)) |
259 | 0 | uint32_t shift = 0; |
260 | 0 | while (freq > (1u << shift)) |
261 | 0 | shift++; |
262 | |
|
263 | 0 | s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq); |
264 | 0 | s->rcp_shift = shift - 1; |
265 | | |
266 | | // With these values, 'q' is the correct quotient, so we |
267 | | // have bias=start. |
268 | 0 | s->bias = start; |
269 | 0 | } |
270 | |
|
271 | 0 | s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol |
272 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansEncSymbolInit Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncSymbolInit Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncSymbolInit Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncSymbolInit Unexecuted instantiation: rANS_static32x16pr.c:RansEncSymbolInit |
273 | | |
274 | | // Initialize a decoder symbol to start "start" and frequency "freq" |
275 | | static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq) |
276 | 0 | { |
277 | 0 | RansAssert(start <= (1 << 16)); |
278 | 0 | RansAssert(freq <= (1 << 16) - start); |
279 | 0 | s->start = (uint16_t) start; |
280 | 0 | s->freq = (uint16_t) freq; |
281 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansDecSymbolInit Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecSymbolInit Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecSymbolInit Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecSymbolInit Unexecuted instantiation: rANS_static32x16pr.c:RansDecSymbolInit |
282 | | |
283 | | // Encodes a given symbol. This is faster than straight RansEnc since we can do |
284 | | // multiplications instead of a divide. |
285 | | // |
286 | | // See RansEncSymbolInit for a description of how this works. |
287 | | static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) |
288 | 0 | { |
289 | | //RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 |
290 | | |
291 | | // renormalize |
292 | 0 | uint32_t x = *r; |
293 | 0 | uint32_t x_max = sym->x_max; |
294 | |
|
295 | 0 | #ifdef HTSCODECS_LITTLE_ENDIAN |
296 | | // Branchless renorm. |
297 | | // |
298 | | // This works best on high entropy data where branch prediction |
299 | | // is poor. |
300 | | // |
301 | | // Note the bit-packing and RLE modes are more likely to be used on |
302 | | // low entropy data, making this assertion generally true. See |
303 | | // RansEncPutSymbol_branched for a low-entropy optimised function. |
304 | | |
305 | | // NB: "(x > x_max)*2" turns back into branched code with gcc. |
306 | 0 | int c = (x > x_max); c*=2; |
307 | 0 | memcpy(*pptr-2, &x, 2); |
308 | 0 | x >>= c*8; |
309 | 0 | *pptr = *pptr - c; |
310 | | #else |
311 | | if (x > x_max) { |
312 | | uint8_t* ptr = *pptr; |
313 | | ptr -= 2; |
314 | | ptr[0] = x & 0xff; |
315 | | ptr[1] = (x >> 8) & 0xff; |
316 | | x >>= 16; |
317 | | *pptr = ptr; |
318 | | } |
319 | | #endif |
320 | | |
321 | | // x = C(s,x) |
322 | | // NOTE: written this way so we get a 32-bit "multiply high" when |
323 | | // available. If you're on a 64-bit platform with cheap multiplies |
324 | | // (e.g. x64), just bake the +32 into rcp_shift. |
325 | | //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; |
326 | | |
327 | | // Slow method, but robust |
328 | | // *r = ((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start; |
329 | | // return; |
330 | | |
331 | | // The extra >>32 has already been added to RansEncSymbolInit |
332 | 0 | uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); |
333 | 0 | *r = x + sym->bias + q * sym->cmpl_freq; |
334 | | |
335 | | // assert(((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start == *r); |
336 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansEncPutSymbol Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncPutSymbol Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncPutSymbol Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncPutSymbol Unexecuted instantiation: rANS_static32x16pr.c:RansEncPutSymbol |
337 | | |
338 | | static inline void RansEncPutSymbol_branched(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) |
339 | 0 | { |
340 | | //RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 |
341 | | |
342 | | // renormalize |
343 | 0 | uint32_t x = *r; |
344 | 0 | uint32_t x_max = sym->x_max; |
345 | |
|
346 | 0 | #ifdef HTSCODECS_LITTLE_ENDIAN |
347 | | // The old non-branchless method |
348 | 0 | if (x > x_max) { |
349 | 0 | (*pptr) -= 2; |
350 | 0 | memcpy(*pptr, &x, 2); |
351 | 0 | x >>= 16; |
352 | 0 | } |
353 | | #else |
354 | | if (x > x_max) { |
355 | | uint8_t* ptr = *pptr; |
356 | | ptr -= 2; |
357 | | ptr[0] = x & 0xff; |
358 | | ptr[1] = (x >> 8) & 0xff; |
359 | | x >>= 16; |
360 | | *pptr = ptr; |
361 | | } |
362 | | #endif |
363 | | |
364 | | // x = C(s,x) |
365 | | // NOTE: written this way so we get a 32-bit "multiply high" when |
366 | | // available. If you're on a 64-bit platform with cheap multiplies |
367 | | // (e.g. x64), just bake the +32 into rcp_shift. |
368 | | //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; |
369 | | |
370 | | // Slow method, but robust |
371 | | // *r = ((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start; |
372 | | // return; |
373 | | |
374 | | // The extra >>32 has already been added to RansEncSymbolInit |
375 | 0 | uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); |
376 | 0 | *r = x + sym->bias + q * sym->cmpl_freq; |
377 | | |
378 | | // assert(((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start == *r); |
379 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansEncPutSymbol_branched Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansEncPutSymbol_branched Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansEncPutSymbol_branched Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansEncPutSymbol_branched Unexecuted instantiation: rANS_static32x16pr.c:RansEncPutSymbol_branched |
380 | | |
381 | | // Equivalent to RansDecAdvance that takes a symbol. |
382 | | static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits) |
383 | 0 | { |
384 | 0 | RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits); |
385 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvanceSymbol Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvanceSymbol Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvanceSymbol Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvanceSymbol Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvanceSymbol |
386 | | |
387 | | // Advances in the bit stream by "popping" a single symbol with range start |
388 | | // "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits". |
389 | | // No renormalization or output happens. |
390 | | static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits) |
391 | 0 | { |
392 | 0 | uint32_t mask = (1u << scale_bits) - 1; |
393 | 0 |
|
394 | 0 | // s, x = D(x) |
395 | 0 | uint32_t x = *r; |
396 | 0 | *r = freq * (x >> scale_bits) + (x & mask) - start; |
397 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvanceStep Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvanceStep Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvanceStep Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvanceStep Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvanceStep |
398 | | |
399 | | // Equivalent to RansDecAdvanceStep that takes a symbol. |
400 | | static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits) |
401 | 0 | { |
402 | 0 | RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits); |
403 | 0 | } Unexecuted instantiation: rANS_static4x16pr.c:RansDecAdvanceSymbolStep Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecAdvanceSymbolStep Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecAdvanceSymbolStep Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecAdvanceSymbolStep Unexecuted instantiation: rANS_static32x16pr.c:RansDecAdvanceSymbolStep |
404 | | |
405 | | // Renormalize. |
406 | | |
407 | | #if defined(__x86_64) && !defined(__ILP32__) |
408 | | |
409 | | /* |
410 | | * Assembly variants of the RansDecRenorm code. |
411 | | * These are based on joint ideas from Rob Davies and from looking at |
412 | | * the clang assembly output. |
413 | | */ |
414 | 11.2M | static inline void RansDecRenorm(RansState* r, uint8_t** pptr) { |
415 | | // q4 q40 |
416 | | // clang 730/608 717/467 |
417 | | // gcc8 733/588 737/458 |
418 | 11.2M | uint32_t x = *r; |
419 | 11.2M | uint8_t *ptr = *pptr; |
420 | 11.2M | __asm__ ("movzwl (%0), %%eax\n\t" |
421 | 11.2M | "mov %1, %%edx\n\t" |
422 | 11.2M | "shl $0x10, %%edx\n\t" |
423 | 11.2M | "or %%eax, %%edx\n\t" |
424 | 11.2M | "xor %%eax, %%eax\n\t" |
425 | 11.2M | "cmp $0x8000,%1\n\t" |
426 | 11.2M | "cmovb %%edx, %1\n\t" |
427 | 11.2M | "lea 2(%0), %%rax\n\t" |
428 | 11.2M | "cmovb %%rax, %0\n\t" |
429 | 11.2M | : "=r" (ptr), "=r" (x) |
430 | 11.2M | : "0" (ptr), "1" (x) |
431 | 11.2M | : "eax", "edx" |
432 | 11.2M | ); |
433 | 11.2M | *pptr = (uint8_t *)ptr; |
434 | 11.2M | *r = x; |
435 | 11.2M | } rANS_static4x16pr.c:RansDecRenorm Line | Count | Source | 414 | 11.2M | static inline void RansDecRenorm(RansState* r, uint8_t** pptr) { | 415 | | // q4 q40 | 416 | | // clang 730/608 717/467 | 417 | | // gcc8 733/588 737/458 | 418 | 11.2M | uint32_t x = *r; | 419 | 11.2M | uint8_t *ptr = *pptr; | 420 | 11.2M | __asm__ ("movzwl (%0), %%eax\n\t" | 421 | 11.2M | "mov %1, %%edx\n\t" | 422 | 11.2M | "shl $0x10, %%edx\n\t" | 423 | 11.2M | "or %%eax, %%edx\n\t" | 424 | 11.2M | "xor %%eax, %%eax\n\t" | 425 | 11.2M | "cmp $0x8000,%1\n\t" | 426 | 11.2M | "cmovb %%edx, %1\n\t" | 427 | 11.2M | "lea 2(%0), %%rax\n\t" | 428 | 11.2M | "cmovb %%rax, %0\n\t" | 429 | 11.2M | : "=r" (ptr), "=r" (x) | 430 | 11.2M | : "0" (ptr), "1" (x) | 431 | 11.2M | : "eax", "edx" | 432 | 11.2M | ); | 433 | 11.2M | *pptr = (uint8_t *)ptr; | 434 | 11.2M | *r = x; | 435 | 11.2M | } |
Unexecuted instantiation: rANS_static32x16pr_avx2.c:RansDecRenorm Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecRenorm Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecRenorm Unexecuted instantiation: rANS_static32x16pr.c:RansDecRenorm |
436 | | |
437 | | #else /* __x86_64 */ |
438 | | |
439 | | static inline void RansDecRenorm(RansState* r, uint8_t** pptr) |
440 | | { |
441 | | // renormalize, branchless |
442 | | uint32_t x = *r; |
443 | | int cmp = (x < RANS_BYTE_L)*2; |
444 | | uint32_t y = (*pptr)[0] + ((*pptr)[1]<<8); |
445 | | uint32_t x2 = (x << 16) | y; |
446 | | x = cmp ? x2 : x; |
447 | | (*pptr) += cmp; |
448 | | *r = x; |
449 | | |
450 | | // // renormalize, branched. Faster on low-complexity data, but generally |
451 | | // // that is best compressed with PACK and/or RLE which turns it back |
452 | | // // into high complexity data. |
453 | | // uint32_t x = *r; |
454 | | // uint32_t y = (*pptr)[0] | ((*pptr)[1]<<8); |
455 | | // |
456 | | // if (x < RANS_BYTE_L) |
457 | | // (*pptr)+=2; |
458 | | // if (x < RANS_BYTE_L) |
459 | | // x = (x << 16) | y; |
460 | | // |
461 | | // *r = x; |
462 | | } |
463 | | #endif /* __x86_64 */ |
464 | | |
465 | | // Note the data may not be word aligned here. |
466 | | // This function is only used sparingly, for the last few bytes in the buffer, |
467 | | // so speed isn't critical. |
468 | | static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_end) |
469 | 5.49M | { |
470 | 5.49M | uint32_t x = *r; |
471 | 5.49M | if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return; |
472 | 9.58k | uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8); |
473 | 9.58k | x = (x << 16) | y; |
474 | 9.58k | (*pptr) += 2; |
475 | 9.58k | *r = x; |
476 | 9.58k | } rANS_static4x16pr.c:RansDecRenormSafe Line | Count | Source | 469 | 4.05M | { | 470 | 4.05M | uint32_t x = *r; | 471 | 4.05M | if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return; | 472 | 1.77k | uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8); | 473 | 1.77k | x = (x << 16) | y; | 474 | 1.77k | (*pptr) += 2; | 475 | 1.77k | *r = x; | 476 | 1.77k | } |
rANS_static32x16pr_avx2.c:RansDecRenormSafe Line | Count | Source | 469 | 1.44M | { | 470 | 1.44M | uint32_t x = *r; | 471 | 1.44M | if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return; | 472 | 7.81k | uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8); | 473 | 7.81k | x = (x << 16) | y; | 474 | 7.81k | (*pptr) += 2; | 475 | 7.81k | *r = x; | 476 | 7.81k | } |
Unexecuted instantiation: rANS_static32x16pr_avx512.c:RansDecRenormSafe Unexecuted instantiation: rANS_static32x16pr_sse4.c:RansDecRenormSafe Unexecuted instantiation: rANS_static32x16pr.c:RansDecRenormSafe |
477 | | |
478 | | #endif // RANS_WORD_HEADER |