/src/theora/lib/mathops.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "internal.h" |
2 | | #include "mathops.h" |
3 | | |
4 | | /*The fastest fallback strategy for platforms with fast multiplication appears |
5 | | to be based on de Bruijn sequences~\cite{LP98}. |
6 | | Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where |
7 | | multiplication or table lookups are too expensive. |
8 | | |
9 | | @UNPUBLISHED{LP98, |
10 | | author="Charles E. Leiserson and Harald Prokop", |
11 | | title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word", |
12 | | month=Jun, |
13 | | year=1998, |
14 | | note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}" |
15 | | }*/ |
16 | | #if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32) |
17 | | static const unsigned char OC_DEBRUIJN_IDX32[32]={ |
18 | | 0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8, |
19 | | 31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9 |
20 | | }; |
21 | | #endif |
22 | | |
23 | 0 | int oc_ilog32(ogg_uint32_t _v){ |
24 | 0 | #if defined(OC_CLZ32) |
25 | 0 | return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v; |
26 | | #else |
27 | | /*On a Pentium M, this branchless version tested as the fastest version without |
28 | | multiplications on 1,000,000,000 random 32-bit integers, edging out a |
29 | | similar version with branches, and a 256-entry LUT version.*/ |
30 | | # if defined(OC_ILOG_NODEBRUIJN) |
31 | | int ret; |
32 | | int m; |
33 | | ret=_v>0; |
34 | | m=(_v>0xFFFFU)<<4; |
35 | | _v>>=m; |
36 | | ret|=m; |
37 | | m=(_v>0xFFU)<<3; |
38 | | _v>>=m; |
39 | | ret|=m; |
40 | | m=(_v>0xFU)<<2; |
41 | | _v>>=m; |
42 | | ret|=m; |
43 | | m=(_v>3)<<1; |
44 | | _v>>=m; |
45 | | ret|=m; |
46 | | ret+=_v>1; |
47 | | return ret; |
48 | | /*This de Bruijn sequence version is faster if you have a fast multiplier.*/ |
49 | | # else |
50 | | int ret; |
51 | | _v|=_v>>1; |
52 | | _v|=_v>>2; |
53 | | _v|=_v>>4; |
54 | | _v|=_v>>8; |
55 | | _v|=_v>>16; |
56 | | ret=_v&1; |
57 | | _v=(_v>>1)+1; |
58 | | ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F]; |
59 | | return ret; |
60 | | # endif |
61 | | #endif |
62 | 0 | } |
63 | | |
64 | 1.45M | int oc_ilog64(ogg_int64_t _v){ |
65 | 1.45M | #if defined(OC_CLZ64) |
66 | 1.45M | return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v; |
67 | | #else |
68 | | /*If we don't have a fast 64-bit word implementation, split it into two 32-bit |
69 | | halves.*/ |
70 | | # if defined(OC_ILOG_NODEBRUIJN)|| \ |
71 | | defined(OC_CLZ32)||LONG_MAX<9223372036854775807LL |
72 | | ogg_uint32_t v; |
73 | | int ret; |
74 | | int m; |
75 | | m=(_v>0xFFFFFFFFU)<<5; |
76 | | v=(ogg_uint32_t)(_v>>m); |
77 | | # if defined(OC_CLZ32) |
78 | | ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v; |
79 | | # elif defined(OC_ILOG_NODEBRUIJN) |
80 | | ret=v>0|m; |
81 | | m=(v>0xFFFFU)<<4; |
82 | | v>>=m; |
83 | | ret|=m; |
84 | | m=(v>0xFFU)<<3; |
85 | | v>>=m; |
86 | | ret|=m; |
87 | | m=(v>0xFU)<<2; |
88 | | v>>=m; |
89 | | ret|=m; |
90 | | m=(v>3)<<1; |
91 | | v>>=m; |
92 | | ret|=m; |
93 | | ret+=v>1; |
94 | | return ret; |
95 | | # else |
96 | | v|=v>>1; |
97 | | v|=v>>2; |
98 | | v|=v>>4; |
99 | | v|=v>>8; |
100 | | v|=v>>16; |
101 | | ret=v&1|m; |
102 | | v=(v>>1)+1; |
103 | | ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F]; |
104 | | # endif |
105 | | return ret; |
106 | | /*Otherwise do it in one 64-bit multiply.*/ |
107 | | # else |
108 | | static const unsigned char OC_DEBRUIJN_IDX64[64]={ |
109 | | 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40, |
110 | | 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57, |
111 | | 63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56, |
112 | | 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58 |
113 | | }; |
114 | | int ret; |
115 | | _v|=_v>>1; |
116 | | _v|=_v>>2; |
117 | | _v|=_v>>4; |
118 | | _v|=_v>>8; |
119 | | _v|=_v>>16; |
120 | | _v|=_v>>32; |
121 | | ret=(int)_v&1; |
122 | | _v=(_v>>1)+1; |
123 | | ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F]; |
124 | | return ret; |
125 | | # endif |
126 | | #endif |
127 | 1.45M | } |
128 | | |
129 | | /*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/ |
130 | | static const ogg_int64_t OC_ATANH_LOG2[32]={ |
131 | | 0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL, |
132 | | 0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL, |
133 | | 0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL, |
134 | | 0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL, |
135 | | 0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL, |
136 | | 0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL, |
137 | | 0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL, |
138 | | 0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL, |
139 | | 0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL, |
140 | | 0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL, |
141 | | 0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL |
142 | | }; |
143 | | |
144 | | /*Computes the binary exponential of _z, a log base 2 in Q57 format.*/ |
145 | 954k | ogg_int64_t oc_bexp64(ogg_int64_t _z){ |
146 | 954k | ogg_int64_t w; |
147 | 954k | ogg_int64_t z; |
148 | 954k | int ipart; |
149 | 954k | ipart=(int)(_z>>57); |
150 | 954k | if(ipart<0)return 0; |
151 | 865k | if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL; |
152 | 863k | z=_z-OC_Q57(ipart); |
153 | 863k | if(z){ |
154 | 860k | ogg_int64_t mask; |
155 | 860k | long wlo; |
156 | 860k | int i; |
157 | | /*C doesn't give us 64x64->128 muls, so we use CORDIC. |
158 | | This is not particularly fast, but it's not being used in time-critical |
159 | | code; it is very accurate.*/ |
160 | | /*z is the fractional part of the log in Q62 format. |
161 | | We need 1 bit of headroom since the magnitude can get larger than 1 |
162 | | during the iteration, and a sign bit.*/ |
163 | 860k | z<<=5; |
164 | | /*w is the exponential in Q61 format (since it also needs headroom and can |
165 | | get as large as 2.0); we could get another bit if we dropped the sign, |
166 | | but we'll recover that bit later anyway. |
167 | | Ideally this should start out as |
168 | | \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}} |
169 | | but in order to guarantee convergence we have to repeat iterations 4, |
170 | | 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/ |
171 | 860k | w=0x26A3D0E401DD846DLL; |
172 | 3.44M | for(i=0;;i++){ |
173 | 3.44M | mask=-(z<0); |
174 | 3.44M | w+=(w>>i+1)+mask^mask; |
175 | 3.44M | z-=OC_ATANH_LOG2[i]+mask^mask; |
176 | | /*Repeat iteration 4.*/ |
177 | 3.44M | if(i>=3)break; |
178 | 2.58M | z<<=1; |
179 | 2.58M | } |
180 | 8.60M | for(;;i++){ |
181 | 8.60M | mask=-(z<0); |
182 | 8.60M | w+=(w>>i+1)+mask^mask; |
183 | 8.60M | z-=OC_ATANH_LOG2[i]+mask^mask; |
184 | | /*Repeat iteration 13.*/ |
185 | 8.60M | if(i>=12)break; |
186 | 7.74M | z<<=1; |
187 | 7.74M | } |
188 | 18.0M | for(;i<32;i++){ |
189 | 17.2M | mask=-(z<0); |
190 | 17.2M | w+=(w>>i+1)+mask^mask; |
191 | 17.2M | z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1; |
192 | 17.2M | } |
193 | 860k | wlo=0; |
194 | | /*Skip the remaining iterations unless we really require that much |
195 | | precision. |
196 | | We could have bailed out earlier for smaller iparts, but that would |
197 | | require initializing w from a table, as the limit doesn't converge to |
198 | | 61-bit precision until n=30.*/ |
199 | 860k | if(ipart>30){ |
200 | | /*For these iterations, we just update the low bits, as the high bits |
201 | | can't possibly be affected. |
202 | | OC_ATANH_LOG2 has also converged (it actually did so one iteration |
203 | | earlier, but that's no reason for an extra special case).*/ |
204 | 808k | for(;;i++){ |
205 | 808k | mask=-(z<0); |
206 | 808k | wlo+=(w>>i)+mask^mask; |
207 | 808k | z-=OC_ATANH_LOG2[31]+mask^mask; |
208 | | /*Repeat iteration 40.*/ |
209 | 808k | if(i>=39)break; |
210 | 707k | z<<=1; |
211 | 707k | } |
212 | 2.32M | for(;i<61;i++){ |
213 | 2.22M | mask=-(z<0); |
214 | 2.22M | wlo+=(w>>i)+mask^mask; |
215 | 2.22M | z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1; |
216 | 2.22M | } |
217 | 101k | } |
218 | 860k | w=(w<<1)+wlo; |
219 | 860k | } |
220 | 2.89k | else w=(ogg_int64_t)1<<62; |
221 | 863k | if(ipart<62)w=(w>>61-ipart)+1>>1; |
222 | 863k | return w; |
223 | 865k | } |
224 | | |
225 | | /*Computes the binary logarithm of _w, returned in Q57 format.*/ |
226 | 1.48M | ogg_int64_t oc_blog64(ogg_int64_t _w){ |
227 | 1.48M | ogg_int64_t z; |
228 | 1.48M | int ipart; |
229 | 1.48M | if(_w<=0)return -1; |
230 | 1.45M | ipart=OC_ILOGNZ_64(_w)-1; |
231 | 1.45M | if(ipart>61)_w>>=ipart-61; |
232 | 1.45M | else _w<<=61-ipart; |
233 | 1.45M | z=0; |
234 | 1.45M | if(_w&_w-1){ |
235 | 1.43M | ogg_int64_t x; |
236 | 1.43M | ogg_int64_t y; |
237 | 1.43M | ogg_int64_t u; |
238 | 1.43M | ogg_int64_t mask; |
239 | 1.43M | int i; |
240 | | /*C doesn't give us 64x64->128 muls, so we use CORDIC. |
241 | | This is not particularly fast, but it's not being used in time-critical |
242 | | code; it is very accurate.*/ |
243 | | /*z is the fractional part of the log in Q61 format.*/ |
244 | | /*x and y are the cosh() and sinh(), respectively, in Q61 format. |
245 | | We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/ |
246 | 1.43M | x=_w+((ogg_int64_t)1<<61); |
247 | 1.43M | y=_w-((ogg_int64_t)1<<61); |
248 | 7.15M | for(i=0;i<4;i++){ |
249 | 5.72M | mask=-(y<0); |
250 | 5.72M | z+=(OC_ATANH_LOG2[i]>>i)+mask^mask; |
251 | 5.72M | u=x>>i+1; |
252 | 5.72M | x-=(y>>i+1)+mask^mask; |
253 | 5.72M | y-=u+mask^mask; |
254 | 5.72M | } |
255 | | /*Repeat iteration 4.*/ |
256 | 15.7M | for(i--;i<13;i++){ |
257 | 14.3M | mask=-(y<0); |
258 | 14.3M | z+=(OC_ATANH_LOG2[i]>>i)+mask^mask; |
259 | 14.3M | u=x>>i+1; |
260 | 14.3M | x-=(y>>i+1)+mask^mask; |
261 | 14.3M | y-=u+mask^mask; |
262 | 14.3M | } |
263 | | /*Repeat iteration 13.*/ |
264 | 30.0M | for(i--;i<32;i++){ |
265 | 28.6M | mask=-(y<0); |
266 | 28.6M | z+=(OC_ATANH_LOG2[i]>>i)+mask^mask; |
267 | 28.6M | u=x>>i+1; |
268 | 28.6M | x-=(y>>i+1)+mask^mask; |
269 | 28.6M | y-=u+mask^mask; |
270 | 28.6M | } |
271 | | /*OC_ATANH_LOG2 has converged.*/ |
272 | 12.8M | for(;i<40;i++){ |
273 | 11.4M | mask=-(y<0); |
274 | 11.4M | z+=(OC_ATANH_LOG2[31]>>i)+mask^mask; |
275 | 11.4M | u=x>>i+1; |
276 | 11.4M | x-=(y>>i+1)+mask^mask; |
277 | 11.4M | y-=u+mask^mask; |
278 | 11.4M | } |
279 | | /*Repeat iteration 40.*/ |
280 | 34.3M | for(i--;i<62;i++){ |
281 | 32.9M | mask=-(y<0); |
282 | 32.9M | z+=(OC_ATANH_LOG2[31]>>i)+mask^mask; |
283 | 32.9M | u=x>>i+1; |
284 | 32.9M | x-=(y>>i+1)+mask^mask; |
285 | 32.9M | y-=u+mask^mask; |
286 | 32.9M | } |
287 | 1.43M | z=z+8>>4; |
288 | 1.43M | } |
289 | 1.45M | return OC_Q57(ipart)+z; |
290 | 1.48M | } |
291 | | |
292 | | /*Polynomial approximation of a binary exponential. |
293 | | Q10 input, Q0 output.*/ |
294 | 44.5k | ogg_uint32_t oc_bexp32_q10(int _z){ |
295 | 44.5k | unsigned n; |
296 | 44.5k | int ipart; |
297 | 44.5k | ipart=_z>>10; |
298 | 44.5k | n=(_z&(1<<10)-1)<<4; |
299 | 44.5k | n=(n*((n*((n*((n*3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384; |
300 | 44.5k | return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14; |
301 | 44.5k | } |
302 | | |
303 | | /*Polynomial approximation of a binary logarithm. |
304 | | Q0 input, Q10 output.*/ |
305 | 1.36M | int oc_blog32_q10(ogg_uint32_t _w){ |
306 | 1.36M | int n; |
307 | 1.36M | int ipart; |
308 | 1.36M | int fpart; |
309 | 1.36M | if(_w<=0)return -1; |
310 | 1.36M | ipart=OC_ILOGNZ_32(_w); |
311 | 1.36M | n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384; |
312 | 1.36M | fpart=(n*((n*((n*((n*-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793; |
313 | 1.36M | return (ipart<<10)+(fpart>>4); |
314 | 1.36M | } |