Coverage Report

Created: 2024-09-06 07:53

/src/theora/lib/mathops.c
Line
Count
Source (jump to first uncovered line)
1
#include "internal.h"
2
#include "mathops.h"
3
4
/*The fastest fallback strategy for platforms with fast multiplication appears
5
   to be based on de Bruijn sequences~\cite{LP98}.
6
  Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
7
   multiplication or table lookups are too expensive.
8
9
  @UNPUBLISHED{LP98,
10
    author="Charles E. Leiserson and Harald Prokop",
11
    title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
12
    month=Jun,
13
    year=1998,
14
    note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
15
  }*/
16
#if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32)
17
static const unsigned char OC_DEBRUIJN_IDX32[32]={
18
   0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
19
  31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
20
};
21
#endif
22
23
0
int oc_ilog32(ogg_uint32_t _v){
24
0
#if defined(OC_CLZ32)
25
0
  return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v;
26
#else
27
/*On a Pentium M, this branchless version tested as the fastest version without
28
   multiplications on 1,000,000,000 random 32-bit integers, edging out a
29
   similar version with branches, and a 256-entry LUT version.*/
30
# if defined(OC_ILOG_NODEBRUIJN)
31
  int ret;
32
  int m;
33
  ret=_v>0;
34
  m=(_v>0xFFFFU)<<4;
35
  _v>>=m;
36
  ret|=m;
37
  m=(_v>0xFFU)<<3;
38
  _v>>=m;
39
  ret|=m;
40
  m=(_v>0xFU)<<2;
41
  _v>>=m;
42
  ret|=m;
43
  m=(_v>3)<<1;
44
  _v>>=m;
45
  ret|=m;
46
  ret+=_v>1;
47
  return ret;
48
/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
49
# else
50
  int ret;
51
  _v|=_v>>1;
52
  _v|=_v>>2;
53
  _v|=_v>>4;
54
  _v|=_v>>8;
55
  _v|=_v>>16;
56
  ret=_v&1;
57
  _v=(_v>>1)+1;
58
  ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
59
  return ret;
60
# endif
61
#endif
62
0
}
63
64
1.45M
int oc_ilog64(ogg_int64_t _v){
65
1.45M
#if defined(OC_CLZ64)
66
1.45M
  return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v;
67
#else
68
/*If we don't have a fast 64-bit word implementation, split it into two 32-bit
69
   halves.*/
70
# if defined(OC_ILOG_NODEBRUIJN)|| \
71
 defined(OC_CLZ32)||LONG_MAX<9223372036854775807LL
72
  ogg_uint32_t v;
73
  int          ret;
74
  int          m;
75
  m=(_v>0xFFFFFFFFU)<<5;
76
  v=(ogg_uint32_t)(_v>>m);
77
#  if defined(OC_CLZ32)
78
  ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v;
79
#  elif defined(OC_ILOG_NODEBRUIJN)
80
  ret=v>0|m;
81
  m=(v>0xFFFFU)<<4;
82
  v>>=m;
83
  ret|=m;
84
  m=(v>0xFFU)<<3;
85
  v>>=m;
86
  ret|=m;
87
  m=(v>0xFU)<<2;
88
  v>>=m;
89
  ret|=m;
90
  m=(v>3)<<1;
91
  v>>=m;
92
  ret|=m;
93
  ret+=v>1;
94
  return ret;
95
#  else
96
  v|=v>>1;
97
  v|=v>>2;
98
  v|=v>>4;
99
  v|=v>>8;
100
  v|=v>>16;
101
  ret=v&1|m;
102
  v=(v>>1)+1;
103
  ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
104
#  endif
105
  return ret;
106
/*Otherwise do it in one 64-bit multiply.*/
107
# else
108
  static const unsigned char OC_DEBRUIJN_IDX64[64]={
109
     0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
110
     5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
111
    63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
112
    62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
113
  };
114
  int ret;
115
  _v|=_v>>1;
116
  _v|=_v>>2;
117
  _v|=_v>>4;
118
  _v|=_v>>8;
119
  _v|=_v>>16;
120
  _v|=_v>>32;
121
  ret=(int)_v&1;
122
  _v=(_v>>1)+1;
123
  ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
124
  return ret;
125
# endif
126
#endif
127
1.45M
}
128
129
/*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/
130
static const ogg_int64_t OC_ATANH_LOG2[32]={
131
  0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
132
  0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
133
  0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
134
  0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
135
  0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
136
  0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
137
  0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
138
  0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
139
  0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
140
  0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
141
  0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
142
};
143
144
/*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
145
954k
ogg_int64_t oc_bexp64(ogg_int64_t _z){
146
954k
  ogg_int64_t w;
147
954k
  ogg_int64_t z;
148
954k
  int         ipart;
149
954k
  ipart=(int)(_z>>57);
150
954k
  if(ipart<0)return 0;
151
865k
  if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
152
863k
  z=_z-OC_Q57(ipart);
153
863k
  if(z){
154
860k
    ogg_int64_t mask;
155
860k
    long        wlo;
156
860k
    int         i;
157
    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
158
      This is not particularly fast, but it's not being used in time-critical
159
       code; it is very accurate.*/
160
    /*z is the fractional part of the log in Q62 format.
161
      We need 1 bit of headroom since the magnitude can get larger than 1
162
       during the iteration, and a sign bit.*/
163
860k
    z<<=5;
164
    /*w is the exponential in Q61 format (since it also needs headroom and can
165
       get as large as 2.0); we could get another bit if we dropped the sign,
166
       but we'll recover that bit later anyway.
167
      Ideally this should start out as
168
        \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
169
       but in order to guarantee convergence we have to repeat iterations 4,
170
        13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
171
860k
    w=0x26A3D0E401DD846DLL;
172
3.44M
    for(i=0;;i++){
173
3.44M
      mask=-(z<0);
174
3.44M
      w+=(w>>i+1)+mask^mask;
175
3.44M
      z-=OC_ATANH_LOG2[i]+mask^mask;
176
      /*Repeat iteration 4.*/
177
3.44M
      if(i>=3)break;
178
2.58M
      z<<=1;
179
2.58M
    }
180
8.60M
    for(;;i++){
181
8.60M
      mask=-(z<0);
182
8.60M
      w+=(w>>i+1)+mask^mask;
183
8.60M
      z-=OC_ATANH_LOG2[i]+mask^mask;
184
      /*Repeat iteration 13.*/
185
8.60M
      if(i>=12)break;
186
7.74M
      z<<=1;
187
7.74M
    }
188
18.0M
    for(;i<32;i++){
189
17.2M
      mask=-(z<0);
190
17.2M
      w+=(w>>i+1)+mask^mask;
191
17.2M
      z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
192
17.2M
    }
193
860k
    wlo=0;
194
    /*Skip the remaining iterations unless we really require that much
195
       precision.
196
      We could have bailed out earlier for smaller iparts, but that would
197
       require initializing w from a table, as the limit doesn't converge to
198
       61-bit precision until n=30.*/
199
860k
    if(ipart>30){
200
      /*For these iterations, we just update the low bits, as the high bits
201
         can't possibly be affected.
202
        OC_ATANH_LOG2 has also converged (it actually did so one iteration
203
         earlier, but that's no reason for an extra special case).*/
204
808k
      for(;;i++){
205
808k
        mask=-(z<0);
206
808k
        wlo+=(w>>i)+mask^mask;
207
808k
        z-=OC_ATANH_LOG2[31]+mask^mask;
208
        /*Repeat iteration 40.*/
209
808k
        if(i>=39)break;
210
707k
        z<<=1;
211
707k
      }
212
2.32M
      for(;i<61;i++){
213
2.22M
        mask=-(z<0);
214
2.22M
        wlo+=(w>>i)+mask^mask;
215
2.22M
        z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
216
2.22M
      }
217
101k
    }
218
860k
    w=(w<<1)+wlo;
219
860k
  }
220
2.89k
  else w=(ogg_int64_t)1<<62;
221
863k
  if(ipart<62)w=(w>>61-ipart)+1>>1;
222
863k
  return w;
223
865k
}
224
225
/*Computes the binary logarithm of _w, returned in Q57 format.*/
226
1.48M
ogg_int64_t oc_blog64(ogg_int64_t _w){
227
1.48M
  ogg_int64_t z;
228
1.48M
  int         ipart;
229
1.48M
  if(_w<=0)return -1;
230
1.45M
  ipart=OC_ILOGNZ_64(_w)-1;
231
1.45M
  if(ipart>61)_w>>=ipart-61;
232
1.45M
  else _w<<=61-ipart;
233
1.45M
  z=0;
234
1.45M
  if(_w&_w-1){
235
1.43M
    ogg_int64_t x;
236
1.43M
    ogg_int64_t y;
237
1.43M
    ogg_int64_t u;
238
1.43M
    ogg_int64_t mask;
239
1.43M
    int         i;
240
    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
241
      This is not particularly fast, but it's not being used in time-critical
242
       code; it is very accurate.*/
243
    /*z is the fractional part of the log in Q61 format.*/
244
    /*x and y are the cosh() and sinh(), respectively, in Q61 format.
245
      We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/
246
1.43M
    x=_w+((ogg_int64_t)1<<61);
247
1.43M
    y=_w-((ogg_int64_t)1<<61);
248
7.15M
    for(i=0;i<4;i++){
249
5.72M
      mask=-(y<0);
250
5.72M
      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
251
5.72M
      u=x>>i+1;
252
5.72M
      x-=(y>>i+1)+mask^mask;
253
5.72M
      y-=u+mask^mask;
254
5.72M
    }
255
    /*Repeat iteration 4.*/
256
15.7M
    for(i--;i<13;i++){
257
14.3M
      mask=-(y<0);
258
14.3M
      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
259
14.3M
      u=x>>i+1;
260
14.3M
      x-=(y>>i+1)+mask^mask;
261
14.3M
      y-=u+mask^mask;
262
14.3M
    }
263
    /*Repeat iteration 13.*/
264
30.0M
    for(i--;i<32;i++){
265
28.6M
      mask=-(y<0);
266
28.6M
      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
267
28.6M
      u=x>>i+1;
268
28.6M
      x-=(y>>i+1)+mask^mask;
269
28.6M
      y-=u+mask^mask;
270
28.6M
    }
271
    /*OC_ATANH_LOG2 has converged.*/
272
12.8M
    for(;i<40;i++){
273
11.4M
      mask=-(y<0);
274
11.4M
      z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
275
11.4M
      u=x>>i+1;
276
11.4M
      x-=(y>>i+1)+mask^mask;
277
11.4M
      y-=u+mask^mask;
278
11.4M
    }
279
    /*Repeat iteration 40.*/
280
34.3M
    for(i--;i<62;i++){
281
32.9M
      mask=-(y<0);
282
32.9M
      z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
283
32.9M
      u=x>>i+1;
284
32.9M
      x-=(y>>i+1)+mask^mask;
285
32.9M
      y-=u+mask^mask;
286
32.9M
    }
287
1.43M
    z=z+8>>4;
288
1.43M
  }
289
1.45M
  return OC_Q57(ipart)+z;
290
1.48M
}
291
292
/*Polynomial approximation of a binary exponential.
293
  Q10 input, Q0 output.*/
294
44.5k
ogg_uint32_t oc_bexp32_q10(int _z){
295
44.5k
  unsigned n;
296
44.5k
  int      ipart;
297
44.5k
  ipart=_z>>10;
298
44.5k
  n=(_z&(1<<10)-1)<<4;
299
44.5k
  n=(n*((n*((n*((n*3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384;
300
44.5k
  return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14;
301
44.5k
}
302
303
/*Polynomial approximation of a binary logarithm.
304
  Q0 input, Q10 output.*/
305
1.36M
int oc_blog32_q10(ogg_uint32_t _w){
306
1.36M
  int n;
307
1.36M
  int ipart;
308
1.36M
  int fpart;
309
1.36M
  if(_w<=0)return -1;
310
1.36M
  ipart=OC_ILOGNZ_32(_w);
311
1.36M
  n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384;
312
1.36M
  fpart=(n*((n*((n*((n*-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793;
313
1.36M
  return (ipart<<10)+(fpart>>4);
314
1.36M
}