/src/theora/lib/mathops.c

Source (jump to first uncovered line)
#include "internal.h"
#include "mathops.h"

/*The fastest fallback strategy for platforms with fast multiplication appears
   to be based on de Bruijn sequences~\cite{LP98}.
  Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
   multiplication or table lookups are too expensive.

  @UNPUBLISHED{LP98,
    author="Charles E. Leiserson and Harald Prokop",
    title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
    month=Jun,
    year=1998,
    note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
  }*/
#if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32)
static const unsigned char OC_DEBRUIJN_IDX32[32]={
   0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
  31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
};
#endif

int oc_ilog32(ogg_uint32_t _v){
#if defined(OC_CLZ32)
  return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v;
#else
/*On a Pentium M, this branchless version tested as the fastest version without
   multiplications on 1,000,000,000 random 32-bit integers, edging out a
   similar version with branches, and a 256-entry LUT version.*/
# if defined(OC_ILOG_NODEBRUIJN)
  int ret;
  int m;
  ret=_v>0;
  m=(_v>0xFFFFU)<<4;
  _v>>=m;
  ret|=m;
  m=(_v>0xFFU)<<3;
  _v>>=m;
  ret|=m;
  m=(_v>0xFU)<<2;
  _v>>=m;
  ret|=m;
  m=(_v>3)<<1;
  _v>>=m;
  ret|=m;
  ret+=_v>1;
  return ret;
/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
# else
  int ret;
  _v|=_v>>1;
  _v|=_v>>2;
  _v|=_v>>4;
  _v|=_v>>8;
  _v|=_v>>16;
  ret=_v&1;
  _v=(_v>>1)+1;
  ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
  return ret;
# endif
#endif
}

int oc_ilog64(ogg_int64_t _v){
#if defined(OC_CLZ64)
  return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v;
#else
/*If we don't have a fast 64-bit word implementation, split it into two 32-bit
   halves.*/
# if defined(OC_ILOG_NODEBRUIJN)|| \
 defined(OC_CLZ32)||LONG_MAX<9223372036854775807LL
  ogg_uint32_t v;
  int          ret;
  int          m;
  m=(_v>0xFFFFFFFFU)<<5;
  v=(ogg_uint32_t)(_v>>m);
#  if defined(OC_CLZ32)
  ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v;
#  elif defined(OC_ILOG_NODEBRUIJN)
  ret=v>0|m;
  m=(v>0xFFFFU)<<4;
  v>>=m;
  ret|=m;
  m=(v>0xFFU)<<3;
  v>>=m;
  ret|=m;
  m=(v>0xFU)<<2;
  v>>=m;
  ret|=m;
  m=(v>3)<<1;
  v>>=m;
  ret|=m;
  ret+=v>1;
  return ret;
#  else
  v|=v>>1;
  v|=v>>2;
  v|=v>>4;
  v|=v>>8;
  v|=v>>16;
  ret=v&1|m;
  v=(v>>1)+1;
  ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
#  endif
  return ret;
/*Otherwise do it in one 64-bit multiply.*/
# else
  static const unsigned char OC_DEBRUIJN_IDX64[64]={
     0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
     5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
    63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
    62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
  };
  int ret;
  _v|=_v>>1;
  _v|=_v>>2;
  _v|=_v>>4;
  _v|=_v>>8;
  _v|=_v>>16;
  _v|=_v>>32;
  ret=(int)_v&1;
  _v=(_v>>1)+1;
  ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
  return ret;
# endif
#endif
}

/*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/
static const ogg_int64_t OC_ATANH_LOG2[32]={
  0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
  0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
  0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
  0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
  0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
  0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
  0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
  0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
  0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
  0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
  0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
};

/*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
ogg_int64_t oc_bexp64(ogg_int64_t _z){
  ogg_int64_t w;
  ogg_int64_t z;
  int         ipart;
  ipart=(int)(_z>>57);
  if(ipart<0)return 0;
  if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
  z=_z-OC_Q57(ipart);
  if(z){
    ogg_int64_t mask;
    long        wlo;
    int         i;
    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
      This is not particularly fast, but it's not being used in time-critical
       code; it is very accurate.*/
    /*z is the fractional part of the log in Q62 format.
      We need 1 bit of headroom since the magnitude can get larger than 1
       during the iteration, and a sign bit.*/
    z<<=5;
    /*w is the exponential in Q61 format (since it also needs headroom and can
       get as large as 2.0); we could get another bit if we dropped the sign,
       but we'll recover that bit later anyway.
      Ideally this should start out as
        \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
       but in order to guarantee convergence we have to repeat iterations 4,
        13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
    w=0x26A3D0E401DD846DLL;
    for(i=0;;i++){
      mask=-(z<0);
      w+=(w>>i+1)+mask^mask;
      z-=OC_ATANH_LOG2[i]+mask^mask;
      /*Repeat iteration 4.*/
      if(i>=3)break;
      z<<=1;
    }
    for(;;i++){
      mask=-(z<0);
      w+=(w>>i+1)+mask^mask;
      z-=OC_ATANH_LOG2[i]+mask^mask;
      /*Repeat iteration 13.*/
      if(i>=12)break;
      z<<=1;
    }
    for(;i<32;i++){
      mask=-(z<0);
      w+=(w>>i+1)+mask^mask;
      z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
    }
    wlo=0;
    /*Skip the remaining iterations unless we really require that much
       precision.
      We could have bailed out earlier for smaller iparts, but that would
       require initializing w from a table, as the limit doesn't converge to
       61-bit precision until n=30.*/
    if(ipart>30){
      /*For these iterations, we just update the low bits, as the high bits
         can't possibly be affected.
        OC_ATANH_LOG2 has also converged (it actually did so one iteration
         earlier, but that's no reason for an extra special case).*/
      for(;;i++){
        mask=-(z<0);
        wlo+=(w>>i)+mask^mask;
        z-=OC_ATANH_LOG2[31]+mask^mask;
        /*Repeat iteration 40.*/
        if(i>=39)break;
        z<<=1;
      }
      for(;i<61;i++){
        mask=-(z<0);
        wlo+=(w>>i)+mask^mask;
        z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
      }
    }
    w=(w<<1)+wlo;
  }
  else w=(ogg_int64_t)1<<62;
  if(ipart<62)w=(w>>61-ipart)+1>>1;
  return w;
}

/*Computes the binary logarithm of _w, returned in Q57 format.*/
ogg_int64_t oc_blog64(ogg_int64_t _w){
  ogg_int64_t z;
  int         ipart;
  if(_w<=0)return -1;
  ipart=OC_ILOGNZ_64(_w)-1;
  if(ipart>61)_w>>=ipart-61;
  else _w<<=61-ipart;
  z=0;
  if(_w&_w-1){
    ogg_int64_t x;
    ogg_int64_t y;
    ogg_int64_t u;
    ogg_int64_t mask;
    int         i;
    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
      This is not particularly fast, but it's not being used in time-critical
       code; it is very accurate.*/
    /*z is the fractional part of the log in Q61 format.*/
    /*x and y are the cosh() and sinh(), respectively, in Q61 format.
      We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/
    x=_w+((ogg_int64_t)1<<61);
    y=_w-((ogg_int64_t)1<<61);
    for(i=0;i<4;i++){
      mask=-(y<0);
      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
      u=x>>i+1;
      x-=(y>>i+1)+mask^mask;
      y-=u+mask^mask;
    }
    /*Repeat iteration 4.*/
    for(i--;i<13;i++){
      mask=-(y<0);
      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
      u=x>>i+1;
      x-=(y>>i+1)+mask^mask;
      y-=u+mask^mask;
    }
    /*Repeat iteration 13.*/
    for(i--;i<32;i++){
      mask=-(y<0);
      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
      u=x>>i+1;
      x-=(y>>i+1)+mask^mask;
      y-=u+mask^mask;
    }
    /*OC_ATANH_LOG2 has converged.*/
    for(;i<40;i++){
      mask=-(y<0);
      z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
      u=x>>i+1;
      x-=(y>>i+1)+mask^mask;
      y-=u+mask^mask;
    }
    /*Repeat iteration 40.*/
    for(i--;i<62;i++){
      mask=-(y<0);
      z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
      u=x>>i+1;
      x-=(y>>i+1)+mask^mask;
      y-=u+mask^mask;
    }
    z=z+8>>4;
  }
  return OC_Q57(ipart)+z;
}

/*Polynomial approximation of a binary exponential.
  Q10 input, Q0 output.*/
ogg_uint32_t oc_bexp32_q10(int _z){
  unsigned n;
  int      ipart;
  ipart=_z>>10;
  n=(_z&(1<<10)-1)<<4;
  n=(n*((n*((n*((n*3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384;
  return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14;
}

/*Polynomial approximation of a binary logarithm.
  Q0 input, Q10 output.*/
int oc_blog32_q10(ogg_uint32_t _w){
  int n;
  int ipart;
  int fpart;
  if(_w<=0)return -1;
  ipart=OC_ILOGNZ_32(_w);
  n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384;
  fpart=(n*((n*((n*((n*-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793;
  return (ipart<<10)+(fpart>>4);
}

Coverage Report

Created: 2024-09-06 07:53

Line	Count	Source (jump to first uncovered line)
1		#include "internal.h"
2		#include "mathops.h"
3
4		/*The fastest fallback strategy for platforms with fast multiplication appears
5		to be based on de Bruijn sequences~\cite{LP98}.
6		Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
7		multiplication or table lookups are too expensive.
8
9		@UNPUBLISHED{LP98,
10		author="Charles E. Leiserson and Harald Prokop",
11		title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
12		month=Jun,
13		year=1998,
14		note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
15		}*/
16		#if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32)
17		static const unsigned char OC_DEBRUIJN_IDX32[32]={
18		0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
19		31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
20		};
21		#endif
22
23	0	int oc_ilog32(ogg_uint32_t _v){
24	0	#if defined(OC_CLZ32)
25	0	return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v;
26		#else
27		/*On a Pentium M, this branchless version tested as the fastest version without
28		multiplications on 1,000,000,000 random 32-bit integers, edging out a
29		similar version with branches, and a 256-entry LUT version.*/
30		# if defined(OC_ILOG_NODEBRUIJN)
31		int ret;
32		int m;
33		ret=_v>0;
34		m=(_v>0xFFFFU)<<4;
35		_v>>=m;
36		ret\|=m;
37		m=(_v>0xFFU)<<3;
38		_v>>=m;
39		ret\|=m;
40		m=(_v>0xFU)<<2;
41		_v>>=m;
42		ret\|=m;
43		m=(_v>3)<<1;
44		_v>>=m;
45		ret\|=m;
46		ret+=_v>1;
47		return ret;
48		/This de Bruijn sequence version is faster if you have a fast multiplier./
49		# else
50		int ret;
51		_v\|=_v>>1;
52		_v\|=_v>>2;
53		_v\|=_v>>4;
54		_v\|=_v>>8;
55		_v\|=_v>>16;
56		ret=_v&1;
57		_v=(_v>>1)+1;
58		ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
59		return ret;
60		# endif
61		#endif
62	0	}
63
64	1.45M	int oc_ilog64(ogg_int64_t _v){
65	1.45M	#if defined(OC_CLZ64)
66	1.45M	return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v;
67		#else
68		/*If we don't have a fast 64-bit word implementation, split it into two 32-bit
69		halves.*/
70		# if defined(OC_ILOG_NODEBRUIJN)\|\| \
71		defined(OC_CLZ32)\|\|LONG_MAX<9223372036854775807LL
72		ogg_uint32_t v;
73		int ret;
74		int m;
75		m=(_v>0xFFFFFFFFU)<<5;
76		v=(ogg_uint32_t)(_v>>m);
77		# if defined(OC_CLZ32)
78		ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v;
79		# elif defined(OC_ILOG_NODEBRUIJN)
80		ret=v>0\|m;
81		m=(v>0xFFFFU)<<4;
82		v>>=m;
83		ret\|=m;
84		m=(v>0xFFU)<<3;
85		v>>=m;
86		ret\|=m;
87		m=(v>0xFU)<<2;
88		v>>=m;
89		ret\|=m;
90		m=(v>3)<<1;
91		v>>=m;
92		ret\|=m;
93		ret+=v>1;
94		return ret;
95		# else
96		v\|=v>>1;
97		v\|=v>>2;
98		v\|=v>>4;
99		v\|=v>>8;
100		v\|=v>>16;
101		ret=v&1\|m;
102		v=(v>>1)+1;
103		ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
104		# endif
105		return ret;
106		/Otherwise do it in one 64-bit multiply./
107		# else
108		static const unsigned char OC_DEBRUIJN_IDX64[64]={
109		0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
110		5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
111		63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
112		62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
113		};
114		int ret;
115		_v\|=_v>>1;
116		_v\|=_v>>2;
117		_v\|=_v>>4;
118		_v\|=_v>>8;
119		_v\|=_v>>16;
120		_v\|=_v>>32;
121		ret=(int)_v&1;
122		_v=(_v>>1)+1;
123		ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
124		return ret;
125		# endif
126		#endif
127	1.45M	}
128
129		/round(2(62+i)atanh(2*(-(i+1)))/log(2))/
130		static const ogg_int64_t OC_ATANH_LOG2[32]={
131		0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
132		0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
133		0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
134		0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
135		0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
136		0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
137		0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
138		0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
139		0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
140		0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
141		0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
142		};
143
144		/Computes the binary exponential of _z, a log base 2 in Q57 format./
145	954k	ogg_int64_t oc_bexp64(ogg_int64_t _z){
146	954k	ogg_int64_t w;
147	954k	ogg_int64_t z;
148	954k	int ipart;
149	954k	ipart=(int)(_z>>57);
150	954k	if(ipart<0)return 0;
151	865k	if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
152	863k	z=_z-OC_Q57(ipart);
153	863k	if(z){
154	860k	ogg_int64_t mask;
155	860k	long wlo;
156	860k	int i;
157		/*C doesn't give us 64x64->128 muls, so we use CORDIC.
158		This is not particularly fast, but it's not being used in time-critical
159		code; it is very accurate.*/
160		/*z is the fractional part of the log in Q62 format.
161		We need 1 bit of headroom since the magnitude can get larger than 1
162		during the iteration, and a sign bit.*/
163	860k	z<<=5;
164		/*w is the exponential in Q61 format (since it also needs headroom and can
165		get as large as 2.0); we could get another bit if we dropped the sign,
166		but we'll recover that bit later anyway.
167		Ideally this should start out as
168		\lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
169		but in order to guarantee convergence we have to repeat iterations 4,
170		13 (=34+1), and 40 (=313+1, etc.), so it winds up somewhat larger.*/
171	860k	w=0x26A3D0E401DD846DLL;
172	3.44M	for(i=0;;i++){
173	3.44M	mask=-(z<0);
174	3.44M	w+=(w>>i+1)+mask^mask;
175	3.44M	z-=OC_ATANH_LOG2[i]+mask^mask;
176		/Repeat iteration 4./
177	3.44M	if(i>=3)break;
178	2.58M	z<<=1;
179	2.58M	}
180	8.60M	for(;;i++){
181	8.60M	mask=-(z<0);
182	8.60M	w+=(w>>i+1)+mask^mask;
183	8.60M	z-=OC_ATANH_LOG2[i]+mask^mask;
184		/Repeat iteration 13./
185	8.60M	if(i>=12)break;
186	7.74M	z<<=1;
187	7.74M	}
188	18.0M	for(;i<32;i++){
189	17.2M	mask=-(z<0);
190	17.2M	w+=(w>>i+1)+mask^mask;
191	17.2M	z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
192	17.2M	}
193	860k	wlo=0;
194		/*Skip the remaining iterations unless we really require that much
195		precision.
196		We could have bailed out earlier for smaller iparts, but that would
197		require initializing w from a table, as the limit doesn't converge to
198		61-bit precision until n=30.*/
199	860k	if(ipart>30){
200		/*For these iterations, we just update the low bits, as the high bits
201		can't possibly be affected.
202		OC_ATANH_LOG2 has also converged (it actually did so one iteration
203		earlier, but that's no reason for an extra special case).*/
204	808k	for(;;i++){
205	808k	mask=-(z<0);
206	808k	wlo+=(w>>i)+mask^mask;
207	808k	z-=OC_ATANH_LOG2[31]+mask^mask;
208		/Repeat iteration 40./
209	808k	if(i>=39)break;
210	707k	z<<=1;
211	707k	}
212	2.32M	for(;i<61;i++){
213	2.22M	mask=-(z<0);
214	2.22M	wlo+=(w>>i)+mask^mask;
215	2.22M	z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
216	2.22M	}
217	101k	}
218	860k	w=(w<<1)+wlo;
219	860k	}
220	2.89k	else w=(ogg_int64_t)1<<62;
221	863k	if(ipart<62)w=(w>>61-ipart)+1>>1;
222	863k	return w;
223	865k	}
224
225		/Computes the binary logarithm of _w, returned in Q57 format./
226	1.48M	ogg_int64_t oc_blog64(ogg_int64_t _w){
227	1.48M	ogg_int64_t z;
228	1.48M	int ipart;
229	1.48M	if(_w<=0)return -1;
230	1.45M	ipart=OC_ILOGNZ_64(_w)-1;
231	1.45M	if(ipart>61)_w>>=ipart-61;
232	1.45M	else _w<<=61-ipart;
233	1.45M	z=0;
234	1.45M	if(_w&_w-1){
235	1.43M	ogg_int64_t x;
236	1.43M	ogg_int64_t y;
237	1.43M	ogg_int64_t u;
238	1.43M	ogg_int64_t mask;
239	1.43M	int i;
240		/*C doesn't give us 64x64->128 muls, so we use CORDIC.
241		This is not particularly fast, but it's not being used in time-critical
242		code; it is very accurate.*/
243		/z is the fractional part of the log in Q61 format./
244		/*x and y are the cosh() and sinh(), respectively, in Q61 format.
245		We are computing z=2atanh(y/x)=2atanh((_w-1)/(_w+1)).*/
246	1.43M	x=_w+((ogg_int64_t)1<<61);
247	1.43M	y=_w-((ogg_int64_t)1<<61);
248	7.15M	for(i=0;i<4;i++){
249	5.72M	mask=-(y<0);
250	5.72M	z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
251	5.72M	u=x>>i+1;
252	5.72M	x-=(y>>i+1)+mask^mask;
253	5.72M	y-=u+mask^mask;
254	5.72M	}
255		/Repeat iteration 4./
256	15.7M	for(i--;i<13;i++){
257	14.3M	mask=-(y<0);
258	14.3M	z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
259	14.3M	u=x>>i+1;
260	14.3M	x-=(y>>i+1)+mask^mask;
261	14.3M	y-=u+mask^mask;
262	14.3M	}
263		/Repeat iteration 13./
264	30.0M	for(i--;i<32;i++){
265	28.6M	mask=-(y<0);
266	28.6M	z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
267	28.6M	u=x>>i+1;
268	28.6M	x-=(y>>i+1)+mask^mask;
269	28.6M	y-=u+mask^mask;
270	28.6M	}
271		/OC_ATANH_LOG2 has converged./
272	12.8M	for(;i<40;i++){
273	11.4M	mask=-(y<0);
274	11.4M	z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
275	11.4M	u=x>>i+1;
276	11.4M	x-=(y>>i+1)+mask^mask;
277	11.4M	y-=u+mask^mask;
278	11.4M	}
279		/Repeat iteration 40./
280	34.3M	for(i--;i<62;i++){
281	32.9M	mask=-(y<0);
282	32.9M	z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
283	32.9M	u=x>>i+1;
284	32.9M	x-=(y>>i+1)+mask^mask;
285	32.9M	y-=u+mask^mask;
286	32.9M	}
287	1.43M	z=z+8>>4;
288	1.43M	}
289	1.45M	return OC_Q57(ipart)+z;
290	1.48M	}
291
292		/*Polynomial approximation of a binary exponential.
293		Q10 input, Q0 output.*/
294	44.5k	ogg_uint32_t oc_bexp32_q10(int _z){
295	44.5k	unsigned n;
296	44.5k	int ipart;
297	44.5k	ipart=_z>>10;
298	44.5k	n=(_z&(1<<10)-1)<<4;
299	44.5k	n=(n((n((n((n3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384;
300	44.5k	return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14;
301	44.5k	}
302
303		/*Polynomial approximation of a binary logarithm.
304		Q0 input, Q10 output.*/
305	1.36M	int oc_blog32_q10(ogg_uint32_t _w){
306	1.36M	int n;
307	1.36M	int ipart;
308	1.36M	int fpart;
309	1.36M	if(_w<=0)return -1;
310	1.36M	ipart=OC_ILOGNZ_32(_w);
311	1.36M	n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384;
312	1.36M	fpart=(n((n((n((n-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793;
313	1.36M	return (ipart<<10)+(fpart>>4);
314	1.36M	}