Line | Count | Source |
1 | | /* Copyright (c) 2007-2008 CSIRO |
2 | | Copyright (c) 2007-2009 Xiph.Org Foundation |
3 | | Written by Jean-Marc Valin */ |
4 | | /* |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions |
7 | | are met: |
8 | | |
9 | | - Redistributions of source code must retain the above copyright |
10 | | notice, this list of conditions and the following disclaimer. |
11 | | |
12 | | - Redistributions in binary form must reproduce the above copyright |
13 | | notice, this list of conditions and the following disclaimer in the |
14 | | documentation and/or other materials provided with the distribution. |
15 | | |
16 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
17 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
20 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
21 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
22 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
23 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
24 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
25 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
26 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | | */ |
28 | | |
29 | | #ifdef HAVE_CONFIG_H |
30 | | #include "config.h" |
31 | | #endif |
32 | | |
33 | | #include "mathops.h" |
34 | | #include "cwrs.h" |
35 | | #include "vq.h" |
36 | | #include "arch.h" |
37 | | #include "os_support.h" |
38 | | #include "bands.h" |
39 | | #include "rate.h" |
40 | | #include "pitch.h" |
41 | | #include "SigProc_FIX.h" |
42 | | |
43 | | #if defined(FIXED_POINT) |
44 | | void norm_scaleup(celt_norm *X, int N, int shift) { |
45 | | int i; |
46 | | celt_assert(shift >= 0); |
47 | | if (shift <= 0) return; |
48 | | for (i=0;i<N;i++) X[i] = SHL32(X[i], shift); |
49 | | } |
50 | | |
51 | | void norm_scaledown(celt_norm *X, int N, int shift) { |
52 | | int i; |
53 | | celt_assert(shift >= 0); |
54 | | if (shift <= 0) return; |
55 | | for (i=0;i<N;i++) X[i] = PSHR32(X[i], shift); |
56 | | } |
57 | | |
58 | | opus_val32 celt_inner_prod_norm(const celt_norm *x, const celt_norm *y, int len, int arch) { |
59 | | int i; |
60 | | opus_val32 sum = 0; |
61 | | (void)arch; |
62 | | for (i=0;i<len;i++) sum += x[i]*y[i]; |
63 | | return sum; |
64 | | } |
65 | | opus_val32 celt_inner_prod_norm_shift(const celt_norm *x, const celt_norm *y, int len, int arch) { |
66 | | int i; |
67 | | opus_val64 sum = 0; |
68 | | (void)arch; |
69 | | for (i=0;i<len;i++) sum += x[i]*(opus_val64)y[i]; |
70 | | return sum>>2*(NORM_SHIFT-14); |
71 | | } |
72 | | #endif |
73 | | |
74 | | #ifndef OVERRIDE_vq_exp_rotation1 |
75 | | static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s) |
76 | 169k | { |
77 | 169k | int i; |
78 | 169k | opus_val16 ms; |
79 | 169k | celt_norm *Xptr; |
80 | 169k | Xptr = X; |
81 | 169k | ms = NEG16(s); |
82 | 169k | norm_scaledown(X, len, NORM_SHIFT-14); |
83 | 1.33M | for (i=0;i<len-stride;i++) |
84 | 1.16M | { |
85 | 1.16M | celt_norm x1, x2; |
86 | 1.16M | x1 = Xptr[0]; |
87 | 1.16M | x2 = Xptr[stride]; |
88 | 1.16M | Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15)); |
89 | 1.16M | *Xptr++ = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15)); |
90 | 1.16M | } |
91 | 169k | Xptr = &X[len-2*stride-1]; |
92 | 1.09M | for (i=len-2*stride-1;i>=0;i--) |
93 | 929k | { |
94 | 929k | celt_norm x1, x2; |
95 | 929k | x1 = Xptr[0]; |
96 | 929k | x2 = Xptr[stride]; |
97 | 929k | Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15)); |
98 | 929k | *Xptr-- = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15)); |
99 | 929k | } |
100 | 169k | norm_scaleup(X, len, NORM_SHIFT-14); |
101 | 169k | } |
102 | | #endif /* OVERRIDE_vq_exp_rotation1 */ |
103 | | |
104 | | void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread) |
105 | 858k | { |
106 | 858k | static const int SPREAD_FACTOR[3]={15,10,5}; |
107 | 858k | int i; |
108 | 858k | opus_val16 c, s; |
109 | 858k | opus_val16 gain, theta; |
110 | 858k | int stride2=0; |
111 | 858k | int factor; |
112 | | |
113 | 858k | if (2*K>=len || spread==SPREAD_NONE) |
114 | 789k | return; |
115 | 69.2k | factor = SPREAD_FACTOR[spread-1]; |
116 | | |
117 | 69.2k | gain = celt_div((opus_val32)MULT16_16(Q15_ONE,len),(opus_val32)(len+factor*K)); |
118 | 69.2k | theta = HALF16(MULT16_16_Q15(gain,gain)); |
119 | | |
120 | 69.2k | c = celt_cos_norm(EXTEND32(theta)); |
121 | 69.2k | s = celt_cos_norm(EXTEND32(SUB16(Q15ONE,theta))); /* sin(theta) */ |
122 | | |
123 | 69.2k | if (len>=8*stride) |
124 | 34.2k | { |
125 | 34.2k | stride2 = 1; |
126 | | /* This is just a simple (equivalent) way of computing sqrt(len/stride) with rounding. |
127 | | It's basically incrementing long as (stride2+0.5)^2 < len/stride. */ |
128 | 134k | while ((stride2*stride2+stride2)*stride + (stride>>2) < len) |
129 | 100k | stride2++; |
130 | 34.2k | } |
131 | | /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for |
132 | | extract_collapse_mask().*/ |
133 | 69.2k | len = celt_udiv(len, stride); |
134 | 201k | for (i=0;i<stride;i++) |
135 | 132k | { |
136 | 132k | if (dir < 0) |
137 | 132k | { |
138 | 132k | if (stride2) |
139 | 36.9k | exp_rotation1(X+i*len, len, stride2, s, c); |
140 | 132k | exp_rotation1(X+i*len, len, 1, c, s); |
141 | 132k | } else { |
142 | 0 | exp_rotation1(X+i*len, len, 1, c, -s); |
143 | 0 | if (stride2) |
144 | 0 | exp_rotation1(X+i*len, len, stride2, s, -c); |
145 | 0 | } |
146 | 132k | } |
147 | 69.2k | } |
148 | | |
149 | | /** Normalizes the decoded integer pvq codeword to unit norm. */ |
150 | | static void normalise_residual(int * OPUS_RESTRICT iy, celt_norm * OPUS_RESTRICT X, |
151 | | int N, opus_val32 Ryy, opus_val32 gain, int shift) |
152 | 858k | { |
153 | 858k | int i; |
154 | | #ifdef FIXED_POINT |
155 | | int k; |
156 | | #endif |
157 | 858k | opus_val32 t; |
158 | 858k | opus_val32 g; |
159 | | |
160 | | #ifdef FIXED_POINT |
161 | | k = celt_ilog2(Ryy)>>1; |
162 | | #endif |
163 | 858k | t = VSHR32(Ryy, 2*(k-7)-15); |
164 | 858k | g = MULT32_32_Q31(celt_rsqrt_norm32(t),gain); |
165 | 858k | i=0; |
166 | 858k | (void)shift; |
167 | | #if defined(FIXED_POINT) && defined(ENABLE_QEXT) |
168 | | if (shift>0) { |
169 | | int tot_shift = NORM_SHIFT+1-k-shift; |
170 | | if (tot_shift >= 0) { |
171 | | do X[i] = MULT32_32_Q31(g, SHL32(iy[i], tot_shift)); |
172 | | while (++i < N); |
173 | | } else { |
174 | | do X[i] = MULT32_32_Q31(g, PSHR32(iy[i], -tot_shift)); |
175 | | while (++i < N); |
176 | | } |
177 | | } else |
178 | | #endif |
179 | 3.94M | do X[i] = VSHR32(MULT16_32_Q15(iy[i], g), k+15-NORM_SHIFT); |
180 | 3.94M | while (++i < N); |
181 | 858k | } |
182 | | |
183 | | static unsigned extract_collapse_mask(int *iy, int N, int B) |
184 | 858k | { |
185 | 858k | unsigned collapse_mask; |
186 | 858k | int N0; |
187 | 858k | int i; |
188 | 858k | if (B<=1) |
189 | 697k | return 1; |
190 | | /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for |
191 | | exp_rotation().*/ |
192 | 160k | N0 = celt_udiv(N, B); |
193 | 160k | collapse_mask = 0; |
194 | 465k | i=0; do { |
195 | 465k | int j; |
196 | 465k | unsigned tmp=0; |
197 | 806k | j=0; do { |
198 | 806k | tmp |= iy[i*N0+j]; |
199 | 806k | } while (++j<N0); |
200 | 465k | collapse_mask |= (tmp!=0)<<i; |
201 | 465k | } while (++i<B); |
202 | 160k | return collapse_mask; |
203 | 858k | } |
204 | | |
205 | | opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch) |
206 | 0 | { |
207 | 0 | VARDECL(celt_norm, y); |
208 | 0 | VARDECL(int, signx); |
209 | 0 | int i, j; |
210 | 0 | int pulsesLeft; |
211 | 0 | opus_val32 sum; |
212 | 0 | opus_val32 xy; |
213 | 0 | opus_val16 yy; |
214 | 0 | SAVE_STACK; |
215 | |
|
216 | 0 | (void)arch; |
217 | 0 | ALLOC(y, N, celt_norm); |
218 | 0 | ALLOC(signx, N, int); |
219 | | #ifdef FIXED_POINT |
220 | | { |
221 | | int shift = (celt_ilog2(1+celt_inner_prod_norm_shift(X, X, N, arch))+1)/2; |
222 | | shift = IMAX(0, shift+(NORM_SHIFT-14)-14); |
223 | | norm_scaledown(X, N, shift); |
224 | | } |
225 | | #endif |
226 | | /* Get rid of the sign */ |
227 | 0 | sum = 0; |
228 | 0 | j=0; do { |
229 | 0 | signx[j] = X[j]<0; |
230 | | /* OPT: Make sure the compiler doesn't use a branch on ABS16(). */ |
231 | 0 | X[j] = ABS16(X[j]); |
232 | 0 | iy[j] = 0; |
233 | 0 | y[j] = 0; |
234 | 0 | } while (++j<N); |
235 | |
|
236 | 0 | xy = yy = 0; |
237 | |
|
238 | 0 | pulsesLeft = K; |
239 | | |
240 | | /* Do a pre-search by projecting on the pyramid */ |
241 | 0 | if (K > (N>>1)) |
242 | 0 | { |
243 | 0 | opus_val16 rcp; |
244 | 0 | j=0; do { |
245 | 0 | sum += X[j]; |
246 | 0 | } while (++j<N); |
247 | | |
248 | | /* If X is too small, just replace it with a pulse at 0 */ |
249 | | #ifdef FIXED_POINT |
250 | | if (sum <= K) |
251 | | #else |
252 | | /* Prevents infinities and NaNs from causing too many pulses |
253 | | to be allocated. 64 is an approximation of infinity here. */ |
254 | 0 | if (!(sum > EPSILON && sum < 64)) |
255 | 0 | #endif |
256 | 0 | { |
257 | 0 | X[0] = QCONST16(1.f,14); |
258 | 0 | j=1; do |
259 | 0 | X[j]=0; |
260 | 0 | while (++j<N); |
261 | 0 | sum = QCONST16(1.f,14); |
262 | 0 | } |
263 | | #ifdef FIXED_POINT |
264 | | rcp = EXTRACT16(MULT16_32_Q16(K, celt_rcp(sum))); |
265 | | #else |
266 | | /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */ |
267 | 0 | rcp = EXTRACT16(MULT16_32_Q16(K+0.8f, celt_rcp(sum))); |
268 | 0 | #endif |
269 | 0 | j=0; do { |
270 | | #ifdef FIXED_POINT |
271 | | /* It's really important to round *towards zero* here */ |
272 | | iy[j] = MULT16_16_Q15(X[j],rcp); |
273 | | #else |
274 | 0 | iy[j] = (int)floor(rcp*X[j]); |
275 | 0 | #endif |
276 | 0 | y[j] = (celt_norm)iy[j]; |
277 | 0 | yy = MAC16_16(yy, y[j],y[j]); |
278 | 0 | xy = MAC16_16(xy, X[j],y[j]); |
279 | 0 | y[j] *= 2; |
280 | 0 | pulsesLeft -= iy[j]; |
281 | 0 | } while (++j<N); |
282 | 0 | } |
283 | 0 | celt_sig_assert(pulsesLeft>=0); |
284 | | |
285 | | /* This should never happen, but just in case it does (e.g. on silence) |
286 | | we fill the first bin with pulses. */ |
287 | | #ifdef FIXED_POINT_DEBUG |
288 | | celt_sig_assert(pulsesLeft<=N+3); |
289 | | #endif |
290 | 0 | if (pulsesLeft > N+3) |
291 | 0 | { |
292 | 0 | opus_val16 tmp = (opus_val16)pulsesLeft; |
293 | 0 | yy = MAC16_16(yy, tmp, tmp); |
294 | 0 | yy = MAC16_16(yy, tmp, y[0]); |
295 | 0 | iy[0] += pulsesLeft; |
296 | 0 | pulsesLeft=0; |
297 | 0 | } |
298 | |
|
299 | 0 | for (i=0;i<pulsesLeft;i++) |
300 | 0 | { |
301 | 0 | opus_val16 Rxy, Ryy; |
302 | 0 | int best_id; |
303 | 0 | opus_val32 best_num; |
304 | 0 | opus_val16 best_den; |
305 | | #ifdef FIXED_POINT |
306 | | int rshift; |
307 | | #endif |
308 | | #ifdef FIXED_POINT |
309 | | rshift = 1+celt_ilog2(K-pulsesLeft+i+1); |
310 | | #endif |
311 | 0 | best_id = 0; |
312 | | /* The squared magnitude term gets added anyway, so we might as well |
313 | | add it outside the loop */ |
314 | 0 | yy = ADD16(yy, 1); |
315 | | |
316 | | /* Calculations for position 0 are out of the loop, in part to reduce |
317 | | mispredicted branches (since the if condition is usually false) |
318 | | in the loop. */ |
319 | | /* Temporary sums of the new pulse(s) */ |
320 | 0 | Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[0])),rshift)); |
321 | | /* We're multiplying y[j] by two so we don't have to do it here */ |
322 | 0 | Ryy = ADD16(yy, y[0]); |
323 | | |
324 | | /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that |
325 | | Rxy is positive because the sign is pre-computed) */ |
326 | 0 | Rxy = MULT16_16_Q15(Rxy,Rxy); |
327 | 0 | best_den = Ryy; |
328 | 0 | best_num = Rxy; |
329 | 0 | j=1; |
330 | 0 | do { |
331 | | /* Temporary sums of the new pulse(s) */ |
332 | 0 | Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[j])),rshift)); |
333 | | /* We're multiplying y[j] by two so we don't have to do it here */ |
334 | 0 | Ryy = ADD16(yy, y[j]); |
335 | | |
336 | | /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that |
337 | | Rxy is positive because the sign is pre-computed) */ |
338 | 0 | Rxy = MULT16_16_Q15(Rxy,Rxy); |
339 | | /* The idea is to check for num/den >= best_num/best_den, but that way |
340 | | we can do it without any division */ |
341 | | /* OPT: It's not clear whether a cmov is faster than a branch here |
342 | | since the condition is more often false than true and using |
343 | | a cmov introduces data dependencies across iterations. The optimal |
344 | | choice may be architecture-dependent. */ |
345 | 0 | if (opus_unlikely(MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num))) |
346 | 0 | { |
347 | 0 | best_den = Ryy; |
348 | 0 | best_num = Rxy; |
349 | 0 | best_id = j; |
350 | 0 | } |
351 | 0 | } while (++j<N); |
352 | | |
353 | | /* Updating the sums of the new pulse(s) */ |
354 | 0 | xy = ADD32(xy, EXTEND32(X[best_id])); |
355 | | /* We're multiplying y[j] by two so we don't have to do it here */ |
356 | 0 | yy = ADD16(yy, y[best_id]); |
357 | | |
358 | | /* Only now that we've made the final choice, update y/iy */ |
359 | | /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ |
360 | 0 | y[best_id] += 2; |
361 | 0 | iy[best_id]++; |
362 | 0 | } |
363 | | |
364 | | /* Put the original sign back */ |
365 | 0 | j=0; |
366 | 0 | do { |
367 | | /*iy[j] = signx[j] ? -iy[j] : iy[j];*/ |
368 | | /* OPT: The is more likely to be compiled without a branch than the code above |
369 | | but has the same performance otherwise. */ |
370 | 0 | iy[j] = (iy[j]^-signx[j]) + signx[j]; |
371 | 0 | } while (++j<N); |
372 | 0 | RESTORE_STACK; |
373 | 0 | return yy; |
374 | 0 | } |
375 | | |
376 | | #ifdef ENABLE_QEXT |
377 | | #include "macros.h" |
378 | | |
379 | | static opus_val32 op_pvq_search_N2(const celt_norm *X, int *iy, int *up_iy, int K, int up, int *refine, int shift) { |
380 | | opus_val32 sum; |
381 | | opus_val32 rcp_sum; |
382 | | int offset; |
383 | | sum = ABS32(X[0]) + ABS32(X[1]); |
384 | | if (sum < EPSILON) { |
385 | | iy[0] = K; |
386 | | up_iy[0] = up*K; |
387 | | iy[1]=up_iy[1]=0; |
388 | | *refine=0; |
389 | | #ifdef FIXED_POINT |
390 | | return (opus_val64)K*K*up*up>>2*shift; |
391 | | #else |
392 | | (void)shift; |
393 | | return K*(float)K*up*up; |
394 | | #endif |
395 | | } |
396 | | #ifdef FIXED_POINT |
397 | | int sum_shift; |
398 | | opus_val32 X0; |
399 | | sum_shift = 30-celt_ilog2(sum); |
400 | | rcp_sum = celt_rcp_norm32(SHL32(sum, sum_shift)); |
401 | | X0 = MULT32_32_Q31(SHL32(X[0], sum_shift), rcp_sum); |
402 | | iy[0] = PSHR32(MULT32_32_Q31(SHL32(K, 8), X0), 7); |
403 | | up_iy[0] = PSHR32(MULT32_32_Q31(SHL32(up*K, 8), X0), 7); |
404 | | #else |
405 | | rcp_sum = 1.f/sum; |
406 | | iy[0] = (int)floor(.5f+K*X[0]*rcp_sum); |
407 | | up_iy[0] = (int)floor(.5f+up*K*X[0]*rcp_sum); |
408 | | #endif |
409 | | up_iy[0] = IMAX(up*iy[0] - (up-1)/2, IMIN(up*iy[0] + (up-1)/2, up_iy[0])); |
410 | | offset = up_iy[0] - up*iy[0]; |
411 | | iy[1] = K-abs(iy[0]); |
412 | | up_iy[1] = up*K-abs(up_iy[0]); |
413 | | if (X[1] < 0) { |
414 | | iy[1] = -iy[1]; |
415 | | up_iy[1] = -up_iy[1]; |
416 | | offset = -offset; |
417 | | } |
418 | | *refine = offset; |
419 | | #ifdef FIXED_POINT |
420 | | return (up_iy[0]*(opus_val64)up_iy[0] + up_iy[1]*(opus_val64)up_iy[1] + (1<<2*shift>>1))>>2*shift; |
421 | | #else |
422 | | return up_iy[0]*(opus_val64)up_iy[0] + up_iy[1]*(opus_val64)up_iy[1]; |
423 | | #endif |
424 | | } |
425 | | |
426 | | static int op_pvq_refine(const opus_val32 *Xn, int *iy, int *iy0, int K, int up, int margin, int N) { |
427 | | int i; |
428 | | int dir; |
429 | | VARDECL(opus_val32, rounding); |
430 | | int iysum = 0; |
431 | | SAVE_STACK; |
432 | | ALLOC(rounding, N, opus_val32); |
433 | | for (i=0;i<N;i++) { |
434 | | opus_val32 tmp; |
435 | | tmp = MULT32_32_Q31(SHL32(K, 8), Xn[i]); |
436 | | #ifdef FIXED_POINT |
437 | | iy[i] = (tmp+64) >> 7; |
438 | | #else |
439 | | iy[i] = (int)floor(.5+tmp); |
440 | | #endif |
441 | | rounding[i] = tmp - SHL32(iy[i], 7); |
442 | | } |
443 | | if (iy != iy0) { |
444 | | for (i=0;i<N;i++) iy[i] = IMIN(up*iy0[i]+up-1, IMAX(up*iy0[i]-up+1, iy[i])); |
445 | | } |
446 | | for (i=0;i<N;i++) iysum += iy[i]; |
447 | | if (abs(iysum - K) > 32) { |
448 | | return 1; |
449 | | } |
450 | | dir = iysum < K ? 1 : -1; |
451 | | while (iysum != K) { |
452 | | opus_val32 roundval=-1000000*dir; |
453 | | int roundpos=0; |
454 | | for (i=0;i<N;i++) { |
455 | | if ((rounding[i]-roundval)*dir > 0 && abs(iy[i]-up*iy0[i]) < (margin-1) && !(dir==-1 && iy[i] == 0)) { |
456 | | roundval = rounding[i]; |
457 | | roundpos = i; |
458 | | } |
459 | | } |
460 | | iy[roundpos] += dir; |
461 | | rounding[roundpos] -= SHL32(dir, 15); |
462 | | iysum+=dir; |
463 | | } |
464 | | return 0; |
465 | | } |
466 | | |
467 | | static opus_val32 op_pvq_search_extra(const celt_norm *X, int *iy, int *up_iy, int K, int up, int *refine, int N, int shift) { |
468 | | opus_val32 rcp_sum; |
469 | | opus_val32 sum=0; |
470 | | int i; |
471 | | int failed=0; |
472 | | opus_val64 yy=0; |
473 | | VARDECL(opus_val32, Xn); |
474 | | SAVE_STACK; |
475 | | for (i=0;i<N;i++) sum += ABS32(X[i]); |
476 | | ALLOC(Xn, N, opus_val32); |
477 | | if (sum < EPSILON) |
478 | | failed = 1; |
479 | | else { |
480 | | #ifdef FIXED_POINT |
481 | | int sum_shift = 30-celt_ilog2(sum); |
482 | | rcp_sum = celt_rcp_norm32(SHL32(sum, sum_shift)); |
483 | | for (i=0;i<N;i++) { |
484 | | Xn[i] = MULT32_32_Q31(SHL32(ABS32(X[i]), sum_shift), rcp_sum); |
485 | | } |
486 | | #else |
487 | | rcp_sum = celt_rcp(sum); |
488 | | for (i=0;i<N;i++) { |
489 | | Xn[i] = ABS32(X[i])*rcp_sum; |
490 | | } |
491 | | #endif |
492 | | } |
493 | | failed = failed || op_pvq_refine(Xn, iy, iy, K, 1, K+1, N); |
494 | | failed = failed || op_pvq_refine(Xn, up_iy, iy, up*K, up, up, N); |
495 | | if (failed) { |
496 | | iy[0] = K; |
497 | | for (i=1;i<N;i++) iy[i] = 0; |
498 | | up_iy[0] = up*K; |
499 | | for (i=1;i<N;i++) up_iy[i] = 0; |
500 | | } |
501 | | for (i=0;i<N;i++) { |
502 | | yy += up_iy[i]*(opus_val64)up_iy[i]; |
503 | | if (X[i] < 0) { |
504 | | iy[i] = -iy[i]; |
505 | | up_iy[i] = -up_iy[i]; |
506 | | } |
507 | | refine[i] = up_iy[i]-up*iy[i]; |
508 | | } |
509 | | #ifdef FIXED_POINT |
510 | | return (yy + (1<<2*shift>>1))>>2*shift; |
511 | | #else |
512 | | (void)shift; |
513 | | return yy; |
514 | | #endif |
515 | | } |
516 | | #endif |
517 | | |
518 | | #ifdef ENABLE_QEXT |
519 | | /* Take advantage of the fact that "large" refine values are much less likely |
520 | | than smaller ones. */ |
521 | | static void ec_enc_refine(ec_enc *enc, opus_int32 refine, opus_int32 up, int extra_bits, int use_entropy) { |
522 | | int large; |
523 | | large = abs(refine)>up/2; |
524 | | ec_enc_bit_logp(enc, large, use_entropy ? 3 : 1); |
525 | | if (large) { |
526 | | ec_enc_bits(enc, refine < 0, 1); |
527 | | ec_enc_bits(enc, abs(refine)-up/2-1, extra_bits-1); |
528 | | } else { |
529 | | ec_enc_bits(enc, refine+up/2, extra_bits); |
530 | | } |
531 | | } |
532 | | |
533 | | static int ec_dec_refine(ec_enc *dec, opus_int32 up, int extra_bits, int use_entropy) { |
534 | | int large, refine; |
535 | | large = ec_dec_bit_logp(dec, use_entropy ? 3 : 1); |
536 | | if (large) { |
537 | | int sign = ec_dec_bits(dec, 1); |
538 | | refine = ec_dec_bits(dec, extra_bits-1) + up/2+1; |
539 | | if (sign) refine = -refine; |
540 | | } else { |
541 | | refine = (opus_int32)ec_dec_bits(dec, extra_bits)-up/2; |
542 | | } |
543 | | return refine; |
544 | | } |
545 | | #endif |
546 | | |
547 | | unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, |
548 | | opus_val32 gain, int resynth |
549 | | ARG_QEXT(ec_enc *ext_enc) ARG_QEXT(int extra_bits), int arch) |
550 | 0 | { |
551 | 0 | VARDECL(int, iy); |
552 | 0 | opus_val32 yy; |
553 | 0 | unsigned collapse_mask; |
554 | | #ifdef ENABLE_QEXT |
555 | | int yy_shift = 0; |
556 | | #endif |
557 | 0 | SAVE_STACK; |
558 | |
|
559 | 0 | celt_assert2(K>0, "alg_quant() needs at least one pulse"); |
560 | 0 | celt_assert2(N>1, "alg_quant() needs at least two dimensions"); |
561 | | |
562 | | /* Covers vectorization by up to 4. */ |
563 | 0 | ALLOC(iy, N+3, int); |
564 | |
|
565 | 0 | exp_rotation(X, N, 1, B, K, spread); |
566 | |
|
567 | | #ifdef ENABLE_QEXT |
568 | | if (N==2 && extra_bits >= 2) { |
569 | | int refine; |
570 | | int up_iy[2]; |
571 | | int up; |
572 | | yy_shift = IMAX(0, extra_bits-7); |
573 | | up = (1<<extra_bits)-1; |
574 | | yy = op_pvq_search_N2(X, iy, up_iy, K, up, &refine, yy_shift); |
575 | | collapse_mask = extract_collapse_mask(up_iy, N, B); |
576 | | encode_pulses(iy, N, K, enc); |
577 | | ec_enc_uint(ext_enc, refine+(up-1)/2, up); |
578 | | if (resynth) |
579 | | normalise_residual(up_iy, X, N, yy, gain, yy_shift); |
580 | | } else if (extra_bits >= 2) { |
581 | | int i; |
582 | | VARDECL(int, up_iy); |
583 | | VARDECL(int, refine); |
584 | | int up, use_entropy; |
585 | | ALLOC(up_iy, N, int); |
586 | | ALLOC(refine, N, int); |
587 | | yy_shift = IMAX(0, extra_bits-7); |
588 | | up = (1<<extra_bits)-1; |
589 | | yy = op_pvq_search_extra(X, iy, up_iy, K, up, refine, N, yy_shift); |
590 | | collapse_mask = extract_collapse_mask(up_iy, N, B); |
591 | | encode_pulses(iy, N, K, enc); |
592 | | use_entropy = (ext_enc->storage*8 - ec_tell(ext_enc)) > (unsigned)(N-1)*(extra_bits+3)+1; |
593 | | for (i=0;i<N-1;i++) ec_enc_refine(ext_enc, refine[i], up, extra_bits, use_entropy); |
594 | | if (iy[N-1]==0) ec_enc_bits(ext_enc, up_iy[N-1]<0, 1); |
595 | | if (resynth) |
596 | | normalise_residual(up_iy, X, N, yy, gain, yy_shift); |
597 | | } else |
598 | | #endif |
599 | 0 | { |
600 | 0 | yy = op_pvq_search(X, iy, K, N, arch); |
601 | 0 | collapse_mask = extract_collapse_mask(iy, N, B); |
602 | 0 | encode_pulses(iy, N, K, enc); |
603 | 0 | if (resynth) |
604 | 0 | normalise_residual(iy, X, N, yy, gain, 0); |
605 | 0 | } |
606 | |
|
607 | 0 | if (resynth) |
608 | 0 | exp_rotation(X, N, -1, B, K, spread); |
609 | |
|
610 | 0 | RESTORE_STACK; |
611 | 0 | return collapse_mask; |
612 | 0 | } |
613 | | |
614 | | /** Decode pulse vector and combine the result with the pitch vector to produce |
615 | | the final normalised signal in the current band. */ |
616 | | unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B, |
617 | | ec_dec *dec, opus_val32 gain |
618 | | ARG_QEXT(ec_enc *ext_dec) ARG_QEXT(int extra_bits)) |
619 | 858k | { |
620 | 858k | opus_val32 Ryy; |
621 | 858k | unsigned collapse_mask; |
622 | 858k | VARDECL(int, iy); |
623 | 858k | int yy_shift=0; |
624 | 858k | SAVE_STACK; |
625 | | |
626 | 858k | celt_assert2(K>0, "alg_unquant() needs at least one pulse"); |
627 | 858k | celt_assert2(N>1, "alg_unquant() needs at least two dimensions"); |
628 | 858k | ALLOC(iy, N, int); |
629 | 858k | Ryy = decode_pulses(iy, N, K, dec); |
630 | | #ifdef ENABLE_QEXT |
631 | | if (N==2 && extra_bits >= 2) { |
632 | | int up; |
633 | | int refine; |
634 | | yy_shift = IMAX(0, extra_bits-7); |
635 | | up = (1<<extra_bits)-1; |
636 | | refine = (opus_int32)ec_dec_uint(ext_dec, up) - (up-1)/2; |
637 | | iy[0] *= up; |
638 | | iy[1] *= up; |
639 | | if (iy[1] == 0) { |
640 | | iy[1] = (iy[0] > 0) ? -refine : refine; |
641 | | iy[0] += (refine*(opus_int64)iy[0] > 0) ? -refine : refine; |
642 | | } else if (iy[1] > 0) { |
643 | | iy[0] += refine; |
644 | | iy[1] -= refine*(iy[0]>0?1:-1); |
645 | | } else { |
646 | | iy[0] -= refine; |
647 | | iy[1] -= refine*(iy[0]>0?1:-1); |
648 | | } |
649 | | #ifdef FIXED_POINT |
650 | | Ryy = (iy[0]*(opus_val64)iy[0] + iy[1]*(opus_val64)iy[1] + (1<<2*yy_shift>>1)) >> 2*yy_shift; |
651 | | #else |
652 | | Ryy = iy[0]*(opus_val64)iy[0] + iy[1]*(opus_val64)iy[1]; |
653 | | #endif |
654 | | } else if (extra_bits >= 2) { |
655 | | int i; |
656 | | opus_val64 yy64; |
657 | | VARDECL(int, refine); |
658 | | int up, use_entropy; |
659 | | int sign=0; |
660 | | ALLOC(refine, N, int); |
661 | | yy_shift = IMAX(0, extra_bits-7); |
662 | | up = (1<<extra_bits)-1; |
663 | | use_entropy = (ext_dec->storage*8 - ec_tell(ext_dec)) > (unsigned)(N-1)*(extra_bits+3)+1; |
664 | | for (i=0;i<N-1;i++) refine[i] = ec_dec_refine(ext_dec, up, extra_bits, use_entropy); |
665 | | if (iy[N-1]==0) sign = ec_dec_bits(ext_dec, 1); |
666 | | else sign = iy[N-1] < 0; |
667 | | for (i=0;i<N-1;i++) { |
668 | | iy[i] = iy[i]*up + refine[i]; |
669 | | } |
670 | | iy[N-1] = up*K; |
671 | | for (i=0;i<N-1;i++) iy[N-1] -= abs(iy[i]); |
672 | | if (sign) iy[N-1] = -iy[N-1]; |
673 | | yy64 = 0; |
674 | | for (i=0;i<N;i++) yy64 += iy[i]*(opus_val64)iy[i]; |
675 | | #ifdef FIXED_POINT |
676 | | Ryy = (yy64 + (1<<2*yy_shift>>1)) >> 2*yy_shift; |
677 | | #else |
678 | | Ryy = yy64; |
679 | | #endif |
680 | | } |
681 | | #endif |
682 | 858k | normalise_residual(iy, X, N, Ryy, gain, yy_shift); |
683 | 858k | exp_rotation(X, N, -1, B, K, spread); |
684 | 858k | collapse_mask = extract_collapse_mask(iy, N, B); |
685 | 858k | RESTORE_STACK; |
686 | 858k | return collapse_mask; |
687 | 858k | } |
688 | | |
689 | | #ifndef OVERRIDE_renormalise_vector |
690 | | void renormalise_vector(celt_norm *X, int N, opus_val32 gain, int arch) |
691 | 2.67M | { |
692 | 2.67M | int i; |
693 | | #ifdef FIXED_POINT |
694 | | int k; |
695 | | #endif |
696 | 2.67M | opus_val32 E; |
697 | 2.67M | opus_val16 g; |
698 | 2.67M | opus_val32 t; |
699 | 2.67M | celt_norm *xptr; |
700 | 2.67M | norm_scaledown(X, N, NORM_SHIFT-14); |
701 | 2.67M | E = EPSILON + celt_inner_prod_norm(X, X, N, arch); |
702 | | #ifdef FIXED_POINT |
703 | | k = celt_ilog2(E)>>1; |
704 | | #endif |
705 | 2.67M | t = VSHR32(E, 2*(k-7)); |
706 | 2.67M | g = MULT32_32_Q31(celt_rsqrt_norm(t),gain); |
707 | | |
708 | 2.67M | xptr = X; |
709 | 51.1M | for (i=0;i<N;i++) |
710 | 48.5M | { |
711 | 48.5M | *xptr = EXTRACT16(PSHR32(MULT16_16(g, *xptr), k+15-14)); |
712 | 48.5M | xptr++; |
713 | 48.5M | } |
714 | 2.67M | norm_scaleup(X, N, NORM_SHIFT-14); |
715 | | /*return celt_sqrt(E);*/ |
716 | 2.67M | } |
717 | | #endif /* OVERRIDE_renormalise_vector */ |
718 | | |
719 | | opus_int32 stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch) |
720 | 0 | { |
721 | 0 | int i; |
722 | 0 | int itheta; |
723 | 0 | opus_val32 mid, side; |
724 | 0 | opus_val32 Emid, Eside; |
725 | |
|
726 | 0 | Emid = Eside = 0; |
727 | 0 | if (stereo) |
728 | 0 | { |
729 | 0 | for (i=0;i<N;i++) |
730 | 0 | { |
731 | 0 | celt_norm m, s; |
732 | 0 | m = PSHR32(ADD32(X[i], Y[i]), NORM_SHIFT-13); |
733 | 0 | s = PSHR32(SUB32(X[i], Y[i]), NORM_SHIFT-13); |
734 | 0 | Emid = MAC16_16(Emid, m, m); |
735 | 0 | Eside = MAC16_16(Eside, s, s); |
736 | 0 | } |
737 | 0 | } else { |
738 | 0 | Emid += celt_inner_prod_norm_shift(X, X, N, arch); |
739 | 0 | Eside += celt_inner_prod_norm_shift(Y, Y, N, arch); |
740 | 0 | } |
741 | 0 | mid = celt_sqrt32(Emid); |
742 | 0 | side = celt_sqrt32(Eside); |
743 | | #if defined(FIXED_POINT) |
744 | | itheta = celt_atan2p_norm(side, mid); |
745 | | #else |
746 | 0 | itheta = (int)floor(.5f+65536.f*16384*celt_atan2p_norm(side,mid)); |
747 | 0 | #endif |
748 | |
|
749 | 0 | return itheta; |
750 | 0 | } |
751 | | |
752 | | #ifdef ENABLE_QEXT |
753 | | |
754 | | static void cubic_synthesis(celt_norm *X, int *iy, int N, int K, int face, int sign, opus_val32 gain) { |
755 | | int i; |
756 | | opus_val32 sum=0; |
757 | | opus_val32 mag; |
758 | | #ifdef FIXED_POINT |
759 | | int sum_shift; |
760 | | int shift = IMAX(celt_ilog2(K) + celt_ilog2(N)/2 - 13, 0); |
761 | | #endif |
762 | | for (i=0;i<N;i++) { |
763 | | X[i] = (1+2*iy[i])-K; |
764 | | } |
765 | | X[face] = sign ? -K : K; |
766 | | for (i=0;i<N;i++) { |
767 | | sum += PSHR32(MULT16_16(X[i],X[i]), 2*shift); |
768 | | } |
769 | | #ifdef FIXED_POINT |
770 | | sum_shift = (29-celt_ilog2(sum))>>1; |
771 | | mag = celt_rsqrt_norm32(SHL32(sum, 2*sum_shift+1)); |
772 | | for (i=0;i<N;i++) { |
773 | | X[i] = VSHR32(MULT16_32_Q15(X[i],MULT32_32_Q31(mag,gain)), shift-sum_shift+29-NORM_SHIFT); |
774 | | } |
775 | | #else |
776 | | mag = 1.f/sqrt(sum); |
777 | | for (i=0;i<N;i++) { |
778 | | X[i] *= mag*gain; |
779 | | } |
780 | | #endif |
781 | | } |
782 | | |
783 | | unsigned cubic_quant(celt_norm *X, int N, int res, int B, ec_enc *enc, opus_val32 gain, int resynth) { |
784 | | int i; |
785 | | int face=0; |
786 | | int K; |
787 | | VARDECL(int, iy); |
788 | | celt_norm faceval=-1; |
789 | | opus_val32 norm; |
790 | | int sign; |
791 | | SAVE_STACK; |
792 | | ALLOC(iy, N, int); |
793 | | K = 1<<res; |
794 | | /* Using odd K on transients to avoid adding pre-echo. */ |
795 | | if (B!=1) K=IMAX(1, K-1); |
796 | | if (K==1) { |
797 | | if (resynth) OPUS_CLEAR(X, N); |
798 | | RESTORE_STACK; |
799 | | return 0; |
800 | | } |
801 | | for (i=0;i<N;i++) { |
802 | | if (ABS32(X[i]) > faceval) { |
803 | | faceval = ABS32(X[i]); |
804 | | face = i; |
805 | | } |
806 | | } |
807 | | sign = X[face]<0; |
808 | | ec_enc_uint(enc, face, N); |
809 | | ec_enc_bits(enc, sign, 1); |
810 | | #ifdef FIXED_POINT |
811 | | if (faceval != 0) { |
812 | | int face_shift = 30-celt_ilog2(faceval); |
813 | | norm = celt_rcp_norm32(SHL32(faceval, face_shift)); |
814 | | norm = MULT16_32_Q15(K, norm); |
815 | | for (i=0;i<N;i++) { |
816 | | /* By computing X[i]+faceval inside the shift, the result is guaranteed non-negative. */ |
817 | | iy[i] = IMIN(K-1, (MULT32_32_Q31(SHL32(X[i]+faceval, face_shift-1), norm)) >> 15); |
818 | | } |
819 | | } else { |
820 | | OPUS_CLEAR(iy, N); |
821 | | } |
822 | | #else |
823 | | norm = .5f*K/(faceval+EPSILON); |
824 | | for (i=0;i<N;i++) { |
825 | | iy[i] = IMIN(K-1, (int)floor((X[i]+faceval)*norm)); |
826 | | } |
827 | | #endif |
828 | | for (i=0;i<N;i++) { |
829 | | if (i != face) ec_enc_bits(enc, iy[i], res); |
830 | | } |
831 | | if (resynth) { |
832 | | cubic_synthesis(X, iy, N, K, face, sign, gain); |
833 | | } |
834 | | RESTORE_STACK; |
835 | | return (1<<B)-1; |
836 | | } |
837 | | |
838 | | unsigned cubic_unquant(celt_norm *X, int N, int res, int B, ec_dec *dec, opus_val32 gain) { |
839 | | int i; |
840 | | int face; |
841 | | int sign; |
842 | | int K; |
843 | | VARDECL(int, iy); |
844 | | SAVE_STACK; |
845 | | ALLOC(iy, N, int); |
846 | | K = 1<<res; |
847 | | /* Using odd K on transients to avoid adding pre-echo. */ |
848 | | if (B!=1) K=IMAX(1, K-1); |
849 | | if (K==1) { |
850 | | OPUS_CLEAR(X, N); |
851 | | return 0; |
852 | | } |
853 | | face = ec_dec_uint(dec, N); |
854 | | sign = ec_dec_bits(dec, 1); |
855 | | for (i=0;i<N;i++) { |
856 | | if (i != face) iy[i] = ec_dec_bits(dec, res); |
857 | | } |
858 | | iy[face]=0; |
859 | | cubic_synthesis(X, iy, N, K, face, sign, gain); |
860 | | RESTORE_STACK; |
861 | | return (1<<B)-1; |
862 | | } |
863 | | #endif |